Merge branch 'develop' into feature/gpu-port

2025-10-24 17:54:47 +01:00 · 2018-12-13 05:11:34 +00:00
parent adbdc4e65b c509bd3fe2
commit b57a4d32aa
647 changed files with 49155 additions and 11160 deletions
--- a/Grid/tensors/Tensor_Ta.h
+++ b/Grid/tensors/Tensor_Ta.h
@@ -0,0 +1,124 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/tensors/Tensor_Ta.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: neo <cossu@post.kek.jp>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_MATH_TA_H
+#define GRID_MATH_TA_H
+
+
+NAMESPACE_BEGIN(Grid);
+
+/////////////////////////////////////////////// 
+// Ta function for scalar, vector, matrix
+/////////////////////////////////////////////// 
+/*
+  accelerator_inline ComplexF Ta( const ComplexF &arg){    return arg;}
+  accelerator_inline ComplexD Ta( const ComplexD &arg){    return arg;}
+  accelerator_inline RealF Ta( const RealF &arg){    return arg;}
+  accelerator_inline RealD Ta( const RealD &arg){    return arg;}
+*/
+
+template<class vtype> accelerator_inline iScalar<vtype> Ta(const iScalar<vtype>&r)
+{
+  iScalar<vtype> ret;
+  ret._internal = Ta(r._internal);
+  return ret;
+}
+template<class vtype,int N> accelerator_inline iVector<vtype,N> Ta(const iVector<vtype,N>&r)
+{
+  iVector<vtype,N> ret;
+  for(int i=0;i<N;i++){
+    ret._internal[i] = Ta(r._internal[i]);
+  }
+  return ret;
+}
+template<class vtype,int N> accelerator_inline iMatrix<vtype,N> Ta(const iMatrix<vtype,N> &arg)
+{
+  iMatrix<vtype,N> ret;
+
+  double factor = (1.0/(double)N);
+  ret= (arg - adj(arg))*0.5;
+  ret=ret - (trace(ret)*factor);
+  return ret;
+}
+
+
+/////////////////////////////////////////////// 
+// ProjectOnGroup function for scalar, vector, matrix 
+// Projects on orthogonal, unitary group
+/////////////////////////////////////////////// 
+
+
+template<class vtype> accelerator_inline iScalar<vtype> ProjectOnGroup(const iScalar<vtype>&r)
+{
+  iScalar<vtype> ret;
+  ret._internal = ProjectOnGroup(r._internal);
+  return ret;
+}
+template<class vtype,int N> accelerator_inline iVector<vtype,N> ProjectOnGroup(const iVector<vtype,N>&r)
+{
+  iVector<vtype,N> ret;
+  for(int i=0;i<N;i++){
+    ret._internal[i] = ProjectOnGroup(r._internal[i]);
+  }
+  return ret;
+}
+template<class vtype,int N, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0 >::type * =nullptr> 
+accelerator_inline iMatrix<vtype,N> ProjectOnGroup(const iMatrix<vtype,N> &arg)
+{
+  // need a check for the group type?
+  iMatrix<vtype,N> ret(arg);
+  vtype nrm;
+  vtype inner;
+  for(int c1=0;c1<N;c1++){
+    zeroit(inner);	
+    for(int c2=0;c2<N;c2++)
+      inner += innerProduct(ret._internal[c1][c2],ret._internal[c1][c2]);
+
+    nrm = rsqrt(inner);
+    for(int c2=0;c2<N;c2++)
+      ret._internal[c1][c2]*= nrm;
+      
+    for (int b=c1+1; b<N; ++b){
+      decltype(ret._internal[b][b]*ret._internal[b][b]) pr;
+      zeroit(pr);
+      for(int c=0; c<N; ++c)
+	pr += conjugate(ret._internal[c1][c])*ret._internal[b][c];
+	  
+      for(int c=0; c<N; ++c){
+	ret._internal[b][c] -= pr * ret._internal[c1][c];
+      }
+    }
+	  
+  }
+  // assuming the determinant is ok
+  return ret;
+}
+
+NAMESPACE_END(Grid);
+
+#endif
--- a/Grid/tensors/Tensor_arith.h
+++ b/Grid/tensors/Tensor_arith.h
@@ -0,0 +1,38 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/tensors/Tensor_arith.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_MATH_ARITH_H
+#define GRID_MATH_ARITH_H
+
+#include "Tensor_arith_add.h"
+#include "Tensor_arith_sub.h"
+#include "Tensor_arith_mac.h"
+#include "Tensor_arith_mul.h"
+#include "Tensor_arith_scalar.h"
+
+#endif
+
--- a/Grid/tensors/Tensor_arith_add.h
+++ b/Grid/tensors/Tensor_arith_add.h
@@ -0,0 +1,142 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/tensors/Tensor_arith_add.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: neo <cossu@post.kek.jp>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_MATH_ARITH_ADD_H
+#define GRID_MATH_ARITH_ADD_H
+
+NAMESPACE_BEGIN(Grid);
+    
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////// ADD         ///////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// ADD is simple for now; cannot mix types and straightforward template
+// Scalar +/- Scalar
+// Vector +/- Vector
+// Matrix +/- Matrix
+template<class vtype,class ltype,class rtype> accelerator_inline void add(iScalar<vtype> * __restrict__ ret,
+									  const iScalar<ltype> * __restrict__ lhs,
+									  const iScalar<rtype> * __restrict__ rhs)
+{
+  add(&ret->_internal,&lhs->_internal,&rhs->_internal);
+}
+template<class vtype,class ltype,class rtype,int N> accelerator_inline void add(iVector<vtype,N> * __restrict__ ret,
+									   const iVector<ltype,N> * __restrict__ lhs,
+									   const iVector<rtype,N> * __restrict__ rhs)
+{
+  for(int c=0;c<N;c++){
+    ret->_internal[c]=lhs->_internal[c]+rhs->_internal[c];
+  }
+  return;
+}
+  
+template<class vtype,class ltype,class rtype, int N> accelerator_inline  void add(iMatrix<vtype,N> * __restrict__ ret,
+									     const iMatrix<ltype,N> * __restrict__ lhs,
+									     const iMatrix<rtype,N> * __restrict__ rhs)
+{
+  for(int c2=0;c2<N;c2++){
+    for(int c1=0;c1<N;c1++){
+      add(&ret->_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal[c1][c2]);
+    }}
+  return;
+}
+template<class vtype,class ltype,class rtype, int N> accelerator_inline  void add(iMatrix<vtype,N> * __restrict__ ret,
+									     const iScalar<ltype>   * __restrict__ lhs,
+									     const iMatrix<rtype,N> * __restrict__ rhs)
+{
+  for(int c2=0;c2<N;c2++){
+    for(int c1=0;c1<N;c1++){
+      if ( c1==c2)
+	add(&ret->_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]);
+      else
+	ret->_internal[c1][c2]=lhs->_internal[c1][c2];
+    }}
+  return;
+}
+template<class vtype,class ltype,class rtype, int N> accelerator_inline  void add(iMatrix<vtype,N> * __restrict__ ret,
+									     const iMatrix<ltype,N> * __restrict__ lhs,
+									     const iScalar<rtype>   * __restrict__ rhs)
+{
+  for(int c2=0;c2<N;c2++){
+    for(int c1=0;c1<N;c1++){
+      if ( c1==c2)
+	add(&ret->_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal);
+      else
+	ret->_internal[c1][c2]=lhs->_internal[c1][c2];
+    }}
+  return;
+}
+
+
+// + operator for scalar, vector, matrix
+template<class ltype,class rtype>
+accelerator_inline auto operator + (const iScalar<ltype>& lhs,const iScalar<rtype>& rhs) -> iScalar<decltype(lhs._internal + rhs._internal)>
+{
+  typedef iScalar<decltype(lhs._internal+rhs._internal)> ret_t;
+  ret_t ret ;
+  add(&ret,&lhs,&rhs);
+  return ret;
+}
+template<class ltype,class rtype,int N>
+accelerator_inline auto operator + (const iVector<ltype,N>& lhs,const iVector<rtype,N>& rhs) ->iVector<decltype(lhs._internal[0]+rhs._internal[0]),N>
+{
+  typedef iVector<decltype(lhs._internal[0]+rhs._internal[0]),N> ret_t;
+  ret_t ret ;
+  add(&ret,&lhs,&rhs);
+  return ret;
+}
+template<class ltype,class rtype,int N>
+accelerator_inline auto operator + (const iMatrix<ltype,N>& lhs,const iMatrix<rtype,N>& rhs) ->iMatrix<decltype(lhs._internal[0][0]+rhs._internal[0][0]),N>
+{
+  typedef iMatrix<decltype(lhs._internal[0][0]+rhs._internal[0][0]),N> ret_t;
+  ret_t ret ;
+  add(&ret,&lhs,&rhs);
+  return ret;
+}
+template<class ltype,class rtype,int N>
+accelerator_inline auto operator + (const iScalar<ltype>& lhs,const iMatrix<rtype,N>& rhs)->iMatrix<decltype(lhs._internal+rhs._internal[0][0]),N>
+{
+  typedef iMatrix<decltype(lhs._internal+rhs._internal[0][0]),N> ret_t;
+  ret_t ret ;
+  add(&ret,&lhs,&rhs);
+  return ret;
+}
+
+template<class ltype,class rtype,int N>
+accelerator_inline auto operator + (const iMatrix<ltype,N>& lhs,const iScalar<rtype>& rhs)->iMatrix<decltype(lhs._internal[0][0]+rhs._internal),N>
+{
+  typedef iMatrix<decltype(lhs._internal[0][0]+rhs._internal),N> ret_t;
+  ret_t ret ;
+  add(&ret,&lhs,&rhs);
+  return ret;
+}
+
+NAMESPACE_END(Grid);
+
+
+#endif
--- a/Grid/tensors/Tensor_arith_mac.h
+++ b/Grid/tensors/Tensor_arith_mac.h
@@ -0,0 +1,108 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/tensors/Tensor_arith_mac.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_MATH_ARITH_MAC_H
+#define GRID_MATH_ARITH_MAC_H
+
+NAMESPACE_BEGIN(Grid);
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////// MAC         ///////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+///////////////////////////
+
+///////////////////////////
+// Legal multiplication table
+///////////////////////////
+// scal x scal = scal
+// mat x  mat  = mat
+// mat  x scal = mat
+// scal x mat  = mat
+// mat  x vec  = vec
+// vec  x scal = vec
+// scal x vec  = vec
+///////////////////////////
+template<class rtype,class vtype,class mtype>
+accelerator_inline  void mac(iScalar<rtype> * __restrict__ ret,const iScalar<vtype> * __restrict__ lhs,const iScalar<mtype> * __restrict__ rhs)
+{
+  mac(&ret->_internal,&lhs->_internal,&rhs->_internal);
+}
+template<class rrtype,class ltype,class rtype,int N>
+accelerator_inline void mac(iMatrix<rrtype,N> * __restrict__ ret,const iMatrix<ltype,N> * __restrict__ lhs,const iMatrix<rtype,N> * __restrict__ rhs){
+  for(int c3=0;c3<N;c3++){
+    for(int c1=0;c1<N;c1++){
+      for(int c2=0;c2<N;c2++){
+        mac(&ret->_internal[c1][c2],&lhs->_internal[c1][c3],&rhs->_internal[c3][c2]);
+      }}}
+  return;
+}
+
+template<class rrtype,class ltype,class rtype,int N>
+accelerator_inline void mac(iMatrix<rrtype,N> * __restrict__ ret,const iMatrix<ltype,N> * __restrict__ lhs,const iScalar<rtype> * __restrict__ rhs){
+  for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+      mac(&ret->_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal);
+    }}
+  return;
+}
+template<class rrtype,class ltype,class rtype,int N>
+accelerator_inline void mac(iMatrix<rrtype,N> * __restrict__ ret,const iScalar<ltype> * __restrict__ lhs,const iMatrix<rtype,N> * __restrict__ rhs){
+  for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+      mac(&ret->_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]);
+    }}
+  return;
+}
+template<class rrtype,class ltype,class rtype,int N>
+accelerator_inline void mac(iVector<rrtype,N> * __restrict__ ret,const iMatrix<ltype,N> * __restrict__ lhs,const iVector<rtype,N> * __restrict__ rhs)
+{
+  for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+      mac(&ret->_internal[c1],&lhs->_internal[c1][c2],&rhs->_internal[c2]);
+    }}
+  return;
+}
+template<class rrtype,class ltype,class rtype,int N>
+accelerator_inline void mac(iVector<rrtype,N> * __restrict__ ret,const iScalar<ltype> * __restrict__ lhs,const iVector<rtype,N> * __restrict__ rhs)
+{
+  for(int c1=0;c1<N;c1++){
+    mac(&ret->_internal[c1],&lhs->_internal,&rhs->_internal[c1]);
+  }
+  return;
+}
+template<class rrtype,class ltype,class rtype,int N>
+accelerator_inline void mac(iVector<rrtype,N> * __restrict__ ret,const iVector<ltype,N> * __restrict__ lhs,const iScalar<rtype> * __restrict__ rhs)
+{
+  for(int c1=0;c1<N;c1++){
+    mac(&ret->_internal[c1],&lhs->_internal[c1],&rhs->_internal);
+  }
+  return;
+}
+NAMESPACE_END(Grid);
+
+
+#endif
--- a/Grid/tensors/Tensor_arith_mul.h
+++ b/Grid/tensors/Tensor_arith_mul.h
@@ -0,0 +1,252 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/tensors/Tensor_arith_mul.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_MATH_ARITH_MUL_H
+#define GRID_MATH_ARITH_MUL_H
+
+NAMESPACE_BEGIN(Grid);
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////// MUL         ///////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+    
+template<class rtype,class vtype,class mtype>
+accelerator_inline void mult(iScalar<rtype> * __restrict__ ret,const iScalar<mtype> * __restrict__ lhs,const iScalar<vtype> * __restrict__ rhs){
+  mult(&ret->_internal,&lhs->_internal,&rhs->_internal);
+}
+
+template<class rrtype,class ltype,class rtype,int N>
+accelerator_inline void mult(iMatrix<rrtype,N> * __restrict__ ret,const iMatrix<ltype,N> * __restrict__ lhs,const iMatrix<rtype,N> * __restrict__ rhs){
+  for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+      mult(&ret->_internal[c1][c2],&lhs->_internal[c1][0],&rhs->_internal[0][c2]);
+    }
+  }
+  for(int c1=0;c1<N;c1++){
+    for(int c3=1;c3<N;c3++){
+      for(int c2=0;c2<N;c2++){
+	mac(&ret->_internal[c1][c2],&lhs->_internal[c1][c3],&rhs->_internal[c3][c2]);
+      }
+    }
+  }
+  return;
+}
+
+template<class rrtype,class ltype,class rtype,int N>
+accelerator_inline void mult(iMatrix<rrtype,N> * __restrict__ ret,const iMatrix<ltype,N> * __restrict__ lhs,const iScalar<rtype> * __restrict__ rhs){
+  for(int c2=0;c2<N;c2++){
+    for(int c1=0;c1<N;c1++){
+      mult(&ret->_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal);
+    }}
+  return;
+}
+
+template<class rrtype,class ltype,class rtype, int N>
+accelerator_inline void mult(iMatrix<rrtype,N> * __restrict__ ret,const iScalar<ltype>   * __restrict__ lhs,const iMatrix<rtype,N> * __restrict__ rhs){
+  for(int c2=0;c2<N;c2++){
+    for(int c1=0;c1<N;c1++){
+      mult(&ret->_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]);
+    }}
+  return;
+}
+// Matrix left multiplies vector
+template<class rtype,class vtype,class mtype,int N>
+accelerator_inline void mult(iVector<rtype,N> * __restrict__ ret,const iMatrix<mtype,N> * __restrict__ lhs,const iVector<vtype,N> * __restrict__ rhs)
+{
+  for(int c1=0;c1<N;c1++){
+    mult(&ret->_internal[c1],&lhs->_internal[c1][0],&rhs->_internal[0]);
+    for(int c2=1;c2<N;c2++){
+      mac(&ret->_internal[c1],&lhs->_internal[c1][c2],&rhs->_internal[c2]);
+    }
+  }
+  return;
+}
+template<class rtype,class vtype,class mtype,int N>
+accelerator_inline void mult(iVector<rtype,N> * __restrict__ ret,
+			const iScalar<mtype>   * __restrict__ lhs,
+			const iVector<vtype,N> * __restrict__ rhs){
+  for(int c1=0;c1<N;c1++){
+    mult(&ret->_internal[c1],&lhs->_internal,&rhs->_internal[c1]);
+  }
+}
+template<class rtype,class vtype,class mtype,int N>
+accelerator_inline void mult(iVector<rtype,N> * __restrict__ ret,
+			const iVector<vtype,N> * __restrict__ rhs,
+			const iScalar<mtype> * __restrict__ lhs){
+  for(int c1=0;c1<N;c1++){
+    mult(&ret->_internal[c1],&rhs->_internal[c1],&lhs->_internal);
+  }                 
+}
+    
+
+
+template<class rtype,class vtype,class mtype,int N> accelerator_inline
+iVector<rtype,N> operator * (const iMatrix<mtype,N>& lhs,const iVector<vtype,N>& rhs)
+{
+  iVector<rtype,N> ret;
+  mult(&ret,&lhs,&rhs);
+  return ret;
+}
+
+template<class rtype,class vtype,class mtype,int N> accelerator_inline
+iVector<rtype,N> operator * (const iScalar<mtype>& lhs,const iVector<vtype,N>& rhs)
+{
+  iVector<rtype,N> ret;
+  mult(&ret,&lhs,&rhs);
+  return ret;
+}
+
+template<class rtype,class vtype,class mtype,int N> accelerator_inline
+iVector<rtype,N> operator * (const iVector<mtype,N>& lhs,const iScalar<vtype>& rhs)
+{
+  iVector<rtype,N> ret;
+  mult(&ret,&lhs,&rhs);
+  return ret;
+}
+
+//////////////////////////////////////////////////////////////////
+// Divide by scalar
+//////////////////////////////////////////////////////////////////
+template<class rtype,class vtype> accelerator_inline
+iScalar<rtype> operator / (const iScalar<rtype>& lhs,const iScalar<vtype>& rhs)
+{
+  iScalar<rtype> ret;
+  ret._internal = lhs._internal/rhs._internal;
+  return ret;
+}
+template<class rtype,class vtype,int N> accelerator_inline
+iVector<rtype,N> operator / (const iVector<rtype,N>& lhs,const iScalar<vtype>& rhs)
+{
+  iVector<rtype,N> ret;
+  for(int i=0;i<N;i++){
+    ret._internal[i] = lhs._internal[i]/rhs._internal;
+  }
+  return ret;
+}
+template<class rtype,class vtype,int N> accelerator_inline
+iMatrix<rtype,N> operator / (const iMatrix<rtype,N>& lhs,const iScalar<vtype>& rhs)
+{
+  iMatrix<rtype,N> ret;
+  for(int i=0;i<N;i++){
+    for(int j=0;j<N;j++){
+      ret._internal[i][j] = lhs._internal[i][j]/rhs._internal;
+    }}
+  return ret;
+}
+    
+//////////////////////////////////////////////////////////////////
+// Glue operators to mult routines. Must resolve return type cleverly from typeof(internal)
+// since nesting matrix<scalar> x matrix<matrix>-> matrix<matrix>
+// while         matrix<scalar> x matrix<scalar>-> matrix<scalar>
+// so return type depends on argument types in nasty way.
+//////////////////////////////////////////////////////////////////
+// scal x scal = scal
+// mat x  mat  = mat
+// mat  x scal = mat
+// scal x mat  = mat
+// mat  x vec  = vec
+// vec  x scal = vec
+// scal x vec  = vec
+//
+// We can special case scalar_type ??
+template<class l,class r>
+accelerator_inline auto operator * (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(lhs._internal * rhs._internal)>
+{
+  typedef iScalar<decltype(lhs._internal*rhs._internal)> ret_t;
+  ret_t ret;
+  mult(&ret,&lhs,&rhs);
+  return ret;
+}
+template<class l,class r,int N> accelerator_inline
+auto operator * (const iMatrix<l,N>& lhs,const iMatrix<r,N>& rhs) -> iMatrix<decltype(lhs._internal[0][0]*rhs._internal[0][0]),N>
+{
+  typedef decltype(lhs._internal[0][0]*rhs._internal[0][0]) ret_t;
+  iMatrix<ret_t,N> ret;
+  mult(&ret,&lhs,&rhs);
+  return ret;
+}
+template<class l,class r, int N> accelerator_inline
+auto operator * (const iMatrix<r,N>& lhs,const iScalar<l>& rhs) -> iMatrix<decltype(lhs._internal[0][0]*rhs._internal),N>
+{
+  typedef decltype(lhs._internal[0][0]*rhs._internal) ret_t;
+        
+  iMatrix<ret_t,N> ret;
+  for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+      mult(&ret._internal[c1][c2],&lhs._internal[c1][c2],&rhs._internal);
+    }}
+  return ret;
+}
+template<class l,class r,int N> accelerator_inline
+auto operator * (const iScalar<l>& lhs,const iMatrix<r,N>& rhs) -> iMatrix<decltype(lhs._internal*rhs._internal[0][0]),N>
+{
+  typedef decltype(lhs._internal*rhs._internal[0][0]) ret_t;
+  iMatrix<ret_t,N> ret;
+  for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+      mult(&ret._internal[c1][c2],&lhs._internal,&rhs._internal[c1][c2]);
+    }}
+  return ret;
+}
+template<class l,class r,int N> accelerator_inline
+auto operator * (const iMatrix<l,N>& lhs,const iVector<r,N>& rhs) -> iVector<decltype(lhs._internal[0][0]*rhs._internal[0]),N>
+{
+  typedef decltype(lhs._internal[0][0]*rhs._internal[0]) ret_t;
+  iVector<ret_t,N> ret;
+  for(int c1=0;c1<N;c1++){
+    mult(&ret._internal[c1],&lhs._internal[c1][0],&rhs._internal[0]);
+    for(int c2=1;c2<N;c2++){
+      mac(&ret._internal[c1],&lhs._internal[c1][c2],&rhs._internal[c2]);
+    }
+  }
+  return ret;
+}
+template<class l,class r,int N> accelerator_inline
+auto operator * (const iScalar<l>& lhs,const iVector<r,N>& rhs) -> iVector<decltype(lhs._internal*rhs._internal[0]),N>
+{
+  typedef decltype(lhs._internal*rhs._internal[0]) ret_t;
+  iVector<ret_t,N> ret;
+  for(int c1=0;c1<N;c1++){
+    mult(&ret._internal[c1],&lhs._internal,&rhs._internal[c1]);
+  }
+  return ret;
+}
+template<class l,class r,int N> accelerator_inline
+auto operator * (const iVector<l,N>& lhs,const iScalar<r>& rhs) -> iVector<decltype(lhs._internal[0]*rhs._internal),N>
+{
+  typedef decltype(lhs._internal[0]*rhs._internal) ret_t;
+  iVector<ret_t,N> ret;
+  for(int c1=0;c1<N;c1++){
+    mult(&ret._internal[c1],&lhs._internal[c1],&rhs._internal);
+  }
+  return ret;
+}
+
+NAMESPACE_END(Grid);
+
+
+#endif
--- a/Grid/tensors/Tensor_arith_scalar.h
+++ b/Grid/tensors/Tensor_arith_scalar.h
@@ -0,0 +1,287 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/tensors/Tensor_arith_scalar.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_MATH_ARITH_SCALAR_H
+#define GRID_MATH_ARITH_SCALAR_H
+
+NAMESPACE_BEGIN(Grid);
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// Must support native C++ types Integer, Complex, Real
+//////////////////////////////////////////////////////////////////////////////////////////
+
+// multiplication by fundamental scalar type
+template<class l> accelerator_inline iScalar<l> operator * (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
+{
+  typename iScalar<l>::tensor_reduced srhs; srhs=rhs;
+  return lhs*srhs;
+}
+template<class l> accelerator_inline iScalar<l> operator * (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) {  return rhs*lhs; }
+
+template<class l,int N> accelerator_inline iVector<l,N> operator * (const iVector<l,N>& lhs,const typename iScalar<l>::scalar_type rhs) 
+{
+  typename iVector<l,N>::tensor_reduced srhs; srhs=rhs;
+  return lhs*srhs;
+}
+template<class l,int N> accelerator_inline iVector<l,N> operator * (const typename iScalar<l>::scalar_type lhs,const iVector<l,N>& rhs) {  return rhs*lhs; }
+
+template<class l,int N> accelerator_inline iMatrix<l,N> operator * (const iMatrix<l,N>& lhs,const typename iScalar<l>::scalar_type &rhs) 
+{
+  typename iMatrix<l,N>::tensor_reduced srhs; srhs=rhs;
+  return lhs*srhs;
+}
+template<class l,int N> accelerator_inline iMatrix<l,N> operator * (const typename iScalar<l>::scalar_type & lhs,const iMatrix<l,N>& rhs) {  return rhs*lhs; }
+
+////////////////////////////////////////////////////////////////////
+// Double support; cast to "scalar_type" through constructor
+////////////////////////////////////////////////////////////////////
+template<class l> accelerator_inline iScalar<l> operator * (const iScalar<l>& lhs,double rhs) 
+{
+  typename iScalar<l>::scalar_type t; t=rhs;
+  typename iScalar<l>::tensor_reduced srhs;srhs=t;
+  return lhs*srhs;
+}
+template<class l> accelerator_inline iScalar<l> operator * (double lhs,const iScalar<l>& rhs) {  return rhs*lhs; }
+
+template<class l,int N> accelerator_inline iVector<l,N> operator * (const iVector<l,N>& lhs,double rhs) 
+{
+  typename iScalar<l>::scalar_type t;t=rhs;
+  typename iScalar<l>::tensor_reduced srhs;srhs=t;
+  return lhs*srhs;
+}
+template<class l,int N> accelerator_inline iVector<l,N> operator * (double lhs,const iVector<l,N>& rhs) {  return rhs*lhs; }
+
+template<class l,int N> accelerator_inline iMatrix<l,N> operator * (const iMatrix<l,N>& lhs,double rhs) 
+{
+  typename iScalar<l>::scalar_type t;t=rhs;
+  typename iScalar<l>::tensor_reduced srhs;srhs=t;
+  return lhs*srhs;
+}
+template<class l,int N> accelerator_inline iMatrix<l,N> operator * (double lhs,const iMatrix<l,N>& rhs) {  return rhs*lhs; }
+
+////////////////////////////////////////////////////////////////////
+// Complex support; cast to "scalar_type" through constructor
+////////////////////////////////////////////////////////////////////
+template<class l> accelerator_inline iScalar<l> operator * (const iScalar<l>& lhs,ComplexD rhs) 
+{
+  typename iScalar<l>::scalar_type t;t=rhs;
+  typename iScalar<l>::tensor_reduced srhs;srhs=t;
+  
+  
+  return lhs*srhs;
+}
+template<class l> accelerator_inline iScalar<l> operator * (ComplexD lhs,const iScalar<l>& rhs) {  return rhs*lhs; }
+
+template<class l,int N> accelerator_inline iVector<l,N> operator * (const iVector<l,N>& lhs,ComplexD rhs) 
+{
+  typename iScalar<l>::scalar_type t;t=rhs;
+  typename iScalar<l>::tensor_reduced srhs;srhs=t;
+  return lhs*srhs;
+}
+template<class l,int N> accelerator_inline iVector<l,N> operator * (ComplexD lhs,const iVector<l,N>& rhs) {  return rhs*lhs; }
+
+template<class l,int N> accelerator_inline iMatrix<l,N> operator * (const iMatrix<l,N>& lhs,ComplexD rhs) 
+{
+  typename iScalar<l>::scalar_type t;t=rhs;
+  typename iScalar<l>::tensor_reduced srhs;srhs=t;
+  return lhs*srhs;
+}
+template<class l,int N> accelerator_inline iMatrix<l,N> operator * (ComplexD lhs,const iMatrix<l,N>& rhs) {  return rhs*lhs; }
+
+////////////////////////////////////////////////////////////////////
+// Integer support; cast to "scalar_type" through constructor
+////////////////////////////////////////////////////////////////////
+template<class l> accelerator_inline iScalar<l> operator * (const iScalar<l>& lhs,Integer rhs) 
+{
+  typename iScalar<l>::scalar_type t;  t=rhs;
+  typename iScalar<l>::tensor_reduced srhs; srhs=t;
+  return lhs*srhs;
+}
+template<class l> accelerator_inline iScalar<l> operator * (Integer lhs,const iScalar<l>& rhs) {  return rhs*lhs; }
+
+template<class l,int N> accelerator_inline iVector<l,N> operator * (const iVector<l,N>& lhs,Integer rhs) 
+{
+  typename iScalar<l>::scalar_type t;t=rhs;
+  typename iScalar<l>::tensor_reduced srhs;srhs=t;
+  return lhs*srhs;
+}
+template<class l,int N> accelerator_inline iVector<l,N> operator * (Integer lhs,const iVector<l,N>& rhs) {  return rhs*lhs; }
+
+template<class l,int N> accelerator_inline iMatrix<l,N> operator * (const iMatrix<l,N>& lhs,Integer rhs) 
+{
+  typename iScalar<l>::scalar_type t;t=rhs;
+  typename iScalar<l>::tensor_reduced srhs;srhs=t;
+  return lhs*srhs;
+}
+template<class l,int N> accelerator_inline iMatrix<l,N> operator * (Integer lhs,const iMatrix<l,N>& rhs) {  return rhs*lhs; }
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+// addition by fundamental scalar type applies to matrix(down diag) and scalar
+///////////////////////////////////////////////////////////////////////////////////////////////
+template<class l> accelerator_inline iScalar<l> operator + (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
+{
+  typename iScalar<l>::tensor_reduced srhs; srhs=rhs;
+  return lhs+srhs;
+}
+template<class l> accelerator_inline iScalar<l> operator + (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) {  return rhs+lhs; }
+
+template<class l,int N> accelerator_inline iMatrix<l,N> operator + (const iMatrix<l,N>& lhs,const typename iScalar<l>::scalar_type rhs) 
+{
+  typename iMatrix<l,N>::tensor_reduced srhs; srhs=rhs;
+  return lhs+srhs;
+}
+template<class l,int N> accelerator_inline iMatrix<l,N> operator + (const typename iScalar<l>::scalar_type lhs,const iMatrix<l,N>& rhs) {  return rhs+lhs; }
+
+////////////////////////////////////////////////////////////////////
+// Double support; cast to "scalar_type" through constructor
+////////////////////////////////////////////////////////////////////
+template<class l> accelerator_inline iScalar<l> operator + (const iScalar<l>& lhs,double rhs) 
+{
+  typename iScalar<l>::scalar_type t; t=rhs;
+  typename iScalar<l>::tensor_reduced srhs; srhs=t;
+  return lhs+srhs;
+}
+template<class l> accelerator_inline iScalar<l> operator + (double lhs,const iScalar<l>& rhs) {  return rhs+lhs; }
+
+template<class l,int N> accelerator_inline iMatrix<l,N> operator + (const iMatrix<l,N>& lhs,double rhs) 
+{
+  typename iScalar<l>::scalar_type t;t=rhs;
+  typename iScalar<l>::tensor_reduced srhs;srhs=t;
+  return lhs+srhs;
+}
+template<class l,int N> accelerator_inline iMatrix<l,N> operator + (double lhs,const iMatrix<l,N>& rhs) {  return rhs+lhs; }
+
+
+// Integer support cast to scalar type through constructor
+
+
+template<class l> accelerator_inline iScalar<l> operator + (const iScalar<l>& lhs,Integer rhs) 
+{
+  typename iScalar<l>::scalar_type t; t=rhs;
+  typename iScalar<l>::tensor_reduced srhs; srhs=t;
+  return lhs+srhs;
+}
+
+template<class l> accelerator_inline iScalar<l> operator + (Integer lhs,const iScalar<l>& rhs) {  return rhs+lhs; }
+
+template<class l,int N> accelerator_inline iMatrix<l,N> operator + (const iMatrix<l,N>& lhs,Integer rhs) 
+{
+  typename iScalar<l>::scalar_type t;t=rhs;
+  typename iScalar<l>::tensor_reduced srhs;srhs=t;
+  return lhs+srhs;
+}
+template<class l,int N> accelerator_inline iMatrix<l,N> operator + (Integer lhs,const iMatrix<l,N>& rhs) {  return rhs+lhs; }
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+// subtraction of fundamental scalar type applies to matrix(down diag) and scalar
+///////////////////////////////////////////////////////////////////////////////////////////////
+template<class l> accelerator_inline iScalar<l> operator - (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
+{
+  typename iScalar<l>::tensor_reduced srhs; srhs=rhs;
+  return lhs-srhs;
+}
+template<class l> accelerator_inline iScalar<l> operator - (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) 
+{
+  typename iScalar<l>::tensor_reduced slhs;slhs=lhs;
+  return slhs-rhs;
+}
+
+template<class l,int N> accelerator_inline iMatrix<l,N> operator - (const iMatrix<l,N>& lhs,const typename iScalar<l>::scalar_type rhs) 
+{
+  typename iScalar<l>::tensor_reduced srhs; srhs=rhs;
+  return lhs-srhs;
+}
+template<class l,int N> accelerator_inline iMatrix<l,N> operator - (const typename iScalar<l>::scalar_type lhs,const iMatrix<l,N>& rhs) 
+{
+  typename iScalar<l>::tensor_reduced slhs;slhs=lhs;
+  return slhs-rhs;
+}
+
+////////////////////////////////////////////////////////////////////
+// Double support; cast to "scalar_type" through constructor
+////////////////////////////////////////////////////////////////////
+template<class l> accelerator_inline iScalar<l> operator - (const iScalar<l>& lhs,double rhs) 
+{
+  typename iScalar<l>::scalar_type t; t=rhs;
+  typename iScalar<l>::tensor_reduced srhs; srhs=t;
+  return lhs-srhs;
+}
+template<class l> accelerator_inline iScalar<l> operator - (double lhs,const iScalar<l>& rhs) 
+{
+  typename iScalar<l>::scalar_type t(lhs);
+  typename iScalar<l>::tensor_reduced slhs;slhs=t;
+  return slhs-rhs;
+}
+
+template<class l,int N> accelerator_inline iMatrix<l,N> operator - (const iMatrix<l,N>& lhs,double rhs) 
+{
+  typename iScalar<l>::scalar_type t;t=rhs;
+  typename iScalar<l>::tensor_reduced srhs;srhs=t;
+  return lhs-srhs;
+}
+template<class l,int N> accelerator_inline iMatrix<l,N> operator - (double lhs,const iMatrix<l,N>& rhs) 
+{
+  typename iScalar<l>::scalar_type t(lhs);
+  typename iScalar<l>::tensor_reduced slhs;slhs=t;
+  return slhs-rhs;
+}
+
+////////////////////////////////////////////////////////////////////
+// Integer support; cast to "scalar_type" through constructor
+////////////////////////////////////////////////////////////////////
+template<class l> accelerator_inline iScalar<l> operator - (const iScalar<l>& lhs,Integer rhs) 
+{
+  typename iScalar<l>::scalar_type t; t=rhs;
+  typename iScalar<l>::tensor_reduced srhs; srhs=t;
+  return lhs-srhs;
+}
+template<class l> accelerator_inline iScalar<l> operator - (Integer lhs,const iScalar<l>& rhs) 
+{
+  typename iScalar<l>::scalar_type t;t=lhs;
+  typename iScalar<l>::tensor_reduced slhs;slhs=t;
+  return slhs-rhs;
+}
+template<class l,int N> accelerator_inline iMatrix<l,N> operator - (const iMatrix<l,N>& lhs,Integer rhs) 
+{
+  typename iScalar<l>::scalar_type t;t=rhs;
+  typename iScalar<l>::tensor_reduced srhs;srhs=t;
+  return lhs-srhs;
+}
+template<class l,int N> accelerator_inline iMatrix<l,N> operator - (Integer lhs,const iMatrix<l,N>& rhs) 
+{
+  typename iScalar<l>::scalar_type t;t=lhs;
+  typename iScalar<l>::tensor_reduced slhs;slhs=t;
+  return slhs-rhs;
+}
+
+NAMESPACE_END(Grid);
+
+#endif
--- a/Grid/tensors/Tensor_arith_sub.h
+++ b/Grid/tensors/Tensor_arith_sub.h
@@ -0,0 +1,142 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/tensors/Tensor_arith_sub.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: neo <cossu@post.kek.jp>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_MATH_ARITH_SUB_H
+#define GRID_MATH_ARITH_SUB_H
+
+NAMESPACE_BEGIN(Grid);
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////// SUB         ///////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+    
+
+// SUB is simple for now; cannot mix types and straightforward template
+// Scalar +/- Scalar
+// Vector +/- Vector
+// Matrix +/- Matrix
+// Matrix /- scalar
+template<class vtype,class ltype,class rtype> accelerator_inline void sub(iScalar<vtype> * __restrict__ ret,
+								     const iScalar<ltype> * __restrict__ lhs,
+								     const iScalar<rtype> * __restrict__ rhs)
+{
+  sub(&ret->_internal,&lhs->_internal,&rhs->_internal);
+}
+
+template<class vtype,class ltype,class rtype,int N> accelerator_inline void sub(iVector<vtype,N> * __restrict__ ret,
+									   const iVector<ltype,N> * __restrict__ lhs,
+									   const iVector<rtype,N> * __restrict__ rhs)
+{
+  for(int c=0;c<N;c++){
+    ret->_internal[c]=lhs->_internal[c]-rhs->_internal[c];
+  }
+  return;
+}
+template<class vtype,class ltype,class rtype, int N> accelerator_inline void sub(iMatrix<vtype,N> * __restrict__ ret,
+									    const iMatrix<ltype,N> * __restrict__ lhs,
+									    const iMatrix<rtype,N> * __restrict__ rhs){
+  for(int c2=0;c2<N;c2++){
+    for(int c1=0;c1<N;c1++){
+      sub(&ret->_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal[c1][c2]);
+    }}
+  return;
+}
+template<class vtype,class ltype,class rtype, int N> accelerator_inline void sub(iMatrix<vtype,N> * __restrict__ ret,
+									    const iScalar<ltype> * __restrict__ lhs,
+									    const iMatrix<rtype,N> * __restrict__ rhs){
+  for(int c2=0;c2<N;c2++){
+    for(int c1=0;c1<N;c1++){
+      if ( c1==c2) {
+	sub(&ret->_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]);
+      } else {
+	// Fails -- need unary minus. Catalogue other unops?
+	ret->_internal[c1][c2]=Zero();
+	ret->_internal[c1][c2]=ret->_internal[c1][c2]-rhs->_internal[c1][c2];
+
+      }
+    }}
+  return;
+}
+template<class vtype,class ltype,class rtype, int N> accelerator_inline void sub(iMatrix<vtype,N> * __restrict__ ret,
+									    const iMatrix<ltype,N> * __restrict__ lhs,
+									    const iScalar<rtype> * __restrict__ rhs){
+  for(int c2=0;c2<N;c2++){
+    for(int c1=0;c1<N;c1++){
+      if ( c1==c2)
+	sub(&ret->_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal);
+      else
+	ret->_internal[c1][c2]=lhs->_internal[c1][c2];
+    }}
+  return;
+}
+
+// - operator for scalar, vector, matrix
+template<class ltype,class rtype> accelerator_inline auto
+operator - (const iScalar<ltype>& lhs, const iScalar<rtype>& rhs) -> iScalar<decltype(lhs._internal - rhs._internal)>
+{
+  typedef iScalar<decltype(lhs._internal-rhs._internal)> ret_t;
+  ret_t ret;
+  sub(&ret,&lhs,&rhs);
+  return ret;
+}
+template<class ltype,class rtype,int N>
+accelerator_inline auto operator - (const iVector<ltype,N>& lhs,const iVector<rtype,N>& rhs) ->iVector<decltype(lhs._internal[0]-rhs._internal[0]),N>
+{
+  typedef iVector<decltype(lhs._internal[0]-rhs._internal[0]),N> ret_t;
+  ret_t ret;
+  sub(&ret,&lhs,&rhs);
+  return ret;
+}
+template<class ltype,class rtype,int N>
+accelerator_inline auto operator - (const iMatrix<ltype,N>& lhs,const iMatrix<rtype,N>& rhs) ->iMatrix<decltype(lhs._internal[0][0]-rhs._internal[0][0]),N>
+{
+  typedef iMatrix<decltype(lhs._internal[0][0]-rhs._internal[0][0]),N> ret_t;
+  ret_t ret;
+  sub(&ret,&lhs,&rhs);
+  return ret;
+}
+template<class ltype,class rtype,int N>
+accelerator_inline auto operator - (const iScalar<ltype>& lhs,const iMatrix<rtype,N>& rhs)->iMatrix<decltype(lhs._internal-rhs._internal[0][0]),N>
+{
+  typedef iMatrix<decltype(lhs._internal-rhs._internal[0][0]),N> ret_t;
+  ret_t ret;
+  sub(&ret,&lhs,&rhs);
+  return ret;
+}
+template<class ltype,class rtype,int N>
+accelerator_inline auto operator - (const iMatrix<ltype,N>& lhs,const iScalar<rtype>& rhs)->iMatrix<decltype(lhs._internal[0][0]-rhs._internal),N>
+{
+  typedef iMatrix<decltype(lhs._internal[0][0]-rhs._internal),N> ret_t;
+  ret_t ret;
+  sub(&ret,&lhs,&rhs);
+  return ret;
+}
+
+NAMESPACE_END(Grid);
+
+#endif
--- a/Grid/tensors/Tensor_class.h
+++ b/Grid/tensors/Tensor_class.h
@@ -0,0 +1,448 @@
+/*************************************************************************************
+Grid physics library, www.github.com/paboyle/Grid
+Source file: ./lib/tensors/Tensor_class.h
+Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+			   /*  END LEGAL */
+#ifndef GRID_MATH_TENSORS_H
+#define GRID_MATH_TENSORS_H
+
+NAMESPACE_BEGIN(Grid);
+
+///////////////////////////////////////////////////
+// Scalar, Vector, Matrix objects.
+// These can be composed to form tensor products of internal indices.
+///////////////////////////////////////////////////
+
+// It is useful to NOT have any constructors
+// so that these classes assert "is_pod<class> == true"
+// because then the standard C++ valarray container eliminates fill overhead on
+// new allocation and
+// non-move copying.
+//
+// However note that doing this eliminates some syntactical sugar such as
+// calling the constructor explicitly or implicitly
+//
+class GridTensorBase {};
+
+template <class vtype>
+class iScalar {
+public:
+  vtype _internal;
+
+  typedef vtype element;
+  typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
+  typedef typename GridTypeMapper<vtype>::vector_type vector_type;
+  typedef typename GridTypeMapper<vtype>::vector_typeD vector_typeD;
+  typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
+  typedef typename GridTypeMapper<vtype>::scalar_object recurse_scalar_object;
+  typedef iScalar<tensor_reduced_v> tensor_reduced;
+  typedef iScalar<recurse_scalar_object> scalar_object;
+  // substitutes a real or complex version with same tensor structure
+  typedef iScalar<typename GridTypeMapper<vtype>::Complexified> Complexified;
+  typedef iScalar<typename GridTypeMapper<vtype>::Realified> Realified;
+
+  // get double precision version
+  typedef iScalar<typename GridTypeMapper<vtype>::DoublePrecision> DoublePrecision;
+  
+  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1 };
+
+  // Scalar no action
+  accelerator iScalar() = default;
+
+  friend accelerator_inline void zeroit(iScalar<vtype> &that){
+    zeroit(that._internal);
+  }
+
+  accelerator_inline iScalar(scalar_type s) : _internal(s){};  // recurse down and hit the constructor for vector_type
+
+  accelerator_inline iScalar(const Zero &z) { zeroit(*this); };
+
+  accelerator_inline iScalar<vtype> &operator=(const Zero &hero) {
+    zeroit(*this);  return *this;
+  }
+  friend accelerator_inline void vstream(iScalar<vtype> &out, const iScalar<vtype> &in) {
+    vstream(out._internal, in._internal);
+  }
+  friend accelerator_inline void vbroadcast(iScalar<vtype> &out,const iScalar<vtype> &in,int lane){
+    vbroadcast(out._internal,in._internal,lane);
+  }
+  friend accelerator_inline void prefetch(iScalar<vtype> &that) {
+    prefetch(that._internal);
+  }
+  friend accelerator_inline void permute(iScalar<vtype> &out, const iScalar<vtype> &in, int permutetype) {
+    permute(out._internal, in._internal, permutetype);
+  }
+  friend accelerator_inline void rotate(iScalar<vtype> &out,const iScalar<vtype> &in,int rot){
+    rotate(out._internal,in._internal,rot);
+  }
+  friend accelerator_inline void exchange(iScalar<vtype> &out1,iScalar<vtype> &out2,
+				     const iScalar<vtype> &in1,const iScalar<vtype> &in2,int type)
+  {
+    exchange(out1._internal,out2._internal,in1._internal, in2._internal,type);
+  }
+
+  // Unary negation
+  friend accelerator_inline iScalar<vtype> operator-(const iScalar<vtype> &r) {
+    iScalar<vtype> ret;
+    ret._internal = -r._internal;
+    return ret;
+  }
+  // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
+  accelerator_inline iScalar<vtype> &operator*=(const iScalar<vtype> &r) {
+    *this = (*this) * r;
+    return *this;
+  }
+  accelerator_inline iScalar<vtype> &operator-=(const iScalar<vtype> &r) {
+    *this = (*this) - r;
+    return *this;
+  }
+  accelerator_inline iScalar<vtype> &operator+=(const iScalar<vtype> &r) {
+    *this = (*this) + r;
+    return *this;
+  }
+  accelerator_inline vtype &operator()(void) { return _internal; }
+  accelerator_inline const vtype &operator()(void) const { return _internal; }
+
+  // Type casts meta programmed, must be pure scalar to match TensorRemove
+  template <class U = vtype, class V = scalar_type, IfComplex<V> = 0, IfNotSimd<U> = 0> accelerator_inline
+  operator ComplexF() const {
+    return (TensorRemove(_internal));
+  }
+  template <class U = vtype, class V = scalar_type, IfComplex<V> = 0, IfNotSimd<U> = 0> accelerator_inline
+  operator ComplexD() const {
+    return (TensorRemove(_internal));
+  }
+  template <class U = vtype, class V = scalar_type, IfReal<V> = 0,IfNotSimd<U> = 0> accelerator_inline
+  operator RealD() const {
+    return TensorRemove(_internal);
+  }
+  template <class U = vtype, class V = scalar_type, IfInteger<V> = 0, IfNotSimd<U> = 0> accelerator_inline
+  operator Integer() const {
+    return Integer(TensorRemove(_internal));
+  }
+
+  // convert from a something to a scalar via constructor of something arg
+  template <class T, typename std::enable_if<!isGridTensor<T>::value, T>::type * = nullptr>
+  accelerator_inline iScalar<vtype> operator=(T arg) {
+    _internal = arg;
+    return *this;
+  }
+
+  // Convert elements
+  template <class ttype>
+  accelerator_inline iScalar<vtype> operator=(iScalar<ttype> &&arg) {
+    _internal = arg._internal;
+    return *this;
+  }
+
+  // Host only
+  friend std::ostream &operator<<(std::ostream &stream,const iScalar<vtype> &o) {
+    stream << "S {" << o._internal << "}";
+    return stream;
+  };
+};
+
+///////////////////////////////////////////////////////////
+// Allows to turn scalar<scalar<scalar<double>>>> back to double.
+///////////////////////////////////////////////////////////
+template <class T>
+accelerator_inline typename std::enable_if<!isGridTensor<T>::value, T>::type
+TensorRemove(T arg) {
+  return arg;
+}
+template <class vtype>
+accelerator_inline auto TensorRemove(iScalar<vtype> arg)
+  -> decltype(TensorRemove(arg._internal)) {
+  return TensorRemove(arg._internal);
+}
+
+template <class vtype, int N>
+class iVector {
+public:
+  vtype _internal[N];
+
+  typedef vtype element;
+  typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
+  typedef typename GridTypeMapper<vtype>::vector_type vector_type;
+  typedef typename GridTypeMapper<vtype>::vector_typeD vector_typeD;
+  typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
+  typedef typename GridTypeMapper<vtype>::scalar_object recurse_scalar_object;
+  typedef iScalar<tensor_reduced_v> tensor_reduced;
+  typedef iVector<recurse_scalar_object, N> scalar_object;
+
+  // substitutes a real or complex version with same tensor structure
+  typedef iVector<typename GridTypeMapper<vtype>::Complexified, N> Complexified;
+  typedef iVector<typename GridTypeMapper<vtype>::Realified, N> Realified;
+
+  // get double precision version
+  typedef iVector<typename GridTypeMapper<vtype>::DoublePrecision, N> DoublePrecision;
+  
+  template <class T, typename std::enable_if<!isGridTensor<T>::value, T>::type * = nullptr>
+  accelerator_inline auto operator=(T arg) -> iVector<vtype, N> {
+    zeroit(*this);
+    for (int i = 0; i < N; i++) _internal[i] = arg;
+    return *this;
+  }
+
+  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1 };
+  accelerator_inline iVector(const Zero &z) { zeroit(*this); };
+  accelerator iVector() = default;
+
+  accelerator_inline iVector<vtype, N> &operator=(const Zero &hero) {
+    zeroit(*this);
+    return *this;
+  }
+  friend accelerator_inline void zeroit(iVector<vtype, N> &that) {
+    for (int i = 0; i < N; i++) {
+      zeroit(that._internal[i]);
+    }
+  }
+  friend accelerator_inline void prefetch(iVector<vtype, N> &that) {
+    for (int i = 0; i < N; i++) prefetch(that._internal[i]);
+  }
+  friend accelerator_inline void vstream(iVector<vtype, N> &out, const iVector<vtype, N> &in) {
+    for (int i = 0; i < N; i++) {
+      vstream(out._internal[i], in._internal[i]);
+    }
+  }
+  friend accelerator_inline void vbroadcast(iVector<vtype,N> &out,const iVector<vtype,N> &in,int lane){
+    for(int i=0;i<N;i++){
+      vbroadcast(out._internal[i],in._internal[i],lane);
+    }
+  }
+  friend accelerator_inline void permute(iVector<vtype,N> &out,const iVector<vtype,N> &in,int permutetype){
+    for(int i=0;i<N;i++){
+      permute(out._internal[i],in._internal[i],permutetype);
+    }
+  }
+  friend accelerator_inline void rotate(iVector<vtype,N> &out,const iVector<vtype,N> &in,int rot){
+    for(int i=0;i<N;i++){
+      rotate(out._internal[i],in._internal[i],rot);
+    }
+  }
+  friend accelerator_inline void exchange(iVector<vtype,N> &out1,iVector<vtype,N> &out2,
+				     const iVector<vtype,N> &in1,const iVector<vtype,N> &in2,int type){
+    for(int i=0;i<N;i++){
+      exchange(out1._internal[i],out2._internal[i],in1._internal[i], in2._internal[i],type);
+    }
+  }
+
+  // Unary negation
+  friend accelerator_inline iVector<vtype, N> operator-(const iVector<vtype, N> &r) {
+    iVector<vtype, N> ret;
+    for (int i = 0; i < N; i++) ret._internal[i] = -r._internal[i];
+    return ret;
+  }
+  // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
+  accelerator_inline iVector<vtype, N> &operator*=(const iScalar<vtype> &r) {
+    *this = (*this) * r;
+    return *this;
+  }
+  accelerator_inline iVector<vtype, N> &operator-=(const iVector<vtype, N> &r) {
+    *this = (*this) - r;
+    return *this;
+  }
+  accelerator_inline iVector<vtype, N> &operator+=(const iVector<vtype, N> &r) {
+    *this = (*this) + r;
+    return *this;
+  }
+  accelerator_inline vtype &operator()(int i) { return _internal[i]; }
+  accelerator_inline const vtype &operator()(int i) const { return _internal[i]; }
+
+  // Host
+  friend std::ostream &operator<<(std::ostream &stream, const iVector<vtype, N> &o) {
+    stream << "V<" << N << ">{";
+    for (int i = 0; i < N; i++) {
+      stream << o._internal[i];
+      if (i < N - 1) stream << ",";
+    }
+    stream << "}";
+    return stream;
+  };
+};
+
+template <class vtype, int N>
+class iMatrix {
+public:
+  vtype _internal[N][N];
+
+  typedef vtype element;
+  typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
+  typedef typename GridTypeMapper<vtype>::vector_type vector_type;
+  typedef typename GridTypeMapper<vtype>::vector_typeD vector_typeD;
+  typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
+  typedef typename GridTypeMapper<vtype>::scalar_object recurse_scalar_object;
+
+  // substitutes a real or complex version with same tensor structure
+  typedef iMatrix<typename GridTypeMapper<vtype>::Complexified, N> Complexified;
+  typedef iMatrix<typename GridTypeMapper<vtype>::Realified, N> Realified;
+
+  // get double precision version
+  typedef iMatrix<typename GridTypeMapper<vtype>::DoublePrecision, N> DoublePrecision;
+  
+  // Tensor removal
+  typedef iScalar<tensor_reduced_v> tensor_reduced;
+  typedef iMatrix<recurse_scalar_object, N> scalar_object;
+
+  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1 };
+
+  accelerator_inline iMatrix(const Zero &z) { zeroit(*this); };
+  accelerator iMatrix() = default;
+
+  accelerator_inline iMatrix &operator=(const iMatrix &rhs) {
+    for (int i = 0; i < N; i++)
+      for (int j = 0; j < N; j++) 
+	vstream(_internal[i][j], rhs._internal[i][j]);
+    return *this;
+  };
+
+  accelerator_inline iMatrix(scalar_type s) {
+    (*this) = s;
+  };  // recurse down and hit the constructor for vector_type
+
+  accelerator_inline iMatrix<vtype, N> &operator=(const Zero &hero) {
+    zeroit(*this);
+    return *this;
+  }
+  template <class T, typename std::enable_if<!isGridTensor<T>::value, T>::type * = nullptr>
+  accelerator_inline auto operator=(T arg) -> iMatrix<vtype, N> {
+    zeroit(*this);
+    for (int i = 0; i < N; i++) _internal[i][i] = arg;
+    return *this;
+  }
+
+  friend accelerator_inline void zeroit(iMatrix<vtype,N> &that){
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	zeroit(that._internal[i][j]);
+    }}
+  }
+  friend accelerator_inline void prefetch(iMatrix<vtype,N> &that){
+    for(int i=0;i<N;i++) {
+      for(int j=0;j<N;j++) { 
+	prefetch(that._internal[i][j]);
+    }}
+  }
+  friend accelerator_inline void vstream(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in){
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	vstream(out._internal[i][j],in._internal[i][j]);
+    }}
+  }
+  friend accelerator_inline void vbroadcast(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in,int lane){
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	vbroadcast(out._internal[i][j],in._internal[i][j],lane);
+    }}
+  }
+
+  friend accelerator_inline void permute(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in,int permutetype){
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	permute(out._internal[i][j],in._internal[i][j],permutetype);
+    }}
+  }
+  friend accelerator_inline void rotate(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in,int rot){
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+      rotate(out._internal[i][j],in._internal[i][j],rot);
+    }}
+  }
+  friend accelerator_inline void exchange(iMatrix<vtype,N> &out1,iMatrix<vtype,N> &out2,
+					  const iMatrix<vtype,N> &in1,const iMatrix<vtype,N> &in2,int type){
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	exchange(out1._internal[i][j],out2._internal[i][j],in1._internal[i][j], in2._internal[i][j],type);
+    }}
+  }
+  
+  // Unary negation
+  friend accelerator_inline iMatrix<vtype, N> operator-(const iMatrix<vtype, N> &r) {
+    iMatrix<vtype, N> ret;
+    for (int i = 0; i < N; i++) {
+      for (int j = 0; j < N; j++) {
+	ret._internal[i][j] = -r._internal[i][j];
+    }}
+    return ret;
+  }
+  // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
+  template <class T>
+  accelerator_inline iMatrix<vtype, N> &operator*=(const T &r) {
+    *this = (*this) * r;
+    return *this;
+  }
+  template <class T>
+  accelerator_inline iMatrix<vtype, N> &operator-=(const T &r) {
+    *this = (*this) - r;
+    return *this;
+  }
+  template <class T>
+  accelerator_inline iMatrix<vtype, N> &operator+=(const T &r) {
+    *this = (*this) + r;
+    return *this;
+  }
+
+  // returns an lvalue reference
+  accelerator_inline vtype &operator()(int i, int j) { return _internal[i][j]; }
+  accelerator_inline const vtype &operator()(int i, int j) const {
+    return _internal[i][j];
+  }
+  
+  // Host function only
+  friend std::ostream &operator<<(std::ostream &stream, const iMatrix<vtype, N> &o) {
+    stream << "M<" << N << ">{";
+    for (int i = 0; i < N; i++) {
+      stream << "{";
+      for (int j = 0; j < N; j++) {
+	stream << o._internal[i][j];
+	if (i < N - 1) stream << ",";
+      }
+      stream << "}";
+      if (i != N - 1) stream << "\n\t\t";
+    }
+    stream << "}";
+    return stream;
+  };
+
+};
+
+template <class v> accelerator_inline
+void vprefetch(const iScalar<v> &vv) {
+  vprefetch(vv._internal);
+}
+template <class v, int N> accelerator_inline
+void vprefetch(const iVector<v, N> &vv) {
+  for (int i = 0; i < N; i++) {
+    vprefetch(vv._internal[i]);
+  }
+}
+template <class v, int N> accelerator_inline
+void vprefetch(const iMatrix<v, N> &vv) {
+  for (int i = 0; i < N; i++) {
+    for (int j = 0; j < N; j++) {
+      vprefetch(vv._internal[i][j]);
+    }
+  }
+}
+
+NAMESPACE_END(Grid);
+
+#endif
--- a/Grid/tensors/Tensor_determinant.h
+++ b/Grid/tensors/Tensor_determinant.h
@@ -0,0 +1,73 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/tensors/Tensor_determinant.h
+
+    Copyright (C) 2015
+
+Author: neo <cossu@post.kek.jp>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_MATH_DET_H
+#define GRID_MATH_DET_H
+
+NAMESPACE_BEGIN(Grid);
+
+/////////////////////////////////////////////// 
+// Determinant function for scalar, vector, matrix
+/////////////////////////////////////////////// 
+accelerator_inline ComplexF Determinant( const ComplexF &arg){    return arg;}
+accelerator_inline ComplexD Determinant( const ComplexD &arg){    return arg;}
+accelerator_inline RealF Determinant( const RealF &arg){    return arg;}
+accelerator_inline RealD Determinant( const RealD &arg){    return arg;}
+
+template<class vtype> accelerator_inline auto Determinant(const iScalar<vtype>&r) -> iScalar<decltype(Determinant(r._internal))>
+{
+  iScalar<decltype(Determinant(r._internal))> ret;
+  ret._internal = Determinant(r._internal);
+  return ret;
+}
+
+template<class vtype,int N, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0 >::type * =nullptr> 
+accelerator_inline iScalar<vtype> Determinant(const iMatrix<vtype,N> &arg)
+{
+  iMatrix<vtype,N> ret(arg);
+  iScalar<vtype> det = vtype(1.0);
+  /* Conversion of matrix to upper triangular */
+  for(int i = 0; i < N; i++){
+    for(int j = 0; j < N; j++){
+      if(j>i){
+	vtype ratio = ret._internal[j][i]/ret._internal[i][i];
+	for(int k = 0; k < N; k++){
+	  ret._internal[j][k] -= ratio * ret._internal[i][k];
+	}
+      }
+    }
+  }      
+
+  for(int i = 0; i < N; i++)
+    det *= ret._internal[i][i];   
+
+  return det;
+}
+
+NAMESPACE_END(Grid);
+
+#endif
--- a/Grid/tensors/Tensor_exp.h
+++ b/Grid/tensors/Tensor_exp.h
@@ -0,0 +1,141 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/tensors/Tensor_exp.h
+
+    Copyright (C) 2015
+
+Author: neo <cossu@post.kek.jp>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_MATH_EXP_H
+#define GRID_MATH_EXP_H
+
+#define DEFAULT_MAT_EXP 12
+
+NAMESPACE_BEGIN(Grid);
+
+/////////////////////////////////////////////// 
+// Exponentiate function for scalar, vector, matrix
+/////////////////////////////////////////////// 
+
+
+template<class vtype> accelerator_inline iScalar<vtype> Exponentiate(const iScalar<vtype>&r, RealD alpha ,  Integer Nexp = DEFAULT_MAT_EXP)
+{
+  iScalar<vtype> ret;
+  ret._internal = Exponentiate(r._internal, alpha, Nexp);
+  return ret;
+}
+
+template<class vtype, int N> accelerator_inline iVector<vtype, N> Exponentiate(const iVector<vtype,N>&r, RealD alpha ,  Integer Nexp = DEFAULT_MAT_EXP)
+{
+  iVector<vtype, N> ret;
+  for (int i = 0; i < N; i++)
+    ret._internal[i] = Exponentiate(r._internal[i], alpha, Nexp);
+  return ret;
+}
+
+
+
+// Specialisation: Cayley-Hamilton exponential for SU(3)
+template<class vtype, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0>::type * =nullptr> 
+accelerator_inline iMatrix<vtype,3> Exponentiate(const iMatrix<vtype,3> &arg, RealD alpha  , Integer Nexp = DEFAULT_MAT_EXP )
+{
+  // for SU(3) 2x faster than the std implementation using Nexp=12
+  // notice that it actually computes
+  // exp ( input matrix )
+  // the i sign is coming from outside
+  // input matrix is anti-hermitian NOT hermitian
+  typedef iMatrix<vtype,3> mat;
+  typedef iScalar<vtype> scalar;
+  mat unit(1.0);
+  const Complex one_over_three = 1.0 / 3.0;
+  const Complex one_over_two = 1.0 / 2.0;
+
+  scalar c0, c1, tmp, c0max, theta, u, w;
+  scalar xi0, u2, w2, cosw;
+  scalar fden, h0, h1, h2;
+  scalar e2iu, emiu, ixi0;
+  scalar f0, f1, f2;
+  scalar unity(1.0);
+      
+  mat iQ2 = arg*arg*alpha*alpha;
+  mat iQ3 = arg*iQ2*alpha;   
+  // sign in c0 from the conventions on the Ta
+  scalar imQ3, reQ2;
+  imQ3 = imag( trace(iQ3) );
+  reQ2 = real( trace(iQ2) );
+  c0 = -imQ3 * one_over_three;  
+  c1 = -reQ2 * one_over_two;
+
+  // Cayley Hamilton checks to machine precision, tested
+  tmp = c1 * one_over_three;
+  c0max = 2.0 * pow(tmp, 1.5);
+
+  theta = acos(c0 / c0max) * one_over_three;
+  u = sqrt(tmp) * cos(theta);
+  w = sqrt(c1) * sin(theta);
+
+  xi0 = sin(w) / w;
+  u2 = u * u;
+  w2 = w * w;
+  cosw = cos(w);
+
+  ixi0 = timesI(xi0);
+  emiu = cos(u) - timesI(sin(u));
+  e2iu = cos(2.0 * u) + timesI(sin(2.0 * u));
+
+  h0 = e2iu * (u2 - w2) +
+    emiu * ((8.0 * u2 * cosw) + (2.0 * u * (3.0 * u2 + w2) * ixi0));
+  h1 = e2iu * (2.0 * u) - emiu * ((2.0 * u * cosw) - (3.0 * u2 - w2) * ixi0);
+  h2 = e2iu - emiu * (cosw + (3.0 * u) * ixi0);
+
+  fden = unity / (9.0 * u2 - w2);  // reals
+  f0 = h0 * fden;
+  f1 = h1 * fden;
+  f2 = h2 * fden;
+
+  return (f0 * unit + timesMinusI(f1) * arg*alpha - f2 * iQ2);
+}
+
+
+
+// General exponential
+template<class vtype,int N, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0 >::type * =nullptr> 
+accelerator_inline iMatrix<vtype,N> Exponentiate(const iMatrix<vtype,N> &arg, RealD alpha  , Integer Nexp = DEFAULT_MAT_EXP )
+{
+  // notice that it actually computes
+  // exp ( input matrix )
+  // the i sign is coming from outside
+  // input matrix is anti-hermitian NOT hermitian
+  typedef iMatrix<vtype,N> mat;
+  mat unit(1.0);
+  mat temp(unit);
+  for(int i=Nexp; i>=1;--i){
+    temp *= alpha/RealD(i);
+    temp = unit + temp*arg;
+  }
+  return temp;
+
+}
+
+NAMESPACE_END(Grid);
+
+#endif
--- a/Grid/tensors/Tensor_extract_merge.h
+++ b/Grid/tensors/Tensor_extract_merge.h
@@ -0,0 +1,212 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/tensors/Tensor_extract_merge.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: neo <cossu@post.kek.jp>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+Author: Christopher Kelly <ckelly@phys.columbia.edu>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once 
+
+#include <string.h>
+
+//#pragma GCC optimize("no-strict-aliasing")
+
+NAMESPACE_BEGIN(Grid);
+
+/////////////////////////////////////////////////////////////////
+// Generic extract/merge/permute
+/////////////////////////////////////////////////////////////////
+
+template<class __T> using ExtractPointerArray = AcceleratorVector<__T *,GRID_MAX_SIMD>;
+template<class __T> using ExtractBuffer       = AcceleratorVector<__T  ,GRID_MAX_SIMD>;
+
+//void extract(const vobj &vec,ExtractBuffer<typename vobj::scalar_object> &extracted);
+//void extract(const vobj &vec,ExtractPointerArray<sobj> &extracted, int offset);
+//void   merge(vobj &vec,ExtractBuffer<typename vobj::scalar_object> &extracted)
+//void   merge(vobj &vec,ExtractPointerArray<typename vobj::scalar_object> &extracted)
+
+////////////////////////////////////////////////////////////////////////
+// Extract to contiguous array scalar object
+////////////////////////////////////////////////////////////////////////
+template<class vobj,class sobj> accelerator
+void extract(const vobj &vec,ExtractBuffer<sobj> &extracted)
+{
+  typedef typename GridTypeMapper<sobj>::scalar_type sobj_scalar_type;
+  typedef typename GridTypeMapper<vobj>::scalar_type scalar_type;
+  typedef typename GridTypeMapper<vobj>::vector_type vector_type;
+
+  const int words=sizeof(vobj)/sizeof(vector_type);
+  const int Nsimd=vector_type::Nsimd();
+  const int Nextr=extracted.size();
+  const int s=Nsimd/Nextr;
+  sobj_scalar_type *sp = (sobj_scalar_type *) &extracted[0];
+  scalar_type *vp = (scalar_type *)&vec;
+  scalar_type      vtmp;
+  sobj_scalar_type stmp;
+  for(int w=0;w<words;w++){
+    for(int i=0;i<Nextr;i++){
+      memcpy((char *)&vtmp,(char *)&vp[w*Nsimd+i*s],sizeof(vtmp));
+      stmp = vtmp;
+      memcpy((char *)&sp[i*words+w],(char *)&stmp,sizeof(stmp));
+    }
+  }
+  return;
+}
+
+////////////////////////////////////////////////////////////////////////
+// Merge a contiguous array of scalar objects
+////////////////////////////////////////////////////////////////////////
+template<class vobj,class sobj> accelerator
+void   merge(vobj &vec,ExtractBuffer<sobj> &extracted)
+{
+  typedef typename GridTypeMapper<sobj>::scalar_type sobj_scalar_type;
+  typedef typename GridTypeMapper<vobj>::scalar_type scalar_type;
+  typedef typename GridTypeMapper<vobj>::vector_type vector_type;
+
+  const int words=sizeof(vobj)/sizeof(vector_type);
+  const int Nsimd=vector_type::Nsimd();
+  const int Nextr = extracted.size();
+  const int s=Nsimd/Nextr;
+
+  sobj_scalar_type *sp = (sobj_scalar_type *)&extracted[0];
+  scalar_type *vp = (scalar_type *)&vec;
+  scalar_type      vtmp;
+  sobj_scalar_type stmp;
+  for(int w=0;w<words;w++){
+    for(int i=0;i<Nextr;i++){
+      for(int ii=0;ii<s;ii++){
+	memcpy((char *)&stmp,(char *)&sp[i*words+w],sizeof(stmp));
+	vtmp = stmp;
+	memcpy((char *)&vp[w*Nsimd+i*s+ii],(char *)&vtmp,sizeof(vtmp));
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////
+// Extract/Insert a single lane  
+////////////////////////////////////////////////////////////////////////
+template<class vobj> accelerator_inline
+typename vobj::scalar_object extractLane(int lane, const vobj & __restrict__ vec)
+{
+  typedef typename vobj::scalar_type   scalar_type;
+  typedef typename vobj::scalar_object scalar_object;
+  typedef typename vobj::vector_type   vector_type;
+  typedef typename ExtractTypeMap<scalar_type>::extract_type extract_type;
+  typedef extract_type * pointer;
+
+  constexpr int words=sizeof(vobj)/sizeof(vector_type);
+  constexpr int Nsimd=vector_type::Nsimd();
+
+  scalar_object extracted;
+  pointer __restrict__  sp = (pointer)&extracted; // Type pun
+  pointer __restrict__  vp = (pointer)&vec;
+  for(int w=0;w<words;w++){
+    sp[w]=vp[w*Nsimd+lane];
+  }
+  return extracted;
+}
+
+template<class vobj> accelerator_inline
+void insertLane(int lane, vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted)
+{
+  typedef typename vobj::vector_type vector_type;
+  typedef typename vector_type::scalar_type scalar_type;
+  typedef typename ExtractTypeMap<scalar_type>::extract_type extract_type;
+  typedef extract_type * pointer;
+
+  constexpr int words=sizeof(vobj)/sizeof(vector_type);
+  constexpr int Nsimd=vector_type::Nsimd();
+
+  pointer __restrict__ sp = (pointer)&extracted;
+  pointer __restrict__ vp = (pointer)&vec;
+  for(int w=0;w<words;w++){
+    vp[w*Nsimd+lane]=sp[w];
+  }
+}
+
+////////////////////////////////////////////////////////////////////////
+// Extract to a bunch of scalar object pointers of different scalar type, with offset. Useful for precision change
+////////////////////////////////////////////////////////////////////////
+template<class vobj, class sobj> accelerator
+void extract(const vobj &vec,ExtractPointerArray<sobj> &extracted, int offset)
+{
+  typedef typename GridTypeMapper<sobj>::scalar_type sobj_scalar_type;
+  typedef typename GridTypeMapper<vobj>::scalar_type scalar_type;
+  typedef typename GridTypeMapper<vobj>::vector_type vector_type;
+
+  const int words=sizeof(vobj)/sizeof(vector_type);
+  const int Nsimd=vector_type::Nsimd();
+  const int Nextr=extracted.size();
+  const int s = Nsimd/Nextr;
+
+  scalar_type * vp = (scalar_type *)&vec;
+  scalar_type      vtmp;
+  sobj_scalar_type stmp;
+  for(int w=0;w<words;w++){
+    for(int i=0;i<Nextr;i++){
+      sobj_scalar_type * pointer = (sobj_scalar_type *)& extracted[i][offset];
+      memcpy((char *)&vtmp,(char *)&vp[w*Nsimd+i*s],sizeof(vtmp));
+      stmp = vtmp;
+      memcpy((char *)&pointer[w],(char *)&stmp,sizeof(stmp)); // may do a precision conversion
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////
+// Merge bunch of scalar object pointers of different scalar type, with offset. Useful for precision change
+////////////////////////////////////////////////////////////////////////
+template<class vobj, class sobj> accelerator
+void merge(vobj &vec,ExtractPointerArray<sobj> &extracted, int offset)
+{
+  typedef typename GridTypeMapper<sobj>::scalar_type sobj_scalar_type;
+  typedef typename GridTypeMapper<vobj>::scalar_type scalar_type;
+  typedef typename GridTypeMapper<vobj>::vector_type vector_type;
+
+  const int words=sizeof(vobj)/sizeof(vector_type);
+  const int Nsimd=vector_type::Nsimd();
+  const int Nextr=extracted.size();
+  const int s = Nsimd/Nextr;
+
+  scalar_type * vp = (scalar_type *)&vec;
+  scalar_type      vtmp;
+  sobj_scalar_type stmp;
+  for(int w=0;w<words;w++){
+    for(int i=0;i<Nextr;i++){
+      sobj_scalar_type * pointer = (sobj_scalar_type *)& extracted[i][offset];
+      for(int ii=0;ii<s;ii++){
+	memcpy((char *)&stmp,(char *)&pointer[w],sizeof(stmp));
+	vtmp=stmp;
+	memcpy((char *)&vp[w*Nsimd+i*s+ii],(char *)&vtmp,sizeof(vtmp));
+      }
+    }
+  }
+}
+
+
+NAMESPACE_END(Grid);
+
--- a/Grid/tensors/Tensor_index.h
+++ b/Grid/tensors/Tensor_index.h
@@ -0,0 +1,411 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/tensors/Tensor_index.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_TENSOR_INDEX_H
+#define GRID_TENSOR_INDEX_H
+
+////////////////////////////////////////////////////////////////////////////////////////
+// Recursion for trace, transpose, peek, poke a specific index
+////////////////////////////////////////////////////////////////////////////////////////
+// Allow trace to recurse if vector, but never terminate on a vector
+// trace of a different index can distribute across the vector index in a replicated way
+// but we do not trace a vector index.
+
+NAMESPACE_BEGIN(Grid);
+
+/* Needed?
+   template<int Level> accelerator_inline ComplexF traceIndex(const ComplexF arg) { return arg;}
+   template<int Level> accelerator_inline ComplexD traceIndex(const ComplexD arg) { return arg;}
+   template<int Level> accelerator_inline RealF traceIndex(const RealF arg) { return arg;}
+   template<int Level> accelerator_inline RealD traceIndex(const RealD arg) { return arg;}
+*/
+template<int Level> 
+class TensorIndexRecursion {
+
+public:
+
+  ////////////////////////////////////////////////////
+  // Type Queries
+  ////////////////////////////////////////////////////
+  template<class vtype>       static accelerator_inline int indexRank(const iScalar<vtype> tmp)  { return TensorIndexRecursion<Level-1>::indexRank(tmp._internal);  }
+  template<class vtype,int N> static accelerator_inline int indexRank(const iVector<vtype,N> tmp){ return TensorIndexRecursion<Level-1>::indexRank(tmp._internal[0]);  }
+  template<class vtype,int N> static accelerator_inline int indexRank(const iMatrix<vtype,N> tmp){ return TensorIndexRecursion<Level-1>::indexRank(tmp._internal[0][0]);  }
+
+  template<class vtype>       static accelerator_inline int isScalar(const iScalar<vtype> tmp)  { return TensorIndexRecursion<Level-1>::isScalar(tmp._internal);  }
+  template<class vtype,int N> static accelerator_inline int isScalar(const iVector<vtype,N> tmp){ return TensorIndexRecursion<Level-1>::isScalar(tmp._internal[0]);  }
+  template<class vtype,int N> static accelerator_inline int isScalar(const iMatrix<vtype,N> tmp){ return TensorIndexRecursion<Level-1>::isScalar(tmp._internal[0][0]);  }
+
+  template<class vtype>       static accelerator_inline int isVector(const iScalar<vtype> tmp)  { return TensorIndexRecursion<Level-1>::isVector(tmp._internal);  }
+  template<class vtype,int N> static accelerator_inline int isVector(const iVector<vtype,N> tmp){ return TensorIndexRecursion<Level-1>::isVector(tmp._internal[0]);  }
+  template<class vtype,int N> static accelerator_inline int isVector(const iMatrix<vtype,N> tmp){ return TensorIndexRecursion<Level-1>::isVector(tmp._internal[0][0]);  }
+  
+  template<class vtype>       static accelerator_inline int isMatrix(const iScalar<vtype> tmp)  { return TensorIndexRecursion<Level-1>::isMatrix(tmp._internal);  }
+  template<class vtype,int N> static accelerator_inline int isMatrix(const iVector<vtype,N> tmp){ return TensorIndexRecursion<Level-1>::isMatrix(tmp._internal[0]);  }
+  template<class vtype,int N> static accelerator_inline int isMatrix(const iMatrix<vtype,N> tmp){ return TensorIndexRecursion<Level-1>::isMatrix(tmp._internal[0][0]);  }
+  ////////////////////////////////////////////////////
+  // Trace
+  ////////////////////////////////////////////////////
+  template<class vtype>
+  static accelerator_inline auto traceIndex(const iScalar<vtype> arg) ->  iScalar<decltype(TensorIndexRecursion<Level-1>::traceIndex(arg._internal))> 
+  {
+    iScalar<decltype(TensorIndexRecursion<Level-1>::traceIndex(arg._internal))> ret;
+    ret._internal = TensorIndexRecursion<Level-1>::traceIndex(arg._internal);
+    return ret;
+  }
+  template<class vtype,int N>
+  static accelerator_inline auto traceIndex(const iVector<vtype,N> arg) ->  iVector<decltype(TensorIndexRecursion<Level-1>::traceIndex(arg._internal[0])),N> 
+  {
+    iVector<decltype(TensorIndexRecursion<Level-1>::traceIndex(arg._internal[0])),N> ret;
+    for(int i=0;i<N;i++){
+      ret._internal[i] = TensorIndexRecursion<Level-1>::traceIndex(arg._internal[i]);
+    }
+    return ret;
+  }
+  template<class vtype,int N>
+  static accelerator_inline auto traceIndex(const iMatrix<vtype,N> arg) ->  iMatrix<decltype(TensorIndexRecursion<Level-1>::traceIndex(arg._internal[0][0])),N> 
+  {
+    iMatrix<decltype(TensorIndexRecursion<Level-1>::traceIndex(arg._internal[0][0])),N> ret;
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	ret._internal[i][j] = TensorIndexRecursion<Level-1>::traceIndex(arg._internal[i][j]);
+      }}
+    return ret;
+  }
+  ////////////////////////////////////////////
+  // Recursion for peeking a specific index
+  ////////////////////////////////////////////
+  template<class vtype>
+  static accelerator_inline auto peekIndex(const iScalar<vtype> arg,int i) ->  iScalar<decltype(TensorIndexRecursion<Level-1>::peekIndex(arg._internal,0))> 
+  {
+    iScalar<decltype(TensorIndexRecursion<Level-1>::peekIndex(arg._internal,0))> ret;
+    ret._internal = TensorIndexRecursion<Level-1>::peekIndex(arg._internal,i);
+    return ret;
+  }
+  template<class vtype>
+  static accelerator_inline auto peekIndex(const iScalar<vtype> arg,int i,int j) ->  iScalar<decltype(TensorIndexRecursion<Level-1>::peekIndex(arg._internal,0,0))> 
+  {
+    iScalar<decltype(TensorIndexRecursion<Level-1>::peekIndex(arg._internal,0,0))> ret;
+    ret._internal = TensorIndexRecursion<Level-1>::peekIndex(arg._internal,i,j);
+    return ret;
+  }
+
+  template<class vtype,int N>
+  static accelerator_inline auto peekIndex(const iVector<vtype,N> arg,int ii) ->  iVector<decltype(TensorIndexRecursion<Level-1>::peekIndex(arg._internal[0],0)),N> 
+  {
+    iVector<decltype(TensorIndexRecursion<Level-1>::peekIndex(arg._internal[0],0)),N> ret;
+    for(int i=0;i<N;i++){
+      ret._internal[i] = TensorIndexRecursion<Level-1>::peekIndex(arg._internal[i],ii);
+    }
+    return ret;
+  }
+  template<class vtype,int N>
+  static accelerator_inline auto peekIndex(const iVector<vtype,N> arg,int ii,int jj) 
+    ->  iVector<decltype(TensorIndexRecursion<Level-1>::peekIndex(arg._internal[0],0,0)),N> 
+  {
+    iVector<decltype(TensorIndexRecursion<Level-1>::peekIndex(arg._internal[0],0,0)),N> ret;
+    for(int i=0;i<N;i++){
+      ret._internal[i] = TensorIndexRecursion<Level-1>::peekIndex(arg._internal[i],ii,jj);
+    }
+    return ret;
+  }
+  
+  template<class vtype,int N>
+  static accelerator_inline auto peekIndex(const iMatrix<vtype,N> arg,int ii) ->  iMatrix<decltype(TensorIndexRecursion<Level-1>::peekIndex(arg._internal[0][0],0)),N> 
+  {
+    iMatrix<decltype(TensorIndexRecursion<Level-1>::peekIndex(arg._internal[0][0],0)),N> ret;
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	ret._internal[i][j] = TensorIndexRecursion<Level-1>::peekIndex(arg._internal[i][j],ii);
+      }}
+    return ret;
+  }
+  template<class vtype,int N>
+  static accelerator_inline auto peekIndex(const iMatrix<vtype,N> arg,int ii,int jj) 
+    ->  iMatrix<decltype(TensorIndexRecursion<Level-1>::peekIndex(arg._internal[0][0],0,0)),N> 
+  {
+    iMatrix<decltype(TensorIndexRecursion<Level-1>::peekIndex(arg._internal[0][0],0,0)),N> ret;
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	ret._internal[i][j] = TensorIndexRecursion<Level-1>::peekIndex(arg._internal[i][j],ii,jj);
+      }}
+    return ret;
+  }
+  ////////////////////////////////////////////
+  // Recursion for poking a specific index
+  ////////////////////////////////////////////
+  
+  template<class vtype> accelerator_inline static 
+  void pokeIndex(iScalar<vtype> &ret, const iScalar<decltype(TensorIndexRecursion<Level-1>::peekIndex(ret._internal,0))> &arg, int i)
+  {
+    TensorIndexRecursion<Level-1>::pokeIndex(ret._internal,arg._internal,i);
+  }
+  template<class vtype> accelerator_inline static 
+  void pokeIndex(iScalar<vtype> &ret, const iScalar<decltype(TensorIndexRecursion<Level-1>::peekIndex(ret._internal,0,0))> &arg, int i,int j)
+  {
+    TensorIndexRecursion<Level-1>::pokeIndex(ret._internal,arg._internal,i,j);
+  }
+  
+  template<class vtype,int N> accelerator_inline static 
+  void pokeIndex(iVector<vtype,N> &ret, const iVector<decltype(TensorIndexRecursion<Level-1>::peekIndex(ret._internal[0],0)),N> &arg, int i)
+  {
+    for(int ii=0;ii<N;ii++){
+      TensorIndexRecursion<Level-1>::pokeIndex(ret._internal[ii],arg._internal[ii],i);
+    }
+  }
+  template<class vtype,int N> accelerator_inline static 
+  void pokeIndex(iVector<vtype,N> &ret, const iVector<decltype(TensorIndexRecursion<Level-1>::peekIndex(ret._internal[0],0,0)),N> &arg, int i,int j)
+  {
+    for(int ii=0;ii<N;ii++){
+      TensorIndexRecursion<Level-1>::pokeIndex(ret._internal[ii],arg._internal[ii],i,j);
+    }
+  }
+  
+  template<class vtype,int N> accelerator_inline static 
+  void pokeIndex(iMatrix<vtype,N> &ret, const iMatrix<decltype(TensorIndexRecursion<Level-1>::peekIndex(ret._internal[0][0],0)),N> &arg, int i)
+  {
+    for(int ii=0;ii<N;ii++){
+      for(int jj=0;jj<N;jj++){
+	TensorIndexRecursion<Level-1>::pokeIndex(ret._internal[ii][jj],arg._internal[ii][jj],i);
+      }}
+  }
+  template<class vtype,int N> accelerator_inline static 
+  void pokeIndex(iMatrix<vtype,N> &ret, const iMatrix<decltype(TensorIndexRecursion<Level-1>::peekIndex(ret._internal[0][0],0,0)),N> &arg, int i,int j)
+  {
+    for(int ii=0;ii<N;ii++){
+      for(int jj=0;jj<N;jj++){
+	TensorIndexRecursion<Level-1>::pokeIndex(ret._internal[ii][jj],arg._internal[ii][jj],i,j);
+      }}
+  }
+
+  ////////////////////////////////////////////
+  // Recursion for transposing a specific index
+  ////////////////////////////////////////////
+  template<class vtype> accelerator_inline 
+  static auto transposeIndex(const iScalar<vtype> arg) ->  iScalar<vtype> 
+  {
+    iScalar<vtype> ret;
+    ret._internal = TensorIndexRecursion<Level-1>::transposeIndex(arg._internal);
+    return ret;
+  }
+  template<class vtype,int N> accelerator_inline 
+  static auto transposeIndex(const iVector<vtype,N> arg) ->  iVector<vtype,N> 
+  {
+    iVector<vtype,N> ret;
+    for(int i=0;i<N;i++){
+      ret._internal[i] = TensorIndexRecursion<Level-1>::transposeIndex(arg._internal[i]);
+    }
+    return ret;
+  }
+  template<class vtype,int N> accelerator_inline 
+  static auto transposeIndex(const iMatrix<vtype,N> arg) ->  iMatrix<vtype,N> 
+  {
+    iMatrix<vtype,N> ret;
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	ret._internal[i][j] = TensorIndexRecursion<Level-1>::transposeIndex(arg._internal[i][j]);
+      }}
+    return ret;
+  }
+};
+
+////////////////////////////
+// strip const & ref quali's
+////////////////////////////
+#define RemoveCRV(a) typename std::remove_const<typename std::remove_reference<decltype(a)>::type>::type
+template<>
+class TensorIndexRecursion<0> {
+public:
+  ////////////////////////////////////////////////////
+  // Type Queries
+  ////////////////////////////////////////////////////
+  template<class vtype>       static accelerator_inline int indexRank(const iScalar<vtype> tmp)  { return 1; }
+  template<class vtype,int N> static accelerator_inline int indexRank(const iVector<vtype,N> tmp){ return N; }
+  template<class vtype,int N> static accelerator_inline int indexRank(const iMatrix<vtype,N> tmp){ return N; }
+
+  template<class vtype>       static accelerator_inline int isScalar(const iScalar<vtype> tmp)  { return true;}
+  template<class vtype,int N> static accelerator_inline int isScalar(const iVector<vtype,N> tmp){ return false;}
+  template<class vtype,int N> static accelerator_inline int isScalar(const iMatrix<vtype,N> tmp){ return false;}
+
+  template<class vtype>       static accelerator_inline int isVector(const iScalar<vtype> tmp)  { return false;}
+  template<class vtype,int N> static accelerator_inline int isVector(const iVector<vtype,N> tmp){ return true;}
+  template<class vtype,int N> static accelerator_inline int isVector(const iMatrix<vtype,N> tmp){ return false;}
+
+  template<class vtype>       static accelerator_inline int isMatrix(const iScalar<vtype> tmp)  { return false;}
+  template<class vtype,int N> static accelerator_inline int isMatrix(const iVector<vtype,N> tmp){ return false;}
+  template<class vtype,int N> static accelerator_inline int isMatrix(const iMatrix<vtype,N> tmp){ return true;}
+
+  /////////////////////////////////////////
+  // Ends recursion for trace (scalar/vector/matrix)
+  /////////////////////////////////////////
+  template<class vtype> accelerator_inline 
+  static auto traceIndex(const iScalar<vtype> arg) ->  iScalar<RemoveCRV(arg._internal)>
+  {
+    iScalar<RemoveCRV(arg._internal)> ret;
+    ret._internal = arg._internal;
+    return ret;
+  }
+  template<class vtype,int N> accelerator_inline 
+  static auto traceIndex(const iVector<vtype,N> arg) ->  iScalar<RemoveCRV(arg._internal[0])>
+  {
+    iScalar<RemoveCRV(arg._internal[0])> ret;
+    ret._internal=Zero();
+    for(int i=0;i<N;i++){
+      ret._internal = ret._internal+ arg._internal[i];
+    }
+    return ret;
+  }
+  template<class vtype,int N> accelerator_inline 
+  static auto traceIndex(const iMatrix<vtype,N> arg) ->  iScalar<RemoveCRV(arg._internal[0][0])> 
+  {
+    iScalar<RemoveCRV(arg._internal[0][0])> ret;
+    zeroit(ret);
+    for(int i=0;i<N;i++){
+      ret._internal = ret._internal+arg._internal[i][i];
+    }
+    return ret;
+  }
+  /////////////////////////////////////////
+  // Ends recursion for transpose scalar/matrix ; no way to terminate on vector
+  /////////////////////////////////////////
+  template<class vtype> accelerator_inline 
+  static auto transposeIndex(const iScalar<vtype> arg) ->  iScalar<vtype>
+  {
+    iScalar<vtype> ret;
+    ret._internal = arg._internal;
+    return ret;
+  }
+  template<class vtype,int N> accelerator_inline 
+  static auto transposeIndex(const iMatrix<vtype,N> arg)  ->  iMatrix<vtype,N> 
+  {
+    iMatrix<vtype,N> ret;
+    ret=Zero();
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	ret._internal[i][j] = ret._internal[i][j]+arg._internal[i][j];
+      }}
+    return ret;
+  }
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // End recursion for peeking a specific index; single index on vector, double index on matrix
+  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+  template<class vtype,int N> accelerator_inline 
+  static auto peekIndex(const iVector<vtype,N> arg,int ii) ->  iScalar<vtype> 
+  {
+    iScalar<vtype> ret;
+    ret._internal = arg._internal[ii];
+    return ret;
+  }
+  template<class vtype,int N> accelerator_inline 
+  static auto peekIndex(const iMatrix<vtype,N> arg,int ii,int jj) ->  iScalar<vtype>
+  {
+    iScalar<vtype> ret;
+    ret._internal = arg._internal[ii][jj];
+    return ret;
+  }
+  // Vector poke, one index
+  template<class vtype,int N> accelerator_inline static 
+  void pokeIndex(iVector<vtype,N> &ret, const iScalar<vtype> &arg,int i)
+  {
+    ret._internal[i] = arg._internal;
+  }
+  // Matrix poke two indices
+  template<class vtype,int N> accelerator_inline static 
+  void pokeIndex(iMatrix<vtype,N> &ret, const iScalar<vtype> &arg,int i,int j)
+  {
+    ret._internal[i][j] = arg._internal;
+  }
+  
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////
+// External wrappers
+////////////////////////////////////////////////////////////////////////////////////////////////////////
+template<int Level,class vtype> accelerator_inline int indexRank(void)
+{
+  vtype tmp;
+  return TensorIndexRecursion<Level>::indexRank(tmp);
+}
+template<int Level,class vtype> accelerator_inline int isScalar(void)
+{
+  vtype tmp;
+  return TensorIndexRecursion<Level>::isScalar(tmp);
+}
+template<int Level,class vtype> accelerator_inline int isVector(void)
+{
+  vtype tmp;
+  return TensorIndexRecursion<Level>::isVector(tmp);
+}
+template<int Level,class vtype> accelerator_inline int isMatrix(void)
+{
+  vtype tmp;
+  return TensorIndexRecursion<Level>::isMatrix(tmp);
+}
+
+template<int Level,class vtype> accelerator_inline auto traceIndex (const vtype &arg) -> RemoveCRV(TensorIndexRecursion<Level>::traceIndex(arg))
+{
+  RemoveCRV(TensorIndexRecursion<Level>::traceIndex(arg)) ret;
+  ret=TensorIndexRecursion<Level>::traceIndex(arg);
+  return ret;
+}
+template<int Level,class vtype> accelerator_inline auto transposeIndex (const vtype &arg) -> RemoveCRV(TensorIndexRecursion<Level>::transposeIndex(arg))
+{
+  RemoveCRV(TensorIndexRecursion<Level>::transposeIndex(arg)) ret;
+  ret=TensorIndexRecursion<Level>::transposeIndex(arg);
+  return ret;
+}
+
+template<int Level,class vtype> accelerator_inline auto peekIndex (const vtype &arg,int i) -> RemoveCRV(TensorIndexRecursion<Level>::peekIndex(arg,0))
+{
+  RemoveCRV(TensorIndexRecursion<Level>::peekIndex(arg,0)) ret;
+  ret=TensorIndexRecursion<Level>::peekIndex(arg,i);
+  return ret;
+}
+template<int Level,class vtype> accelerator_inline auto peekIndex (const vtype &arg,int i,int j) -> RemoveCRV(TensorIndexRecursion<Level>::peekIndex(arg,0,0))
+{
+  RemoveCRV(TensorIndexRecursion<Level>::peekIndex(arg,0,0)) ret;
+  ret=TensorIndexRecursion<Level>::peekIndex(arg,i,j);
+  return ret;
+}
+
+template<int Level,class vtype> accelerator_inline 
+void pokeIndex (vtype &ret,const decltype(TensorIndexRecursion<Level>::peekIndex(ret,0)) &arg,int i) 
+{
+  TensorIndexRecursion<Level>::pokeIndex(ret,arg,i);
+}
+
+template<int Level,class vtype> accelerator_inline 
+void pokeIndex (vtype &ret,const decltype(TensorIndexRecursion<Level>::peekIndex(ret,0,0)) &arg,int i,int j) 
+{
+  TensorIndexRecursion<Level>::pokeIndex(ret,arg,i,j);
+}
+
+
+#undef RemoveCRV
+
+NAMESPACE_END(Grid);
+
+#endif
--- a/Grid/tensors/Tensor_inner.h
+++ b/Grid/tensors/Tensor_inner.h
@@ -0,0 +1,139 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/tensors/Tensor_inner.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_MATH_INNER_H
+#define GRID_MATH_INNER_H
+
+NAMESPACE_BEGIN(Grid);
+
+///////////////////////////////////////////////////////////////////////////////////////
+// innerProduct Scalar x Scalar -> Scalar
+// innerProduct Vector x Vector -> Scalar
+// innerProduct Matrix x Matrix -> Scalar
+///////////////////////////////////////////////////////////////////////////////////////
+template<class sobj> accelerator_inline RealD norm2(const sobj &arg){
+  auto nrm = innerProductD(arg,arg);
+  RealD ret = real(nrm);
+  return ret;
+}
+//////////////////////////////////////
+// If single promote to double and sum 2x
+//////////////////////////////////////
+
+accelerator_inline ComplexD innerProductD(const ComplexF &l,const ComplexF &r){  return innerProduct(l,r); }
+accelerator_inline ComplexD innerProductD(const ComplexD &l,const ComplexD &r){  return innerProduct(l,r); }
+accelerator_inline RealD    innerProductD(const RealD    &l,const RealD    &r){  return innerProduct(l,r); }
+accelerator_inline RealD    innerProductD(const RealF    &l,const RealF    &r){  return innerProduct(l,r); }
+
+accelerator_inline vComplexD innerProductD(const vComplexD &l,const vComplexD &r){  return innerProduct(l,r); }
+accelerator_inline vRealD    innerProductD(const vRealD    &l,const vRealD    &r){  return innerProduct(l,r); }
+accelerator_inline vComplexD innerProductD(const vComplexF &l,const vComplexF &r){  
+  vComplexD la,lb;
+  vComplexD ra,rb;
+  Optimization::PrecisionChange::StoD(l.v,la.v,lb.v);
+  Optimization::PrecisionChange::StoD(r.v,ra.v,rb.v);
+  return innerProduct(la,ra) + innerProduct(lb,rb); 
+}
+accelerator_inline vRealD innerProductD(const vRealF &l,const vRealF &r){  
+  vRealD la,lb;
+  vRealD ra,rb;
+  Optimization::PrecisionChange::StoD(l.v,la.v,lb.v);
+  Optimization::PrecisionChange::StoD(r.v,ra.v,rb.v);
+  return innerProduct(la,ra) + innerProduct(lb,rb); 
+}
+
+template<class l,class r,int N> accelerator_inline
+auto innerProductD (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iScalar<decltype(innerProductD(lhs._internal[0],rhs._internal[0]))>
+{
+  typedef decltype(innerProductD(lhs._internal[0],rhs._internal[0])) ret_t;
+  iScalar<ret_t> ret;
+  zeroit(ret);
+  for(int c1=0;c1<N;c1++){
+    ret._internal += innerProductD(lhs._internal[c1],rhs._internal[c1]);
+  }
+  return ret;
+}
+template<class l,class r,int N> accelerator_inline
+auto innerProductD (const iMatrix<l,N>& lhs,const iMatrix<r,N>& rhs) -> iScalar<decltype(innerProductD(lhs._internal[0][0],rhs._internal[0][0]))>
+{
+  typedef decltype(innerProductD(lhs._internal[0][0],rhs._internal[0][0])) ret_t;
+  iScalar<ret_t> ret;
+  ret=Zero();
+  for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+      ret._internal+=innerProductD(lhs._internal[c1][c2],rhs._internal[c1][c2]);
+  }}
+  return ret;
+}
+template<class l,class r> accelerator_inline
+auto innerProductD (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(innerProductD(lhs._internal,rhs._internal))>
+{
+  typedef decltype(innerProductD(lhs._internal,rhs._internal)) ret_t;
+  iScalar<ret_t> ret;
+  ret._internal = innerProductD(lhs._internal,rhs._internal);
+  return ret;
+}
+//////////////////////
+// Keep same precison
+//////////////////////
+template<class l,class r,int N> accelerator_inline
+auto innerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iScalar<decltype(innerProduct(lhs._internal[0],rhs._internal[0]))>
+{
+  typedef decltype(innerProduct(lhs._internal[0],rhs._internal[0])) ret_t;
+  iScalar<ret_t> ret;
+  ret=Zero();
+  for(int c1=0;c1<N;c1++){
+    ret._internal += innerProduct(lhs._internal[c1],rhs._internal[c1]);
+  }
+  return ret;
+}
+template<class l,class r,int N> accelerator_inline
+auto innerProduct (const iMatrix<l,N>& lhs,const iMatrix<r,N>& rhs) -> iScalar<decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0]))>
+{
+  typedef decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t;
+  iScalar<ret_t> ret;
+  iScalar<ret_t> tmp;
+  ret=Zero();
+  for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+      ret._internal+=innerProduct(lhs._internal[c1][c2],rhs._internal[c1][c2]);
+    }}
+  return ret;
+}
+template<class l,class r> accelerator_inline
+auto innerProduct (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(innerProduct(lhs._internal,rhs._internal))>
+{
+  typedef decltype(innerProduct(lhs._internal,rhs._internal)) ret_t;
+  iScalar<ret_t> ret;
+  ret._internal = innerProduct(lhs._internal,rhs._internal);
+  return ret;
+}
+
+NAMESPACE_END(Grid);
+
+#endif
--- a/Grid/tensors/Tensor_logical.h
+++ b/Grid/tensors/Tensor_logical.h
@@ -0,0 +1,92 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/tensors/Tensor_logical.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#pragma once
+
+NAMESPACE_BEGIN(Grid);
+
+#define LOGICAL_BINOP(Op)						\
+  template<class v> accelerator_inline iScalar<v> operator Op (const iScalar<v>& lhs,const iScalar<v>& rhs) \
+  {									\
+    iScalar<v> ret;							\
+    ret._internal = lhs._internal Op rhs._internal ;			\
+    return ret;								\
+  }									\
+  template<class l> accelerator_inline iScalar<l> operator Op (const iScalar<l>& lhs,Integer rhs) \
+  {									\
+    typename iScalar<l>::scalar_type t; t=rhs;				\
+    typename iScalar<l>::tensor_reduced srhs; srhs=t;			\
+    return lhs Op srhs;							\
+  }									\
+  template<class l> accelerator_inline iScalar<l> operator Op (Integer lhs,const iScalar<l>& rhs) \
+  {									\
+    typename iScalar<l>::scalar_type t;t=lhs;				\
+    typename iScalar<l>::tensor_reduced slhs;slhs=t;			\
+    return slhs Op rhs;							\
+  }
+
+LOGICAL_BINOP(|);
+LOGICAL_BINOP(&);
+LOGICAL_BINOP(||);
+LOGICAL_BINOP(&&);
+
+template <class T>
+strong_inline bool operator==(const iScalar<T> &t1, const iScalar<T> &t2)
+{
+  return (t1._internal == t2._internal);
+}
+
+template <class T, int N>
+strong_inline bool operator==(const iVector<T, N> &t1, const iVector<T, N> &t2)
+{
+  bool res = true;
+
+  for (unsigned int i = 0; i < N; ++i)
+  {
+    res = (res && (t1._internal[i] == t2._internal[i]));
+  }
+
+  return res;
+}
+
+template <class T, int N>
+strong_inline bool operator==(const iMatrix<T, N> &t1, const iMatrix<T, N> &t2)
+{
+  bool res = true;
+
+  for (unsigned int i = 0; i < N; ++i)
+  for (unsigned int j = 0; j < N; ++j)
+  {
+    res = (res && (t1._internal[i][j] == t2._internal[i][j]));
+  }
+  
+  return res;
+}
+
+NAMESPACE_END(Grid);
+
+
--- a/Grid/tensors/Tensor_outer.h
+++ b/Grid/tensors/Tensor_outer.h
@@ -0,0 +1,80 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/tensors/Tensor_outer.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_MATH_OUTER_H
+#define GRID_MATH_OUTER_H
+
+NAMESPACE_BEGIN(Grid);
+
+///////////////////////////////////////////////////////////////////////////////////////
+// outerProduct Scalar x Scalar -> Scalar
+//              Vector x Vector -> Matrix
+///////////////////////////////////////////////////////////////////////////////////////
+
+template<class l,class r,int N> accelerator_inline
+auto outerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iMatrix<decltype(outerProduct(lhs._internal[0],rhs._internal[0])),N>
+{
+  typedef decltype(outerProduct(lhs._internal[0],rhs._internal[0])) ret_t;
+  iMatrix<ret_t,N> ret;
+  for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+      ret._internal[c1][c2] = outerProduct(lhs._internal[c1],rhs._internal[c2]);
+    }}
+  return ret;
+}
+
+
+template<class l,class r> accelerator_inline
+auto outerProduct (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(outerProduct(lhs._internal,rhs._internal))>
+{
+  typedef decltype(outerProduct(lhs._internal,rhs._internal)) ret_t;
+  iScalar<ret_t> ret;
+  ret._internal = outerProduct(lhs._internal,rhs._internal);
+  return ret;
+}
+
+  
+accelerator_inline ComplexF outerProduct(const ComplexF &l, const ComplexF& r)
+{
+  return l*conj(r);
+}
+accelerator_inline ComplexD outerProduct(const ComplexD &l, const ComplexD& r)
+{
+  return l*conj(r);
+}
+accelerator_inline RealF outerProduct(const RealF &l, const RealF& r)
+{
+  return l*r;
+}
+accelerator_inline RealD outerProduct(const RealD &l, const RealD& r)
+{
+  return l*r;
+}
+
+NAMESPACE_END(Grid);
+
+#endif
--- a/Grid/tensors/Tensor_reality.h
+++ b/Grid/tensors/Tensor_reality.h
@@ -0,0 +1,236 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/tensors/Tensor_reality.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: neo <cossu@post.kek.jp>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_MATH_REALITY_H
+#define GRID_MATH_REALITY_H
+
+NAMESPACE_BEGIN(Grid);
+
+/////////////////////////////////////////////// 
+// multiply by I; make recursive.
+/////////////////////////////////////////////// 
+template<class vtype> accelerator_inline iScalar<vtype> timesI(const iScalar<vtype>&r) 
+{
+  iScalar<vtype> ret;
+  timesI(ret._internal,r._internal);
+  return ret;
+}
+template<class vtype,int N> accelerator_inline iVector<vtype,N> timesI(const iVector<vtype,N>&r) 
+{
+  iVector<vtype,N> ret;
+  for(int i=0;i<N;i++){
+    timesI(ret._internal[i],r._internal[i]);
+  }
+  return ret;
+}
+template<class vtype,int N> accelerator_inline iMatrix<vtype,N> timesI(const iMatrix<vtype,N>&r)
+{
+  iMatrix<vtype,N> ret;
+  for(int i=0;i<N;i++){
+    for(int j=0;j<N;j++){
+      timesI(ret._internal[i][j],r._internal[i][j]);
+    }}
+  return ret;
+}
+
+template<class vtype> accelerator_inline void timesI(iScalar<vtype> &ret,const iScalar<vtype>&r) 
+{
+  timesI(ret._internal,r._internal);
+}
+template<class vtype,int N> accelerator_inline void timesI(iVector<vtype,N> &ret,const iVector<vtype,N>&r) 
+{
+  for(int i=0;i<N;i++){
+    timesI(ret._internal[i],r._internal[i]);
+  }
+}
+template<class vtype,int N> accelerator_inline void  timesI(iMatrix<vtype,N> &ret,const iMatrix<vtype,N>&r)
+{
+  for(int i=0;i<N;i++){
+    for(int j=0;j<N;j++){
+      timesI(ret._internal[i][j],r._internal[i][j]);
+    }}
+}
+
+
+template<class vtype> accelerator_inline iScalar<vtype> timesMinusI(const iScalar<vtype>&r) 
+{
+  iScalar<vtype> ret;
+  timesMinusI(ret._internal,r._internal);
+  return ret;
+}
+template<class vtype,int N> accelerator_inline iVector<vtype,N> timesMinusI(const iVector<vtype,N>&r) 
+{
+  iVector<vtype,N> ret;
+  for(int i=0;i<N;i++){
+    timesMinusI(ret._internal[i],r._internal[i]);
+  }
+  return ret;
+}
+template<class vtype,int N> accelerator_inline iMatrix<vtype,N> timesMinusI(const iMatrix<vtype,N>&r)
+{
+  iMatrix<vtype,N> ret;
+  for(int i=0;i<N;i++){
+    for(int j=0;j<N;j++){
+      timesMinusI(ret._internal[i][j],r._internal[i][j]);
+    }}
+  return ret;
+}
+
+template<class vtype>  accelerator_inline void timesMinusI(iScalar<vtype> &ret,const iScalar<vtype>&r) 
+{
+  timesMinusI(ret._internal,r._internal);
+}
+template<class vtype,int N> accelerator_inline void timesMinusI(iVector<vtype,N> &ret,const iVector<vtype,N>&r) 
+{
+  for(int i=0;i<N;i++){
+    timesMinusI(ret._internal[i],r._internal[i]);
+  }
+}
+template<class vtype,int N> accelerator_inline void  timesMinusI(iMatrix<vtype,N> &ret,const iMatrix<vtype,N>&r)
+{
+  for(int i=0;i<N;i++){
+    for(int j=0;j<N;j++){
+      timesMinusI(ret._internal[i][j],r._internal[i][j]);
+    }}
+}
+
+
+/////////////////////////////////////////////// 
+// Conj function for scalar, vector, matrix
+/////////////////////////////////////////////// 
+template<class vtype> accelerator_inline iScalar<vtype> conjugate(const iScalar<vtype>&r)
+{
+  iScalar<vtype> ret;
+  ret._internal = conjugate(r._internal);
+  return ret;
+}
+template<class vtype,int N> accelerator_inline iVector<vtype,N> conjugate(const iVector<vtype,N>&r)
+{
+  iVector<vtype,N> ret;
+  for(int i=0;i<N;i++){
+    ret._internal[i] = conjugate(r._internal[i]);
+  }
+  return ret;
+}
+template<class vtype,int N> accelerator_inline iMatrix<vtype,N> conjugate(const iMatrix<vtype,N>&r)
+{
+  iMatrix<vtype,N> ret;
+  for(int i=0;i<N;i++){
+    for(int j=0;j<N;j++){
+      ret._internal[i][j] = conjugate(r._internal[i][j]);
+    }}
+  return ret;
+}
+
+/////////////////////////////////////////////// 
+// Adj function for scalar, vector, matrix
+/////////////////////////////////////////////// 
+template<class vtype> accelerator_inline iScalar<vtype> adj(const iScalar<vtype>&r)
+{
+  iScalar<vtype> ret;
+  ret._internal = adj(r._internal);
+  return ret;
+}
+template<class vtype,int N> accelerator_inline iVector<vtype,N> adj(const iVector<vtype,N>&r)
+{
+  iVector<vtype,N> ret;
+  for(int i=0;i<N;i++){
+    ret._internal[i] = adj(r._internal[i]);
+  }
+  return ret;
+}
+template<class vtype,int N> accelerator_inline iMatrix<vtype,N> adj(const iMatrix<vtype,N> &arg)
+{
+  iMatrix<vtype,N> ret;
+  for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+      ret._internal[c1][c2]=adj(arg._internal[c2][c1]);
+    }}
+  return ret;
+}
+
+
+
+
+
+
+/////////////////////////////////////////////////////////////////
+// Can only take the real/imag part of scalar objects, since
+// lattice objects of different complex nature are non-conformable.
+/////////////////////////////////////////////////////////////////
+template<class itype> accelerator_inline auto real(const iScalar<itype> &z) -> iScalar<decltype(real(z._internal))>
+{
+  iScalar<decltype(real(z._internal))> ret;
+  ret._internal = real(z._internal);
+  return ret;
+}
+template<class itype,int N> accelerator_inline auto real(const iMatrix<itype,N> &z) -> iMatrix<decltype(real(z._internal[0][0])),N>
+{
+  iMatrix<decltype(real(z._internal[0][0])),N> ret;
+  for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+      ret._internal[c1][c2] = real(z._internal[c1][c2]);
+    }}
+  return ret;
+}
+template<class itype,int N> accelerator_inline auto real(const iVector<itype,N> &z) -> iVector<decltype(real(z._internal[0])),N>
+{
+  iVector<decltype(real(z._internal[0])),N> ret;
+  for(int c1=0;c1<N;c1++){
+    ret._internal[c1] = real(z._internal[c1]);
+  }
+  return ret;
+}
+    
+template<class itype> accelerator_inline auto imag(const iScalar<itype> &z) -> iScalar<decltype(imag(z._internal))>
+{
+  iScalar<decltype(imag(z._internal))> ret;
+  ret._internal = imag(z._internal);
+  return ret;
+}
+template<class itype,int N> accelerator_inline auto imag(const iMatrix<itype,N> &z) -> iMatrix<decltype(imag(z._internal[0][0])),N>
+{
+  iMatrix<decltype(imag(z._internal[0][0])),N> ret;
+  for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+      ret._internal[c1][c2] = imag(z._internal[c1][c2]);
+    }}
+  return ret;
+}
+template<class itype,int N> accelerator_inline auto imag(const iVector<itype,N> &z) -> iVector<decltype(imag(z._internal[0])),N>
+{
+  iVector<decltype(imag(z._internal[0])),N> ret;
+  for(int c1=0;c1<N;c1++){
+    ret._internal[c1] = imag(z._internal[c1]);
+  }
+  return ret;
+}
+
+NAMESPACE_END(Grid);
+
+#endif
--- a/Grid/tensors/Tensor_trace.h
+++ b/Grid/tensors/Tensor_trace.h
@@ -0,0 +1,75 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/tensors/Tensor_trace.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: neo <cossu@post.kek.jp>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_MATH_TRACE_H
+#define GRID_MATH_TRACE_H
+
+NAMESPACE_BEGIN(Grid);
+
+//////////////////////////////////////////////////////////////////
+// Traces: both all indices and a specific index. Indices must be
+// either scalar or matrix
+/////////////////////////////////////////////////////////////////
+
+accelerator_inline ComplexF trace( const ComplexF &arg){    return arg;}
+accelerator_inline ComplexD trace( const ComplexD &arg){    return arg;}
+accelerator_inline RealF trace( const RealF &arg){    return arg;}
+accelerator_inline RealD trace( const RealD &arg){    return arg;}
+
+template<class vtype,int N>
+accelerator_inline auto trace(const iMatrix<vtype,N> &arg) -> iScalar<decltype(trace(arg._internal[0][0]))>
+{
+  iScalar<decltype( trace(arg._internal[0][0] )) > ret;
+  zeroit(ret._internal);
+  for(int i=0;i<N;i++){
+    ret._internal=ret._internal+trace(arg._internal[i][i]);
+  }
+  return ret;
+}
+
+template<class vtype>
+accelerator_inline auto trace(const iScalar<vtype> &arg) -> iScalar<decltype(trace(arg._internal))>
+{
+  iScalar<decltype(trace(arg._internal))> ret;
+  ret._internal=trace(arg._internal);
+  return ret;
+}
+
+template<class vtype,int N>
+accelerator_inline auto trace(const iVector<vtype,N> &arg) -> iVector<decltype(trace(arg._internal[0])),N>
+{
+  iVector<decltype(trace(arg._internal[0])),N> ret;
+  for(int i=0;i<N;i++){
+    ret._internal[i]=trace(arg._internal[i]);
+  }
+  return ret;
+}
+
+NAMESPACE_END(Grid);
+
+#endif
--- a/Grid/tensors/Tensor_traits.h
+++ b/Grid/tensors/Tensor_traits.h
@@ -0,0 +1,296 @@
+/*************************************************************************************
+    Grid physics library, www.github.com/paboyle/Grid 
+    Source file: ./lib/tensors/Tensor_traits.h
+    Copyright (C) 2015
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: Christopher Kelly <ckelly@phys.columbia.edu>
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_MATH_TRAITS_H
+#define GRID_MATH_TRAITS_H
+
+#include <type_traits>
+
+NAMESPACE_BEGIN(Grid);
+
+//////////////////////////////////////////////////////////////////////////////////
+// Want to recurse: GridTypeMapper<Matrix<vComplexD> >::scalar_type == ComplexD.
+// Use of a helper class like this allows us to template specialise and "dress"
+// other classes such as RealD == double, ComplexD == std::complex<double> with these
+// traits.
+//
+// It is possible that we could do this more elegantly if I introduced a 
+// queryable trait in iScalar, iMatrix and iVector and used the query on vtype in 
+// place of the type mapper?
+//
+// Not sure how to do this, but probably could be done with a research effort
+// to study C++11's type_traits.h file. (std::enable_if<isGridTensorType<vtype> >)
+//
+//////////////////////////////////////////////////////////////////////////////////
+  
+template <class T> class GridTypeMapper {
+public:
+  typedef typename T::scalar_type scalar_type;
+  typedef typename T::vector_type vector_type;
+  typedef typename T::vector_typeD vector_typeD;
+  typedef typename T::tensor_reduced tensor_reduced;
+  typedef typename T::scalar_object scalar_object;
+  typedef typename T::Complexified Complexified;
+  typedef typename T::Realified Realified;
+  typedef typename T::DoublePrecision DoublePrecision;
+  enum { TensorLevel = T::TensorLevel };
+};
+
+//////////////////////////////////////////////////////////////////////////////////
+// Recursion stops with these template specialisations
+//////////////////////////////////////////////////////////////////////////////////
+template<> class GridTypeMapper<RealF> {
+public:
+  typedef RealF scalar_type;
+  typedef RealF vector_type;
+  typedef RealD vector_typeD;
+  typedef RealF tensor_reduced ;
+  typedef RealF scalar_object;
+  typedef ComplexF Complexified;
+  typedef RealF Realified;
+  typedef RealD DoublePrecision;
+  enum { TensorLevel = 0 };
+};
+template<> class GridTypeMapper<RealD> {
+public:
+  typedef RealD scalar_type;
+  typedef RealD vector_type;
+  typedef RealD vector_typeD;
+  typedef RealD tensor_reduced;
+  typedef RealD scalar_object;
+  typedef ComplexD Complexified;
+  typedef RealD Realified;
+  typedef RealD DoublePrecision;
+  enum { TensorLevel = 0 };
+};
+template<> class GridTypeMapper<ComplexF> {
+public:
+  typedef ComplexF scalar_type;
+  typedef ComplexF vector_type;
+  typedef ComplexD vector_typeD;
+  typedef ComplexF tensor_reduced;
+  typedef ComplexF scalar_object;
+  typedef ComplexF Complexified;
+  typedef RealF Realified;
+  typedef ComplexD DoublePrecision;
+  enum { TensorLevel = 0 };
+};
+template<> class GridTypeMapper<ComplexD> {
+public:
+  typedef ComplexD scalar_type;
+  typedef ComplexD vector_type;
+  typedef ComplexD vector_typeD;
+  typedef ComplexD tensor_reduced;
+  typedef ComplexD scalar_object;
+  typedef ComplexD Complexified;
+  typedef RealD Realified;
+  typedef ComplexD DoublePrecision;
+  enum { TensorLevel = 0 };
+};
+template<> class GridTypeMapper<Integer> {
+public:
+  typedef Integer scalar_type;
+  typedef Integer vector_type;
+  typedef Integer vector_typeD;
+  typedef Integer tensor_reduced;
+  typedef Integer scalar_object;
+  typedef void Complexified;
+  typedef void Realified;
+  typedef void DoublePrecision;
+  enum { TensorLevel = 0 };
+};
+
+template<> class GridTypeMapper<vRealF> {
+public:
+  typedef RealF  scalar_type;
+  typedef vRealF vector_type;
+  typedef vRealD vector_typeD;
+  typedef vRealF tensor_reduced;
+  typedef RealF  scalar_object;
+  typedef vComplexF Complexified;
+  typedef vRealF Realified;
+  typedef vRealD DoublePrecision;
+  enum { TensorLevel = 0 };
+};
+template<> class GridTypeMapper<vRealD> {
+public:
+  typedef RealD  scalar_type;
+  typedef vRealD vector_type;
+  typedef vRealD vector_typeD;
+  typedef vRealD tensor_reduced;
+  typedef RealD  scalar_object;
+  typedef vComplexD Complexified;
+  typedef vRealD Realified;
+  typedef vRealD DoublePrecision;
+  enum { TensorLevel = 0 };
+};
+template<> class GridTypeMapper<vComplexH> {
+public:
+  typedef ComplexF  scalar_type;
+  typedef vComplexH vector_type;
+  typedef vComplexD vector_typeD;
+  typedef vComplexH tensor_reduced;
+  typedef ComplexF  scalar_object;
+  typedef vComplexH Complexified;
+  typedef vRealH Realified;
+  typedef vComplexD DoublePrecision;
+  enum { TensorLevel = 0 };
+};
+template<> class GridTypeMapper<vComplexF> {
+public:
+  typedef ComplexF  scalar_type;
+  typedef vComplexF vector_type;
+  typedef vComplexD vector_typeD;
+  typedef vComplexF tensor_reduced;
+  typedef ComplexF  scalar_object;
+  typedef vComplexF Complexified;
+  typedef vRealF Realified;
+  typedef vComplexD DoublePrecision;
+  enum { TensorLevel = 0 };
+};
+template<> class GridTypeMapper<vComplexD> {
+public:
+  typedef ComplexD  scalar_type;
+  typedef vComplexD vector_type;
+  typedef vComplexD vector_typeD;
+  typedef vComplexD tensor_reduced;
+  typedef ComplexD  scalar_object;
+  typedef vComplexD Complexified;
+  typedef vRealD Realified;
+  typedef vComplexD DoublePrecision;
+  enum { TensorLevel = 0 };
+};
+template<> class GridTypeMapper<vInteger> {
+public:
+  typedef  Integer scalar_type;
+  typedef vInteger vector_type;
+  typedef vInteger vector_typeD;
+  typedef vInteger tensor_reduced;
+  typedef  Integer scalar_object;
+  typedef void Complexified;
+  typedef void Realified;
+  typedef void DoublePrecision;
+  enum { TensorLevel = 0 };
+};
+
+// First some of my own traits
+template<typename T> struct isGridTensor {
+  static const bool value = true;
+  static const bool notvalue = false;
+};
+template<> struct isGridTensor<int > {
+  static const bool value = false;
+  static const bool notvalue = true;
+};
+template<> struct isGridTensor<RealD > {
+  static const bool value = false;
+  static const bool notvalue = true;
+};
+template<> struct isGridTensor<RealF > {
+  static const bool value = false;
+  static const bool notvalue = true;
+};
+template<> struct isGridTensor<ComplexD > {
+  static const bool value = false;
+  static const bool notvalue = true;
+};
+template<> struct isGridTensor<ComplexF > {
+  static const bool value = false;
+  static const bool notvalue = true;
+};
+template<> struct isGridTensor<Integer > {
+  static const bool value = false;
+  static const bool notvalue = true;
+};
+template<> struct isGridTensor<vRealD > {
+  static const bool value = false;
+  static const bool notvalue = true;
+};
+template<> struct isGridTensor<vRealF > {
+  static const bool value = false;
+  static const bool notvalue = true;
+};
+template<> struct isGridTensor<vComplexD > {
+  static const bool value = false;
+  static const bool notvalue = true;
+};
+template<> struct isGridTensor<vComplexF > {
+  static const bool value = false;
+  static const bool notvalue = true;
+};
+template<> struct isGridTensor<vInteger > {
+  static const bool value = false;
+  static const bool notvalue = true;
+};
+
+// Match the index
+template<typename T,int Level> struct matchGridTensorIndex {
+  static const bool value = (Level==T::TensorLevel);
+  static const bool notvalue = (Level!=T::TensorLevel);
+};
+// What is the vtype
+template<typename T> struct isComplex {
+  static const bool value = false;
+};
+template<> struct isComplex<ComplexF> {
+  static const bool value = true;
+};
+template<> struct isComplex<ComplexD> {
+  static const bool value = true;
+};
+
+//Get the SIMD vector type from a Grid tensor or Lattice<Tensor>
+template<typename T>
+struct getVectorType{
+  typedef T type;
+};
+  
+//Query if a tensor or Lattice<Tensor> is SIMD vector or scalar
+template<typename T>
+class isSIMDvectorized{
+  template<typename U>
+  static typename std::enable_if< !std::is_same< typename GridTypeMapper<typename getVectorType<U>::type>::scalar_type,   
+						 typename GridTypeMapper<typename getVectorType<U>::type>::vector_type>::value, char>::type test(void *);
+
+  template<typename U>
+  static double test(...);
+  
+public:
+  enum {value = sizeof(test<T>(0)) == sizeof(char) };
+};
+  
+//Get the precision of a Lattice, tensor or scalar type in units of sizeof(float)
+template<typename T>
+class getPrecision{
+public:
+  //get the vector_obj (i.e. a grid Tensor) if its a Lattice<vobj>, do nothing otherwise (i.e. if fundamental or grid Tensor)
+  typedef typename getVectorType<T>::type vector_obj; 
+  typedef typename GridTypeMapper<vector_obj>::scalar_type scalar_type; //get the associated scalar type. Works on fundamental and tensor types
+  typedef typename GridTypeMapper<scalar_type>::Realified real_scalar_type; //remove any std::complex wrapper, should get us to the fundamental type
+
+  enum { value = sizeof(real_scalar_type)/sizeof(float) };
+};
+
+NAMESPACE_END(Grid);
+
+
+#endif
+
--- a/Grid/tensors/Tensor_transpose.h
+++ b/Grid/tensors/Tensor_transpose.h
@@ -0,0 +1,130 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/tensors/Tensor_transpose.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_MATH_TRANSPOSE_H
+#define GRID_MATH_TRANSPOSE_H
+
+NAMESPACE_BEGIN(Grid);
+
+/////////////////////////////////////////////////////////////////
+// Transpose all indices
+/////////////////////////////////////////////////////////////////
+
+accelerator_inline ComplexD transpose(ComplexD &rhs){  return rhs;}
+accelerator_inline ComplexF transpose(ComplexF &rhs){  return rhs;}
+accelerator_inline RealD transpose(RealD &rhs){  return rhs;}
+accelerator_inline RealF transpose(RealF &rhs){  return rhs;}
+
+template<class vtype,int N>
+accelerator_inline typename std::enable_if<isGridTensor<vtype>::value, iMatrix<vtype,N> >::type 
+transpose(iMatrix<vtype,N> arg)
+{
+  iMatrix<vtype,N> ret;
+  for(int i=0;i<N;i++){
+    for(int j=0;j<N;j++){
+      ret._internal[i][j] = transpose(arg._internal[j][i]); // NB recurses
+    }}
+  return ret;
+}
+template<class vtype,int N>
+accelerator_inline typename std::enable_if<isGridTensor<vtype>::notvalue, iMatrix<vtype,N> >::type 
+transpose(iMatrix<vtype,N> arg)
+{
+  iMatrix<vtype,N> ret;
+  for(int i=0;i<N;i++){
+    for(int j=0;j<N;j++){
+      ret._internal[i][j] = arg._internal[j][i]; // Stop recursion if not a tensor type
+    }}
+  return ret;
+}
+
+template<class vtype>
+accelerator_inline typename std::enable_if<isGridTensor<vtype>::value, iScalar<vtype> >::type 
+transpose(iScalar<vtype> arg)
+{
+  iScalar<vtype> ret;
+  ret._internal = transpose(arg._internal); // NB recurses
+  return ret;
+}
+
+template<class vtype>
+accelerator_inline typename std::enable_if<isGridTensor<vtype>::notvalue, iScalar<vtype> >::type 
+transpose(iScalar<vtype> arg)
+{
+  iScalar<vtype> ret;
+  ret._internal = arg._internal; // NB recursion stops
+  return ret;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////////////////
+// Transpose a specific index; instructive to compare this style of recursion termination
+// to that of adj; which is easiers?
+////////////////////////////////////////////////////////////////////////////////////////////
+#if 0
+template<int Level,class vtype,int N> accelerator_inline 
+typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,Level>::value, iMatrix<vtype,N> >::type 
+transposeIndex (const iMatrix<vtype,N> &arg)
+{
+  iMatrix<vtype,N> ret;
+  for(int i=0;i<N;i++){
+    for(int j=0;j<N;j++){
+      ret._internal[i][j] = arg._internal[j][i]; 
+    }}
+  return ret;
+}
+// or not
+template<int Level,class vtype,int N> accelerator_inline 
+typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,Level>::notvalue, iMatrix<vtype,N> >::type 
+transposeIndex (const iMatrix<vtype,N> &arg)
+{
+  iMatrix<vtype,N> ret;
+  for(int i=0;i<N;i++){
+    for(int j=0;j<N;j++){
+      ret._internal[i][j] = transposeIndex<Level>(arg._internal[i][j]); 
+    }}
+  return ret;
+}
+template<int Level,class vtype> accelerator_inline 
+typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,Level>::notvalue, iScalar<vtype> >::type 
+transposeIndex (const iScalar<vtype> &arg)
+{
+  iScalar<vtype> ret;
+  ret._internal=transposeIndex<Level>(arg._internal);
+  return ret;
+}
+template<int Level,class vtype> accelerator_inline 
+typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,Level>::value, iScalar<vtype> >::type 
+transposeIndex (const iScalar<vtype> &arg)
+{
+  return arg;
+}
+#endif
+
+NAMESPACE_END(Grid);
+
+#endif
--- a/Grid/tensors/Tensor_unary.h
+++ b/Grid/tensors/Tensor_unary.h
@@ -0,0 +1,155 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/tensors/Tensor_unary.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_TENSOR_UNARY_H
+#define GRID_TENSOR_UNARY_H
+
+NAMESPACE_BEGIN(Grid);
+
+#define UNARY(func)							\
+  template<class obj> accelerator_inline auto func(const iScalar<obj> &z) -> iScalar<obj> \
+  {									\
+    iScalar<obj> ret;							\
+    ret._internal = func( (z._internal));				\
+    return ret;								\
+  }									\
+  template<class obj,int N> accelerator_inline auto func(const iVector<obj,N> &z) -> iVector<obj,N>	\
+  {									\
+    iVector<obj,N> ret;							\
+    for(int c1=0;c1<N;c1++){						\
+      ret._internal[c1] = func( (z._internal[c1]));			\
+    }									\
+    return ret;								\
+  }									\
+  template<class obj,int N> accelerator_inline auto func(const iMatrix<obj,N> &z) -> iMatrix<obj,N>	\
+  {									\
+    iMatrix<obj,N> ret;							\
+    for(int c1=0;c1<N;c1++){						\
+      for(int c2=0;c2<N;c2++){						\
+	ret._internal[c1][c2] = func( (z._internal[c1][c2]));		\
+      }}								\
+    return ret;								\
+  }
+
+
+#define BINARY_RSCALAR(func,scal)					\
+  template<class obj> accelerator_inline iScalar<obj> func(const iScalar<obj> &z,scal y) \
+  {									\
+    iScalar<obj> ret;							\
+    ret._internal = func(z._internal,y);				\
+    return ret;								\
+  }									\
+  template<class obj,int N> accelerator_inline iVector<obj,N> func(const iVector<obj,N> &z,scal y) \
+  {									\
+    iVector<obj,N> ret;							\
+    for(int c1=0;c1<N;c1++){						\
+      ret._internal[c1] = func(z._internal[c1],y);			\
+    }									\
+    return ret;								\
+  }									\
+  template<class obj,int N> accelerator_inline  iMatrix<obj,N> func(const iMatrix<obj,N> &z, scal y) \
+  {									\
+    iMatrix<obj,N> ret;							\
+    for(int c1=0;c1<N;c1++){						\
+      for(int c2=0;c2<N;c2++){						\
+	ret._internal[c1][c2] = func(z._internal[c1][c2],y);		\
+      }}								\
+    return ret;								\
+  }
+
+UNARY(sqrt);
+UNARY(rsqrt);
+UNARY(sin);
+UNARY(cos);
+UNARY(asin);
+UNARY(acos);
+UNARY(log);
+UNARY(exp);
+UNARY(abs);
+UNARY(Not);
+
+
+template<class obj> accelerator_inline auto toReal(const iScalar<obj> &z) -> typename iScalar<obj>::Realified
+{
+  typename iScalar<obj>::Realified ret;
+  ret._internal = toReal(z._internal);
+  return ret;
+}
+template<class obj,int N> accelerator_inline auto toReal(const iVector<obj,N> &z) -> typename iVector<obj,N>::Realified
+{
+  typename iVector<obj,N>::Realified ret;
+  for(int c1=0;c1<N;c1++){  
+    ret._internal[c1] = toReal(z._internal[c1]); 
+  }
+  return ret;
+}
+template<class obj,int N> accelerator_inline auto toReal(const iMatrix<obj,N> &z) -> typename iMatrix<obj,N>::Realified
+{
+  typename iMatrix<obj,N>::Realified ret;
+  for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+      ret._internal[c1][c2] = toReal(z._internal[c1][c2]);
+    }}
+  return ret;
+}
+
+template<class obj> accelerator_inline auto toComplex(const iScalar<obj> &z) -> typename iScalar<obj>::Complexified
+{
+  typename iScalar<obj>::Complexified ret;
+  ret._internal = toComplex(z._internal);
+  return ret;
+}
+template<class obj,int N> accelerator_inline auto toComplex(const iVector<obj,N> &z) -> typename iVector<obj,N>::Complexified
+{
+  typename iVector<obj,N>::Complexified ret;
+  for(int c1=0;c1<N;c1++){  
+    ret._internal[c1] = toComplex(z._internal[c1]); 
+  }
+  return ret;
+}
+template<class obj,int N> accelerator_inline auto toComplex(const iMatrix<obj,N> &z) -> typename iMatrix<obj,N>::Complexified
+{
+  typename iMatrix<obj,N>::Complexified ret;
+  for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+      ret._internal[c1][c2] = toComplex(z._internal[c1][c2]);
+    }}
+  return ret;
+}
+
+BINARY_RSCALAR(div,Integer);
+BINARY_RSCALAR(mod,Integer);
+BINARY_RSCALAR(pow,RealD);
+
+#undef UNARY
+#undef BINARY_RSCALAR
+
+NAMESPACE_END(Grid);
+
+#endif
--- a/Grid/tensors/Tensors.h
+++ b/Grid/tensors/Tensors.h
@@ -0,0 +1,51 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/Tensors.h
+
+    Copyright (C) 2015
+
+Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: neo <cossu@post.kek.jp>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+*************************************************************************************/
+/*  END LEGAL */
+#ifndef GRID_MATH_H
+#define GRID_MATH_H
+
+#include <Grid/tensors/Tensor_traits.h>
+#include <Grid/tensors/Tensor_class.h>
+#include <Grid/tensors/Tensor_arith.h>
+#include <Grid/tensors/Tensor_inner.h>
+#include <Grid/tensors/Tensor_outer.h>
+#include <Grid/tensors/Tensor_transpose.h>
+#include <Grid/tensors/Tensor_trace.h>
+#include <Grid/tensors/Tensor_index.h>
+#include <Grid/tensors/Tensor_Ta.h>
+#include <Grid/tensors/Tensor_determinant.h>
+#include <Grid/tensors/Tensor_exp.h>
+//#include <Grid/tensors/Tensor_peek.h>
+//#include <Grid/tensors/Tensor_poke.h>
+#include <Grid/tensors/Tensor_reality.h>
+#include <Grid/tensors/Tensor_unary.h>
+#include <Grid/tensors/Tensor_extract_merge.h>
+#include <Grid/tensors/Tensor_logical.h>
+
+#endif