Merge branch 'master' of https://github.com/paboyle/Grid

Conflicts: lib/tensors/Tensor_trace.h
2026-05-13 05:34:30 +01:00 · 2015-06-04 12:17:00 +01:00
parent 5aa8bf77db 7e47b0c6eb
commit 1e4eca8321
29 changed files with 2580 additions and 2351 deletions
@@ -5,6 +5,7 @@
 *.obj
 *~
 errs
+*#

 # Precompiled Headers
 *.gch
@@ -48,3 +49,33 @@ config.status
 /stamp-h1
 /config.sub
 /config.guess
+
+
+# Packages #
+############
+# it's better to unpack these files and commit the raw source
+# git has its own built in compression methods
+*.7z
+*.dmg
+*.gz
+*.iso
+*.jar
+*.rar
+*.tar
+*.zip
+ 
+# Logs and databases #
+######################
+*.log
+*.sql
+*.sqlite
+ 
+# OS generated files #
+######################
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
@@ -1 +1 @@
-/opt/local/share/automake-1.15/INSTALL
+/usr/share/automake-1.14/INSTALL
@@ -20,7 +20,7 @@ optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a signifi
 for most programmers.

 The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
-Presently SSE2 (128 bit) AVX, AVX2 (256 bit) and IMCI and AVX512 (512 bit) targets are supported.
+Presently SSE4 (128 bit) AVX, AVX2 (256 bit) and IMCI and AVX512 (512 bit) targets are supported.

 These are presented as 

@@ -46,3 +46,5 @@ are examples:
     ./configure CXX=icpc CXXFLAGS="-std=c++11 -O3 -mmic" --enable-simd=AVX512 --host=none
     
     
+For developers:
+Use reconfigure_script in the scripts/ directory to create the autotools environment 
@@ -66,6 +66,9 @@ Insert/Extract

 * Support for ILDG

+* Support different boundary conditions (finite temp, chem. potential ... )
+
+* Support different fermion representations? 

 Actions -- coherent framework for implementing actions and their forces.

@@ -3,7 +3,7 @@
 #
 # Project Grid package  
 # 
-# Time-stamp: <2015-05-26 17:18:54 neo>
+# Time-stamp: <2015-05-27 18:51:47 neo>

 AC_PREREQ([2.63])
 AC_INIT([Grid], [1.0], [paboyle@ph.ed.ac.uk])
@@ -27,7 +27,7 @@ AC_PROG_CXX
 AC_OPENMP
 AC_PROG_RANLIB
 AX_CXX_COMPILE_STDCXX_11(noext, mandatory)
-
+AX_EXT

 # Checks for libraries.
 #AX_GCC_VAR_ATTRIBUTE(aligned)
@@ -69,26 +69,44 @@ Info at: http://www.mpfr.org/)])



-AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE|AVX|AVX2|AVX512|MIC],\
+AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVX2|AVX512|MIC],\
 	[Select instructions to be SSE4.0, AVX 1.0, AVX 2.0+FMA, AVX 512, MIC])],\
 	[ac_SIMD=${enable_simd}],[ac_SIMD=AVX2])

+supported=no
+
 case ${ac_SIMD} in
     SSE4)
       echo Configuring for SSE4
+       if test x"$ax_cv_support_ssse3_ext" = x"yes"; then  dnl minimal support for SSE4
       AC_DEFINE([SSE4],[1],[SSE4] )
+       supported=yes
+       else
+	AC_MSG_WARN([Your processor does not support SSE4 instructions])
+       fi
     ;;
     AVX)
       echo Configuring for AVX
+       if test x"$ax_cv_support_avx_ext" = x"yes"; then  dnl minimal support for AVX
       AC_DEFINE([AVX1],[1],[AVX] )
+       supported=yes			  
+       else
+       	AC_MSG_WARN([Your processor does not support AVX instructions])
+       fi
     ;;
     AVX2)
       echo Configuring for AVX2
+       if test x"$ax_cv_support_avx2_ext" = x"yes"; then  dnl minimal support for AVX2
       AC_DEFINE([AVX2],[1],[AVX2] )
+       supported=yes
+       else
+       AC_MSG_WARN([Your processor does not support AVX2 instructions])
+       fi
     ;;
     AVX512|MIC)
       echo Configuring for AVX512 and MIC
       AC_DEFINE([AVX512],[1],[AVX512] )
+       supported="cross compilation"
     ;;
     *)
     AC_MSG_ERROR([${ac_SIMD} unsupported --enable-simd option]); 
@@ -129,7 +147,9 @@ then
 AC_CONFIG_FILES([docs/doxy.cfg])
 fi

-
+echo
+echo Creating configuration files
+echo :::::::::::::::::::::::::::::::::::::::::::
 AC_CONFIG_FILES(Makefile)
 AC_CONFIG_FILES(lib/Makefile)
 AC_CONFIG_FILES(tests/Makefile)
@@ -150,9 +170,9 @@ The following features are enabled:
 - os (target)                   : $target_os
 - build DOXYGEN documentation   : `if test "x$enable_doc" = xyes; then echo yes; else echo no; fi`
 - graphs and diagrams           : `if test "x$enable_dot" = xyes; then echo yes; else echo no; fi`
-
+- Supported SIMD flags          : $SIMD_FLAGS
 ----------------------------------------------------------
- enabled simd support          : ${ac_SIMD}
+- enabled simd support          : ${ac_SIMD}   (supported: $supported )
 - communications type           : ${ac_COMMS}


@@ -1,5 +1,5 @@
-/* lib/Grid_config.h.  Generated from Grid_config.h.in by configure.  */
-/* lib/Grid_config.h.in.  Generated from configure.ac by autoheader.  */
+/* lib/GridConfig.h.  Generated from GridConfig.h.in by configure.  */
+/* lib/GridConfig.h.in.  Generated from configure.ac by autoheader.  */

 /* AVX */
 /* #undef AVX1 */
@@ -16,6 +16,15 @@
 /* GRID_COMMS_NONE */
 #define GRID_COMMS_NONE 1

+/* Support Altivec instructions */
+/* #undef HAVE_ALTIVEC */
+
+/* Support AVX (Advanced Vector Extensions) instructions */
+/* #undef HAVE_AVX */
+
+/* Support AVX2 (Advanced Vector Extensions 2) instructions */
+/* #undef HAVE_AVX2 */
+
 /* define if the compiler supports basic C++11 syntax */
 /* #undef HAVE_CXX11 */

@@ -30,6 +39,9 @@
 /* Define to 1 if you have the <endian.h> header file. */
 #define HAVE_ENDIAN_H 1

+/* Support FMA3 (Fused Multiply-Add) instructions */
+/* #undef HAVE_FMA */
+
 /* Define to 1 if you have the `gettimeofday' function. */
 #define HAVE_GETTIMEOFDAY 1

@@ -54,9 +66,30 @@
 /* Define to 1 if you have the <memory.h> header file. */
 #define HAVE_MEMORY_H 1

+/* Support mmx instructions */
+#define HAVE_MMX /**/
+
 /* Define to 1 if you have the <mm_malloc.h> header file. */
 #define HAVE_MM_MALLOC_H 1

+/* Support SSE (Streaming SIMD Extensions) instructions */
+#define HAVE_SSE /**/
+
+/* Support SSE2 (Streaming SIMD Extensions 2) instructions */
+#define HAVE_SSE2 /**/
+
+/* Support SSE3 (Streaming SIMD Extensions 3) instructions */
+#define HAVE_SSE3 /**/
+
+/* Support SSSE4.1 (Streaming SIMD Extensions 4.1) instructions */
+#define HAVE_SSE4_1 /**/
+
+/* Support SSSE4.2 (Streaming SIMD Extensions 4.2) instructions */
+#define HAVE_SSE4_2 /**/
+
+/* Support SSSE3 (Supplemental Streaming SIMD Extensions 3) instructions */
+#define HAVE_SSSE3 /**/
+
 /* Define to 1 if you have the <stdint.h> header file. */
 #define HAVE_STDINT_H 1

@@ -15,6 +15,15 @@
 /* GRID_COMMS_NONE */
 #undef GRID_COMMS_NONE

+/* Support Altivec instructions */
+#undef HAVE_ALTIVEC
+
+/* Support AVX (Advanced Vector Extensions) instructions */
+#undef HAVE_AVX
+
+/* Support AVX2 (Advanced Vector Extensions 2) instructions */
+#undef HAVE_AVX2
+
 /* define if the compiler supports basic C++11 syntax */
 #undef HAVE_CXX11

@@ -29,6 +38,9 @@
 /* Define to 1 if you have the <endian.h> header file. */
 #undef HAVE_ENDIAN_H

+/* Support FMA3 (Fused Multiply-Add) instructions */
+#undef HAVE_FMA
+
 /* Define to 1 if you have the `gettimeofday' function. */
 #undef HAVE_GETTIMEOFDAY

@@ -53,9 +65,30 @@
 /* Define to 1 if you have the <memory.h> header file. */
 #undef HAVE_MEMORY_H

+/* Support mmx instructions */
+#undef HAVE_MMX
+
 /* Define to 1 if you have the <mm_malloc.h> header file. */
 #undef HAVE_MM_MALLOC_H

+/* Support SSE (Streaming SIMD Extensions) instructions */
+#undef HAVE_SSE
+
+/* Support SSE2 (Streaming SIMD Extensions 2) instructions */
+#undef HAVE_SSE2
+
+/* Support SSE3 (Streaming SIMD Extensions 3) instructions */
+#undef HAVE_SSE3
+
+/* Support SSSE4.1 (Streaming SIMD Extensions 4.1) instructions */
+#undef HAVE_SSE4_1
+
+/* Support SSSE4.2 (Streaming SIMD Extensions 4.2) instructions */
+#undef HAVE_SSE4_2
+
+/* Support SSSE3 (Supplemental Streaming SIMD Extensions 3) instructions */
+#undef HAVE_SSSE3
+
 /* Define to 1 if you have the <stdint.h> header file. */
 #undef HAVE_STDINT_H

@@ -1,4 +1,4 @@

-HFILES=./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/Remez.h ./algorithms/approx/Zolotarev.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/LinearOperator.h ./algorithms/SparseMatrix.h ./Algorithms.h ./AlignedAllocator.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./Cartesian.h ./communicator/Communicator_base.h ./Communicator.h ./Comparison.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./cshift/Cshift_none.h ./Cshift.h ./Grid.h ./GridConfig.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_conformable.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_ET.h ./lattice/Lattice_local.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_reality.h ./lattice/Lattice_reduction.h ./lattice/Lattice_rng.h ./lattice/Lattice_trace.h ./lattice/Lattice_transfer.h ./lattice/Lattice_transpose.h ./lattice/Lattice_where.h ./Lattice.h ./parallelIO/NerscIO.h ./qcd/action/Actions.h ./qcd/action/DiffAction.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonKernels.h ./qcd/Dirac.h ./qcd/LinalgUtils.h ./qcd/QCD.h ./qcd/SpaceTimeGrid.h ./qcd/TwoSpinor.h ./simd/Grid_avx.h ./simd/Grid_avx512.h ./simd/Grid_qpx.h ./simd/Grid_sse4.h ./simd/Grid_vector_types.h ./simd/Old/Grid_vComplexD.h ./simd/Old/Grid_vComplexF.h ./simd/Old/Grid_vInteger.h ./simd/Old/Grid_vRealD.h ./simd/Old/Grid_vRealF.h ./Simd.h ./stencil/Lebesgue.h ./Stencil.h ./tensors/Tensor_arith.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_class.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_inner.h ./tensors/Tensor_outer.h ./tensors/Tensor_peek.h ./tensors/Tensor_poke.h ./tensors/Tensor_reality.h ./tensors/Tensor_trace.h ./tensors/Tensor_traits.h ./tensors/Tensor_transpose.h ./Tensors.h ./Threads.h
+HFILES=./Cshift.h ./simd/Grid_avx.h ./simd/Grid_vector_types.h ./simd/Grid_sse4.h ./simd/Grid_avx512.h ./simd/Old/Grid_vRealD.h ./simd/Old/Grid_vComplexD.h ./simd/Old/Grid_vInteger.h ./simd/Old/Grid_vComplexF.h ./simd/Old/Grid_vRealF.h ./simd/Grid_qpx.h ./Tensors.h ./Algorithms.h ./communicator/Communicator_base.h ./lattice/Lattice_rng.h ./lattice/Lattice_reduction.h ./lattice/Lattice_transfer.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_comparison.h ./lattice/Lattice_overload.h ./lattice/Lattice_reality.h ./lattice/Lattice_local.h ./lattice/Lattice_conformable.h ./lattice/Lattice_where.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_ET.h ./lattice/Lattice_transpose.h ./lattice/Lattice_trace.h ./Stencil.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_poke.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_class.h ./tensors/Tensor_transpose.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_reality.h ./tensors/Tensor_trace.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_outer.h ./tensors/Tensor_inner.h ./tensors/Tensor_traits.h ./tensors/Tensor_Ta.h ./tensors/Tensor_peek.h ./tensors/Tensor_arith.h ./tensors/Tensor_extract_merge.h ./Communicator.h ./Cartesian.h ./parallelIO/NerscIO.h ./qcd/QCD.h ./qcd/SpaceTimeGrid.h ./qcd/LinalgUtils.h ./qcd/TwoSpinor.h ./qcd/action/Actions.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/Dirac.h ./cshift/Cshift_common.h ./cshift/Cshift_none.h ./cshift/Cshift_mpi.h ./Simd.h ./GridConfig.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_red_black.h ./cartesian/Cartesian_full.h ./AlignedAllocator.h ./Lattice.h ./Threads.h ./Comparison.h ./Grid.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/Zolotarev.h ./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/Remez.h ./algorithms/LinearOperator.h ./algorithms/SparseMatrix.h ./stencil/Lebesgue.h

-CCFILES=./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./GridInit.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/Dirac.cc ./qcd/SpaceTimeGrid.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc
+CCFILES=./qcd/SpaceTimeGrid.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/Dirac.cc ./GridInit.cc ./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc
@@ -8,6 +8,7 @@
 #include <tensors/Tensor_outer.h>
 #include <tensors/Tensor_transpose.h>
 #include <tensors/Tensor_trace.h>
+#include <tensors/Tensor_Ta.h>
 #include <tensors/Tensor_peek.h>
 #include <tensors/Tensor_poke.h>
 #include <tensors/Tensor_reality.h>
@@ -48,5 +48,16 @@ PARALLEL_FOR_LOOP
    }


+    template<class vobj> inline auto Ta(const Lattice<vobj> &z) -> Lattice<decltype(Ta(z._odata[0]))>
+    {
+      Lattice<decltype(Ta(z._odata[0]))> ret(z._grid);
+PARALLEL_FOR_LOOP
+        for(int ss=0;ss<z._grid->oSites();ss++){
+            ret._odata[ss] = Ta(z._odata[ss]);
+        }
+      return ret;
+    }
+
+
 }
 #endif
@@ -4,7 +4,7 @@

  Using intrinsics
 */
-// Time-stamp: <2015-05-27 12:07:15 neo>
+// Time-stamp: <2015-05-29 14:13:30 neo>
 //----------------------------------------------------------------------

 #include <immintrin.h>
@@ -261,13 +261,7 @@ namespace Optimization {
    }
    // Complex double
    inline __m256d operator()(__m256d in){
-      return _mm256_xor_pd(_mm256_addsub_pd(_mm256_setzero_pd(),in), _mm256_set1_pd(-0.f));//untested
-      /*
-	// original 
-	//      addsubps 0, inv=>0+in.v[3] 0-in.v[2], 0+in.v[1], 0-in.v[0], ...
-	__m256d tmp = _mm256_addsub_pd(_mm256_setzero_pd(),_mm256_shuffle_pd(in,in,0x5));
-	return _mm256_shuffle_pd(tmp,tmp,0x5);
-      */
+      return _mm256_xor_pd(_mm256_addsub_pd(_mm256_setzero_pd(),in), _mm256_set1_pd(-0.f));
    }
    // do not define for integer input
  };
@@ -2,7 +2,7 @@
 /*! @file Grid_vector_types.h
  @brief Defines templated class Grid_simd to deal with inner vector types
 */
-// Time-stamp: <2015-05-27 12:04:06 neo>
+// Time-stamp: <2015-05-29 14:19:48 neo>
 //---------------------------------------------------------------------------
 #ifndef GRID_VECTOR_TYPES
 #define GRID_VECTOR_TYPES
@@ -55,7 +55,6 @@ namespace Grid {
  // general forms to allow for vsplat syntax
  // need explicit declaration of types when used since
  // clang cannot automatically determine the output type sometimes
-  // use decltype?
  template < class Out, class Input1, class Input2, class Operation > 
    Out binary(Input1 src_1, Input2 src_2, Operation op){
    return op(src_1, src_2);
@@ -1 +1 @@
-timestamp for lib/Grid_config.h
+timestamp for lib/GridConfig.h
@@ -0,0 +1,43 @@
+#ifndef GRID_MATH_TA_H
+#define GRID_MATH_TA_H
+namespace Grid {
+
+  /////////////////////////////////////////////// 
+  // Ta function for scalar, vector, matrix
+  /////////////////////////////////////////////// 
+  inline ComplexF Ta( const ComplexF &arg){    return arg;}
+  inline ComplexD Ta( const ComplexD &arg){    return arg;}
+  inline RealF Ta( const RealF &arg){    return arg;}
+  inline RealD Ta( const RealD &arg){    return arg;}
+
+
+  template<class vtype> inline iScalar<vtype> Ta(const iScalar<vtype>&r)
+    {
+      iScalar<vtype> ret;
+      ret._internal = Ta(r._internal);
+      return ret;
+    }
+  template<class vtype,int N> inline iVector<vtype,N> Ta(const iVector<vtype,N>&r)
+    {
+      iVector<vtype,N> ret;
+      for(int i=0;i<N;i++){
+        ret._internal[i] = Ta(r._internal[i]);
+      }
+      return ret;
+    }
+  template<class vtype,int N> inline iMatrix<vtype,N> Ta(const iMatrix<vtype,N> &arg)
+    {
+      iMatrix<vtype,N> ret(arg);
+      double factor = (1/(double)N);
+      for(int c1=0;c1<N;c1++){
+	for(int c2=0;c2<N;c2++){
+	  ret._internal[c1][c2]= (ret._internal[c1][c2] - adj(arg._internal[c2][c1]));
+	  ret._internal[c1][c2] *= 0.5;
+	}}
+      //ret = (ret - adj(arg))*0.5;
+      ret -= trace(ret)*factor;
+      return ret;
+    }
+
+}
+#endif
@@ -45,7 +45,10 @@ namespace Grid {
  {
    for(int c2=0;c2<N;c2++){
      for(int c1=0;c1<N;c1++){
-        add(&ret->_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]);
+	if ( c1==c2)
+	  add(&ret->_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]);
+	else
+	  ret->_internal[c1][c2]=lhs->_internal[c1][c2];
      }}
    return;
  }
@@ -44,7 +44,7 @@ template<class vtype,class ltype,class rtype, int N> strong_inline void sub(iMat
                                                                     const iMatrix<rtype,N> * __restrict__ rhs){
    for(int c2=0;c2<N;c2++){
    for(int c1=0;c1<N;c1++){
-        if ( c1!=c2) {
+        if ( c1==c2) {
            sub(&ret->_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]);
        } else {
            // Fails -- need unary minus. Catalogue other unops?
@@ -60,7 +60,7 @@ template<class vtype,class ltype,class rtype, int N> strong_inline void sub(iMat
                                                                     const iScalar<rtype> * __restrict__ rhs){
    for(int c2=0;c2<N;c2++){
    for(int c1=0;c1<N;c1++){
-        if ( c1!=c2)
+        if ( c1==c2)
            sub(&ret->_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal);
        else
            ret->_internal[c1][c2]=lhs->_internal[c1][c2];
@@ -2,10 +2,6 @@
 #define GRID_MATH_REALITY_H
 namespace Grid {

-    ///////////////////////////////////////////////////////////////////////////////////////////////////
-    /////////////////////////////////////////// CONJ         ///////////////////////////////////////////
-    ///////////////////////////////////////////////////////////////////////////////////////////////////
-
 /////////////////////////////////////////////// 
 // multiply by I; make recursive.
 /////////////////////////////////////////////// 
@@ -151,6 +147,9 @@ template<class vtype,int N> inline iMatrix<vtype,N> adj(const iMatrix<vtype,N> &



+
+
+
 /////////////////////////////////////////////////////////////////
 // Can only take the real/imag part of scalar objects, since
 // lattice objects of different complex nature are non-conformable.
@@ -75,7 +75,7 @@ auto traceIndex(const iMatrix<vtype,N> &arg) ->  iMatrix<decltype(traceIndex<Lev
 // Allow to recurse if vector, but never terminate on a vector
 // trace of a different index can distribute across the vector index in a replicated way
 // but we do not trace a vector index.
-template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline 
+ template<int Level,class vtype,int N,typename std::enable_if< iVector<vtype, N>::TensorLevel != Level >::type * =nullptr> inline 
 auto traceIndex(const iVector<vtype,N> &arg) ->  iVector<decltype(traceIndex<Level>(arg._internal[0])),N> 
 {
  iVector<decltype(traceIndex<Level>(arg._internal[0])),N> ret;
@@ -0,0 +1,72 @@
+# ===========================================================================
+#   http://www.gnu.org/software/autoconf-archive/ax_check_compile_flag.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_CHECK_COMPILE_FLAG(FLAG, [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS])
+#
+# DESCRIPTION
+#
+#   Check whether the given FLAG works with the current language's compiler
+#   or gives an error.  (Warnings, however, are ignored)
+#
+#   ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on
+#   success/failure.
+#
+#   If EXTRA-FLAGS is defined, it is added to the current language's default
+#   flags (e.g. CFLAGS) when the check is done.  The check is thus made with
+#   the flags: "CFLAGS EXTRA-FLAGS FLAG".  This can for example be used to
+#   force the compiler to issue an error when a bad flag is given.
+#
+#   NOTE: Implementation based on AX_CFLAGS_GCC_OPTION. Please keep this
+#   macro in sync with AX_CHECK_{PREPROC,LINK}_FLAG.
+#
+# LICENSE
+#
+#   Copyright (c) 2008 Guido U. Draheim <guidod@gmx.de>
+#   Copyright (c) 2011 Maarten Bosmans <mkbosmans@gmail.com>
+#
+#   This program is free software: you can redistribute it and/or modify it
+#   under the terms of the GNU General Public License as published by the
+#   Free Software Foundation, either version 3 of the License, or (at your
+#   option) any later version.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+#   Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License along
+#   with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+#   As a special exception, the respective Autoconf Macro's copyright owner
+#   gives unlimited permission to copy, distribute and modify the configure
+#   scripts that are the output of Autoconf when processing the Macro. You
+#   need not follow the terms of the GNU General Public License when using
+#   or distributing such scripts, even though portions of the text of the
+#   Macro appear in them. The GNU General Public License (GPL) does govern
+#   all other use of the material that constitutes the Autoconf Macro.
+#
+#   This special exception to the GPL applies to versions of the Autoconf
+#   Macro released by the Autoconf Archive. When you make and distribute a
+#   modified version of the Autoconf Macro, you may extend this special
+#   exception to the GPL to apply to your modified version as well.
+
+#serial 2
+
+AC_DEFUN([AX_CHECK_COMPILE_FLAG],
+[AC_PREREQ(2.59)dnl for _AC_LANG_PREFIX
+AS_VAR_PUSHDEF([CACHEVAR],[ax_cv_check_[]_AC_LANG_ABBREV[]flags_$4_$1])dnl
+AC_CACHE_CHECK([whether _AC_LANG compiler accepts $1], CACHEVAR, [
+  ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS
+  _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $4 $1"
+  AC_COMPILE_IFELSE([AC_LANG_PROGRAM()],
+    [AS_VAR_SET(CACHEVAR,[yes])],
+    [AS_VAR_SET(CACHEVAR,[no])])
+  _AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags])
+AS_IF([test x"AS_VAR_GET(CACHEVAR)" = xyes],
+  [m4_default([$2], :)],
+  [m4_default([$3], :)])
+AS_VAR_POPDEF([CACHEVAR])dnl
+])dnl AX_CHECK_COMPILE_FLAGS
@@ -0,0 +1,288 @@
+# ===========================================================================
+#          http://www.gnu.org/software/autoconf-archive/ax_ext.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_EXT
+#
+# DESCRIPTION
+#
+#   Find supported SIMD extensions by requesting cpuid. When an SIMD
+#   extension is found, the -m"simdextensionname" is added to SIMD_FLAGS if
+#   compiler supports it. For example, if "sse2" is available, then "-msse2"
+#   is added to SIMD_FLAGS.
+#
+#   This macro calls:
+#
+#     AC_SUBST(SIMD_FLAGS)
+#
+#   And defines:
+#
+#     HAVE_MMX / HAVE_SSE / HAVE_SSE2 / HAVE_SSE3 / HAVE_SSSE3 / HAVE_SSE4.1 / HAVE_SSE4.2 / HAVE_AVX
+#
+# LICENSE
+#
+#   Copyright (c) 2007 Christophe Tournayre <turn3r@users.sourceforge.net>
+#   Copyright (c) 2013 Michael Petch <mpetch@capp-sysware.com>
+#
+#   Copying and distribution of this file, with or without modification, are
+#   permitted in any medium without royalty provided the copyright notice
+#   and this notice are preserved. This file is offered as-is, without any
+#   warranty.
+
+#serial 13
+
+AC_DEFUN([AX_EXT],
+[
+  AC_REQUIRE([AC_CANONICAL_HOST])
+
+  case $host_cpu in
+    powerpc*)
+      AC_CACHE_CHECK([whether altivec is supported], [ax_cv_have_altivec_ext],
+          [
+            if test `/usr/sbin/sysctl -a 2>/dev/null| grep -c hw.optional.altivec` != 0; then
+                if test `/usr/sbin/sysctl -n hw.optional.altivec` = 1; then
+                  ax_cv_have_altivec_ext=yes
+                fi
+            fi
+          ])
+
+          if test "$ax_cv_have_altivec_ext" = yes; then
+            AC_DEFINE(HAVE_ALTIVEC,,[Support Altivec instructions])
+            AX_CHECK_COMPILE_FLAG(-faltivec, [SIMD_FLAGS="$SIMD_FLAGS -faltivec"], [])
+          fi
+    ;;
+
+
+    i[[3456]]86*|x86_64*|amd64*)
+
+      AC_REQUIRE([AX_GCC_X86_CPUID])
+      AC_REQUIRE([AX_GCC_X86_AVX_XGETBV])
+
+      AX_GCC_X86_CPUID(0x00000001)
+      ecx=0
+      edx=0
+      ebx=0
+      if test "$ax_cv_gcc_x86_cpuid_0x00000001" != "unknown";
+      then
+        ecx=`echo $ax_cv_gcc_x86_cpuid_0x00000001 | cut -d ":" -f 3`
+        edx=`echo $ax_cv_gcc_x86_cpuid_0x00000001 | cut -d ":" -f 4`
+      fi
+
+      AX_GCC_X86_CPUID(0x00000007)
+      if test "$ax_cv_gcc_x86_cpuid_0x00000007" != "unknown";
+      then
+        ebx=`echo $ax_cv_gcc_x86_cpuid_0x00000007 | cut -d ":" -f 2`
+      fi
+
+      AC_CACHE_CHECK([whether mmx is supported], [ax_cv_have_mmx_ext],
+      [
+        ax_cv_have_mmx_ext=no
+        if test "$((0x$edx>>23&0x01))" = 1; then
+          ax_cv_have_mmx_ext=yes
+        fi
+      ])
+
+      AC_CACHE_CHECK([whether sse is supported], [ax_cv_have_sse_ext],
+      [
+        ax_cv_have_sse_ext=no
+        if test "$((0x$edx>>25&0x01))" = 1; then
+          ax_cv_have_sse_ext=yes
+        fi
+      ])
+
+      AC_CACHE_CHECK([whether sse2 is supported], [ax_cv_have_sse2_ext],
+      [
+        ax_cv_have_sse2_ext=no
+        if test "$((0x$edx>>26&0x01))" = 1; then
+          ax_cv_have_sse2_ext=yes
+        fi
+      ])
+
+      AC_CACHE_CHECK([whether sse3 is supported], [ax_cv_have_sse3_ext],
+      [
+        ax_cv_have_sse3_ext=no
+        if test "$((0x$ecx&0x01))" = 1; then
+          ax_cv_have_sse3_ext=yes
+        fi
+      ])
+
+      AC_CACHE_CHECK([whether ssse3 is supported], [ax_cv_have_ssse3_ext],
+      [
+        ax_cv_have_ssse3_ext=no
+        if test "$((0x$ecx>>9&0x01))" = 1; then
+          ax_cv_have_ssse3_ext=yes
+        fi
+      ])
+
+      AC_CACHE_CHECK([whether sse4.1 is supported], [ax_cv_have_sse41_ext],
+      [
+        ax_cv_have_sse41_ext=no
+        if test "$((0x$ecx>>19&0x01))" = 1; then
+          ax_cv_have_sse41_ext=yes
+        fi
+      ])
+
+      AC_CACHE_CHECK([whether sse4.2 is supported], [ax_cv_have_sse42_ext],
+      [
+        ax_cv_have_sse42_ext=no
+        if test "$((0x$ecx>>20&0x01))" = 1; then
+          ax_cv_have_sse42_ext=yes
+        fi
+      ])
+
+      AC_CACHE_CHECK([whether avx is supported by processor], [ax_cv_have_avx_cpu_ext],
+      [
+        ax_cv_have_avx_cpu_ext=no
+        if test "$((0x$ecx>>28&0x01))" = 1; then
+          ax_cv_have_avx_cpu_ext=yes
+        fi
+      ])
+
+      AC_CACHE_CHECK([whether avx2 is supported by processor], [ax_cv_have_avx2_cpu_ext],
+      [
+        ax_cv_have_avx2_cpu_ext=no
+        if test "$((0x$ebx>>5&0x01))" = 1; then
+          ax_cv_have_avx2_cpu_ext=yes
+        fi
+      ])
+
+
+      AC_CACHE_CHECK([whether fma is supported by processor], [ax_cv_have_fma_cpu_ext],
+      [
+        ax_cv_have_fma_cpu_ext=no
+        if test "$((0x$ecx>>12&0x01))" = 1; then
+          ax_cv_have_fma_cpu_ext=yes
+        fi
+      ])
+
+
+      if test x"$ax_cv_have_avx_cpu_ext" = x"yes"; then
+        AX_GCC_X86_AVX_XGETBV(0x00000000)
+
+        xgetbv_eax="0"
+        if test x"$ax_cv_gcc_x86_avx_xgetbv_0x00000000" != x"unknown"; then
+          xgetbv_eax=`echo $ax_cv_gcc_x86_avx_xgetbv_0x00000000 | cut -d ":" -f 1`
+        fi
+
+        AC_CACHE_CHECK([whether avx is supported by operating system], [ax_cv_have_avx_ext],
+        [
+          ax_cv_have_avx_ext=no
+
+          if test "$((0x$ecx>>27&0x01))" = 1; then
+            if test "$((0x$xgetbv_eax&0x6))" = 6; then
+              ax_cv_have_avx_ext=yes
+            fi
+          fi
+        ])
+        if test x"$ax_cv_have_avx_ext" = x"no"; then
+          AC_MSG_WARN([Your processor supports AVX, but your operating system doesn't])
+        fi
+      fi
+
+      if test "$ax_cv_have_mmx_ext" = yes; then
+        AX_CHECK_COMPILE_FLAG(-mmmx, ax_cv_support_mmx_ext=yes, [])
+        if test x"$ax_cv_support_mmx_ext" = x"yes"; then
+          SIMD_FLAGS="$SIMD_FLAGS -mmmx"
+          AC_DEFINE(HAVE_MMX,,[Support mmx instructions])
+        else
+          AC_MSG_WARN([Your processor supports mmx instructions but not your compiler, can you try another compiler?])
+        fi
+      fi
+
+      if test "$ax_cv_have_sse_ext" = yes; then
+        AX_CHECK_COMPILE_FLAG(-msse, ax_cv_support_sse_ext=yes, [])
+        if test x"$ax_cv_support_sse_ext" = x"yes"; then
+          SIMD_FLAGS="$SIMD_FLAGS -msse"
+          AC_DEFINE(HAVE_SSE,,[Support SSE (Streaming SIMD Extensions) instructions])
+        else
+          AC_MSG_WARN([Your processor supports sse instructions but not your compiler, can you try another compiler?])
+        fi
+      fi
+
+      if test "$ax_cv_have_sse2_ext" = yes; then
+        AX_CHECK_COMPILE_FLAG(-msse2, ax_cv_support_sse2_ext=yes, [])
+        if test x"$ax_cv_support_sse2_ext" = x"yes"; then
+          SIMD_FLAGS="$SIMD_FLAGS -msse2"
+          AC_DEFINE(HAVE_SSE2,,[Support SSE2 (Streaming SIMD Extensions 2) instructions])
+        else
+          AC_MSG_WARN([Your processor supports sse2 instructions but not your compiler, can you try another compiler?])
+        fi
+      fi
+
+      if test "$ax_cv_have_sse3_ext" = yes; then
+        AX_CHECK_COMPILE_FLAG(-msse3, ax_cv_support_sse3_ext=yes, [])
+        if test x"$ax_cv_support_sse3_ext" = x"yes"; then
+          SIMD_FLAGS="$SIMD_FLAGS -msse3"
+          AC_DEFINE(HAVE_SSE3,,[Support SSE3 (Streaming SIMD Extensions 3) instructions])
+        else
+          AC_MSG_WARN([Your processor supports sse3 instructions but not your compiler, can you try another compiler?])
+        fi
+      fi
+
+      if test "$ax_cv_have_ssse3_ext" = yes; then
+        AX_CHECK_COMPILE_FLAG(-mssse3, ax_cv_support_ssse3_ext=yes, [])
+        if test x"$ax_cv_support_ssse3_ext" = x"yes"; then
+          SIMD_FLAGS="$SIMD_FLAGS -mssse3"
+          AC_DEFINE(HAVE_SSSE3,,[Support SSSE3 (Supplemental Streaming SIMD Extensions 3) instructions])
+        else
+          AC_MSG_WARN([Your processor supports ssse3 instructions but not your compiler, can you try another compiler?])
+        fi
+      fi
+
+      if test "$ax_cv_have_sse41_ext" = yes; then
+        AX_CHECK_COMPILE_FLAG(-msse4.1, ax_cv_support_sse41_ext=yes, [])
+        if test x"$ax_cv_support_sse41_ext" = x"yes"; then
+          SIMD_FLAGS="$SIMD_FLAGS -msse4.1"
+          AC_DEFINE(HAVE_SSE4_1,,[Support SSSE4.1 (Streaming SIMD Extensions 4.1) instructions])
+        else
+          AC_MSG_WARN([Your processor supports sse4.1 instructions but not your compiler, can you try another compiler?])
+        fi
+      fi
+
+      if test "$ax_cv_have_sse42_ext" = yes; then
+        AX_CHECK_COMPILE_FLAG(-msse4.2, ax_cv_support_sse42_ext=yes, [])
+        if test x"$ax_cv_support_sse42_ext" = x"yes"; then
+          SIMD_FLAGS="$SIMD_FLAGS -msse4.2"
+          AC_DEFINE(HAVE_SSE4_2,,[Support SSSE4.2 (Streaming SIMD Extensions 4.2) instructions])
+        else
+          AC_MSG_WARN([Your processor supports sse4.2 instructions but not your compiler, can you try another compiler?])
+        fi
+      fi
+
+      if test "$ax_cv_have_avx_ext" = yes; then
+        AX_CHECK_COMPILE_FLAG(-mavx, ax_cv_support_avx_ext=yes, [])
+        if test x"$ax_cv_support_avx_ext" = x"yes"; then
+          SIMD_FLAGS="$SIMD_FLAGS -mavx"
+          AC_DEFINE(HAVE_AVX,,[Support AVX (Advanced Vector Extensions) instructions])
+        else
+          AC_MSG_WARN([Your processor supports avx instructions but not your compiler, can you try another compiler?])
+        fi
+      fi
+
+       if test "$ax_cv_have_avx2_ext" = yes; then
+         AX_CHECK_COMPILE_FLAG(-mavx2, ax_cv_support_avx2_ext=yes, [])
+         if test x"$ax_cv_support_avx2_ext" = x"yes"; then
+           SIMD_FLAGS="$SIMD_FLAGS -mavx2"
+           AC_DEFINE(HAVE_AVX2,,[Support AVX2 (Advanced Vector Extensions 2) instructions])
+         else
+           AC_MSG_WARN([Your processor supports avx2 instructions but not your compiler, can you try another compiler?])
+         fi
+        fi
+
+       if test "$ax_cv_have_fma_ext" = yes; then
+         AX_CHECK_COMPILE_FLAG(-mfma, ax_cv_support_fma_ext=yes, [])
+         if test x"$ax_cv_support_fma_ext" = x"yes"; then
+           SIMD_FLAGS="$SIMD_FLAGS -mfma"
+           AC_DEFINE(HAVE_FMA,,[Support FMA3 (Fused Multiply-Add) instructions])
+         else
+           AC_MSG_WARN([Your processor supports fma instructions but not your compiler, can you try another compiler?])
+         fi
+      fi
+
+  ;;
+  esac
+
+  AC_SUBST(SIMD_FLAGS)
+])
@@ -0,0 +1,79 @@
+# ===========================================================================
+#   http://www.gnu.org/software/autoconf-archive/ax_gcc_x86_avx_xgetbv.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_GCC_X86_AVX_XGETBV
+#
+# DESCRIPTION
+#
+#   On later x86 processors with AVX SIMD support, with gcc or a compiler
+#   that has a compatible syntax for inline assembly instructions, run a
+#   small program that executes the xgetbv instruction with input OP. This
+#   can be used to detect if the OS supports AVX instruction usage.
+#
+#   On output, the values of the eax and edx registers are stored as
+#   hexadecimal strings as "eax:edx" in the cache variable
+#   ax_cv_gcc_x86_avx_xgetbv.
+#
+#   If the xgetbv instruction fails (because you are running a
+#   cross-compiler, or because you are not using gcc, or because you are on
+#   a processor that doesn't have this instruction),
+#   ax_cv_gcc_x86_avx_xgetbv_OP is set to the string "unknown".
+#
+#   This macro mainly exists to be used in AX_EXT.
+#
+# LICENSE
+#
+#   Copyright (c) 2013 Michael Petch <mpetch@capp-sysware.com>
+#
+#   This program is free software: you can redistribute it and/or modify it
+#   under the terms of the GNU General Public License as published by the
+#   Free Software Foundation, either version 3 of the License, or (at your
+#   option) any later version.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+#   Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License along
+#   with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+#   As a special exception, the respective Autoconf Macro's copyright owner
+#   gives unlimited permission to copy, distribute and modify the configure
+#   scripts that are the output of Autoconf when processing the Macro. You
+#   need not follow the terms of the GNU General Public License when using
+#   or distributing such scripts, even though portions of the text of the
+#   Macro appear in them. The GNU General Public License (GPL) does govern
+#   all other use of the material that constitutes the Autoconf Macro.
+#
+#   This special exception to the GPL applies to versions of the Autoconf
+#   Macro released by the Autoconf Archive. When you make and distribute a
+#   modified version of the Autoconf Macro, you may extend this special
+#   exception to the GPL to apply to your modified version as well.
+
+#serial 1
+
+AC_DEFUN([AX_GCC_X86_AVX_XGETBV],
+[AC_REQUIRE([AC_PROG_CC])
+AC_LANG_PUSH([C])
+AC_CACHE_CHECK(for x86-AVX xgetbv $1 output, ax_cv_gcc_x86_avx_xgetbv_$1,
+ [AC_RUN_IFELSE([AC_LANG_PROGRAM([#include <stdio.h>], [
+     int op = $1, eax, edx;
+     FILE *f;
+      /* Opcodes for xgetbv */
+      __asm__(".byte 0x0f, 0x01, 0xd0"
+        : "=a" (eax), "=d" (edx), 
+        : "c" (op));
+     f = fopen("conftest_xgetbv", "w"); if (!f) return 1;
+     fprintf(f, "%x:%x\n", eax, edx);
+     fclose(f);
+     return 0;
+])],
+     [ax_cv_gcc_x86_avx_xgetbv_$1=`cat conftest_xgetbv`; rm -f conftest_xgetbv],
+     [ax_cv_gcc_x86_avx_xgetbv_$1=unknown; rm -f conftest_xgetbv],
+     [ax_cv_gcc_x86_avx_xgetbv_$1=unknown])])
+AC_LANG_POP([C])
+])
@@ -0,0 +1,45 @@
+dnl @synopsis AX_GCC_X86_CPUID(OP)
+dnl
+dnl @summary run x86 cpuid instruction OP using gcc inline assembler
+dnl
+dnl On Pentium and later x86 processors, with gcc or a compiler that
+dnl has a compatible syntax for inline assembly instructions, run a
+dnl small program that executes the cpuid instruction with input OP.
+dnl This can be used to detect the CPU type.
+dnl
+dnl On output, the values of the eax, ebx, ecx, and edx registers are
+dnl stored as hexadecimal strings as "eax:ebx:ecx:edx" in the cache
+dnl variable ax_cv_gcc_x86_cpuid_OP.
+dnl
+dnl If the cpuid instruction fails (because you are running a
+dnl cross-compiler, or because you are not using gcc, or because you
+dnl are on a processor that doesn't have this instruction),
+dnl ax_cv_gcc_x86_cpuid_OP is set to the string "unknown".
+dnl
+dnl This macro mainly exists to be used in AX_GCC_ARCHFLAG.
+dnl
+dnl @category Misc
+dnl @author Steven G. Johnson <stevenj@alum.mit.edu> and Matteo Frigo.
+dnl @version 2005-05-30
+dnl @license GPLWithACException
+
+AC_DEFUN([AX_GCC_X86_CPUID],
+[AC_REQUIRE([AC_PROG_CC])
+AC_LANG_PUSH([C])
+AC_CACHE_CHECK(for x86 cpuid $1 output, ax_cv_gcc_x86_cpuid_$1,
+ [AC_RUN_IFELSE([AC_LANG_PROGRAM([#include <stdio.h>], [
+     int op = $1, eax, ebx, ecx, edx;
+     FILE *f;
+      __asm__("cpuid"
+        : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
+        : "a" (op));
+     f = fopen("conftest_cpuid", "w"); if (!f) return 1;
+     fprintf(f, "%x:%x:%x:%x\n", eax, ebx, ecx, edx);
+     fclose(f);
+     return 0;
+])],
+     [ax_cv_gcc_x86_cpuid_$1=`cat conftest_cpuid`; rm -f conftest_cpuid],
+     [ax_cv_gcc_x86_cpuid_$1=unknown; rm -f conftest_cpuid],
+     [ax_cv_gcc_x86_cpuid_$1=unknown])])
+AC_LANG_POP([C])
+])
@@ -54,3 +54,5 @@ echo ${BNAME}_SOURCES=$f  >> Make.inc
 echo ${BNAME}_LDADD=-lGrid>> Make.inc
 echo >> Make.inc
 done
+
+cd ..
@@ -56,6 +56,7 @@ int main (int argc, char ** argv)
    GridCartesian           Fine(latt_size,simd_layout,mpi_layout);
    GridRedBlackCartesian rbFine(latt_size,simd_layout,mpi_layout);
    GridParallelRNG       FineRNG(&Fine);
+    GridSerialRNG       SerialRNG;
    FineRNG.SeedRandomDevice();

    LatticeColourMatrix Foo(&Fine);
@@ -83,6 +84,9 @@ int main (int argc, char ** argv)
    LatticeSpinMatrix       sMat(&Fine);
    LatticeSpinColourMatrix scMat(&Fine);
    
+    LatticeLorentzColourMatrix lcMat(&Fine);
+
+
    LatticeComplex scalar(&Fine);
    LatticeReal    rscalar(&Fine);
    LatticeReal    iscalar(&Fine);
@@ -99,12 +103,15 @@ int main (int argc, char ** argv)
    random(FineRNG,cMat);
    random(FineRNG,sMat);
    random(FineRNG,scMat);
+    random(FineRNG,lcMat);
    random(FineRNG,cVec);
    random(FineRNG,sVec);
    random(FineRNG,scVec);

+
    fflush(stdout);
    
+    TComplex tr = trace(cmat);

     
    cVec = cMat * cVec;  // LatticeColourVector     = LatticeColourMatrix     * LatticeColourVector
@@ -116,7 +123,9 @@ int main (int argc, char ** argv)
    cMat = outerProduct(cVec,cVec);
    scalar = localInnerProduct(cVec,cVec);

- 
+    cMat = Ta(cMat);  //traceless antihermitian
+
+
    scalar += scalar;
    scalar -= scalar;
    scalar *= scalar;
@@ -206,7 +215,13 @@ int main (int argc, char ** argv)
    scm=transpose(scm);
    scm=transposeIndex<1>(scm);
    
+   
+    //random(SerialRNG, cm);
+    //std::cout << cm << std::endl;

+    cm = Ta(cm);
+    //TComplex tracecm= trace(cm);      
+    //std::cout << cm << "  "<< tracecm << std::endl;


 //    Foo = Foo+scalar; // LatticeColourMatrix+Scalar
@@ -219,6 +234,10 @@ int main (int argc, char ** argv)
    LatticeComplex trscMat(&Fine);
    trscMat = trace(scMat); // Trace

+    // LatticeComplex trlcMat(&Fine);
+    // trlcMat = trace(lcMat); // Trace involving iVector - now generates error
+    
+
    { // Peek-ology and Poke-ology, with a little app-ology
      TComplex      c;
      ColourMatrix c_m;