From b8eef54fa743476693fd1a3056caebe4063f0d4a Mon Sep 17 00:00:00 2001
From: Peter Boyle <paboyle@ph.ed.ac.uk>
Date: Fri, 24 Apr 2015 18:20:03 +0100
Subject: [PATCH] First implementation of Dirac matrices as a Gamma class.

---
 lib/Grid_simd.h                               |   7 +-
 lib/math/Grid_math_arith_add.h                |   7 -
 lib/math/Grid_math_reality.h                  |  59 ++-
 lib/math/Grid_math_tensors.h                  |  23 +
 lib/qcd/Grid_qcd.h                            |   3 +
 lib/qcd/Grid_qcd_dirac.h                      | 393 ++++++++++++++++++
 lib/simd/Grid_vComplexD.h                     |  66 ++-
 lib/simd/Grid_vComplexF.h                     |  45 +-
 tests/Grid_gamma.cc                           |  95 +++++
 .../{test_Grid_stencil.cc => Grid_stencil.cc} |   0
 tests/Makefile.am                             |   9 +-
 11 files changed, 683 insertions(+), 24 deletions(-)
 create mode 100644 lib/qcd/Grid_qcd_dirac.h
 create mode 100644 tests/Grid_gamma.cc
 rename tests/{test_Grid_stencil.cc => Grid_stencil.cc} (100%)
diff --git a/lib/Grid_simd.h b/lib/Grid_simd.h
index 17755fe0..d8788e33 100644
--- a/lib/Grid_simd.h
+++ b/lib/Grid_simd.h
@@ -61,7 +61,12 @@ namespace Grid {
     inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); }
     inline ComplexF  adj(const ComplexF& r ){ return(conj(r)); }
     //conj already supported for complex
-    
+
+    inline ComplexF timesI(const ComplexF r)     { return(r*ComplexF(0.0,1.0));}
+    inline ComplexF timesMinusI(const ComplexF r){ return(r*ComplexF(0.0,-1.0));}
+    inline ComplexD timesI(const ComplexD r)     { return(r*ComplexD(0.0,1.0));}
+    inline ComplexD timesMinusI(const ComplexD r){ return(r*ComplexD(0.0,-1.0));}
+
     inline void mac (RealD * __restrict__ y,const RealD * __restrict__ a,const RealD *__restrict__ x){  *y = (*a) * (*x)+(*y);}
     inline void mult(RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r);}
     inline void sub (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r);}
diff --git a/lib/math/Grid_math_arith_add.h b/lib/math/Grid_math_arith_add.h
index 7e62e07b..93449402 100644
--- a/lib/math/Grid_math_arith_add.h
+++ b/lib/math/Grid_math_arith_add.h
@@ -63,13 +63,6 @@ namespace Grid {
     return;
   }
 
-  // Need to figure multi-precision.
-  template<class Mytype>  Mytype timesI(Mytype &r)
-    {
-      iScalar<Complex> i;
-      i._internal = Complex(0,1);
-      return r*i;
-    }
 
   // + operator for scalar, vector, matrix
   template<class ltype,class rtype>
diff --git a/lib/math/Grid_math_reality.h b/lib/math/Grid_math_reality.h
index 1f676c11..1bf34f2a 100644
--- a/lib/math/Grid_math_reality.h
+++ b/lib/math/Grid_math_reality.h
@@ -5,8 +5,63 @@ namespace Grid {
     ///////////////////////////////////////////////////////////////////////////////////////////////////
     /////////////////////////////////////////// CONJ         ///////////////////////////////////////////
     ///////////////////////////////////////////////////////////////////////////////////////////////////
- 
+
+#ifdef GRID_WARN_SUBOPTIMAL
+#warning "Optimisation alert switch over to two argument form to avoid copy back in perf critical timesI "
+#endif     
+/////////////////////////////////////////////// 
+// multiply by I; make recursive.
+/////////////////////////////////////////////// 
+template<class vtype> inline iScalar<vtype> timesI(const iScalar<vtype>&r) 
+{
+    iScalar<vtype> ret;
+    ret._internal = timesI(r._internal);
+    return ret;
+}
+template<class vtype,int N> inline iVector<vtype,N> timesI(const iVector<vtype,N>&r) 
+{
+  iVector<vtype,N> ret;
+  for(int i=0;i<N;i++){
+    ret._internal[i] = timesI(r._internal[i]);
+  }
+  return ret;
+}
+template<class vtype,int N> inline iMatrix<vtype,N> timesI(const iMatrix<vtype,N>&r)
+{
+  iMatrix<vtype,N> ret;
+  for(int i=0;i<N;i++){
+  for(int j=0;j<N;j++){
+    ret._internal[i][j] = timesI(r._internal[i][j]);
+  }}
+  return ret;
+}
+
+template<class vtype> inline iScalar<vtype> timesMinusI(const iScalar<vtype>&r) 
+{
+    iScalar<vtype> ret;
+    ret._internal = timesMinusI(r._internal);
+    return ret;
+}
+template<class vtype,int N> inline iVector<vtype,N> timesMinusI(const iVector<vtype,N>&r) 
+{
+  iVector<vtype,N> ret;
+  for(int i=0;i<N;i++){
+    ret._internal[i] = timesMinusI(r._internal[i]);
+  }
+  return ret;
+}
+template<class vtype,int N> inline iMatrix<vtype,N> timesMinusI(const iMatrix<vtype,N>&r)
+{
+  iMatrix<vtype,N> ret;
+  for(int i=0;i<N;i++){
+  for(int j=0;j<N;j++){
+    ret._internal[i][j] = timesMinusI(r._internal[i][j]);
+  }}
+  return ret;
+}
+/////////////////////////////////////////////// 
 // Conj function for scalar, vector, matrix
+/////////////////////////////////////////////// 
 template<class vtype> inline iScalar<vtype> conj(const iScalar<vtype>&r)
 {
     iScalar<vtype> ret;
@@ -31,7 +86,9 @@ template<class vtype,int N> inline iMatrix<vtype,N> conj(const iMatrix<vtype,N>&
   return ret;
 }
 
+/////////////////////////////////////////////// 
 // Adj function for scalar, vector, matrix
+/////////////////////////////////////////////// 
 template<class vtype> inline iScalar<vtype> adj(const iScalar<vtype>&r)
 {
     iScalar<vtype> ret;
diff --git a/lib/math/Grid_math_tensors.h b/lib/math/Grid_math_tensors.h
index 379e61d2..8bca6805 100644
--- a/lib/math/Grid_math_tensors.h
+++ b/lib/math/Grid_math_tensors.h
@@ -72,6 +72,13 @@ public:
       return _internal;
     }
 
+    inline const vtype & operator ()(void) const {
+      return _internal;
+    }
+    //    inline vtype && operator ()(void) {
+    //      return _internal;
+    //    }
+
     operator ComplexD () const { return(TensorRemove(_internal)); };
     operator RealD () const { return(real(TensorRemove(_internal))); }
 
@@ -156,6 +163,12 @@ public:
     inline vtype & operator ()(int i) {
       return _internal[i];
     }
+    inline const vtype & operator ()(int i) const {
+      return _internal[i];
+    }
+    //    inline vtype && operator ()(int i) {
+    //      return _internal[i];
+    //    }
 };
     
 template<class vtype,int N> class iMatrix
@@ -235,12 +248,22 @@ public:
     *this = (*this)+r;
     return *this;
   }
+
+  // returns an lvalue reference
   inline vtype & operator ()(int i,int j) {
     return _internal[i][j];
   }
+  inline const vtype & operator ()(int i,int j) const {
+    return _internal[i][j];
+  }
+
+  //  inline vtype && operator ()(int i,int j) {
+  //    return _internal[i][j];
+  //  }
 
 };
 
+
 template<class vobj> inline 
 void extract(const vobj &vec,std::vector<typename vobj::scalar_object> &extracted)
 {
diff --git a/lib/qcd/Grid_qcd.h b/lib/qcd/Grid_qcd.h
index b30b7a67..a5c4320e 100644
--- a/lib/qcd/Grid_qcd.h
+++ b/lib/qcd/Grid_qcd.h
@@ -143,4 +143,7 @@ namespace QCD {
 
 }   //namespace QCD
 } // Grid
+
+#include <qcd/Grid_qcd_dirac.h>
+
 #endif
diff --git a/lib/qcd/Grid_qcd_dirac.h b/lib/qcd/Grid_qcd_dirac.h
new file mode 100644
index 00000000..86e3870d
--- /dev/null
+++ b/lib/qcd/Grid_qcd_dirac.h
@@ -0,0 +1,393 @@
+#ifndef GRID_QCD_DIRAC_H
+#define GRID_QCD_DIRAC_H
+namespace Grid{
+
+namespace QCD {
+
+  class Gamma {
+
+  public:
+
+    const int Ns=4;
+    
+    enum GammaMatrix {
+      Identity,                        
+      GammaX, 
+      GammaY, 
+      GammaZ, 
+      GammaT,  
+      Gamma5,
+      //      GammaXGamma5, 
+      //      GammaYGamma5, 
+      //      GammaZGamma5, 
+      //      GammaTGamma5,  
+      //      SigmaXY,
+      //      SigmaXZ,
+      //      SigmaYZ,
+      //      SigmaXT,
+      //      SigmaYT,
+      //      SigmaZT,
+      MinusIdentity,                        
+      MinusGammaX, 
+      MinusGammaY, 
+      MinusGammaZ, 
+      MinusGammaT,  
+      MinusGamma5
+      //      MinusGammaXGamma5, easiest to form by composition
+      //      MinusGammaYGamma5, as performance is not critical for these
+      //      MinusGammaZGamma5, 
+      //      MinusGammaTGamma5,  
+      //      MinusSigmaXY,
+      //      MinusSigmaXZ,
+      //      MinusSigmaYZ,
+      //      MinusSigmaXT,
+      //      MinusSigmaYT,
+      //      MinusSigmaZT
+    };
+
+    Gamma (GammaMatrix g) { _g=g; }
+
+    GammaMatrix _g;
+
+
+  };
+
+
+    /* Gx
+     *  0 0  0  i    
+     *  0 0  i  0    
+     *  0 -i 0  0
+     * -i 0  0  0
+     */
+    template<class vtype> inline void rmultMinusGammaX(iMatrix<vtype,Ns> &ret,const iMatrix<vtype,Ns> &rhs){
+      for(int i=0;i<Ns;i++){
+	ret(i,0) = timesI(rhs(i,3));
+	ret(i,1) = timesI(rhs(i,2));
+	ret(i,2) = timesMinusI(rhs(i,1));
+	ret(i,3) = timesMinusI(rhs(i,0));
+      }
+    };
+    template<class vtype> inline void rmultGammaX(iMatrix<vtype,Ns> &ret, const iMatrix<vtype,Ns> &rhs){
+      for(int i=0;i<Ns;i++){
+	ret(i,0) = timesMinusI(rhs(i,3));
+	ret(i,1) = timesMinusI(rhs(i,2));
+	ret(i,2) = timesI(rhs(i,1));
+	ret(i,3) = timesI(rhs(i,1));
+      }
+    };
+
+    template<class vtype> inline void multGammaX(iVector<vtype,Ns> &ret, iVector<vtype,Ns> &rhs){
+      ret._internal[0] = timesI(rhs._internal[3]);
+      ret._internal[1] = timesI(rhs._internal[2]);
+      ret._internal[2] = timesMinusI(rhs._internal[1]);
+      ret._internal[3] = timesMinusI(rhs._internal[0]);
+    };
+    template<class vtype> inline void multMinusGammaX(iVector<vtype,Ns> &ret, iVector<vtype,Ns> &rhs){
+	ret(0) = timesMinusI(rhs(3));
+	ret(1) = timesMinusI(rhs(2));
+	ret(2) = timesI(rhs(1));
+	ret(3) = timesI(rhs(0));
+    };
+    template<class vtype> inline void multGammaX(iMatrix<vtype,Ns> &ret, const iMatrix<vtype,Ns> &rhs){
+      for(int i=0;i<Ns;i++){
+	ret(0,i) = timesI(rhs(3,i));
+	ret(1,i) = timesI(rhs._internal[2][i]);
+	ret(2,i) = timesMinusI(rhs._internal[1][i]);
+	ret(3,i) = timesMinusI(rhs._internal[0][i]);
+      }
+    };
+    template<class vtype> inline void multMinusGammaX(iMatrix<vtype,Ns> &ret, const iMatrix<vtype,Ns> &rhs){
+      for(int i=0;i<Ns;i++){
+	ret(0,i) = timesMinusI(rhs(3,i));
+	ret(1,i) = timesMinusI(rhs(2,i));
+	ret(2,i) = timesI(rhs(1,i));
+	ret(3,i) = timesI(rhs(0,i));
+      }
+    };
+
+
+
+
+    /*Gy
+     *  0 0  0  -1  [0] -+ [3]
+     *  0 0  1  0   [1] +- [2]
+     *  0 1  0  0
+     * -1 0  0  0
+     */
+    template<class vtype> inline void multGammaY(iVector<vtype,Ns> &ret, iVector<vtype,Ns> &rhs){
+      ret(0) = -rhs(3);
+      ret(1) =  rhs(2);
+      ret(2) =  rhs(1);
+      ret(3) = -rhs(0);
+    };
+    template<class vtype> inline void multMinusGammaY(iVector<vtype,Ns> &ret, iVector<vtype,Ns> &rhs){
+      ret(0) =  rhs(3);
+      ret(1) = -rhs(2);
+      ret(2) = -rhs(1);
+      ret(3) =  rhs(0);
+    };
+    template<class vtype> inline void multGammaY(iMatrix<vtype,Ns> &ret, const iMatrix<vtype,Ns> &rhs){
+      for(int i=0;i<Ns;i++){
+	ret(0,i) = -rhs(3,i);
+	ret(1,i) =  rhs(2,i);
+	ret(2,i) =  rhs(1,i);
+	ret(3,i) = -rhs(0,i);
+      }
+    };
+    template<class vtype> inline void multMinusGammaY(iMatrix<vtype,Ns> &ret, const iMatrix<vtype,Ns> &rhs){
+      for(int i=0;i<Ns;i++){
+	ret(0,i) =  rhs(3,i);
+	ret(1,i) = -rhs(2,i);
+	ret(2,i) = -rhs(1,i);
+	ret(3,i) =  rhs(0,i);
+      }
+    };
+    /*Gz
+     *  0 0  i  0   [0]+-i[2]
+     *  0 0  0 -i   [1]-+i[3]
+     * -i 0  0  0
+     *  0 i  0  0
+     */
+    template<class vtype> inline void multGammaZ(iVector<vtype,Ns> &ret, iVector<vtype,Ns> &rhs){
+      ret(0) = timesI(rhs(2));
+      ret(1) =timesMinusI(rhs(3));
+      ret(2) =timesMinusI(rhs(0));
+      ret(3) = timesI(rhs(1));
+    };
+    template<class vtype> inline void multMinusGammaZ(iVector<vtype,Ns> &ret, iVector<vtype,Ns> &rhs){
+      ret(0) = timesMinusI(rhs(2));
+      ret(1) = timesI(rhs(3));
+      ret(2) = timesI(rhs(0));
+      ret(3) = timesMinusI(rhs(1));
+    };
+    template<class vtype> inline void multGammaZ(iMatrix<vtype,Ns> &ret, const iMatrix<vtype,Ns> &rhs){
+      for(int i=0;i<Ns;i++){
+	ret(0,i) = timesI(rhs(2,i));
+	ret(1,i) =timesMinusI(rhs(3,i));
+	ret(2,i) =timesMinusI(rhs(0,i));
+	ret(3,i) = timesI(rhs(1,i));
+      }
+    };
+    template<class vtype> inline void multMinusGammaZ(iMatrix<vtype,Ns> &ret, const iMatrix<vtype,Ns> &rhs){
+      for(int i=0;i<Ns;i++){
+	ret(0,i) = timesMinusI(rhs(2,i));
+	ret(1,i) = timesI(rhs(3,i));
+	ret(2,i) = timesI(rhs(0,i));
+	ret(3,i) = timesMinusI(rhs(1,i));
+      }
+    };
+    /*Gt
+     *  0 0  1  0 [0]+-[2]
+     *  0 0  0  1 [1]+-[3]
+     *  1 0  0  0
+     *  0 1  0  0
+     */
+    template<class vtype> inline void multGammaT(iVector<vtype,Ns> &ret, iVector<vtype,Ns> &rhs){
+      ret(0) = rhs(2);
+      ret(1) = rhs(3);
+      ret(2) = rhs(0);
+      ret(3) = rhs(1);
+    };
+    template<class vtype> inline void multMinusGammaT(iVector<vtype,Ns> &ret, iVector<vtype,Ns> &rhs){
+      ret(0) =-rhs(2);
+      ret(1) =-rhs(3);
+      ret(2) =-rhs(0);
+      ret(3) =-rhs(1);
+    };
+    template<class vtype> inline void multGammaT(iMatrix<vtype,Ns> &ret, const iMatrix<vtype,Ns> &rhs){
+      for(int i=0;i<Ns;i++){
+	ret(0,i) = rhs(2,i);
+	ret(1,i) = rhs(3,i);
+	ret(2,i) = rhs(0,i);
+	ret(3,i) = rhs(1,i);
+      }
+    };
+    template<class vtype> inline void multMinusGammaT(iMatrix<vtype,Ns> &ret, const iMatrix<vtype,Ns> &rhs){
+      for(int i=0;i<Ns;i++){
+	ret(0,i) =-rhs(2,i);
+	ret(1,i) =-rhs(3,i);
+	ret(2,i) =-rhs(0,i);
+	ret(3,i) =-rhs(1,i);
+      }
+    };
+    /*G5
+     *  1 0  0  0 [0]+-[2]
+     *  0 1  0  0 [1]+-[3]
+     *  0 0 -1  0
+     *  0 0  0 -1
+     */
+    template<class vtype> inline void multGamma5(iVector<vtype,Ns> &ret, iVector<vtype,Ns> &rhs){
+      ret(0) = rhs(0);
+      ret(1) = rhs(1);
+      ret(2) =-rhs(2);
+      ret(3) =-rhs(3);
+    };
+    template<class vtype> inline void multMinusGamma5(iVector<vtype,Ns> &ret, iVector<vtype,Ns> &rhs){
+      ret(0) =-rhs(0);
+      ret(1) =-rhs(1);
+      ret(2) = rhs(2);
+      ret(3) = rhs(3);
+    };
+    template<class vtype> inline void multGamma5(iMatrix<vtype,Ns> &ret, const iMatrix<vtype,Ns> &rhs){
+      for(int i=0;i<Ns;i++){
+	ret(0,i) = rhs(0,i);
+	ret(1,i) = rhs(1,i);
+	ret(2,i) =-rhs(2,i);
+	ret(3,i) =-rhs(3,i);
+      }
+    };
+    template<class vtype> inline void multMinusGamma5(iMatrix<vtype,Ns> &ret, const iMatrix<vtype,Ns> &rhs){
+      for(int i=0;i<Ns;i++){
+	ret(0,i) =-rhs(0,i);
+	ret(1,i) =-rhs(1,i);
+	ret(2,i) = rhs(2,i);
+	ret(3,i) = rhs(3,i);
+      }
+    };
+
+
+
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+    // Operator * : first case this is not a spin index, so recurse
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+    
+    // FIXME
+    //
+    // Optimisation; switch over to a "multGammaX(ret._internal,arg._internal)" style early and
+    // note that doing so from the lattice operator will avoid copy back and case switch overhead, as
+    // was done for the tensor math operator to remove operator * notation early
+    //
+#ifdef GRID_WARN_SUBOPTIMAL
+#warning "Optimisation alert switch over to multGammaX early "
+#endif     
+
+    template<class vtype> inline auto operator * ( const Gamma &G,const iScalar<vtype> &arg) ->
+      typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinIndex>::notvalue,iScalar<vtype> >::type 
+
+	{
+	  iScalar<vtype> ret;
+	  ret._internal=G*arg._internal;
+	  return ret;
+	}
+    template<class vtype,int N> inline auto operator * ( const Gamma &G,const iVector<vtype,N> &arg) ->
+      typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinIndex>::notvalue,iVector<vtype,N> >::type 
+	{
+	  iVector<vtype,N> ret;
+	  ret._internal=G*arg._internal;
+	  return ret;
+	}
+    template<class vtype,int N> inline auto operator * ( const Gamma &G,const iMatrix<vtype,N> &arg) ->
+      typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinIndex>::notvalue,iMatrix<vtype,N> >::type 
+	{
+	  iMatrix<vtype,N> ret;
+	  ret._internal=G*arg._internal;
+	  return ret;
+	}
+    ////////////////////////////////////////////////////////
+    // When we hit the spin index this matches and we stop
+    ////////////////////////////////////////////////////////
+    template<class vtype> inline auto operator * ( const Gamma &G,const iMatrix<vtype,Ns> &arg) ->
+      typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,Ns>,SpinIndex>::value,iMatrix<vtype,Ns> >::type 
+      {
+	iMatrix<vtype,Ns> ret;
+	switch (G._g) {
+	case Gamma::Identity:
+	  ret = arg;
+	  break;
+	case Gamma::MinusIdentity:
+	  ret = -arg;
+	  break;
+	case Gamma::GammaX:
+	  multGammaX(ret,arg);
+	  break;
+	case Gamma::MinusGammaX:
+	  multMinusGammaX(ret,arg);
+	  break;
+	case Gamma::GammaY:
+	  multGammaY(ret,arg);
+	  break;
+	case Gamma::MinusGammaY:
+	  multMinusGammaY(ret,arg);
+	  break;
+	case Gamma::GammaZ:
+	  multGammaZ(ret,arg);
+	  break;
+	case Gamma::MinusGammaZ:
+	  multMinusGammaZ(ret,arg);
+	  break;
+	case Gamma::GammaT:
+	  multGammaT(ret,arg);
+	  break;
+	case Gamma::MinusGammaT:
+	  multMinusGammaT(ret,arg);
+	  break;
+	case Gamma::Gamma5:
+	  multGamma5(ret,arg);
+	  break;
+	case Gamma::MinusGamma5:
+	  multMinusGamma5(ret,arg);
+	  break;
+	default:
+	  assert(0);
+	  break;
+	}
+	return ret;
+      }
+    /* Output from test
+./Grid_gamma 
+Identity((1,0),(0,0),(0,0),(0,0))
+        ((0,0),(1,0),(0,0),(0,0))
+        ((0,0),(0,0),(1,0),(0,0))
+        ((0,0),(0,0),(0,0),(1,0))   OK
+
+GammaX  ((0,0),(0,0),(0,0),(0,1))
+        ((0,0),(0,0),(0,1),(0,0))
+        ((0,0),(0,-1),(0,0),(0,0))
+        ((0,-1),(0,0),(0,0),(0,0))  OK
+    * Gx
+    *  0 0  0  i    
+    *  0 0  i  0    
+    *  0 -i 0  0
+    * -i 0  0  0
+
+GammaY  ((-0,-0),(-0,-0),(-0,-0),(-1,-0))
+        ((0,0),(0,0),(1,0),(0,0))
+        ((0,0),(1,0),(0,0),(0,0))          OK
+        ((-1,-0),(-0,-0),(-0,-0),(-0,-0))
+     *Gy
+     *  0 0  0  -1  [0] -+ [3]
+     *  0 0  1  0   [1] +- [2]
+     *  0 1  0  0
+     * -1 0  0  0
+
+GammaZ  ((0,0),(0,0),(0,1),(0,0))
+        ((0,0),(0,0),(0,0),(0,-1))
+        ((0,-1),(0,0),(0,0),(0,0))
+        ((0,0),(0,1),(0,0),(0,0))   OK
+     *  0 0  i  0   [0]+-i[2]
+     *  0 0  0 -i   [1]-+i[3]
+     * -i 0  0  0
+     *  0 i  0  0
+
+GammaT  ((0,0),(0,0),(1,0),(0,0))
+        ((0,0),(0,0),(0,0),(1,0))  OK
+        ((1,0),(0,0),(0,0),(0,0))
+        ((0,0),(1,0),(0,0),(0,0))
+     *  0 0  1  0 [0]+-[2]
+     *  0 0  0  1 [1]+-[3]
+     *  1 0  0  0
+     *  0 1  0  0
+
+Gamma5  ((1,0),(0,0),(0,0),(0,0))
+        ((0,0),(1,0),(0,0),(0,0))
+        ((-0,-0),(-0,-0),(-1,-0),(-0,-0))
+        ((-0,-0),(-0,-0),(-0,-0),(-1,-0))
+     *  1 0  0  0 [0]+-[2]
+     *  0 1  0  0 [1]+-[3]  OK
+     *  0 0 -1  0
+     *  0 0  0 -1
+     */
+
+}   //namespace QCD
+} // Grid
+#endif
diff --git a/lib/simd/Grid_vComplexD.h b/lib/simd/Grid_vComplexD.h
index 313eef4a..402d303c 100644
--- a/lib/simd/Grid_vComplexD.h
+++ b/lib/simd/Grid_vComplexD.h
@@ -101,13 +101,13 @@ namespace Grid {
              IF IMM0[2] = 0
              THEN DEST[191:128]=SRC1[191:128] ELSE DEST[191:128]=SRC1[255:192] FI;
              IF IMM0[3] = 0
-             THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI;
+             THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI; // Ox5 r<->i   ; 0xC unchanged
              */
 #if defined (AVX1)|| defined (AVX2)
             zvec ymm0,ymm1,ymm2;
             ymm0 = _mm256_shuffle_pd(a.v,a.v,0x0); // ymm0 <- ar ar, ar,ar b'00,00
             ymm0 = _mm256_mul_pd(ymm0,b.v);        // ymm0 <- ar bi, ar br
-            ymm1 = _mm256_shuffle_pd(b.v,b.v,0x5); // ymm1 <- br,bi  b'01,01
+            ymm1 = _mm256_shuffle_pd(b.v,b.v,0x5); // ymm1 <- br,bi  b'01,01 
             ymm2 = _mm256_shuffle_pd(a.v,a.v,0xF); // ymm2 <- ai,ai  b'11,11
             ymm1 = _mm256_mul_pd(ymm1,ymm2);       // ymm1 <- br ai, ai bi
             ret.v= _mm256_addsub_pd(ymm0,ymm1);
@@ -140,7 +140,7 @@ namespace Grid {
              * publisher = {ACM},
              * address = {New York, NY, USA},
              * keywords = {autovectorization, fourier transform, program generation, simd, super-optimization},
- *                } 
+	     *                } 
              */
             zvec vzero,ymm0,ymm1,real,imag;
             vzero =  _mm512_setzero();
@@ -252,23 +252,71 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
             vComplexD ret ; vzero(ret);
 #if defined (AVX1)|| defined (AVX2)
             // addsubps 0, inv=>0+in.v[3] 0-in.v[2], 0+in.v[1], 0-in.v[0], ...
-            __m256d tmp = _mm256_addsub_pd(ret.v,_mm256_shuffle_pd(in.v,in.v,0x5));
-             ret.v=_mm256_shuffle_pd(tmp,tmp,0x5);
+	    //            __m256d tmp = _mm256_addsub_pd(ret.v,_mm256_shuffle_pd(in.v,in.v,0x5));
+	    //             ret.v=_mm256_shuffle_pd(tmp,tmp,0x5);
+            ret.v = _mm256_addsub_pd(ret.v,in.v);
 #endif
 #ifdef SSE4
             ret.v = _mm_addsub_pd(ret.v,in.v);
 #endif
 #ifdef AVX512
-             // Xeon does not have fmaddsub or addsub 
-             // with mask 0xa (1010), v[0] -v[1] v[2] -v[3] ....
-             ret.v = _mm512_mask_sub_pd(in.v, 0xaaaa,ret.v, in.v);
-             
+	    ret.v = _mm512_mask_sub_pd(in.v, 0xaaaa,ret.v, in.v);             
 #endif
 #ifdef QPX
 	     assert(0);
 #endif
             return ret;
         }
+
+        friend inline vComplexD timesI(const vComplexD &in){
+	  vComplexD ret; vzero(ret);
+#if defined (AVX1)|| defined (AVX2)
+	  cvec tmp =_mm256_addsub_ps(ret.v,in.v); // r,-i
+	  /*
+             IF IMM0[0] = 0
+             THEN DEST[63:0]=SRC1[63:0] ELSE DEST[63:0]=SRC1[127:64] FI;
+             IF IMM0[1] = 0
+             THEN DEST[127:64]=SRC2[63:0] ELSE DEST[127:64]=SRC2[127:64] FI;
+             IF IMM0[2] = 0
+             THEN DEST[191:128]=SRC1[191:128] ELSE DEST[191:128]=SRC1[255:192] FI;
+             IF IMM0[3] = 0
+             THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI;
+	  */
+          ret.v    =_mm256_shuffle_ps(tmp,tmp,0x5);
+#endif
+#ifdef SSE4
+	  cvec tmp =_mm_addsub_ps(ret.v,in.v); // r,-i
+          ret.v    =_mm_shuffle_ps(tmp,tmp,0x5);
+#endif
+#ifdef AVX512
+          ret.v = _mm512_mask_sub_ps(in.v,0xaaaa,ret.v,in.v); // real -imag 
+	  ret.v = _mm512_swizzle_ps(ret.v, _MM_SWIZ_REG_CDAB);// OK
+#endif
+#ifdef QPX
+            assert(0);
+#endif
+	  return ret;
+	}
+        friend inline vComplexD timesMinusI(const vComplexD &in){
+	  vComplexD ret; vzero(ret);
+#if defined (AVX1)|| defined (AVX2)
+	  cvec tmp =_mm256_shuffle_ps(in.v,in.v,0x5);
+          ret.v    =_mm256_addsub_ps(ret.v,tmp); // i,-r
+#endif
+#ifdef SSE4
+	  cvec tmp =_mm_shuffle_ps(in.v,in.v,0x5);
+          ret.v    =_mm_addsub_ps(ret.v,tmp); // r,-i
+#endif
+#ifdef AVX512
+          cvec tmp = _mm512_swizzle_ps(in.v, _MM_SWIZ_REG_CDAB);// OK
+	  ret.v    = _mm512_mask_sub_ps(tmp,0xaaaa,ret.v,tmp); // real -imag 
+#endif
+#ifdef QPX
+            assert(0);
+#endif
+	  return ret;
+	}
+
 // REDUCE FIXME must be a cleaner implementation
        friend inline ComplexD Reduce(const vComplexD & in)
        { 
diff --git a/lib/simd/Grid_vComplexF.h b/lib/simd/Grid_vComplexF.h
index d931964c..4c31ad04 100644
--- a/lib/simd/Grid_vComplexF.h
+++ b/lib/simd/Grid_vComplexF.h
@@ -329,9 +329,10 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
         friend inline vComplexF conj(const vComplexF &in){
             vComplexF ret ; vzero(ret);
 #if defined (AVX1)|| defined (AVX2)
-             cvec tmp;
-             tmp = _mm256_addsub_ps(ret.v,_mm256_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1))); // ymm1 <- br,bi
-             ret.v=_mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1));
+	    //             cvec tmp;
+	    //             tmp = _mm256_addsub_ps(ret.v,_mm256_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1))); // ymm1 <- br,bi
+	    //             ret.v=_mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1));
+	    ret.v = _mm256_addsub_ps(ret.v,in.v);
 #endif
 #ifdef SSE4
             ret.v = _mm_addsub_ps(ret.v,in.v);
@@ -344,6 +345,44 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
 #endif
             return ret;
         }
+        friend inline vComplexF timesI(const vComplexF &in){
+	  vComplexF ret; vzero(ret);
+#if defined (AVX1)|| defined (AVX2)
+	  cvec tmp =_mm256_addsub_ps(ret.v,in.v); // r,-i
+          ret.v = _mm256_shuffle_ps(tmp,tmp,0x5);
+#endif
+#ifdef SSE4
+	  cvec tmp =_mm_addsub_ps(ret.v,in.v); // r,-i
+          ret.v = _mm_shuffle_ps(tmp,tmp,0x5);
+#endif
+#ifdef AVX512
+          ret.v = _mm512_mask_sub_ps(in.v,0xaaaa,ret.v,in.v); // real -imag 
+	  ret.v = _mm512_swizzle_ps(ret.v, _MM_SWIZ_REG_CDAB);// OK
+#endif
+#ifdef QPX
+            assert(0);
+#endif
+	  return ret;
+	}
+        friend inline vComplexF timesMinusI(const vComplexF &in){
+	  vComplexF ret; vzero(ret);
+#if defined (AVX1)|| defined (AVX2)
+	  cvec tmp =_mm256_shuffle_ps(in.v,in.v,0x5);
+          ret.v = _mm256_addsub_ps(ret.v,tmp); // i,-r
+#endif
+#ifdef SSE4
+	  cvec tmp =_mm_shuffle_ps(in.v,in.v,0x5);
+          ret.v = _mm_addsub_ps(ret.v,tmp); // r,-i
+#endif
+#ifdef AVX512
+          cvec tmp = _mm512_swizzle_ps(in.v, _MM_SWIZ_REG_CDAB);// OK
+	  ret.v    = _mm512_mask_sub_ps(tmp,0xaaaa,ret.v,tmp); // real -imag 
+#endif
+#ifdef QPX
+            assert(0);
+#endif
+	  return ret;
+	}
         
         // Unary negation
         friend inline vComplexF operator -(const vComplexF &r) {
diff --git a/tests/Grid_gamma.cc b/tests/Grid_gamma.cc
new file mode 100644
index 00000000..9c213352
--- /dev/null
+++ b/tests/Grid_gamma.cc
@@ -0,0 +1,95 @@
+#include <Grid.h>
+#include <parallelIO/GridNerscIO.h>
+
+using namespace std;
+using namespace Grid;
+using namespace Grid::QCD;
+
+
+int main (int argc, char ** argv)
+{
+  Grid_init(&argc,&argv);
+
+  std::vector<int> simd_layout({1,1,2,2});
+  std::vector<int> mpi_layout ({1,1,1,1});
+  std::vector<int> latt_size  ({8,8,8,8});
+    
+  GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
+
+  GridRNG           RNG(&Grid);
+
+  SpinMatrix ident=zero;
+  SpinMatrix ll=zero;
+  SpinMatrix rr=zero;
+  SpinMatrix result;
+
+  for(int a=0;a<Ns;a++){
+    ident()(a,a) = 1.0;
+  }
+
+  Gamma::GammaMatrix g [] = {
+    Gamma::Identity,
+    Gamma::GammaX,
+    Gamma::GammaY,
+    Gamma::GammaZ,
+    Gamma::GammaT,
+    Gamma::Gamma5,
+    Gamma::MinusIdentity,
+    Gamma::MinusGammaX,
+    Gamma::MinusGammaY,
+    Gamma::MinusGammaZ,
+    Gamma::MinusGammaT,
+    Gamma::MinusGamma5
+  };
+  const char *list[] = { 
+    "Identity ",
+    "GammaX   ",
+    "GammaY   ",
+    "GammaZ   ",
+    "GammaT   ",
+    "Gamma5   ",
+    "-Identity",
+    "-GammaX  ",
+    "-GammaY  ",
+    "-GammaZ  ",
+    "-GammaT  ",
+    "-Gamma5  ",
+    "         "
+  };
+  //  result == ll*Gamma(g[0])*rr;
+  //  result == ll*Gamma(g[0]);
+
+  for(int mu=0;mu<12;mu++){
+
+    result = Gamma(g[mu])* ident;
+
+    for(int i=0;i<Ns;i++){
+
+      if(i==0) std::cout << list[mu];
+      else     std::cout << list[12];
+
+      std::cout<<"(";
+      for(int j=0;j<Ns;j++){
+	if ( abs(result()(i,j)())==0 ) {
+	  std::cout<< " 0";
+	} else if ( abs(result()(i,j)() - ComplexF(0,1))==0){
+	  std::cout<< " i";
+	} else if ( abs(result()(i,j)() + ComplexF(0,1))==0){
+	  std::cout<< "-i";
+	} else if ( abs(result()(i,j)() - ComplexF(1,0))==0){
+	  std::cout<< " 1";
+	} else if ( abs(result()(i,j)() + ComplexF(1,0))==0){
+	  std::cout<< "-1";
+	}
+	std::cout<< ((j==Ns-1) ? ")" : "," );
+      }
+      std::cout << std::endl;
+    }
+
+    std::cout << std::endl;
+
+  }
+
+  
+  Grid_finalize();
+}
diff --git a/tests/test_Grid_stencil.cc b/tests/Grid_stencil.cc
similarity index 100%
rename from tests/test_Grid_stencil.cc
rename to tests/Grid_stencil.cc
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 0be04e52..8afac541 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -5,7 +5,7 @@ AM_LDFLAGS = -L$(top_srcdir)/lib
 #
 # Test code
 #
-bin_PROGRAMS = Grid_main test_Grid_stencil Grid_nersc_io Grid_cshift
+bin_PROGRAMS = Grid_main Grid_stencil Grid_nersc_io Grid_cshift Grid_gamma
 
 Grid_main_SOURCES = Grid_main.cc
 Grid_main_LDADD = -lGrid
@@ -16,5 +16,8 @@ Grid_nersc_io_LDADD = -lGrid
 Grid_cshift_SOURCES = Grid_cshift.cc
 Grid_cshift_LDADD = -lGrid
 
-test_Grid_stencil_SOURCES = test_Grid_stencil.cc
-test_Grid_stencil_LDADD = -lGrid
+Grid_gamma_SOURCES = Grid_gamma.cc
+Grid_gamma_LDADD = -lGrid
+
+Grid_stencil_SOURCES = Grid_stencil.cc
+Grid_stencil_LDADD = -lGrid