From b8eef54fa743476693fd1a3056caebe4063f0d4a Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Fri, 24 Apr 2015 18:20:03 +0100 Subject: [PATCH] First implementation of Dirac matrices as a Gamma class. --- lib/Grid_simd.h | 7 +- lib/math/Grid_math_arith_add.h | 7 - lib/math/Grid_math_reality.h | 59 ++- lib/math/Grid_math_tensors.h | 23 + lib/qcd/Grid_qcd.h | 3 + lib/qcd/Grid_qcd_dirac.h | 393 ++++++++++++++++++ lib/simd/Grid_vComplexD.h | 66 ++- lib/simd/Grid_vComplexF.h | 45 +- tests/Grid_gamma.cc | 95 +++++ .../{test_Grid_stencil.cc => Grid_stencil.cc} | 0 tests/Makefile.am | 9 +- 11 files changed, 683 insertions(+), 24 deletions(-) create mode 100644 lib/qcd/Grid_qcd_dirac.h create mode 100644 tests/Grid_gamma.cc rename tests/{test_Grid_stencil.cc => Grid_stencil.cc} (100%) diff --git a/lib/Grid_simd.h b/lib/Grid_simd.h index 17755fe0..d8788e33 100644 --- a/lib/Grid_simd.h +++ b/lib/Grid_simd.h @@ -61,7 +61,12 @@ namespace Grid { inline void add (ComplexF * __restrict__ y,const ComplexF * __restrict__ l,const ComplexF *__restrict__ r){ *y = (*l) + (*r); } inline ComplexF adj(const ComplexF& r ){ return(conj(r)); } //conj already supported for complex - + + inline ComplexF timesI(const ComplexF r) { return(r*ComplexF(0.0,1.0));} + inline ComplexF timesMinusI(const ComplexF r){ return(r*ComplexF(0.0,-1.0));} + inline ComplexD timesI(const ComplexD r) { return(r*ComplexD(0.0,1.0));} + inline ComplexD timesMinusI(const ComplexD r){ return(r*ComplexD(0.0,-1.0));} + inline void mac (RealD * __restrict__ y,const RealD * __restrict__ a,const RealD *__restrict__ x){ *y = (*a) * (*x)+(*y);} inline void mult(RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) * (*r);} inline void sub (RealD * __restrict__ y,const RealD * __restrict__ l,const RealD *__restrict__ r){ *y = (*l) - (*r);} diff --git a/lib/math/Grid_math_arith_add.h b/lib/math/Grid_math_arith_add.h index 7e62e07b..93449402 100644 --- a/lib/math/Grid_math_arith_add.h +++ b/lib/math/Grid_math_arith_add.h @@ -63,13 +63,6 @@ namespace Grid { return; } - // Need to figure multi-precision. - template Mytype timesI(Mytype &r) - { - iScalar i; - i._internal = Complex(0,1); - return r*i; - } // + operator for scalar, vector, matrix template diff --git a/lib/math/Grid_math_reality.h b/lib/math/Grid_math_reality.h index 1f676c11..1bf34f2a 100644 --- a/lib/math/Grid_math_reality.h +++ b/lib/math/Grid_math_reality.h @@ -5,8 +5,63 @@ namespace Grid { /////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////// CONJ /////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////// - + +#ifdef GRID_WARN_SUBOPTIMAL +#warning "Optimisation alert switch over to two argument form to avoid copy back in perf critical timesI " +#endif +/////////////////////////////////////////////// +// multiply by I; make recursive. +/////////////////////////////////////////////// +template inline iScalar timesI(const iScalar&r) +{ + iScalar ret; + ret._internal = timesI(r._internal); + return ret; +} +template inline iVector timesI(const iVector&r) +{ + iVector ret; + for(int i=0;i inline iMatrix timesI(const iMatrix&r) +{ + iMatrix ret; + for(int i=0;i inline iScalar timesMinusI(const iScalar&r) +{ + iScalar ret; + ret._internal = timesMinusI(r._internal); + return ret; +} +template inline iVector timesMinusI(const iVector&r) +{ + iVector ret; + for(int i=0;i inline iMatrix timesMinusI(const iMatrix&r) +{ + iMatrix ret; + for(int i=0;i inline iScalar conj(const iScalar&r) { iScalar ret; @@ -31,7 +86,9 @@ template inline iMatrix conj(const iMatrix& return ret; } +/////////////////////////////////////////////// // Adj function for scalar, vector, matrix +/////////////////////////////////////////////// template inline iScalar adj(const iScalar&r) { iScalar ret; diff --git a/lib/math/Grid_math_tensors.h b/lib/math/Grid_math_tensors.h index 379e61d2..8bca6805 100644 --- a/lib/math/Grid_math_tensors.h +++ b/lib/math/Grid_math_tensors.h @@ -72,6 +72,13 @@ public: return _internal; } + inline const vtype & operator ()(void) const { + return _internal; + } + // inline vtype && operator ()(void) { + // return _internal; + // } + operator ComplexD () const { return(TensorRemove(_internal)); }; operator RealD () const { return(real(TensorRemove(_internal))); } @@ -156,6 +163,12 @@ public: inline vtype & operator ()(int i) { return _internal[i]; } + inline const vtype & operator ()(int i) const { + return _internal[i]; + } + // inline vtype && operator ()(int i) { + // return _internal[i]; + // } }; template class iMatrix @@ -235,12 +248,22 @@ public: *this = (*this)+r; return *this; } + + // returns an lvalue reference inline vtype & operator ()(int i,int j) { return _internal[i][j]; } + inline const vtype & operator ()(int i,int j) const { + return _internal[i][j]; + } + + // inline vtype && operator ()(int i,int j) { + // return _internal[i][j]; + // } }; + template inline void extract(const vobj &vec,std::vector &extracted) { diff --git a/lib/qcd/Grid_qcd.h b/lib/qcd/Grid_qcd.h index b30b7a67..a5c4320e 100644 --- a/lib/qcd/Grid_qcd.h +++ b/lib/qcd/Grid_qcd.h @@ -143,4 +143,7 @@ namespace QCD { } //namespace QCD } // Grid + +#include + #endif diff --git a/lib/qcd/Grid_qcd_dirac.h b/lib/qcd/Grid_qcd_dirac.h new file mode 100644 index 00000000..86e3870d --- /dev/null +++ b/lib/qcd/Grid_qcd_dirac.h @@ -0,0 +1,393 @@ +#ifndef GRID_QCD_DIRAC_H +#define GRID_QCD_DIRAC_H +namespace Grid{ + +namespace QCD { + + class Gamma { + + public: + + const int Ns=4; + + enum GammaMatrix { + Identity, + GammaX, + GammaY, + GammaZ, + GammaT, + Gamma5, + // GammaXGamma5, + // GammaYGamma5, + // GammaZGamma5, + // GammaTGamma5, + // SigmaXY, + // SigmaXZ, + // SigmaYZ, + // SigmaXT, + // SigmaYT, + // SigmaZT, + MinusIdentity, + MinusGammaX, + MinusGammaY, + MinusGammaZ, + MinusGammaT, + MinusGamma5 + // MinusGammaXGamma5, easiest to form by composition + // MinusGammaYGamma5, as performance is not critical for these + // MinusGammaZGamma5, + // MinusGammaTGamma5, + // MinusSigmaXY, + // MinusSigmaXZ, + // MinusSigmaYZ, + // MinusSigmaXT, + // MinusSigmaYT, + // MinusSigmaZT + }; + + Gamma (GammaMatrix g) { _g=g; } + + GammaMatrix _g; + + + }; + + + /* Gx + * 0 0 0 i + * 0 0 i 0 + * 0 -i 0 0 + * -i 0 0 0 + */ + template inline void rmultMinusGammaX(iMatrix &ret,const iMatrix &rhs){ + for(int i=0;i inline void rmultGammaX(iMatrix &ret, const iMatrix &rhs){ + for(int i=0;i inline void multGammaX(iVector &ret, iVector &rhs){ + ret._internal[0] = timesI(rhs._internal[3]); + ret._internal[1] = timesI(rhs._internal[2]); + ret._internal[2] = timesMinusI(rhs._internal[1]); + ret._internal[3] = timesMinusI(rhs._internal[0]); + }; + template inline void multMinusGammaX(iVector &ret, iVector &rhs){ + ret(0) = timesMinusI(rhs(3)); + ret(1) = timesMinusI(rhs(2)); + ret(2) = timesI(rhs(1)); + ret(3) = timesI(rhs(0)); + }; + template inline void multGammaX(iMatrix &ret, const iMatrix &rhs){ + for(int i=0;i inline void multMinusGammaX(iMatrix &ret, const iMatrix &rhs){ + for(int i=0;i inline void multGammaY(iVector &ret, iVector &rhs){ + ret(0) = -rhs(3); + ret(1) = rhs(2); + ret(2) = rhs(1); + ret(3) = -rhs(0); + }; + template inline void multMinusGammaY(iVector &ret, iVector &rhs){ + ret(0) = rhs(3); + ret(1) = -rhs(2); + ret(2) = -rhs(1); + ret(3) = rhs(0); + }; + template inline void multGammaY(iMatrix &ret, const iMatrix &rhs){ + for(int i=0;i inline void multMinusGammaY(iMatrix &ret, const iMatrix &rhs){ + for(int i=0;i inline void multGammaZ(iVector &ret, iVector &rhs){ + ret(0) = timesI(rhs(2)); + ret(1) =timesMinusI(rhs(3)); + ret(2) =timesMinusI(rhs(0)); + ret(3) = timesI(rhs(1)); + }; + template inline void multMinusGammaZ(iVector &ret, iVector &rhs){ + ret(0) = timesMinusI(rhs(2)); + ret(1) = timesI(rhs(3)); + ret(2) = timesI(rhs(0)); + ret(3) = timesMinusI(rhs(1)); + }; + template inline void multGammaZ(iMatrix &ret, const iMatrix &rhs){ + for(int i=0;i inline void multMinusGammaZ(iMatrix &ret, const iMatrix &rhs){ + for(int i=0;i inline void multGammaT(iVector &ret, iVector &rhs){ + ret(0) = rhs(2); + ret(1) = rhs(3); + ret(2) = rhs(0); + ret(3) = rhs(1); + }; + template inline void multMinusGammaT(iVector &ret, iVector &rhs){ + ret(0) =-rhs(2); + ret(1) =-rhs(3); + ret(2) =-rhs(0); + ret(3) =-rhs(1); + }; + template inline void multGammaT(iMatrix &ret, const iMatrix &rhs){ + for(int i=0;i inline void multMinusGammaT(iMatrix &ret, const iMatrix &rhs){ + for(int i=0;i inline void multGamma5(iVector &ret, iVector &rhs){ + ret(0) = rhs(0); + ret(1) = rhs(1); + ret(2) =-rhs(2); + ret(3) =-rhs(3); + }; + template inline void multMinusGamma5(iVector &ret, iVector &rhs){ + ret(0) =-rhs(0); + ret(1) =-rhs(1); + ret(2) = rhs(2); + ret(3) = rhs(3); + }; + template inline void multGamma5(iMatrix &ret, const iMatrix &rhs){ + for(int i=0;i inline void multMinusGamma5(iMatrix &ret, const iMatrix &rhs){ + for(int i=0;i inline auto operator * ( const Gamma &G,const iScalar &arg) -> + typename std::enable_if,SpinIndex>::notvalue,iScalar >::type + + { + iScalar ret; + ret._internal=G*arg._internal; + return ret; + } + template inline auto operator * ( const Gamma &G,const iVector &arg) -> + typename std::enable_if,SpinIndex>::notvalue,iVector >::type + { + iVector ret; + ret._internal=G*arg._internal; + return ret; + } + template inline auto operator * ( const Gamma &G,const iMatrix &arg) -> + typename std::enable_if,SpinIndex>::notvalue,iMatrix >::type + { + iMatrix ret; + ret._internal=G*arg._internal; + return ret; + } + //////////////////////////////////////////////////////// + // When we hit the spin index this matches and we stop + //////////////////////////////////////////////////////// + template inline auto operator * ( const Gamma &G,const iMatrix &arg) -> + typename std::enable_if,SpinIndex>::value,iMatrix >::type + { + iMatrix ret; + switch (G._g) { + case Gamma::Identity: + ret = arg; + break; + case Gamma::MinusIdentity: + ret = -arg; + break; + case Gamma::GammaX: + multGammaX(ret,arg); + break; + case Gamma::MinusGammaX: + multMinusGammaX(ret,arg); + break; + case Gamma::GammaY: + multGammaY(ret,arg); + break; + case Gamma::MinusGammaY: + multMinusGammaY(ret,arg); + break; + case Gamma::GammaZ: + multGammaZ(ret,arg); + break; + case Gamma::MinusGammaZ: + multMinusGammaZ(ret,arg); + break; + case Gamma::GammaT: + multGammaT(ret,arg); + break; + case Gamma::MinusGammaT: + multMinusGammaT(ret,arg); + break; + case Gamma::Gamma5: + multGamma5(ret,arg); + break; + case Gamma::MinusGamma5: + multMinusGamma5(ret,arg); + break; + default: + assert(0); + break; + } + return ret; + } + /* Output from test +./Grid_gamma +Identity((1,0),(0,0),(0,0),(0,0)) + ((0,0),(1,0),(0,0),(0,0)) + ((0,0),(0,0),(1,0),(0,0)) + ((0,0),(0,0),(0,0),(1,0)) OK + +GammaX ((0,0),(0,0),(0,0),(0,1)) + ((0,0),(0,0),(0,1),(0,0)) + ((0,0),(0,-1),(0,0),(0,0)) + ((0,-1),(0,0),(0,0),(0,0)) OK + * Gx + * 0 0 0 i + * 0 0 i 0 + * 0 -i 0 0 + * -i 0 0 0 + +GammaY ((-0,-0),(-0,-0),(-0,-0),(-1,-0)) + ((0,0),(0,0),(1,0),(0,0)) + ((0,0),(1,0),(0,0),(0,0)) OK + ((-1,-0),(-0,-0),(-0,-0),(-0,-0)) + *Gy + * 0 0 0 -1 [0] -+ [3] + * 0 0 1 0 [1] +- [2] + * 0 1 0 0 + * -1 0 0 0 + +GammaZ ((0,0),(0,0),(0,1),(0,0)) + ((0,0),(0,0),(0,0),(0,-1)) + ((0,-1),(0,0),(0,0),(0,0)) + ((0,0),(0,1),(0,0),(0,0)) OK + * 0 0 i 0 [0]+-i[2] + * 0 0 0 -i [1]-+i[3] + * -i 0 0 0 + * 0 i 0 0 + +GammaT ((0,0),(0,0),(1,0),(0,0)) + ((0,0),(0,0),(0,0),(1,0)) OK + ((1,0),(0,0),(0,0),(0,0)) + ((0,0),(1,0),(0,0),(0,0)) + * 0 0 1 0 [0]+-[2] + * 0 0 0 1 [1]+-[3] + * 1 0 0 0 + * 0 1 0 0 + +Gamma5 ((1,0),(0,0),(0,0),(0,0)) + ((0,0),(1,0),(0,0),(0,0)) + ((-0,-0),(-0,-0),(-1,-0),(-0,-0)) + ((-0,-0),(-0,-0),(-0,-0),(-1,-0)) + * 1 0 0 0 [0]+-[2] + * 0 1 0 0 [1]+-[3] OK + * 0 0 -1 0 + * 0 0 0 -1 + */ + +} //namespace QCD +} // Grid +#endif diff --git a/lib/simd/Grid_vComplexD.h b/lib/simd/Grid_vComplexD.h index 313eef4a..402d303c 100644 --- a/lib/simd/Grid_vComplexD.h +++ b/lib/simd/Grid_vComplexD.h @@ -101,13 +101,13 @@ namespace Grid { IF IMM0[2] = 0 THEN DEST[191:128]=SRC1[191:128] ELSE DEST[191:128]=SRC1[255:192] FI; IF IMM0[3] = 0 - THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI; + THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI; // Ox5 r<->i ; 0xC unchanged */ #if defined (AVX1)|| defined (AVX2) zvec ymm0,ymm1,ymm2; ymm0 = _mm256_shuffle_pd(a.v,a.v,0x0); // ymm0 <- ar ar, ar,ar b'00,00 ymm0 = _mm256_mul_pd(ymm0,b.v); // ymm0 <- ar bi, ar br - ymm1 = _mm256_shuffle_pd(b.v,b.v,0x5); // ymm1 <- br,bi b'01,01 + ymm1 = _mm256_shuffle_pd(b.v,b.v,0x5); // ymm1 <- br,bi b'01,01 ymm2 = _mm256_shuffle_pd(a.v,a.v,0xF); // ymm2 <- ai,ai b'11,11 ymm1 = _mm256_mul_pd(ymm1,ymm2); // ymm1 <- br ai, ai bi ret.v= _mm256_addsub_pd(ymm0,ymm1); @@ -140,7 +140,7 @@ namespace Grid { * publisher = {ACM}, * address = {New York, NY, USA}, * keywords = {autovectorization, fourier transform, program generation, simd, super-optimization}, - * } + * } */ zvec vzero,ymm0,ymm1,real,imag; vzero = _mm512_setzero(); @@ -252,23 +252,71 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){ vComplexD ret ; vzero(ret); #if defined (AVX1)|| defined (AVX2) // addsubps 0, inv=>0+in.v[3] 0-in.v[2], 0+in.v[1], 0-in.v[0], ... - __m256d tmp = _mm256_addsub_pd(ret.v,_mm256_shuffle_pd(in.v,in.v,0x5)); - ret.v=_mm256_shuffle_pd(tmp,tmp,0x5); + // __m256d tmp = _mm256_addsub_pd(ret.v,_mm256_shuffle_pd(in.v,in.v,0x5)); + // ret.v=_mm256_shuffle_pd(tmp,tmp,0x5); + ret.v = _mm256_addsub_pd(ret.v,in.v); #endif #ifdef SSE4 ret.v = _mm_addsub_pd(ret.v,in.v); #endif #ifdef AVX512 - // Xeon does not have fmaddsub or addsub - // with mask 0xa (1010), v[0] -v[1] v[2] -v[3] .... - ret.v = _mm512_mask_sub_pd(in.v, 0xaaaa,ret.v, in.v); - + ret.v = _mm512_mask_sub_pd(in.v, 0xaaaa,ret.v, in.v); #endif #ifdef QPX assert(0); #endif return ret; } + + friend inline vComplexD timesI(const vComplexD &in){ + vComplexD ret; vzero(ret); +#if defined (AVX1)|| defined (AVX2) + cvec tmp =_mm256_addsub_ps(ret.v,in.v); // r,-i + /* + IF IMM0[0] = 0 + THEN DEST[63:0]=SRC1[63:0] ELSE DEST[63:0]=SRC1[127:64] FI; + IF IMM0[1] = 0 + THEN DEST[127:64]=SRC2[63:0] ELSE DEST[127:64]=SRC2[127:64] FI; + IF IMM0[2] = 0 + THEN DEST[191:128]=SRC1[191:128] ELSE DEST[191:128]=SRC1[255:192] FI; + IF IMM0[3] = 0 + THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI; + */ + ret.v =_mm256_shuffle_ps(tmp,tmp,0x5); +#endif +#ifdef SSE4 + cvec tmp =_mm_addsub_ps(ret.v,in.v); // r,-i + ret.v =_mm_shuffle_ps(tmp,tmp,0x5); +#endif +#ifdef AVX512 + ret.v = _mm512_mask_sub_ps(in.v,0xaaaa,ret.v,in.v); // real -imag + ret.v = _mm512_swizzle_ps(ret.v, _MM_SWIZ_REG_CDAB);// OK +#endif +#ifdef QPX + assert(0); +#endif + return ret; + } + friend inline vComplexD timesMinusI(const vComplexD &in){ + vComplexD ret; vzero(ret); +#if defined (AVX1)|| defined (AVX2) + cvec tmp =_mm256_shuffle_ps(in.v,in.v,0x5); + ret.v =_mm256_addsub_ps(ret.v,tmp); // i,-r +#endif +#ifdef SSE4 + cvec tmp =_mm_shuffle_ps(in.v,in.v,0x5); + ret.v =_mm_addsub_ps(ret.v,tmp); // r,-i +#endif +#ifdef AVX512 + cvec tmp = _mm512_swizzle_ps(in.v, _MM_SWIZ_REG_CDAB);// OK + ret.v = _mm512_mask_sub_ps(tmp,0xaaaa,ret.v,tmp); // real -imag +#endif +#ifdef QPX + assert(0); +#endif + return ret; + } + // REDUCE FIXME must be a cleaner implementation friend inline ComplexD Reduce(const vComplexD & in) { diff --git a/lib/simd/Grid_vComplexF.h b/lib/simd/Grid_vComplexF.h index d931964c..4c31ad04 100644 --- a/lib/simd/Grid_vComplexF.h +++ b/lib/simd/Grid_vComplexF.h @@ -329,9 +329,10 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){ friend inline vComplexF conj(const vComplexF &in){ vComplexF ret ; vzero(ret); #if defined (AVX1)|| defined (AVX2) - cvec tmp; - tmp = _mm256_addsub_ps(ret.v,_mm256_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1))); // ymm1 <- br,bi - ret.v=_mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1)); + // cvec tmp; + // tmp = _mm256_addsub_ps(ret.v,_mm256_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1))); // ymm1 <- br,bi + // ret.v=_mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1)); + ret.v = _mm256_addsub_ps(ret.v,in.v); #endif #ifdef SSE4 ret.v = _mm_addsub_ps(ret.v,in.v); @@ -344,6 +345,44 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){ #endif return ret; } + friend inline vComplexF timesI(const vComplexF &in){ + vComplexF ret; vzero(ret); +#if defined (AVX1)|| defined (AVX2) + cvec tmp =_mm256_addsub_ps(ret.v,in.v); // r,-i + ret.v = _mm256_shuffle_ps(tmp,tmp,0x5); +#endif +#ifdef SSE4 + cvec tmp =_mm_addsub_ps(ret.v,in.v); // r,-i + ret.v = _mm_shuffle_ps(tmp,tmp,0x5); +#endif +#ifdef AVX512 + ret.v = _mm512_mask_sub_ps(in.v,0xaaaa,ret.v,in.v); // real -imag + ret.v = _mm512_swizzle_ps(ret.v, _MM_SWIZ_REG_CDAB);// OK +#endif +#ifdef QPX + assert(0); +#endif + return ret; + } + friend inline vComplexF timesMinusI(const vComplexF &in){ + vComplexF ret; vzero(ret); +#if defined (AVX1)|| defined (AVX2) + cvec tmp =_mm256_shuffle_ps(in.v,in.v,0x5); + ret.v = _mm256_addsub_ps(ret.v,tmp); // i,-r +#endif +#ifdef SSE4 + cvec tmp =_mm_shuffle_ps(in.v,in.v,0x5); + ret.v = _mm_addsub_ps(ret.v,tmp); // r,-i +#endif +#ifdef AVX512 + cvec tmp = _mm512_swizzle_ps(in.v, _MM_SWIZ_REG_CDAB);// OK + ret.v = _mm512_mask_sub_ps(tmp,0xaaaa,ret.v,tmp); // real -imag +#endif +#ifdef QPX + assert(0); +#endif + return ret; + } // Unary negation friend inline vComplexF operator -(const vComplexF &r) { diff --git a/tests/Grid_gamma.cc b/tests/Grid_gamma.cc new file mode 100644 index 00000000..9c213352 --- /dev/null +++ b/tests/Grid_gamma.cc @@ -0,0 +1,95 @@ +#include +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + std::vector simd_layout({1,1,2,2}); + std::vector mpi_layout ({1,1,1,1}); + std::vector latt_size ({8,8,8,8}); + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + + GridRNG RNG(&Grid); + + SpinMatrix ident=zero; + SpinMatrix ll=zero; + SpinMatrix rr=zero; + SpinMatrix result; + + for(int a=0;a