Merging in

Merge branch 'master' of https://github.com/paboyle/Grid
2026-07-18 16:13:28 +01:00 · 2015-05-19 21:30:13 +01:00
parent 3f57662cd0 46ab8edf30
commit dc4014668d
20 changed files with 749 additions and 188 deletions
@@ -33,6 +33,8 @@ MPI parallelism is UNIMPLEMENTED and for now only OpenMP and SIMD parallelism is
 by setting variables in the command line or in the environment.  Here
 is are examples:

+     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -msse4" --enable-simd=SSE4
+
     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx" --enable-simd=AVX1

     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx2" --enable-simd=AVX2
@@ -37,6 +37,8 @@ MPI, OpenMP, and SIMD parallelism are present in the library.
 by setting variables in the command line or in the environment.  Here
 are examples:

+     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -msse4" --enable-simd=SSE4
+
     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx" --enable-simd=AVX1

     ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx2" --enable-simd=AVX2
@@ -4503,6 +4503,11 @@ _ACEOF


 # Checks for library functions.
+echo
+echo Checking libraries
+echo :::::::::::::::::::::::::::::::::::::::::::
+
+
 for ac_func in gettimeofday
 do :
  ac_fn_cxx_check_func "$LINENO" "gettimeofday" "ac_cv_func_gettimeofday"
@@ -4515,6 +4520,105 @@ fi
 done


+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __gmpf_init in -lgmp" >&5
+$as_echo_n "checking for __gmpf_init in -lgmp... " >&6; }
+if ${ac_cv_lib_gmp___gmpf_init+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lgmp  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char __gmpf_init ();
+int
+main ()
+{
+return __gmpf_init ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  ac_cv_lib_gmp___gmpf_init=yes
+else
+  ac_cv_lib_gmp___gmpf_init=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_gmp___gmpf_init" >&5
+$as_echo "$ac_cv_lib_gmp___gmpf_init" >&6; }
+if test "x$ac_cv_lib_gmp___gmpf_init" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_LIBGMP 1
+_ACEOF
+
+  LIBS="-lgmp $LIBS"
+
+else
+  as_fn_error $? "GNU Multiple Precision GMP library was not found in your system.
+Please install or provide the correct path to your installation
+Info at: http://www.gmplib.org" "$LINENO" 5
+fi
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for mpfr_init in -lmpfr" >&5
+$as_echo_n "checking for mpfr_init in -lmpfr... " >&6; }
+if ${ac_cv_lib_mpfr_mpfr_init+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lmpfr  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char mpfr_init ();
+int
+main ()
+{
+return mpfr_init ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  ac_cv_lib_mpfr_mpfr_init=yes
+else
+  ac_cv_lib_mpfr_mpfr_init=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_mpfr_mpfr_init" >&5
+$as_echo "$ac_cv_lib_mpfr_mpfr_init" >&6; }
+if test "x$ac_cv_lib_mpfr_mpfr_init" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_LIBMPFR 1
+_ACEOF
+
+  LIBS="-lmpfr $LIBS"
+
+else
+  as_fn_error $? "GNU Multiple Precision MPFR library was not found in your system.
+Please install or provide the correct path to your installation
+Info at: http://www.mpfr.org/" "$LINENO" 5
+fi
+



@@ -3,7 +3,7 @@
 #
 # Project Grid package  
 # 
-# Time-stamp: <2015-05-18 17:14:20 neo>
+# Time-stamp: <2015-05-19 13:51:08 neo>

 AC_PREREQ([2.69])
 AC_INIT([Grid], [1.0], [paboyle@ph.ed.ac.uk])
@@ -46,8 +46,22 @@ AC_TYPE_UINT32_T
 AC_TYPE_UINT64_T

 # Checks for library functions.
+echo
+echo Checking libraries 
+echo :::::::::::::::::::::::::::::::::::::::::::
+
+
 AC_CHECK_FUNCS([gettimeofday])

+AC_CHECK_LIB([gmp],[__gmpf_init],,
+        [AC_MSG_ERROR(GNU Multiple Precision GMP library was not found in your system.
+Please install or provide the correct path to your installation
+Info at: http://www.gmplib.org)])
+
+AC_CHECK_LIB([mpfr],[mpfr_init],,
+        [AC_MSG_ERROR(GNU Multiple Precision MPFR library was not found in your system.
+Please install or provide the correct path to your installation
+Info at: http://www.mpfr.org/)])



@@ -36,6 +36,12 @@
 /* Define to 1 if you have the <inttypes.h> header file. */
 #define HAVE_INTTYPES_H 1

+/* Define to 1 if you have the `gmp' library (-lgmp). */
+#define HAVE_LIBGMP 1
+
+/* Define to 1 if you have the `mpfr' library (-lmpfr). */
+#define HAVE_LIBMPFR 1
+
 /* Define to 1 if you have the <malloc.h> header file. */
 #define HAVE_MALLOC_H 1

@@ -35,6 +35,12 @@
 /* Define to 1 if you have the <inttypes.h> header file. */
 #undef HAVE_INTTYPES_H

+/* Define to 1 if you have the `gmp' library (-lgmp). */
+#undef HAVE_LIBGMP
+
+/* Define to 1 if you have the `mpfr' library (-lmpfr). */
+#undef HAVE_LIBMPFR
+
 /* Define to 1 if you have the <malloc.h> header file. */
 #undef HAVE_MALLOC_H

@@ -14,86 +14,89 @@ endif
 # Libraries
 #
 lib_LIBRARIES = libGrid.a
-libGrid_a_SOURCES =\
-	 Grid_init.cc\
-	stencil/Grid_stencil_common.cc\
-	qcd/Grid_qcd_dirac.cc\
-	qcd/Grid_qcd_wilson_dop.cc\
-	algorithms/approx/Zolotarev.cc\
-	algorithms/approx/Remez.cc\
+libGrid_a_SOURCES =				\
+	Grid_init.cc				\
+	stencil/Grid_stencil_common.cc		\
+	qcd/Grid_qcd_dirac.cc			\
+	qcd/Grid_qcd_wilson_dop.cc		\
+	algorithms/approx/Zolotarev.cc		\
+	algorithms/approx/Remez.cc		\
 	$(extra_sources)

 #
 # Include files
 #
-nobase_include_HEADERS = algorithms/approx/bigfloat.h\
-	algorithms/approx/Chebyshev.h\
-	algorithms/approx/Remez.h\
-	algorithms/approx/Zolotarev.h\
-	algorithms/iterative/ConjugateGradient.h\
-	algorithms/iterative/NormalEquations.h\
-	algorithms/iterative/SchurRedBlack.h\
-	algorithms/LinearOperator.h\
-	algorithms/SparseMatrix.h\
-	cartesian/Grid_cartesian_base.h\
-	cartesian/Grid_cartesian_full.h\
-	cartesian/Grid_cartesian_red_black.h\
-	communicator/Grid_communicator_base.h\
-	cshift/Grid_cshift_common.h\
-	cshift/Grid_cshift_mpi.h\
-	cshift/Grid_cshift_none.h\
-	Grid.h\
-	Grid_algorithms.h\
-	Grid_aligned_allocator.h\
-	Grid_cartesian.h\
-	Grid_communicator.h\
-	Grid_comparison.h\
-	Grid_cshift.h\
-	Grid_extract.h\
-	Grid_lattice.h\
-	Grid_math.h\
-	Grid_simd.h\
-	Grid_stencil.h\
-	Grid_threads.h\
-	lattice/Grid_lattice_arith.h\
-	lattice/Grid_lattice_base.h\
-	lattice/Grid_lattice_comparison.h\
-	lattice/Grid_lattice_conformable.h\
-	lattice/Grid_lattice_coordinate.h\
-	lattice/Grid_lattice_ET.h\
-	lattice/Grid_lattice_local.h\
-	lattice/Grid_lattice_overload.h\
-	lattice/Grid_lattice_peekpoke.h\
-	lattice/Grid_lattice_reality.h\
-	lattice/Grid_lattice_reduction.h\
-	lattice/Grid_lattice_rng.h\
-	lattice/Grid_lattice_trace.h\
-	lattice/Grid_lattice_transfer.h\
-	lattice/Grid_lattice_transpose.h\
-	lattice/Grid_lattice_where.h\
-	math/Grid_math_arith.h\
-	math/Grid_math_arith_add.h\
-	math/Grid_math_arith_mac.h\
-	math/Grid_math_arith_mul.h\
-	math/Grid_math_arith_scalar.h\
-	math/Grid_math_arith_sub.h\
-	math/Grid_math_inner.h\
-	math/Grid_math_outer.h\
-	math/Grid_math_peek.h\
-	math/Grid_math_poke.h\
-	math/Grid_math_reality.h\
-	math/Grid_math_tensors.h\
-	math/Grid_math_trace.h\
-	math/Grid_math_traits.h\
-	math/Grid_math_transpose.h\
-	parallelIO/GridNerscIO.h\
-	qcd/Grid_qcd.h\
-	qcd/Grid_qcd_2spinor.h\
-	qcd/Grid_qcd_dirac.h\
-	qcd/Grid_qcd_wilson_dop.h\
-	simd/Grid_vComplexD.h\
-	simd/Grid_vComplexF.h\
-	simd/Grid_vInteger.h\
-	simd/Grid_vRealD.h\
-	simd/Grid_vRealF.h
+nobase_include_HEADERS = algorithms/approx/bigfloat.h		\
+	algorithms/approx/Chebyshev.h				\
+	algorithms/approx/Remez.h				\
+	algorithms/approx/Zolotarev.h				\
+	algorithms/iterative/ConjugateGradient.h		\
+	algorithms/iterative/NormalEquations.h			\
+	algorithms/iterative/SchurRedBlack.h			\
+	algorithms/LinearOperator.h				\
+	algorithms/SparseMatrix.h				\
+	cartesian/Grid_cartesian_base.h				\
+	cartesian/Grid_cartesian_full.h				\
+	cartesian/Grid_cartesian_red_black.h			\
+	communicator/Grid_communicator_base.h			\
+	cshift/Grid_cshift_common.h				\
+	cshift/Grid_cshift_mpi.h				\
+	cshift/Grid_cshift_none.h				\
+	Grid.h							\
+	Grid_algorithms.h					\
+	Grid_aligned_allocator.h				\
+	Grid_cartesian.h					\
+	Grid_communicator.h					\
+	Grid_comparison.h					\
+	Grid_cshift.h						\
+	Grid_extract.h						\
+	Grid_lattice.h						\
+	Grid_math.h						\
+	Grid_simd.h						\
+	Grid_stencil.h						\
+	Grid_threads.h						\
+	lattice/Grid_lattice_arith.h				\
+	lattice/Grid_lattice_base.h				\
+	lattice/Grid_lattice_comparison.h			\
+	lattice/Grid_lattice_conformable.h			\
+	lattice/Grid_lattice_coordinate.h			\
+	lattice/Grid_lattice_ET.h				\
+	lattice/Grid_lattice_local.h				\
+	lattice/Grid_lattice_overload.h				\
+	lattice/Grid_lattice_peekpoke.h				\
+	lattice/Grid_lattice_reality.h				\
+	lattice/Grid_lattice_reduction.h			\
+	lattice/Grid_lattice_rng.h				\
+	lattice/Grid_lattice_trace.h				\
+	lattice/Grid_lattice_transfer.h				\
+	lattice/Grid_lattice_transpose.h			\
+	lattice/Grid_lattice_where.h				\
+	math/Grid_math_arith.h					\
+	math/Grid_math_arith_add.h				\
+	math/Grid_math_arith_mac.h				\
+	math/Grid_math_arith_mul.h				\
+	math/Grid_math_arith_scalar.h				\
+	math/Grid_math_arith_sub.h				\
+	math/Grid_math_inner.h					\
+	math/Grid_math_outer.h					\
+	math/Grid_math_peek.h					\
+	math/Grid_math_poke.h					\
+	math/Grid_math_reality.h				\
+	math/Grid_math_tensors.h				\
+	math/Grid_math_trace.h					\
+	math/Grid_math_traits.h					\
+	math/Grid_math_transpose.h				\
+	parallelIO/GridNerscIO.h				\
+	qcd/Grid_qcd.h						\
+	qcd/Grid_qcd_2spinor.h					\
+	qcd/Grid_qcd_dirac.h					\
+	qcd/Grid_qcd_wilson_dop.h				\
+	simd/Grid_vComplexD.h					\
+	simd/Grid_vComplexF.h					\
+	simd/Grid_vInteger.h					\
+	simd/Grid_vRealD.h					\
+	simd/Grid_vRealF.h					\
+	simd/Grid_vector_types.h				\
+	simd/Grid_sse4.h					
+

@@ -138,16 +138,8 @@ void WilsonMatrix::MooeeInvDag(const LatticeFermion &in, LatticeFermion &out)
  return ;
 }

-void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out,int dag)
+void WilsonMatrix::DhopSite(int ss,const LatticeFermion &in, LatticeFermion &out)
 {
-  assert((dag==0) ||(dag==1));
-
-  WilsonCompressor compressor(dag);
-  Stencil.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor);
-
-PARALLEL_FOR_LOOP
-  for(int sss=0;sss<grid->oSites();sss++){
-
    vHalfSpinColourVector  tmp;    
    vHalfSpinColourVector  chi;    
    vSpinColourVector result;
@@ -155,16 +147,14 @@ PARALLEL_FOR_LOOP
    int offset,local,perm, ptype;

    //    int ss = Stencil._LebesgueReorder[sss];
-    int ss = sss;
    int ssu= ss;

    // Xp
-    int idx = (Xp+dag*4)%8;
-    offset = Stencil._offsets [idx][ss];
-    local  = Stencil._is_local[idx][ss];
-    perm   = Stencil._permute[idx][ss];
+    offset = Stencil._offsets [Xp][ss];
+    local  = Stencil._is_local[Xp][ss];
+    perm   = Stencil._permute[Xp][ss];

-    ptype  = Stencil._permute_type[idx];
+    ptype  = Stencil._permute_type[Xp];
    if ( local && perm ) {
      spProjXp(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
@@ -173,16 +163,14 @@ PARALLEL_FOR_LOOP
    } else { 
      chi=comm_buf[offset];
    }
-    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
+    mult(&Uchi(),&Umu._odata[ssu](Xp),&chi());
    spReconXp(result,Uchi);

    // Yp
-    idx = (Yp+dag*4)%8;
-    offset = Stencil._offsets [idx][ss];
-    local  = Stencil._is_local[idx][ss];
-    perm   = Stencil._permute[idx][ss];
-    ptype  = Stencil._permute_type[idx];
-
+    offset = Stencil._offsets [Yp][ss];
+    local  = Stencil._is_local[Yp][ss];
+    perm   = Stencil._permute[Yp][ss];
+    ptype  = Stencil._permute_type[Yp];
    if ( local && perm ) {
      spProjYp(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
@@ -191,15 +179,14 @@ PARALLEL_FOR_LOOP
    } else { 
      chi=comm_buf[offset];
    }
-    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
+    mult(&Uchi(),&Umu._odata[ssu](Yp),&chi());
    accumReconYp(result,Uchi);

    // Zp
-    idx = (Zp+dag*4)%8;
-    offset = Stencil._offsets [idx][ss];
-    local  = Stencil._is_local[idx][ss];
-    perm   = Stencil._permute[idx][ss];
-    ptype  = Stencil._permute_type[idx];
+    offset = Stencil._offsets [Zp][ss];
+    local  = Stencil._is_local[Zp][ss];
+    perm   = Stencil._permute[Zp][ss];
+    ptype  = Stencil._permute_type[Zp];
    if ( local && perm ) {
      spProjZp(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
@@ -208,15 +195,14 @@ PARALLEL_FOR_LOOP
    } else { 
      chi=comm_buf[offset];
    }
-    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
+    mult(&Uchi(),&Umu._odata[ssu](Zp),&chi());
    accumReconZp(result,Uchi);

    // Tp
-    idx = (Tp+dag*4)%8;
-    offset = Stencil._offsets [idx][ss];
-    local  = Stencil._is_local[idx][ss];
-    perm   = Stencil._permute[idx][ss];
-    ptype  = Stencil._permute_type[idx];
+    offset = Stencil._offsets [Tp][ss];
+    local  = Stencil._is_local[Tp][ss];
+    perm   = Stencil._permute[Tp][ss];
+    ptype  = Stencil._permute_type[Tp];
    if ( local && perm ) {
      spProjTp(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
@@ -225,15 +211,14 @@ PARALLEL_FOR_LOOP
    } else { 
      chi=comm_buf[offset];
    }
-    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
+    mult(&Uchi(),&Umu._odata[ssu](Tp),&chi());
    accumReconTp(result,Uchi);

    // Xm
-    idx = (Xm+dag*4)%8;
-    offset = Stencil._offsets [idx][ss];
-    local  = Stencil._is_local[idx][ss];
-    perm   = Stencil._permute[idx][ss];
-    ptype  = Stencil._permute_type[idx];
+    offset = Stencil._offsets [Xm][ss];
+    local  = Stencil._is_local[Xm][ss];
+    perm   = Stencil._permute[Xm][ss];
+    ptype  = Stencil._permute_type[Xm];

    if ( local && perm ) 
    {
@@ -244,16 +229,15 @@ PARALLEL_FOR_LOOP
    } else { 
      chi=comm_buf[offset];
    }
-    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
+    mult(&Uchi(),&Umu._odata[ssu](Xm),&chi());
    accumReconXm(result,Uchi);


    // Ym
-    idx = (Ym+dag*4)%8;
-    offset = Stencil._offsets [idx][ss];
-    local  = Stencil._is_local[idx][ss];
-    perm   = Stencil._permute[idx][ss];
-    ptype  = Stencil._permute_type[idx];
+    offset = Stencil._offsets [Ym][ss];
+    local  = Stencil._is_local[Ym][ss];
+    perm   = Stencil._permute[Ym][ss];
+    ptype  = Stencil._permute_type[Ym];

    if ( local && perm ) {
      spProjYm(tmp,in._odata[offset]);
@@ -263,15 +247,14 @@ PARALLEL_FOR_LOOP
    } else { 
      chi=comm_buf[offset];
    }
-    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
+    mult(&Uchi(),&Umu._odata[ssu](Ym),&chi());
    accumReconYm(result,Uchi);

    // Zm
-    idx = (Zm+dag*4)%8;
-    offset = Stencil._offsets [idx][ss];
-    local  = Stencil._is_local[idx][ss];
-    perm   = Stencil._permute[idx][ss];
-    ptype  = Stencil._permute_type[idx];
+    offset = Stencil._offsets [Zm][ss];
+    local  = Stencil._is_local[Zm][ss];
+    perm   = Stencil._permute[Zm][ss];
+    ptype  = Stencil._permute_type[Zm];
    if ( local && perm ) {
      spProjZm(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
@@ -280,15 +263,14 @@ PARALLEL_FOR_LOOP
    } else { 
      chi=comm_buf[offset];
    }
-    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
+    mult(&Uchi(),&Umu._odata[ssu](Zm),&chi());
    accumReconZm(result,Uchi);

    // Tm
-    idx = (Tm+dag*4)%8;
-    offset = Stencil._offsets [idx][ss];
-    local  = Stencil._is_local[idx][ss];
-    perm   = Stencil._permute[idx][ss];
-    ptype  = Stencil._permute_type[idx];
+    offset = Stencil._offsets [Tm][ss];
+    local  = Stencil._is_local[Tm][ss];
+    perm   = Stencil._permute[Tm][ss];
+    ptype  = Stencil._permute_type[Tm];
    if ( local && perm ) {
      spProjTm(tmp,in._odata[offset]);
      permute(chi,tmp,ptype);
@@ -297,12 +279,175 @@ PARALLEL_FOR_LOOP
    } else { 
      chi=comm_buf[offset];
    }
-    mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
+    mult(&Uchi(),&Umu._odata[ssu](Tm),&chi());
    accumReconTm(result,Uchi);

    vstream(out._odata[ss],result);
-  }
+}
+void WilsonMatrix::DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion &out)
+{
+    vHalfSpinColourVector  tmp;    
+    vHalfSpinColourVector  chi;    
+    vSpinColourVector result;
+    vHalfSpinColourVector Uchi;
+    int offset,local,perm, ptype;

+    int ssu= ss;
+
+    // Xp
+    offset = Stencil._offsets [Xm][ss];
+    local  = Stencil._is_local[Xm][ss];
+    perm   = Stencil._permute[Xm][ss];
+
+    ptype  = Stencil._permute_type[Xm];
+    if ( local && perm ) {
+      spProjXp(tmp,in._odata[offset]);
+      permute(chi,tmp,ptype);
+    } else if ( local ) {
+      spProjXp(chi,in._odata[offset]);
+    } else { 
+      chi=comm_buf[offset];
+    }
+    mult(&Uchi(),&Umu._odata[ssu](Xm),&chi());
+    spReconXp(result,Uchi);
+
+    // Yp
+    offset = Stencil._offsets [Ym][ss];
+    local  = Stencil._is_local[Ym][ss];
+    perm   = Stencil._permute[Ym][ss];
+    ptype  = Stencil._permute_type[Ym];
+    if ( local && perm ) {
+      spProjYp(tmp,in._odata[offset]);
+      permute(chi,tmp,ptype);
+    } else if ( local ) {
+      spProjYp(chi,in._odata[offset]);
+    } else { 
+      chi=comm_buf[offset];
+    }
+    mult(&Uchi(),&Umu._odata[ssu](Ym),&chi());
+    accumReconYp(result,Uchi);
+
+    // Zp
+    offset = Stencil._offsets [Zm][ss];
+    local  = Stencil._is_local[Zm][ss];
+    perm   = Stencil._permute[Zm][ss];
+    ptype  = Stencil._permute_type[Zm];
+    if ( local && perm ) {
+      spProjZp(tmp,in._odata[offset]);
+      permute(chi,tmp,ptype);
+    } else if ( local ) {
+      spProjZp(chi,in._odata[offset]);
+    } else { 
+      chi=comm_buf[offset];
+    }
+    mult(&Uchi(),&Umu._odata[ssu](Zm),&chi());
+    accumReconZp(result,Uchi);
+
+    // Tp
+    offset = Stencil._offsets [Tm][ss];
+    local  = Stencil._is_local[Tm][ss];
+    perm   = Stencil._permute[Tm][ss];
+    ptype  = Stencil._permute_type[Tm];
+    if ( local && perm ) {
+      spProjTp(tmp,in._odata[offset]);
+      permute(chi,tmp,ptype);
+    } else if ( local ) {
+      spProjTp(chi,in._odata[offset]);
+    } else { 
+      chi=comm_buf[offset];
+    }
+    mult(&Uchi(),&Umu._odata[ssu](Tm),&chi());
+    accumReconTp(result,Uchi);
+
+    // Xm
+    offset = Stencil._offsets [Xp][ss];
+    local  = Stencil._is_local[Xp][ss];
+    perm   = Stencil._permute[Xp][ss];
+    ptype  = Stencil._permute_type[Xp];
+
+    if ( local && perm ) 
+    {
+      spProjXm(tmp,in._odata[offset]);
+      permute(chi,tmp,ptype);
+    } else if ( local ) {
+      spProjXm(chi,in._odata[offset]);
+    } else { 
+      chi=comm_buf[offset];
+    }
+    mult(&Uchi(),&Umu._odata[ssu](Xp),&chi());
+    accumReconXm(result,Uchi);
+
+
+    // Ym
+    offset = Stencil._offsets [Yp][ss];
+    local  = Stencil._is_local[Yp][ss];
+    perm   = Stencil._permute[Yp][ss];
+    ptype  = Stencil._permute_type[Yp];
+
+    if ( local && perm ) {
+      spProjYm(tmp,in._odata[offset]);
+      permute(chi,tmp,ptype);
+    } else if ( local ) {
+      spProjYm(chi,in._odata[offset]);
+    } else { 
+      chi=comm_buf[offset];
+    }
+    mult(&Uchi(),&Umu._odata[ssu](Yp),&chi());
+    accumReconYm(result,Uchi);
+
+    // Zm
+    offset = Stencil._offsets [Zp][ss];
+    local  = Stencil._is_local[Zp][ss];
+    perm   = Stencil._permute[Zp][ss];
+    ptype  = Stencil._permute_type[Zp];
+    if ( local && perm ) {
+      spProjZm(tmp,in._odata[offset]);
+      permute(chi,tmp,ptype);
+    } else if ( local ) {
+      spProjZm(chi,in._odata[offset]);
+    } else { 
+      chi=comm_buf[offset];
+    }
+    mult(&Uchi(),&Umu._odata[ssu](Zp),&chi());
+    accumReconZm(result,Uchi);
+
+    // Tm
+    offset = Stencil._offsets [Tp][ss];
+    local  = Stencil._is_local[Tp][ss];
+    perm   = Stencil._permute[Tp][ss];
+    ptype  = Stencil._permute_type[Tp];
+    if ( local && perm ) {
+      spProjTm(tmp,in._odata[offset]);
+      permute(chi,tmp,ptype);
+    } else if ( local ) {
+      spProjTm(chi,in._odata[offset]);
+    } else { 
+      chi=comm_buf[offset];
+    }
+    mult(&Uchi(),&Umu._odata[ssu](Tp),&chi());
+    accumReconTm(result,Uchi);
+
+    vstream(out._odata[ss],result);
+}
+
+void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out,int dag)
+{
+  assert((dag==0) ||(dag==1));
+
+  WilsonCompressor compressor(dag);
+  Stencil.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor);
+
+  if ( dag ) {
+PARALLEL_FOR_LOOP
+    for(int sss=0;sss<grid->oSites();sss++){
+      DhopSiteDag(sss,in,out);
+    }
+  } else {
+PARALLEL_FOR_LOOP
+    for(int sss=0;sss<grid->oSites();sss++){
+      DhopSite(sss,in,out);
+    }
+  }
 }


@@ -46,6 +46,8 @@ namespace Grid {

      // non-hermitian hopping term; half cb or both
      void Dhop(const LatticeFermion &in, LatticeFermion &out,int dag);
+      void DhopSite   (int ss,const LatticeFermion &in, LatticeFermion &out);
+      void DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion &out);

      typedef iScalar<iMatrix<vComplex, Nc> > matrix;

@@ -0,0 +1,194 @@
+//----------------------------------------------------------------------
+/*! @file Grid_sse4.h
+  @brief Optimization libraries
+*/
+// Time-stamp: <2015-05-19 17:06:51 neo>
+//----------------------------------------------------------------------
+
+#include <pmmintrin.h>
+
+namespace Optimization {
+  
+  struct Vsplat{
+    //Complex float
+    inline __m128 operator()(float a, float b){
+      return _mm_set_ps(b,a,b,a);
+    }
+    // Real float
+    inline __m128 operator()(float a){
+      return _mm_set_ps(a,a,a,a);
+    }
+    //Complex double
+    inline __m128d operator()(double a, double b){
+      return _mm_set_pd(b,a);
+    }
+    //Real double
+    inline __m128d operator()(double a){
+      return _mm_set_pd(a,a);
+    }
+    //Integer
+    inline __m128i operator()(Integer a){
+      return _mm_set1_epi32(a);
+    }
+  };
+
+  struct Vstore{
+    //Float 
+    inline void operator()(__m128 a, float* F){
+      _mm_store_ps(F,a);
+    }
+    //Double
+    inline void operator()(__m128d a, double* D){
+      _mm_store_pd(D,a);
+    }
+    //Integer
+    inline void operator()(__m128i a, Integer* I){
+      _mm_store_si128((__m128i *)I,a);
+    }
+
+  };
+
+
+
+  struct Vset{
+    // Complex float 
+    inline __m128 operator()(Grid::ComplexF *a){
+      return _mm_set_ps(a[1].imag(), a[1].real(),a[0].imag(),a[0].real());
+    }
+    // Complex double 
+    inline __m128d operator()(Grid::ComplexD *a){
+      return _mm_set_pd(a[0].imag(),a[0].real());
+    }
+    // Real float 
+    inline __m128 operator()(float *a){
+      return _mm_set_ps(a[3],a[2],a[1],a[0]);
+    }
+    // Real double
+    inline __m128d operator()(double *a){
+      return _mm_set_pd(a[1],a[0]);
+    }
+    // Integer
+    inline __m128i operator()(Integer *a){
+      return _mm_set_epi32(a[0],a[1],a[2],a[3]);
+    }
+
+
+  };
+
+  struct Reduce{
+    //Complex float
+    inline Grid::ComplexF operator()(__m128 in){
+      union {
+	__m128 v1;  
+	float f[4]; 
+      } u128;
+      u128.v1 = _mm_add_ps(in, _mm_shuffle_ps(in,in, 0b01001110)); // FIXME Prefer to use _MM_SHUFFLE macros
+      return Grid::ComplexF(u128.f[0], u128.f[1]);   
+    }
+    //Complex double
+    inline Grid::ComplexD operator()(__m128d in){
+      printf("Missing complex double implementation -> FIX\n");
+      return Grid::ComplexD(0,0); // FIXME wrong
+    }
+
+
+
+  };
+
+
+  /////////////////////////////////////////////////////
+  // Arithmetic operations
+  /////////////////////////////////////////////////////
+  struct Sum{
+    //Complex/Real float
+    inline __m128 operator()(__m128 a, __m128 b){
+      return _mm_add_ps(a,b);
+    }
+    //Complex/Real double
+    inline __m128d operator()(__m128d a, __m128d b){
+      return _mm_add_pd(a,b);
+    }
+    //Integer
+    inline __m128i operator()(__m128i a, __m128i b){
+      return _mm_add_epi32(a,b);
+    }
+  };
+
+  struct Sub{
+    //Complex/Real float
+    inline __m128 operator()(__m128 a, __m128 b){
+      return _mm_sub_ps(a,b);
+    }
+    //Complex/Real double
+    inline __m128d operator()(__m128d a, __m128d b){
+      return _mm_sub_pd(a,b);
+    }
+    //Integer
+    inline __m128i operator()(__m128i a, __m128i b){
+      return _mm_sub_epi32(a,b);
+    }
+  };
+
+  struct MultComplex{
+    // Complex float
+    inline __m128 operator()(__m128 a, __m128 b){
+      __m128 ymm0,ymm1,ymm2;
+      ymm0 = _mm_shuffle_ps(a,a,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar,
+      ymm0 = _mm_mul_ps(ymm0,b);        // ymm0 <- ar bi, ar br
+      ymm1 = _mm_shuffle_ps(b,b,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi
+      ymm2 = _mm_shuffle_ps(a,a,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai
+      ymm1 = _mm_mul_ps(ymm1,ymm2);       // ymm1 <- br ai, ai bi
+      return _mm_addsub_ps(ymm0,ymm1);    
+    }
+    // Complex double
+    inline __m128d operator()(__m128d a, __m128d b){
+      __m128d ymm0,ymm1,ymm2;
+      ymm0 = _mm_shuffle_pd(a,a,0x0); // ymm0 <- ar ar,
+      ymm0 = _mm_mul_pd(ymm0,b);        // ymm0 <- ar bi, ar br
+      ymm1 = _mm_shuffle_pd(b,b,0x1); // ymm1 <- br,bi   b01
+      ymm2 = _mm_shuffle_pd(a,a,0x3); // ymm2 <- ai,ai   b11
+      ymm1 = _mm_mul_pd(ymm1,ymm2);       // ymm1 <- br ai, ai bi
+      return _mm_addsub_pd(ymm0,ymm1);  
+    }
+  };
+
+  struct Mult{
+    // Real float
+    inline __m128 operator()(__m128 a, __m128 b){
+      return _mm_mul_ps(a,b);
+    }
+    // Real double
+    inline __m128d operator()(__m128d a, __m128d b){
+      return _mm_mul_pd(a,b);
+    }
+    // Integer
+    inline __m128i operator()(__m128i a, __m128i b){
+      return _mm_mul_epi32(a,b);
+    }
+
+  };
+
+
+
+
+}
+
+// Here assign types 
+namespace Grid {
+  typedef __m128 SIMD_Ftype;  // Single precision type
+  typedef __m128d SIMD_Dtype; // Double precision type
+  typedef __m128i SIMD_Itype; // Integer type
+
+
+  // Function names
+  typedef Optimization::Vsplat VsplatSIMD;
+  typedef Optimization::Vstore VstoreSIMD;
+
+  // Arithmetic operations
+  typedef Optimization::Sum         SumSIMD;
+  typedef Optimization::Sub         SubSIMD;
+  typedef Optimization::Mult        MultSIMD;
+  typedef Optimization::MultComplex MultComplexSIMD;
+  typedef Optimization::Vset        VsetSIMD;
+
+}
@@ -1,6 +1,14 @@
+//---------------------------------------------------------------------------
+/*! @file Grid_vector_types.h
+  @brief Defines templated class Grid_simd to deal with inner vector types
+*/
+// Time-stamp: <2015-05-19 17:20:36 neo>
+//---------------------------------------------------------------------------
 #ifndef GRID_VECTOR_TYPES
 #define GRID_VECTOR_TYPES

+#include "Grid_sse4.h"
+

 namespace Grid {

@@ -13,31 +21,32 @@ namespace Grid {
    struct RealPart< std::complex<T> >{
    typedef T type;
  };
-  ////////////////////////////////////////////////////////

+  ////////////////////////////////////////////////////////
  // Check for complexity with type traits
  template <typename T> 
    struct is_complex : std::false_type {};
  template < typename T > 
    struct is_complex< std::complex<T> >: std::true_type {};
  ////////////////////////////////////////////////////////
-
-
  // Define the operation templates functors
-  template < class SIMD, class Operation > 
-    SIMD binary(SIMD src_1, SIMD src_2, Operation op){
+  // general forms to allow for vsplat syntax
+  // need explicit declaration of types when used since
+  // clang cannot automatically determine the output type sometimes
+  template < class Out, class Input1, class Input2, class Operation > 
+    Out binary(Input1 src_1, Input2 src_2, Operation op){
    return op(src_1, src_2);
-  }
+  } 

-  template < class SIMD, class Operation > 
-    SIMD unary(SIMD src, Operation op){
+  template < class SIMDout, class Input, class Operation > 
+    SIMDout unary(Input src, Operation op){
    return op(src);
-  }
+  } 
+
  ///////////////////////////////////////////////

  /*
    @brief Grid_simd class for the SIMD vector type operations
-
   */
  template < class Scalar_type, class Vector_type > 
    class Grid_simd {
@@ -69,76 +78,91 @@ namespace Grid {
    };


-
       
    ///////////////////////////////////////////////
    // mac, mult, sub, add, adj
-    // Should do an AVX2 version with mac.
    ///////////////////////////////////////////////
    friend inline void mac (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ a,const Grid_simd *__restrict__ x){ *y = (*a)*(*x)+(*y); };
    friend inline void mult(Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) * (*r); }
    friend inline void sub (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) - (*r); }
    friend inline void add (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) + (*r); }
+
+    //not for integer types... FIXME
    friend inline Grid_simd adj(const Grid_simd &in){ return conjugate(in); }
        
-    //////////////////////////////////
-    // Initialise to 1,0,i
-    //////////////////////////////////
+    ///////////////////////////////////////////////
+    // Initialise to 1,0,i for the correct types
+    ///////////////////////////////////////////////
    // if not complex overload here 
-    friend inline void vone(Grid_simd &ret)      { vsplat(ret,1.0); }
-    friend inline void vzero(Grid_simd &ret)     { vsplat(ret,0.0); }
-
+    template <  class S = Scalar_type,typename std::enable_if < !is_complex < S >::value, int >::type = 0 > 
+      friend inline void vone(Grid_simd &ret)      { vsplat(ret,1.0); }
+    template <  class S = Scalar_type,typename std::enable_if < !is_complex < S >::value, int >::type = 0 > 
+      friend inline void vzero(Grid_simd &ret)     { vsplat(ret,0.0); }
+    
    // overload for complex type
    template <  class S = Scalar_type,typename std::enable_if < is_complex < S >::value, int >::type = 0 > 
-    friend inline void vone(Grid_simd &ret)      { vsplat(ret,1.0,0.0); }
+      friend inline void vone(Grid_simd &ret)      { vsplat(ret,1.0,0.0); }
    template < class S = Scalar_type,typename std::enable_if < is_complex < S >::value, int >::type = 0 > 
-    friend inline void vzero(Grid_simd &ret)     { vsplat(ret,0.0,0.0); }
-
+      friend inline void vzero(Grid_simd &ret)     { vsplat(ret,0.0,0.0); }// use xor?
+    
    // For integral type
    template <  class S = Scalar_type,typename std::enable_if < std::is_integral < S >::value, int >::type = 0 > 
-    friend inline void vone(Grid_simd &ret)      { vsplat(ret,1); }
+      friend inline void vone(Grid_simd &ret)      { vsplat(ret,1); }
    template <  class S = Scalar_type,typename std::enable_if < std::is_integral < S >::value, int >::type = 0 > 
-    friend inline void vzero(Grid_simd &ret)      { vsplat(ret,0); }
+      friend inline void vzero(Grid_simd &ret)      { vsplat(ret,0); }
+    template <  class S = Scalar_type,typename std::enable_if < std::is_integral < S >::value, int >::type = 0 > 
+      friend inline void vtrue (Grid_simd &ret){vsplat(ret,0xFFFFFFFF);}
+    template <  class S = Scalar_type,typename std::enable_if < std::is_integral < S >::value, int >::type = 0 > 
+      friend inline void vfalse(vInteger &ret){vsplat(ret,0);}


-    
    // do not compile if real or integer, send an error message from the compiler
    template < class S = Scalar_type,typename std::enable_if < is_complex < S >::value, int >::type = 0 > 
    friend inline void vcomplex_i(Grid_simd &ret){ vsplat(ret,0.0,1.0);}
-       
-
-    
   
    ////////////////////////////////////
    // Arithmetic operator overloads +,-,*
    ////////////////////////////////////
    friend inline Grid_simd operator + (Grid_simd a, Grid_simd b)
    {
-      vComplexF ret;
-      // FIXME call the binary op
+      Grid_simd ret;
+      ret.v = binary<Vector_type>(a.v, b.v, SumSIMD());
      return ret;
    };
        
    friend inline Grid_simd operator - (Grid_simd a, Grid_simd b)
    {
-      vComplexF ret;
-      // FIXME call the binary op
+      Grid_simd ret;
+      ret.v = binary<Vector_type>(a.v, b.v, SubSIMD());
      return ret;
    };
        
-    friend inline Grid_simd operator * (Grid_simd a, Grid_simd b)
+    // Distinguish between complex types and others
+    template < class S = Scalar_type, typename std::enable_if < is_complex < S >::value, int >::type = 0 >
+      friend inline Grid_simd operator * (Grid_simd a, Grid_simd b)
      {
-	vComplexF ret;
-	// FIXME call the binary op
+	Grid_simd ret;
+	ret.v = binary<Vector_type>(a.v,b.v, MultComplexSIMD());
 	return ret;
      };
-      
+
+    // Real/Integer types
+    template <  class S = Scalar_type,typename std::enable_if < !is_complex < S >::value, int >::type = 0 > 
+    friend inline Grid_simd operator * (Grid_simd a, Grid_simd b)
+      {
+	Grid_simd ret;
+	ret.v = binary<Vector_type>(a.v,b.v, MultSIMD());
+	return ret;
+      };
+    
+
+

    ////////////////////////////////////////////////////////////////////////
    // FIXME:  gonna remove these load/store, get, set, prefetch
    ////////////////////////////////////////////////////////////////////////
    friend inline void vset(Grid_simd &ret, Scalar_type *a){
-      // FIXME set 
+      ret.v = unary<Vector_type>(a, VsetSIMD());
    }
        
    ///////////////////////
@@ -147,34 +171,33 @@ namespace Grid {
    // overload if complex
    template < class S = Scalar_type > 
    friend inline void vsplat(Grid_simd &ret, typename std::enable_if< is_complex < S >::value, S>::type c){
-      Real a= real(c);
-      Real b= imag(c);
+      Real a = real(c);
+      Real b = imag(c);
      vsplat(ret,a,b);
    }

    // this only for the complex version
    template < class S = Scalar_type, typename std::enable_if < is_complex < S >::value, int >::type = 0 > 
    friend inline void vsplat(Grid_simd &ret,Real a, Real b){
-      // FIXME add operator
+      ret.v = binary<Vector_type>(a, b, VsplatSIMD());
    }    

-    //if real fill with a, if complex fill with a in the real part
+    //if real fill with a, if complex fill with a in the real part (first function above)
    friend inline void vsplat(Grid_simd &ret,Real a){
-      // FIXME add operator
+      ret.v = unary<Vector_type>(a, VsplatSIMD());
    }    


-
    friend inline void vstore(const Grid_simd &ret, Scalar_type *a){
-      //FIXME
+      binary<void>(ret.v, (Real*)a, VstoreSIMD());
    }
+
    friend inline void vprefetch(const Grid_simd &v)
    {
      _mm_prefetch((const char*)&v.v,_MM_HINT_T0);
    }


-
    friend inline Scalar_type Reduce(const Grid_simd & in)
    {
      // FIXME add operator
@@ -221,6 +244,7 @@ namespace Grid {
    inline Grid_simd &operator *=(const Grid_simd &r) {
      *this = (*this)*r;
      return *this;
+      // return (*this)*r; ?
    }
    inline Grid_simd &operator +=(const Grid_simd &r) {
      *this = *this+r;
@@ -233,6 +257,12 @@ namespace Grid {



+     friend inline void permute(Grid_simd &y,Grid_simd b,int perm)
+      {
+        Gpermute<Grid_simd>(y,b,perm);
+      }
+
+     /*
    friend inline void permute(Grid_simd &y,Grid_simd b,int perm)
    {
      Gpermute<Grid_simd>(y,b,perm);
@@ -253,7 +283,7 @@ namespace Grid {
    {
      Gextract<Grid_simd,Scalar_type>(y,extracted);
    }
-
+     */

  };// end of Grid_simd class definition 

@@ -286,11 +316,11 @@ namespace Grid {


  // Define available types (now change names to avoid clashing)
-  typedef __m128 SIMD_type;// decided at compilation time
-  typedef Grid_simd< float                 , SIMD_type > MyRealF;
-  typedef Grid_simd< double                , SIMD_type > MyRealD;
-  typedef Grid_simd< std::complex< float > , SIMD_type > MyComplexF;
-  typedef Grid_simd< std::complex< double >, SIMD_type > MyComplexD;
+
+  typedef Grid_simd< float                 , SIMD_Ftype > MyRealF;
+  typedef Grid_simd< double                , SIMD_Dtype > MyRealD;
+  typedef Grid_simd< std::complex< float > , SIMD_Ftype > MyComplexF;
+  typedef Grid_simd< std::complex< double >, SIMD_Dtype > MyComplexD;



@@ -0,0 +1,9 @@
+for omp in 1 2 4
+do
+echo > wilson.t$omp
+for vol in 4.4.4.4 4.4.4.8 4.4.8.8  4.8.8.8  8.8.8.8   8.8.8.16 8.8.16.16  8.16.16.16
+do   
+perf=` ./benchmarks/Grid_wilson --grid $vol --omp $omp  | grep mflop | awk '{print $3}'`
+echo $vol $perf >> wilson.t$omp
+done
+done
@@ -0,0 +1,7 @@
+plot 'wilson.t1' u 2 w l t "AVX1-OMP=1"
+replot 'wilson.t2' u 2 w l t "AVX1-OMP=2"
+replot 'wilson.t4' u 2 w l t "AVX1-OMP=4"
+set terminal 'pdf'
+set output 'wilson_clang.pdf'
+replot
+quit
@@ -1,5 +1,9 @@
 #include "Grid.h"

+//DEBUG
+#include "simd/Grid_vector_types.h"
+
+
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
@@ -151,6 +155,39 @@ int main (int argc, char ** argv)
    
    scMat = sMat*scMat;  // LatticeSpinColourMatrix = LatticeSpinMatrix       * LatticeSpinColourMatrix

+
+
+#ifdef SSE4
+    ///////// Tests the new class Grid_simd 
+    std::complex<double> ctest(3.0,2.0);
+    std::complex<float> ctestf(3.0,2.0);
+    MyComplexF TestMe1(1.0); // fill real part
+    MyComplexD TestMe2(ctest);
+    MyComplexD TestMe3(ctest);// compiler generate conversion of basic types
+    //MyRealF TestMe5(ctest);// Must generate compiler error
+    MyRealD TestMe4(2.0); 
+ 
+    MyComplexF TestMe6(ctestf);
+    MyComplexF TestMe7(ctestf);  
+    
+    MyComplexD TheSum= TestMe2*TestMe3;
+    MyComplexF TheSumF= TestMe6*TestMe7;
+
+    double dsum[2];
+    _mm_store_pd(dsum, TheSum.v);
+    for (int i =0; i< 2; i++)
+      printf("%f\n", dsum[i]);
+
+    float fsum[4];
+    _mm_store_ps(fsum, TheSumF.v);
+    for (int i =0; i< 4; i++)
+      printf("%f\n", fsum[i]);
+
+    vstore(TheSum, &ctest);
+    std::cout << ctest<< std::endl;
+#endif
+    ///////////////////////
+
    // Non-lattice (const objects) * Lattice
    ColourMatrix cm;
    SpinColourMatrix scm;