diff --git a/README b/README index 41b66b6b..17e92fa0 100644 --- a/README +++ b/README @@ -33,6 +33,8 @@ MPI parallelism is UNIMPLEMENTED and for now only OpenMP and SIMD parallelism is by setting variables in the command line or in the environment. Here is are examples: + ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -msse4" --enable-simd=SSE4 + ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx" --enable-simd=AVX1 ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx2" --enable-simd=AVX2 diff --git a/README.md b/README.md index 80714d76..e18ca474 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,8 @@ MPI, OpenMP, and SIMD parallelism are present in the library. by setting variables in the command line or in the environment. Here are examples: + ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -msse4" --enable-simd=SSE4 + ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx" --enable-simd=AVX1 ./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx2" --enable-simd=AVX2 diff --git a/configure b/configure index b42143e2..6e27ab11 100755 --- a/configure +++ b/configure @@ -4503,6 +4503,11 @@ _ACEOF # Checks for library functions. +echo +echo Checking libraries +echo ::::::::::::::::::::::::::::::::::::::::::: + + for ac_func in gettimeofday do : ac_fn_cxx_check_func "$LINENO" "gettimeofday" "ac_cv_func_gettimeofday" @@ -4515,6 +4520,105 @@ fi done +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __gmpf_init in -lgmp" >&5 +$as_echo_n "checking for __gmpf_init in -lgmp... " >&6; } +if ${ac_cv_lib_gmp___gmpf_init+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_check_lib_save_LIBS=$LIBS +LIBS="-lgmp $LIBS" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char __gmpf_init (); +int +main () +{ +return __gmpf_init (); + ; + return 0; +} +_ACEOF +if ac_fn_cxx_try_link "$LINENO"; then : + ac_cv_lib_gmp___gmpf_init=yes +else + ac_cv_lib_gmp___gmpf_init=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +LIBS=$ac_check_lib_save_LIBS +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_gmp___gmpf_init" >&5 +$as_echo "$ac_cv_lib_gmp___gmpf_init" >&6; } +if test "x$ac_cv_lib_gmp___gmpf_init" = xyes; then : + cat >>confdefs.h <<_ACEOF +#define HAVE_LIBGMP 1 +_ACEOF + + LIBS="-lgmp $LIBS" + +else + as_fn_error $? "GNU Multiple Precision GMP library was not found in your system. +Please install or provide the correct path to your installation +Info at: http://www.gmplib.org" "$LINENO" 5 +fi + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for mpfr_init in -lmpfr" >&5 +$as_echo_n "checking for mpfr_init in -lmpfr... " >&6; } +if ${ac_cv_lib_mpfr_mpfr_init+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_check_lib_save_LIBS=$LIBS +LIBS="-lmpfr $LIBS" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char mpfr_init (); +int +main () +{ +return mpfr_init (); + ; + return 0; +} +_ACEOF +if ac_fn_cxx_try_link "$LINENO"; then : + ac_cv_lib_mpfr_mpfr_init=yes +else + ac_cv_lib_mpfr_mpfr_init=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +LIBS=$ac_check_lib_save_LIBS +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_mpfr_mpfr_init" >&5 +$as_echo "$ac_cv_lib_mpfr_mpfr_init" >&6; } +if test "x$ac_cv_lib_mpfr_mpfr_init" = xyes; then : + cat >>confdefs.h <<_ACEOF +#define HAVE_LIBMPFR 1 +_ACEOF + + LIBS="-lmpfr $LIBS" + +else + as_fn_error $? "GNU Multiple Precision MPFR library was not found in your system. +Please install or provide the correct path to your installation +Info at: http://www.mpfr.org/" "$LINENO" 5 +fi + diff --git a/configure.ac b/configure.ac index 14170f4e..93e2b574 100644 --- a/configure.ac +++ b/configure.ac @@ -3,7 +3,7 @@ # # Project Grid package # -# Time-stamp: <2015-05-18 17:14:20 neo> +# Time-stamp: <2015-05-19 13:51:08 neo> AC_PREREQ([2.69]) AC_INIT([Grid], [1.0], [paboyle@ph.ed.ac.uk]) @@ -46,8 +46,22 @@ AC_TYPE_UINT32_T AC_TYPE_UINT64_T # Checks for library functions. +echo +echo Checking libraries +echo ::::::::::::::::::::::::::::::::::::::::::: + + AC_CHECK_FUNCS([gettimeofday]) +AC_CHECK_LIB([gmp],[__gmpf_init],, + [AC_MSG_ERROR(GNU Multiple Precision GMP library was not found in your system. +Please install or provide the correct path to your installation +Info at: http://www.gmplib.org)]) + +AC_CHECK_LIB([mpfr],[mpfr_init],, + [AC_MSG_ERROR(GNU Multiple Precision MPFR library was not found in your system. +Please install or provide the correct path to your installation +Info at: http://www.mpfr.org/)]) diff --git a/lib/Grid_config.h b/lib/Grid_config.h index e1674850..78582f3e 100644 --- a/lib/Grid_config.h +++ b/lib/Grid_config.h @@ -36,6 +36,12 @@ /* Define to 1 if you have the header file. */ #define HAVE_INTTYPES_H 1 +/* Define to 1 if you have the `gmp' library (-lgmp). */ +#define HAVE_LIBGMP 1 + +/* Define to 1 if you have the `mpfr' library (-lmpfr). */ +#define HAVE_LIBMPFR 1 + /* Define to 1 if you have the header file. */ #define HAVE_MALLOC_H 1 diff --git a/lib/Grid_config.h.in b/lib/Grid_config.h.in index 0ce09cf5..b7f56d5b 100644 --- a/lib/Grid_config.h.in +++ b/lib/Grid_config.h.in @@ -35,6 +35,12 @@ /* Define to 1 if you have the header file. */ #undef HAVE_INTTYPES_H +/* Define to 1 if you have the `gmp' library (-lgmp). */ +#undef HAVE_LIBGMP + +/* Define to 1 if you have the `mpfr' library (-lmpfr). */ +#undef HAVE_LIBMPFR + /* Define to 1 if you have the header file. */ #undef HAVE_MALLOC_H diff --git a/lib/Makefile.am b/lib/Makefile.am index 938f7ca1..82459763 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -14,86 +14,89 @@ endif # Libraries # lib_LIBRARIES = libGrid.a -libGrid_a_SOURCES =\ - Grid_init.cc\ - stencil/Grid_stencil_common.cc\ - qcd/Grid_qcd_dirac.cc\ - qcd/Grid_qcd_wilson_dop.cc\ - algorithms/approx/Zolotarev.cc\ - algorithms/approx/Remez.cc\ +libGrid_a_SOURCES = \ + Grid_init.cc \ + stencil/Grid_stencil_common.cc \ + qcd/Grid_qcd_dirac.cc \ + qcd/Grid_qcd_wilson_dop.cc \ + algorithms/approx/Zolotarev.cc \ + algorithms/approx/Remez.cc \ $(extra_sources) # # Include files # -nobase_include_HEADERS = algorithms/approx/bigfloat.h\ - algorithms/approx/Chebyshev.h\ - algorithms/approx/Remez.h\ - algorithms/approx/Zolotarev.h\ - algorithms/iterative/ConjugateGradient.h\ - algorithms/iterative/NormalEquations.h\ - algorithms/iterative/SchurRedBlack.h\ - algorithms/LinearOperator.h\ - algorithms/SparseMatrix.h\ - cartesian/Grid_cartesian_base.h\ - cartesian/Grid_cartesian_full.h\ - cartesian/Grid_cartesian_red_black.h\ - communicator/Grid_communicator_base.h\ - cshift/Grid_cshift_common.h\ - cshift/Grid_cshift_mpi.h\ - cshift/Grid_cshift_none.h\ - Grid.h\ - Grid_algorithms.h\ - Grid_aligned_allocator.h\ - Grid_cartesian.h\ - Grid_communicator.h\ - Grid_comparison.h\ - Grid_cshift.h\ - Grid_extract.h\ - Grid_lattice.h\ - Grid_math.h\ - Grid_simd.h\ - Grid_stencil.h\ - Grid_threads.h\ - lattice/Grid_lattice_arith.h\ - lattice/Grid_lattice_base.h\ - lattice/Grid_lattice_comparison.h\ - lattice/Grid_lattice_conformable.h\ - lattice/Grid_lattice_coordinate.h\ - lattice/Grid_lattice_ET.h\ - lattice/Grid_lattice_local.h\ - lattice/Grid_lattice_overload.h\ - lattice/Grid_lattice_peekpoke.h\ - lattice/Grid_lattice_reality.h\ - lattice/Grid_lattice_reduction.h\ - lattice/Grid_lattice_rng.h\ - lattice/Grid_lattice_trace.h\ - lattice/Grid_lattice_transfer.h\ - lattice/Grid_lattice_transpose.h\ - lattice/Grid_lattice_where.h\ - math/Grid_math_arith.h\ - math/Grid_math_arith_add.h\ - math/Grid_math_arith_mac.h\ - math/Grid_math_arith_mul.h\ - math/Grid_math_arith_scalar.h\ - math/Grid_math_arith_sub.h\ - math/Grid_math_inner.h\ - math/Grid_math_outer.h\ - math/Grid_math_peek.h\ - math/Grid_math_poke.h\ - math/Grid_math_reality.h\ - math/Grid_math_tensors.h\ - math/Grid_math_trace.h\ - math/Grid_math_traits.h\ - math/Grid_math_transpose.h\ - parallelIO/GridNerscIO.h\ - qcd/Grid_qcd.h\ - qcd/Grid_qcd_2spinor.h\ - qcd/Grid_qcd_dirac.h\ - qcd/Grid_qcd_wilson_dop.h\ - simd/Grid_vComplexD.h\ - simd/Grid_vComplexF.h\ - simd/Grid_vInteger.h\ - simd/Grid_vRealD.h\ - simd/Grid_vRealF.h +nobase_include_HEADERS = algorithms/approx/bigfloat.h \ + algorithms/approx/Chebyshev.h \ + algorithms/approx/Remez.h \ + algorithms/approx/Zolotarev.h \ + algorithms/iterative/ConjugateGradient.h \ + algorithms/iterative/NormalEquations.h \ + algorithms/iterative/SchurRedBlack.h \ + algorithms/LinearOperator.h \ + algorithms/SparseMatrix.h \ + cartesian/Grid_cartesian_base.h \ + cartesian/Grid_cartesian_full.h \ + cartesian/Grid_cartesian_red_black.h \ + communicator/Grid_communicator_base.h \ + cshift/Grid_cshift_common.h \ + cshift/Grid_cshift_mpi.h \ + cshift/Grid_cshift_none.h \ + Grid.h \ + Grid_algorithms.h \ + Grid_aligned_allocator.h \ + Grid_cartesian.h \ + Grid_communicator.h \ + Grid_comparison.h \ + Grid_cshift.h \ + Grid_extract.h \ + Grid_lattice.h \ + Grid_math.h \ + Grid_simd.h \ + Grid_stencil.h \ + Grid_threads.h \ + lattice/Grid_lattice_arith.h \ + lattice/Grid_lattice_base.h \ + lattice/Grid_lattice_comparison.h \ + lattice/Grid_lattice_conformable.h \ + lattice/Grid_lattice_coordinate.h \ + lattice/Grid_lattice_ET.h \ + lattice/Grid_lattice_local.h \ + lattice/Grid_lattice_overload.h \ + lattice/Grid_lattice_peekpoke.h \ + lattice/Grid_lattice_reality.h \ + lattice/Grid_lattice_reduction.h \ + lattice/Grid_lattice_rng.h \ + lattice/Grid_lattice_trace.h \ + lattice/Grid_lattice_transfer.h \ + lattice/Grid_lattice_transpose.h \ + lattice/Grid_lattice_where.h \ + math/Grid_math_arith.h \ + math/Grid_math_arith_add.h \ + math/Grid_math_arith_mac.h \ + math/Grid_math_arith_mul.h \ + math/Grid_math_arith_scalar.h \ + math/Grid_math_arith_sub.h \ + math/Grid_math_inner.h \ + math/Grid_math_outer.h \ + math/Grid_math_peek.h \ + math/Grid_math_poke.h \ + math/Grid_math_reality.h \ + math/Grid_math_tensors.h \ + math/Grid_math_trace.h \ + math/Grid_math_traits.h \ + math/Grid_math_transpose.h \ + parallelIO/GridNerscIO.h \ + qcd/Grid_qcd.h \ + qcd/Grid_qcd_2spinor.h \ + qcd/Grid_qcd_dirac.h \ + qcd/Grid_qcd_wilson_dop.h \ + simd/Grid_vComplexD.h \ + simd/Grid_vComplexF.h \ + simd/Grid_vInteger.h \ + simd/Grid_vRealD.h \ + simd/Grid_vRealF.h \ + simd/Grid_vector_types.h \ + simd/Grid_sse4.h + diff --git a/lib/algorithms/approx/Remez.cc b/lib/algorithms/approx/Remez.cc old mode 100755 new mode 100644 diff --git a/lib/algorithms/approx/Remez.h b/lib/algorithms/approx/Remez.h old mode 100755 new mode 100644 diff --git a/lib/algorithms/approx/Zolotarev.cc b/lib/algorithms/approx/Zolotarev.cc old mode 100755 new mode 100644 diff --git a/lib/algorithms/approx/Zolotarev.h b/lib/algorithms/approx/Zolotarev.h old mode 100755 new mode 100644 diff --git a/lib/algorithms/approx/bigfloat.h b/lib/algorithms/approx/bigfloat.h old mode 100755 new mode 100644 diff --git a/lib/qcd/Grid_qcd_wilson_dop.cc b/lib/qcd/Grid_qcd_wilson_dop.cc index 5143daf3..9ef4af0c 100644 --- a/lib/qcd/Grid_qcd_wilson_dop.cc +++ b/lib/qcd/Grid_qcd_wilson_dop.cc @@ -138,16 +138,8 @@ void WilsonMatrix::MooeeInvDag(const LatticeFermion &in, LatticeFermion &out) return ; } -void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out,int dag) +void WilsonMatrix::DhopSite(int ss,const LatticeFermion &in, LatticeFermion &out) { - assert((dag==0) ||(dag==1)); - - WilsonCompressor compressor(dag); - Stencil.HaloExchange(in,comm_buf,compressor); - -PARALLEL_FOR_LOOP - for(int sss=0;sssoSites();sss++){ - vHalfSpinColourVector tmp; vHalfSpinColourVector chi; vSpinColourVector result; @@ -155,16 +147,14 @@ PARALLEL_FOR_LOOP int offset,local,perm, ptype; // int ss = Stencil._LebesgueReorder[sss]; - int ss = sss; int ssu= ss; // Xp - int idx = (Xp+dag*4)%8; - offset = Stencil._offsets [idx][ss]; - local = Stencil._is_local[idx][ss]; - perm = Stencil._permute[idx][ss]; + offset = Stencil._offsets [Xp][ss]; + local = Stencil._is_local[Xp][ss]; + perm = Stencil._permute[Xp][ss]; - ptype = Stencil._permute_type[idx]; + ptype = Stencil._permute_type[Xp]; if ( local && perm ) { spProjXp(tmp,in._odata[offset]); permute(chi,tmp,ptype); @@ -173,16 +163,14 @@ PARALLEL_FOR_LOOP } else { chi=comm_buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); + mult(&Uchi(),&Umu._odata[ssu](Xp),&chi()); spReconXp(result,Uchi); // Yp - idx = (Yp+dag*4)%8; - offset = Stencil._offsets [idx][ss]; - local = Stencil._is_local[idx][ss]; - perm = Stencil._permute[idx][ss]; - ptype = Stencil._permute_type[idx]; - + offset = Stencil._offsets [Yp][ss]; + local = Stencil._is_local[Yp][ss]; + perm = Stencil._permute[Yp][ss]; + ptype = Stencil._permute_type[Yp]; if ( local && perm ) { spProjYp(tmp,in._odata[offset]); permute(chi,tmp,ptype); @@ -191,15 +179,14 @@ PARALLEL_FOR_LOOP } else { chi=comm_buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); + mult(&Uchi(),&Umu._odata[ssu](Yp),&chi()); accumReconYp(result,Uchi); // Zp - idx = (Zp+dag*4)%8; - offset = Stencil._offsets [idx][ss]; - local = Stencil._is_local[idx][ss]; - perm = Stencil._permute[idx][ss]; - ptype = Stencil._permute_type[idx]; + offset = Stencil._offsets [Zp][ss]; + local = Stencil._is_local[Zp][ss]; + perm = Stencil._permute[Zp][ss]; + ptype = Stencil._permute_type[Zp]; if ( local && perm ) { spProjZp(tmp,in._odata[offset]); permute(chi,tmp,ptype); @@ -208,15 +195,14 @@ PARALLEL_FOR_LOOP } else { chi=comm_buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); + mult(&Uchi(),&Umu._odata[ssu](Zp),&chi()); accumReconZp(result,Uchi); // Tp - idx = (Tp+dag*4)%8; - offset = Stencil._offsets [idx][ss]; - local = Stencil._is_local[idx][ss]; - perm = Stencil._permute[idx][ss]; - ptype = Stencil._permute_type[idx]; + offset = Stencil._offsets [Tp][ss]; + local = Stencil._is_local[Tp][ss]; + perm = Stencil._permute[Tp][ss]; + ptype = Stencil._permute_type[Tp]; if ( local && perm ) { spProjTp(tmp,in._odata[offset]); permute(chi,tmp,ptype); @@ -225,15 +211,14 @@ PARALLEL_FOR_LOOP } else { chi=comm_buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); + mult(&Uchi(),&Umu._odata[ssu](Tp),&chi()); accumReconTp(result,Uchi); // Xm - idx = (Xm+dag*4)%8; - offset = Stencil._offsets [idx][ss]; - local = Stencil._is_local[idx][ss]; - perm = Stencil._permute[idx][ss]; - ptype = Stencil._permute_type[idx]; + offset = Stencil._offsets [Xm][ss]; + local = Stencil._is_local[Xm][ss]; + perm = Stencil._permute[Xm][ss]; + ptype = Stencil._permute_type[Xm]; if ( local && perm ) { @@ -244,16 +229,15 @@ PARALLEL_FOR_LOOP } else { chi=comm_buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); + mult(&Uchi(),&Umu._odata[ssu](Xm),&chi()); accumReconXm(result,Uchi); // Ym - idx = (Ym+dag*4)%8; - offset = Stencil._offsets [idx][ss]; - local = Stencil._is_local[idx][ss]; - perm = Stencil._permute[idx][ss]; - ptype = Stencil._permute_type[idx]; + offset = Stencil._offsets [Ym][ss]; + local = Stencil._is_local[Ym][ss]; + perm = Stencil._permute[Ym][ss]; + ptype = Stencil._permute_type[Ym]; if ( local && perm ) { spProjYm(tmp,in._odata[offset]); @@ -263,15 +247,14 @@ PARALLEL_FOR_LOOP } else { chi=comm_buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); + mult(&Uchi(),&Umu._odata[ssu](Ym),&chi()); accumReconYm(result,Uchi); // Zm - idx = (Zm+dag*4)%8; - offset = Stencil._offsets [idx][ss]; - local = Stencil._is_local[idx][ss]; - perm = Stencil._permute[idx][ss]; - ptype = Stencil._permute_type[idx]; + offset = Stencil._offsets [Zm][ss]; + local = Stencil._is_local[Zm][ss]; + perm = Stencil._permute[Zm][ss]; + ptype = Stencil._permute_type[Zm]; if ( local && perm ) { spProjZm(tmp,in._odata[offset]); permute(chi,tmp,ptype); @@ -280,15 +263,14 @@ PARALLEL_FOR_LOOP } else { chi=comm_buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); + mult(&Uchi(),&Umu._odata[ssu](Zm),&chi()); accumReconZm(result,Uchi); // Tm - idx = (Tm+dag*4)%8; - offset = Stencil._offsets [idx][ss]; - local = Stencil._is_local[idx][ss]; - perm = Stencil._permute[idx][ss]; - ptype = Stencil._permute_type[idx]; + offset = Stencil._offsets [Tm][ss]; + local = Stencil._is_local[Tm][ss]; + perm = Stencil._permute[Tm][ss]; + ptype = Stencil._permute_type[Tm]; if ( local && perm ) { spProjTm(tmp,in._odata[offset]); permute(chi,tmp,ptype); @@ -297,12 +279,175 @@ PARALLEL_FOR_LOOP } else { chi=comm_buf[offset]; } - mult(&Uchi(),&Umu._odata[ssu](idx),&chi()); + mult(&Uchi(),&Umu._odata[ssu](Tm),&chi()); accumReconTm(result,Uchi); vstream(out._odata[ss],result); - } +} +void WilsonMatrix::DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion &out) +{ + vHalfSpinColourVector tmp; + vHalfSpinColourVector chi; + vSpinColourVector result; + vHalfSpinColourVector Uchi; + int offset,local,perm, ptype; + int ssu= ss; + + // Xp + offset = Stencil._offsets [Xm][ss]; + local = Stencil._is_local[Xm][ss]; + perm = Stencil._permute[Xm][ss]; + + ptype = Stencil._permute_type[Xm]; + if ( local && perm ) { + spProjXp(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { + spProjXp(chi,in._odata[offset]); + } else { + chi=comm_buf[offset]; + } + mult(&Uchi(),&Umu._odata[ssu](Xm),&chi()); + spReconXp(result,Uchi); + + // Yp + offset = Stencil._offsets [Ym][ss]; + local = Stencil._is_local[Ym][ss]; + perm = Stencil._permute[Ym][ss]; + ptype = Stencil._permute_type[Ym]; + if ( local && perm ) { + spProjYp(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { + spProjYp(chi,in._odata[offset]); + } else { + chi=comm_buf[offset]; + } + mult(&Uchi(),&Umu._odata[ssu](Ym),&chi()); + accumReconYp(result,Uchi); + + // Zp + offset = Stencil._offsets [Zm][ss]; + local = Stencil._is_local[Zm][ss]; + perm = Stencil._permute[Zm][ss]; + ptype = Stencil._permute_type[Zm]; + if ( local && perm ) { + spProjZp(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { + spProjZp(chi,in._odata[offset]); + } else { + chi=comm_buf[offset]; + } + mult(&Uchi(),&Umu._odata[ssu](Zm),&chi()); + accumReconZp(result,Uchi); + + // Tp + offset = Stencil._offsets [Tm][ss]; + local = Stencil._is_local[Tm][ss]; + perm = Stencil._permute[Tm][ss]; + ptype = Stencil._permute_type[Tm]; + if ( local && perm ) { + spProjTp(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { + spProjTp(chi,in._odata[offset]); + } else { + chi=comm_buf[offset]; + } + mult(&Uchi(),&Umu._odata[ssu](Tm),&chi()); + accumReconTp(result,Uchi); + + // Xm + offset = Stencil._offsets [Xp][ss]; + local = Stencil._is_local[Xp][ss]; + perm = Stencil._permute[Xp][ss]; + ptype = Stencil._permute_type[Xp]; + + if ( local && perm ) + { + spProjXm(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { + spProjXm(chi,in._odata[offset]); + } else { + chi=comm_buf[offset]; + } + mult(&Uchi(),&Umu._odata[ssu](Xp),&chi()); + accumReconXm(result,Uchi); + + + // Ym + offset = Stencil._offsets [Yp][ss]; + local = Stencil._is_local[Yp][ss]; + perm = Stencil._permute[Yp][ss]; + ptype = Stencil._permute_type[Yp]; + + if ( local && perm ) { + spProjYm(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { + spProjYm(chi,in._odata[offset]); + } else { + chi=comm_buf[offset]; + } + mult(&Uchi(),&Umu._odata[ssu](Yp),&chi()); + accumReconYm(result,Uchi); + + // Zm + offset = Stencil._offsets [Zp][ss]; + local = Stencil._is_local[Zp][ss]; + perm = Stencil._permute[Zp][ss]; + ptype = Stencil._permute_type[Zp]; + if ( local && perm ) { + spProjZm(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { + spProjZm(chi,in._odata[offset]); + } else { + chi=comm_buf[offset]; + } + mult(&Uchi(),&Umu._odata[ssu](Zp),&chi()); + accumReconZm(result,Uchi); + + // Tm + offset = Stencil._offsets [Tp][ss]; + local = Stencil._is_local[Tp][ss]; + perm = Stencil._permute[Tp][ss]; + ptype = Stencil._permute_type[Tp]; + if ( local && perm ) { + spProjTm(tmp,in._odata[offset]); + permute(chi,tmp,ptype); + } else if ( local ) { + spProjTm(chi,in._odata[offset]); + } else { + chi=comm_buf[offset]; + } + mult(&Uchi(),&Umu._odata[ssu](Tp),&chi()); + accumReconTm(result,Uchi); + + vstream(out._odata[ss],result); +} + +void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out,int dag) +{ + assert((dag==0) ||(dag==1)); + + WilsonCompressor compressor(dag); + Stencil.HaloExchange(in,comm_buf,compressor); + + if ( dag ) { +PARALLEL_FOR_LOOP + for(int sss=0;sssoSites();sss++){ + DhopSiteDag(sss,in,out); + } + } else { +PARALLEL_FOR_LOOP + for(int sss=0;sssoSites();sss++){ + DhopSite(sss,in,out); + } + } } diff --git a/lib/qcd/Grid_qcd_wilson_dop.h b/lib/qcd/Grid_qcd_wilson_dop.h index 1db95b33..8d93a926 100644 --- a/lib/qcd/Grid_qcd_wilson_dop.h +++ b/lib/qcd/Grid_qcd_wilson_dop.h @@ -46,6 +46,8 @@ namespace Grid { // non-hermitian hopping term; half cb or both void Dhop(const LatticeFermion &in, LatticeFermion &out,int dag); + void DhopSite (int ss,const LatticeFermion &in, LatticeFermion &out); + void DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion &out); typedef iScalar > matrix; diff --git a/lib/simd/.dirstamp b/lib/simd/.dirstamp new file mode 100644 index 00000000..e69de29b diff --git a/lib/simd/Grid_sse4.h b/lib/simd/Grid_sse4.h new file mode 100644 index 00000000..ed4039b7 --- /dev/null +++ b/lib/simd/Grid_sse4.h @@ -0,0 +1,194 @@ +//---------------------------------------------------------------------- +/*! @file Grid_sse4.h + @brief Optimization libraries +*/ +// Time-stamp: <2015-05-19 17:06:51 neo> +//---------------------------------------------------------------------- + +#include + +namespace Optimization { + + struct Vsplat{ + //Complex float + inline __m128 operator()(float a, float b){ + return _mm_set_ps(b,a,b,a); + } + // Real float + inline __m128 operator()(float a){ + return _mm_set_ps(a,a,a,a); + } + //Complex double + inline __m128d operator()(double a, double b){ + return _mm_set_pd(b,a); + } + //Real double + inline __m128d operator()(double a){ + return _mm_set_pd(a,a); + } + //Integer + inline __m128i operator()(Integer a){ + return _mm_set1_epi32(a); + } + }; + + struct Vstore{ + //Float + inline void operator()(__m128 a, float* F){ + _mm_store_ps(F,a); + } + //Double + inline void operator()(__m128d a, double* D){ + _mm_store_pd(D,a); + } + //Integer + inline void operator()(__m128i a, Integer* I){ + _mm_store_si128((__m128i *)I,a); + } + + }; + + + + struct Vset{ + // Complex float + inline __m128 operator()(Grid::ComplexF *a){ + return _mm_set_ps(a[1].imag(), a[1].real(),a[0].imag(),a[0].real()); + } + // Complex double + inline __m128d operator()(Grid::ComplexD *a){ + return _mm_set_pd(a[0].imag(),a[0].real()); + } + // Real float + inline __m128 operator()(float *a){ + return _mm_set_ps(a[3],a[2],a[1],a[0]); + } + // Real double + inline __m128d operator()(double *a){ + return _mm_set_pd(a[1],a[0]); + } + // Integer + inline __m128i operator()(Integer *a){ + return _mm_set_epi32(a[0],a[1],a[2],a[3]); + } + + + }; + + struct Reduce{ + //Complex float + inline Grid::ComplexF operator()(__m128 in){ + union { + __m128 v1; + float f[4]; + } u128; + u128.v1 = _mm_add_ps(in, _mm_shuffle_ps(in,in, 0b01001110)); // FIXME Prefer to use _MM_SHUFFLE macros + return Grid::ComplexF(u128.f[0], u128.f[1]); + } + //Complex double + inline Grid::ComplexD operator()(__m128d in){ + printf("Missing complex double implementation -> FIX\n"); + return Grid::ComplexD(0,0); // FIXME wrong + } + + + + }; + + + ///////////////////////////////////////////////////// + // Arithmetic operations + ///////////////////////////////////////////////////// + struct Sum{ + //Complex/Real float + inline __m128 operator()(__m128 a, __m128 b){ + return _mm_add_ps(a,b); + } + //Complex/Real double + inline __m128d operator()(__m128d a, __m128d b){ + return _mm_add_pd(a,b); + } + //Integer + inline __m128i operator()(__m128i a, __m128i b){ + return _mm_add_epi32(a,b); + } + }; + + struct Sub{ + //Complex/Real float + inline __m128 operator()(__m128 a, __m128 b){ + return _mm_sub_ps(a,b); + } + //Complex/Real double + inline __m128d operator()(__m128d a, __m128d b){ + return _mm_sub_pd(a,b); + } + //Integer + inline __m128i operator()(__m128i a, __m128i b){ + return _mm_sub_epi32(a,b); + } + }; + + struct MultComplex{ + // Complex float + inline __m128 operator()(__m128 a, __m128 b){ + __m128 ymm0,ymm1,ymm2; + ymm0 = _mm_shuffle_ps(a,a,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar, + ymm0 = _mm_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br + ymm1 = _mm_shuffle_ps(b,b,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi + ymm2 = _mm_shuffle_ps(a,a,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai + ymm1 = _mm_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi + return _mm_addsub_ps(ymm0,ymm1); + } + // Complex double + inline __m128d operator()(__m128d a, __m128d b){ + __m128d ymm0,ymm1,ymm2; + ymm0 = _mm_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, + ymm0 = _mm_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br + ymm1 = _mm_shuffle_pd(b,b,0x1); // ymm1 <- br,bi b01 + ymm2 = _mm_shuffle_pd(a,a,0x3); // ymm2 <- ai,ai b11 + ymm1 = _mm_mul_pd(ymm1,ymm2); // ymm1 <- br ai, ai bi + return _mm_addsub_pd(ymm0,ymm1); + } + }; + + struct Mult{ + // Real float + inline __m128 operator()(__m128 a, __m128 b){ + return _mm_mul_ps(a,b); + } + // Real double + inline __m128d operator()(__m128d a, __m128d b){ + return _mm_mul_pd(a,b); + } + // Integer + inline __m128i operator()(__m128i a, __m128i b){ + return _mm_mul_epi32(a,b); + } + + }; + + + + +} + +// Here assign types +namespace Grid { + typedef __m128 SIMD_Ftype; // Single precision type + typedef __m128d SIMD_Dtype; // Double precision type + typedef __m128i SIMD_Itype; // Integer type + + + // Function names + typedef Optimization::Vsplat VsplatSIMD; + typedef Optimization::Vstore VstoreSIMD; + + // Arithmetic operations + typedef Optimization::Sum SumSIMD; + typedef Optimization::Sub SubSIMD; + typedef Optimization::Mult MultSIMD; + typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::Vset VsetSIMD; + +} diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index 4576abe3..97958fe4 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -1,6 +1,14 @@ +//--------------------------------------------------------------------------- +/*! @file Grid_vector_types.h + @brief Defines templated class Grid_simd to deal with inner vector types +*/ +// Time-stamp: <2015-05-19 17:20:36 neo> +//--------------------------------------------------------------------------- #ifndef GRID_VECTOR_TYPES #define GRID_VECTOR_TYPES +#include "Grid_sse4.h" + namespace Grid { @@ -13,31 +21,32 @@ namespace Grid { struct RealPart< std::complex >{ typedef T type; }; - //////////////////////////////////////////////////////// + //////////////////////////////////////////////////////// // Check for complexity with type traits template struct is_complex : std::false_type {}; template < typename T > struct is_complex< std::complex >: std::true_type {}; //////////////////////////////////////////////////////// - - // Define the operation templates functors - template < class SIMD, class Operation > - SIMD binary(SIMD src_1, SIMD src_2, Operation op){ + // general forms to allow for vsplat syntax + // need explicit declaration of types when used since + // clang cannot automatically determine the output type sometimes + template < class Out, class Input1, class Input2, class Operation > + Out binary(Input1 src_1, Input2 src_2, Operation op){ return op(src_1, src_2); - } + } - template < class SIMD, class Operation > - SIMD unary(SIMD src, Operation op){ + template < class SIMDout, class Input, class Operation > + SIMDout unary(Input src, Operation op){ return op(src); - } + } + /////////////////////////////////////////////// /* @brief Grid_simd class for the SIMD vector type operations - */ template < class Scalar_type, class Vector_type > class Grid_simd { @@ -69,76 +78,91 @@ namespace Grid { }; - /////////////////////////////////////////////// // mac, mult, sub, add, adj - // Should do an AVX2 version with mac. /////////////////////////////////////////////// friend inline void mac (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ a,const Grid_simd *__restrict__ x){ *y = (*a)*(*x)+(*y); }; friend inline void mult(Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) * (*r); } friend inline void sub (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) - (*r); } friend inline void add (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) + (*r); } + + //not for integer types... FIXME friend inline Grid_simd adj(const Grid_simd &in){ return conjugate(in); } - ////////////////////////////////// - // Initialise to 1,0,i - ////////////////////////////////// + /////////////////////////////////////////////// + // Initialise to 1,0,i for the correct types + /////////////////////////////////////////////// // if not complex overload here - friend inline void vone(Grid_simd &ret) { vsplat(ret,1.0); } - friend inline void vzero(Grid_simd &ret) { vsplat(ret,0.0); } - + template < class S = Scalar_type,typename std::enable_if < !is_complex < S >::value, int >::type = 0 > + friend inline void vone(Grid_simd &ret) { vsplat(ret,1.0); } + template < class S = Scalar_type,typename std::enable_if < !is_complex < S >::value, int >::type = 0 > + friend inline void vzero(Grid_simd &ret) { vsplat(ret,0.0); } + // overload for complex type template < class S = Scalar_type,typename std::enable_if < is_complex < S >::value, int >::type = 0 > - friend inline void vone(Grid_simd &ret) { vsplat(ret,1.0,0.0); } + friend inline void vone(Grid_simd &ret) { vsplat(ret,1.0,0.0); } template < class S = Scalar_type,typename std::enable_if < is_complex < S >::value, int >::type = 0 > - friend inline void vzero(Grid_simd &ret) { vsplat(ret,0.0,0.0); } - + friend inline void vzero(Grid_simd &ret) { vsplat(ret,0.0,0.0); }// use xor? + // For integral type template < class S = Scalar_type,typename std::enable_if < std::is_integral < S >::value, int >::type = 0 > - friend inline void vone(Grid_simd &ret) { vsplat(ret,1); } + friend inline void vone(Grid_simd &ret) { vsplat(ret,1); } template < class S = Scalar_type,typename std::enable_if < std::is_integral < S >::value, int >::type = 0 > - friend inline void vzero(Grid_simd &ret) { vsplat(ret,0); } + friend inline void vzero(Grid_simd &ret) { vsplat(ret,0); } + template < class S = Scalar_type,typename std::enable_if < std::is_integral < S >::value, int >::type = 0 > + friend inline void vtrue (Grid_simd &ret){vsplat(ret,0xFFFFFFFF);} + template < class S = Scalar_type,typename std::enable_if < std::is_integral < S >::value, int >::type = 0 > + friend inline void vfalse(vInteger &ret){vsplat(ret,0);} - // do not compile if real or integer, send an error message from the compiler template < class S = Scalar_type,typename std::enable_if < is_complex < S >::value, int >::type = 0 > friend inline void vcomplex_i(Grid_simd &ret){ vsplat(ret,0.0,1.0);} - - - //////////////////////////////////// // Arithmetic operator overloads +,-,* //////////////////////////////////// friend inline Grid_simd operator + (Grid_simd a, Grid_simd b) { - vComplexF ret; - // FIXME call the binary op + Grid_simd ret; + ret.v = binary(a.v, b.v, SumSIMD()); return ret; }; friend inline Grid_simd operator - (Grid_simd a, Grid_simd b) { - vComplexF ret; - // FIXME call the binary op + Grid_simd ret; + ret.v = binary(a.v, b.v, SubSIMD()); return ret; }; - friend inline Grid_simd operator * (Grid_simd a, Grid_simd b) + // Distinguish between complex types and others + template < class S = Scalar_type, typename std::enable_if < is_complex < S >::value, int >::type = 0 > + friend inline Grid_simd operator * (Grid_simd a, Grid_simd b) { - vComplexF ret; - // FIXME call the binary op + Grid_simd ret; + ret.v = binary(a.v,b.v, MultComplexSIMD()); return ret; }; - + + // Real/Integer types + template < class S = Scalar_type,typename std::enable_if < !is_complex < S >::value, int >::type = 0 > + friend inline Grid_simd operator * (Grid_simd a, Grid_simd b) + { + Grid_simd ret; + ret.v = binary(a.v,b.v, MultSIMD()); + return ret; + }; + + + //////////////////////////////////////////////////////////////////////// // FIXME: gonna remove these load/store, get, set, prefetch //////////////////////////////////////////////////////////////////////// friend inline void vset(Grid_simd &ret, Scalar_type *a){ - // FIXME set + ret.v = unary(a, VsetSIMD()); } /////////////////////// @@ -147,34 +171,33 @@ namespace Grid { // overload if complex template < class S = Scalar_type > friend inline void vsplat(Grid_simd &ret, typename std::enable_if< is_complex < S >::value, S>::type c){ - Real a= real(c); - Real b= imag(c); + Real a = real(c); + Real b = imag(c); vsplat(ret,a,b); } // this only for the complex version template < class S = Scalar_type, typename std::enable_if < is_complex < S >::value, int >::type = 0 > friend inline void vsplat(Grid_simd &ret,Real a, Real b){ - // FIXME add operator + ret.v = binary(a, b, VsplatSIMD()); } - //if real fill with a, if complex fill with a in the real part + //if real fill with a, if complex fill with a in the real part (first function above) friend inline void vsplat(Grid_simd &ret,Real a){ - // FIXME add operator + ret.v = unary(a, VsplatSIMD()); } - friend inline void vstore(const Grid_simd &ret, Scalar_type *a){ - //FIXME + binary(ret.v, (Real*)a, VstoreSIMD()); } + friend inline void vprefetch(const Grid_simd &v) { _mm_prefetch((const char*)&v.v,_MM_HINT_T0); } - friend inline Scalar_type Reduce(const Grid_simd & in) { // FIXME add operator @@ -221,6 +244,7 @@ namespace Grid { inline Grid_simd &operator *=(const Grid_simd &r) { *this = (*this)*r; return *this; + // return (*this)*r; ? } inline Grid_simd &operator +=(const Grid_simd &r) { *this = *this+r; @@ -233,6 +257,12 @@ namespace Grid { + friend inline void permute(Grid_simd &y,Grid_simd b,int perm) + { + Gpermute(y,b,perm); + } + + /* friend inline void permute(Grid_simd &y,Grid_simd b,int perm) { Gpermute(y,b,perm); @@ -253,7 +283,7 @@ namespace Grid { { Gextract(y,extracted); } - + */ };// end of Grid_simd class definition @@ -286,11 +316,11 @@ namespace Grid { // Define available types (now change names to avoid clashing) - typedef __m128 SIMD_type;// decided at compilation time - typedef Grid_simd< float , SIMD_type > MyRealF; - typedef Grid_simd< double , SIMD_type > MyRealD; - typedef Grid_simd< std::complex< float > , SIMD_type > MyComplexF; - typedef Grid_simd< std::complex< double >, SIMD_type > MyComplexD; + + typedef Grid_simd< float , SIMD_Ftype > MyRealF; + typedef Grid_simd< double , SIMD_Dtype > MyRealD; + typedef Grid_simd< std::complex< float > , SIMD_Ftype > MyComplexF; + typedef Grid_simd< std::complex< double >, SIMD_Dtype > MyComplexD; diff --git a/scripts/bench_wilson.sh b/scripts/bench_wilson.sh new file mode 100755 index 00000000..af73d591 --- /dev/null +++ b/scripts/bench_wilson.sh @@ -0,0 +1,9 @@ +for omp in 1 2 4 +do +echo > wilson.t$omp +for vol in 4.4.4.4 4.4.4.8 4.4.8.8 4.8.8.8 8.8.8.8 8.8.8.16 8.8.16.16 8.16.16.16 +do +perf=` ./benchmarks/Grid_wilson --grid $vol --omp $omp | grep mflop | awk '{print $3}'` +echo $vol $perf >> wilson.t$omp +done +done \ No newline at end of file diff --git a/scripts/wilson.gnu b/scripts/wilson.gnu new file mode 100644 index 00000000..69bca5b5 --- /dev/null +++ b/scripts/wilson.gnu @@ -0,0 +1,7 @@ +plot 'wilson.t1' u 2 w l t "AVX1-OMP=1" +replot 'wilson.t2' u 2 w l t "AVX1-OMP=2" +replot 'wilson.t4' u 2 w l t "AVX1-OMP=4" +set terminal 'pdf' +set output 'wilson_clang.pdf' +replot +quit diff --git a/tests/Grid_main.cc b/tests/Grid_main.cc index 1aa4fc43..33b1cf4f 100644 --- a/tests/Grid_main.cc +++ b/tests/Grid_main.cc @@ -1,5 +1,9 @@ #include "Grid.h" +//DEBUG +#include "simd/Grid_vector_types.h" + + using namespace std; using namespace Grid; using namespace Grid::QCD; @@ -151,6 +155,39 @@ int main (int argc, char ** argv) scMat = sMat*scMat; // LatticeSpinColourMatrix = LatticeSpinMatrix * LatticeSpinColourMatrix + + +#ifdef SSE4 + ///////// Tests the new class Grid_simd + std::complex ctest(3.0,2.0); + std::complex ctestf(3.0,2.0); + MyComplexF TestMe1(1.0); // fill real part + MyComplexD TestMe2(ctest); + MyComplexD TestMe3(ctest);// compiler generate conversion of basic types + //MyRealF TestMe5(ctest);// Must generate compiler error + MyRealD TestMe4(2.0); + + MyComplexF TestMe6(ctestf); + MyComplexF TestMe7(ctestf); + + MyComplexD TheSum= TestMe2*TestMe3; + MyComplexF TheSumF= TestMe6*TestMe7; + + double dsum[2]; + _mm_store_pd(dsum, TheSum.v); + for (int i =0; i< 2; i++) + printf("%f\n", dsum[i]); + + float fsum[4]; + _mm_store_ps(fsum, TheSumF.v); + for (int i =0; i< 4; i++) + printf("%f\n", fsum[i]); + + vstore(TheSum, &ctest); + std::cout << ctest<< std::endl; +#endif + /////////////////////// + // Non-lattice (const objects) * Lattice ColourMatrix cm; SpinColourMatrix scm;