1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 15:55:37 +00:00

Merging in

Merge branch 'master' of https://github.com/paboyle/Grid
This commit is contained in:
Peter Boyle 2015-05-19 21:30:13 +01:00
commit dc4014668d
20 changed files with 749 additions and 188 deletions

2
README
View File

@ -33,6 +33,8 @@ MPI parallelism is UNIMPLEMENTED and for now only OpenMP and SIMD parallelism is
by setting variables in the command line or in the environment. Here
is are examples:
./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -msse4" --enable-simd=SSE4
./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx" --enable-simd=AVX1
./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx2" --enable-simd=AVX2

View File

@ -37,6 +37,8 @@ MPI, OpenMP, and SIMD parallelism are present in the library.
by setting variables in the command line or in the environment. Here
are examples:
./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -msse4" --enable-simd=SSE4
./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx" --enable-simd=AVX1
./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx2" --enable-simd=AVX2

104
configure vendored
View File

@ -4503,6 +4503,11 @@ _ACEOF
# Checks for library functions.
echo
echo Checking libraries
echo :::::::::::::::::::::::::::::::::::::::::::
for ac_func in gettimeofday
do :
ac_fn_cxx_check_func "$LINENO" "gettimeofday" "ac_cv_func_gettimeofday"
@ -4515,6 +4520,105 @@ fi
done
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __gmpf_init in -lgmp" >&5
$as_echo_n "checking for __gmpf_init in -lgmp... " >&6; }
if ${ac_cv_lib_gmp___gmpf_init+:} false; then :
$as_echo_n "(cached) " >&6
else
ac_check_lib_save_LIBS=$LIBS
LIBS="-lgmp $LIBS"
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
/* Override any GCC internal prototype to avoid an error.
Use char because int might match the return type of a GCC
builtin and then its argument prototype would still apply. */
#ifdef __cplusplus
extern "C"
#endif
char __gmpf_init ();
int
main ()
{
return __gmpf_init ();
;
return 0;
}
_ACEOF
if ac_fn_cxx_try_link "$LINENO"; then :
ac_cv_lib_gmp___gmpf_init=yes
else
ac_cv_lib_gmp___gmpf_init=no
fi
rm -f core conftest.err conftest.$ac_objext \
conftest$ac_exeext conftest.$ac_ext
LIBS=$ac_check_lib_save_LIBS
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_gmp___gmpf_init" >&5
$as_echo "$ac_cv_lib_gmp___gmpf_init" >&6; }
if test "x$ac_cv_lib_gmp___gmpf_init" = xyes; then :
cat >>confdefs.h <<_ACEOF
#define HAVE_LIBGMP 1
_ACEOF
LIBS="-lgmp $LIBS"
else
as_fn_error $? "GNU Multiple Precision GMP library was not found in your system.
Please install or provide the correct path to your installation
Info at: http://www.gmplib.org" "$LINENO" 5
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for mpfr_init in -lmpfr" >&5
$as_echo_n "checking for mpfr_init in -lmpfr... " >&6; }
if ${ac_cv_lib_mpfr_mpfr_init+:} false; then :
$as_echo_n "(cached) " >&6
else
ac_check_lib_save_LIBS=$LIBS
LIBS="-lmpfr $LIBS"
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
/* Override any GCC internal prototype to avoid an error.
Use char because int might match the return type of a GCC
builtin and then its argument prototype would still apply. */
#ifdef __cplusplus
extern "C"
#endif
char mpfr_init ();
int
main ()
{
return mpfr_init ();
;
return 0;
}
_ACEOF
if ac_fn_cxx_try_link "$LINENO"; then :
ac_cv_lib_mpfr_mpfr_init=yes
else
ac_cv_lib_mpfr_mpfr_init=no
fi
rm -f core conftest.err conftest.$ac_objext \
conftest$ac_exeext conftest.$ac_ext
LIBS=$ac_check_lib_save_LIBS
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_mpfr_mpfr_init" >&5
$as_echo "$ac_cv_lib_mpfr_mpfr_init" >&6; }
if test "x$ac_cv_lib_mpfr_mpfr_init" = xyes; then :
cat >>confdefs.h <<_ACEOF
#define HAVE_LIBMPFR 1
_ACEOF
LIBS="-lmpfr $LIBS"
else
as_fn_error $? "GNU Multiple Precision MPFR library was not found in your system.
Please install or provide the correct path to your installation
Info at: http://www.mpfr.org/" "$LINENO" 5
fi

View File

@ -3,7 +3,7 @@
#
# Project Grid package
#
# Time-stamp: <2015-05-18 17:14:20 neo>
# Time-stamp: <2015-05-19 13:51:08 neo>
AC_PREREQ([2.69])
AC_INIT([Grid], [1.0], [paboyle@ph.ed.ac.uk])
@ -46,8 +46,22 @@ AC_TYPE_UINT32_T
AC_TYPE_UINT64_T
# Checks for library functions.
echo
echo Checking libraries
echo :::::::::::::::::::::::::::::::::::::::::::
AC_CHECK_FUNCS([gettimeofday])
AC_CHECK_LIB([gmp],[__gmpf_init],,
[AC_MSG_ERROR(GNU Multiple Precision GMP library was not found in your system.
Please install or provide the correct path to your installation
Info at: http://www.gmplib.org)])
AC_CHECK_LIB([mpfr],[mpfr_init],,
[AC_MSG_ERROR(GNU Multiple Precision MPFR library was not found in your system.
Please install or provide the correct path to your installation
Info at: http://www.mpfr.org/)])

View File

@ -36,6 +36,12 @@
/* Define to 1 if you have the <inttypes.h> header file. */
#define HAVE_INTTYPES_H 1
/* Define to 1 if you have the `gmp' library (-lgmp). */
#define HAVE_LIBGMP 1
/* Define to 1 if you have the `mpfr' library (-lmpfr). */
#define HAVE_LIBMPFR 1
/* Define to 1 if you have the <malloc.h> header file. */
#define HAVE_MALLOC_H 1

View File

@ -35,6 +35,12 @@
/* Define to 1 if you have the <inttypes.h> header file. */
#undef HAVE_INTTYPES_H
/* Define to 1 if you have the `gmp' library (-lgmp). */
#undef HAVE_LIBGMP
/* Define to 1 if you have the `mpfr' library (-lmpfr). */
#undef HAVE_LIBMPFR
/* Define to 1 if you have the <malloc.h> header file. */
#undef HAVE_MALLOC_H

View File

@ -14,86 +14,89 @@ endif
# Libraries
#
lib_LIBRARIES = libGrid.a
libGrid_a_SOURCES =\
Grid_init.cc\
stencil/Grid_stencil_common.cc\
qcd/Grid_qcd_dirac.cc\
qcd/Grid_qcd_wilson_dop.cc\
algorithms/approx/Zolotarev.cc\
algorithms/approx/Remez.cc\
libGrid_a_SOURCES = \
Grid_init.cc \
stencil/Grid_stencil_common.cc \
qcd/Grid_qcd_dirac.cc \
qcd/Grid_qcd_wilson_dop.cc \
algorithms/approx/Zolotarev.cc \
algorithms/approx/Remez.cc \
$(extra_sources)
#
# Include files
#
nobase_include_HEADERS = algorithms/approx/bigfloat.h\
algorithms/approx/Chebyshev.h\
algorithms/approx/Remez.h\
algorithms/approx/Zolotarev.h\
algorithms/iterative/ConjugateGradient.h\
algorithms/iterative/NormalEquations.h\
algorithms/iterative/SchurRedBlack.h\
algorithms/LinearOperator.h\
algorithms/SparseMatrix.h\
cartesian/Grid_cartesian_base.h\
cartesian/Grid_cartesian_full.h\
cartesian/Grid_cartesian_red_black.h\
communicator/Grid_communicator_base.h\
cshift/Grid_cshift_common.h\
cshift/Grid_cshift_mpi.h\
cshift/Grid_cshift_none.h\
Grid.h\
Grid_algorithms.h\
Grid_aligned_allocator.h\
Grid_cartesian.h\
Grid_communicator.h\
Grid_comparison.h\
Grid_cshift.h\
Grid_extract.h\
Grid_lattice.h\
Grid_math.h\
Grid_simd.h\
Grid_stencil.h\
Grid_threads.h\
lattice/Grid_lattice_arith.h\
lattice/Grid_lattice_base.h\
lattice/Grid_lattice_comparison.h\
lattice/Grid_lattice_conformable.h\
lattice/Grid_lattice_coordinate.h\
lattice/Grid_lattice_ET.h\
lattice/Grid_lattice_local.h\
lattice/Grid_lattice_overload.h\
lattice/Grid_lattice_peekpoke.h\
lattice/Grid_lattice_reality.h\
lattice/Grid_lattice_reduction.h\
lattice/Grid_lattice_rng.h\
lattice/Grid_lattice_trace.h\
lattice/Grid_lattice_transfer.h\
lattice/Grid_lattice_transpose.h\
lattice/Grid_lattice_where.h\
math/Grid_math_arith.h\
math/Grid_math_arith_add.h\
math/Grid_math_arith_mac.h\
math/Grid_math_arith_mul.h\
math/Grid_math_arith_scalar.h\
math/Grid_math_arith_sub.h\
math/Grid_math_inner.h\
math/Grid_math_outer.h\
math/Grid_math_peek.h\
math/Grid_math_poke.h\
math/Grid_math_reality.h\
math/Grid_math_tensors.h\
math/Grid_math_trace.h\
math/Grid_math_traits.h\
math/Grid_math_transpose.h\
parallelIO/GridNerscIO.h\
qcd/Grid_qcd.h\
qcd/Grid_qcd_2spinor.h\
qcd/Grid_qcd_dirac.h\
qcd/Grid_qcd_wilson_dop.h\
simd/Grid_vComplexD.h\
simd/Grid_vComplexF.h\
simd/Grid_vInteger.h\
simd/Grid_vRealD.h\
simd/Grid_vRealF.h
nobase_include_HEADERS = algorithms/approx/bigfloat.h \
algorithms/approx/Chebyshev.h \
algorithms/approx/Remez.h \
algorithms/approx/Zolotarev.h \
algorithms/iterative/ConjugateGradient.h \
algorithms/iterative/NormalEquations.h \
algorithms/iterative/SchurRedBlack.h \
algorithms/LinearOperator.h \
algorithms/SparseMatrix.h \
cartesian/Grid_cartesian_base.h \
cartesian/Grid_cartesian_full.h \
cartesian/Grid_cartesian_red_black.h \
communicator/Grid_communicator_base.h \
cshift/Grid_cshift_common.h \
cshift/Grid_cshift_mpi.h \
cshift/Grid_cshift_none.h \
Grid.h \
Grid_algorithms.h \
Grid_aligned_allocator.h \
Grid_cartesian.h \
Grid_communicator.h \
Grid_comparison.h \
Grid_cshift.h \
Grid_extract.h \
Grid_lattice.h \
Grid_math.h \
Grid_simd.h \
Grid_stencil.h \
Grid_threads.h \
lattice/Grid_lattice_arith.h \
lattice/Grid_lattice_base.h \
lattice/Grid_lattice_comparison.h \
lattice/Grid_lattice_conformable.h \
lattice/Grid_lattice_coordinate.h \
lattice/Grid_lattice_ET.h \
lattice/Grid_lattice_local.h \
lattice/Grid_lattice_overload.h \
lattice/Grid_lattice_peekpoke.h \
lattice/Grid_lattice_reality.h \
lattice/Grid_lattice_reduction.h \
lattice/Grid_lattice_rng.h \
lattice/Grid_lattice_trace.h \
lattice/Grid_lattice_transfer.h \
lattice/Grid_lattice_transpose.h \
lattice/Grid_lattice_where.h \
math/Grid_math_arith.h \
math/Grid_math_arith_add.h \
math/Grid_math_arith_mac.h \
math/Grid_math_arith_mul.h \
math/Grid_math_arith_scalar.h \
math/Grid_math_arith_sub.h \
math/Grid_math_inner.h \
math/Grid_math_outer.h \
math/Grid_math_peek.h \
math/Grid_math_poke.h \
math/Grid_math_reality.h \
math/Grid_math_tensors.h \
math/Grid_math_trace.h \
math/Grid_math_traits.h \
math/Grid_math_transpose.h \
parallelIO/GridNerscIO.h \
qcd/Grid_qcd.h \
qcd/Grid_qcd_2spinor.h \
qcd/Grid_qcd_dirac.h \
qcd/Grid_qcd_wilson_dop.h \
simd/Grid_vComplexD.h \
simd/Grid_vComplexF.h \
simd/Grid_vInteger.h \
simd/Grid_vRealD.h \
simd/Grid_vRealF.h \
simd/Grid_vector_types.h \
simd/Grid_sse4.h

0
lib/algorithms/approx/Remez.cc Executable file → Normal file
View File

0
lib/algorithms/approx/Remez.h Executable file → Normal file
View File

0
lib/algorithms/approx/Zolotarev.cc Executable file → Normal file
View File

0
lib/algorithms/approx/Zolotarev.h Executable file → Normal file
View File

0
lib/algorithms/approx/bigfloat.h Executable file → Normal file
View File

View File

@ -138,16 +138,8 @@ void WilsonMatrix::MooeeInvDag(const LatticeFermion &in, LatticeFermion &out)
return ;
}
void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out,int dag)
void WilsonMatrix::DhopSite(int ss,const LatticeFermion &in, LatticeFermion &out)
{
assert((dag==0) ||(dag==1));
WilsonCompressor compressor(dag);
Stencil.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor);
PARALLEL_FOR_LOOP
for(int sss=0;sss<grid->oSites();sss++){
vHalfSpinColourVector tmp;
vHalfSpinColourVector chi;
vSpinColourVector result;
@ -155,16 +147,14 @@ PARALLEL_FOR_LOOP
int offset,local,perm, ptype;
// int ss = Stencil._LebesgueReorder[sss];
int ss = sss;
int ssu= ss;
// Xp
int idx = (Xp+dag*4)%8;
offset = Stencil._offsets [idx][ss];
local = Stencil._is_local[idx][ss];
perm = Stencil._permute[idx][ss];
offset = Stencil._offsets [Xp][ss];
local = Stencil._is_local[Xp][ss];
perm = Stencil._permute[Xp][ss];
ptype = Stencil._permute_type[idx];
ptype = Stencil._permute_type[Xp];
if ( local && perm ) {
spProjXp(tmp,in._odata[offset]);
permute(chi,tmp,ptype);
@ -173,16 +163,14 @@ PARALLEL_FOR_LOOP
} else {
chi=comm_buf[offset];
}
mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
mult(&Uchi(),&Umu._odata[ssu](Xp),&chi());
spReconXp(result,Uchi);
// Yp
idx = (Yp+dag*4)%8;
offset = Stencil._offsets [idx][ss];
local = Stencil._is_local[idx][ss];
perm = Stencil._permute[idx][ss];
ptype = Stencil._permute_type[idx];
offset = Stencil._offsets [Yp][ss];
local = Stencil._is_local[Yp][ss];
perm = Stencil._permute[Yp][ss];
ptype = Stencil._permute_type[Yp];
if ( local && perm ) {
spProjYp(tmp,in._odata[offset]);
permute(chi,tmp,ptype);
@ -191,15 +179,14 @@ PARALLEL_FOR_LOOP
} else {
chi=comm_buf[offset];
}
mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
mult(&Uchi(),&Umu._odata[ssu](Yp),&chi());
accumReconYp(result,Uchi);
// Zp
idx = (Zp+dag*4)%8;
offset = Stencil._offsets [idx][ss];
local = Stencil._is_local[idx][ss];
perm = Stencil._permute[idx][ss];
ptype = Stencil._permute_type[idx];
offset = Stencil._offsets [Zp][ss];
local = Stencil._is_local[Zp][ss];
perm = Stencil._permute[Zp][ss];
ptype = Stencil._permute_type[Zp];
if ( local && perm ) {
spProjZp(tmp,in._odata[offset]);
permute(chi,tmp,ptype);
@ -208,15 +195,14 @@ PARALLEL_FOR_LOOP
} else {
chi=comm_buf[offset];
}
mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
mult(&Uchi(),&Umu._odata[ssu](Zp),&chi());
accumReconZp(result,Uchi);
// Tp
idx = (Tp+dag*4)%8;
offset = Stencil._offsets [idx][ss];
local = Stencil._is_local[idx][ss];
perm = Stencil._permute[idx][ss];
ptype = Stencil._permute_type[idx];
offset = Stencil._offsets [Tp][ss];
local = Stencil._is_local[Tp][ss];
perm = Stencil._permute[Tp][ss];
ptype = Stencil._permute_type[Tp];
if ( local && perm ) {
spProjTp(tmp,in._odata[offset]);
permute(chi,tmp,ptype);
@ -225,15 +211,14 @@ PARALLEL_FOR_LOOP
} else {
chi=comm_buf[offset];
}
mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
mult(&Uchi(),&Umu._odata[ssu](Tp),&chi());
accumReconTp(result,Uchi);
// Xm
idx = (Xm+dag*4)%8;
offset = Stencil._offsets [idx][ss];
local = Stencil._is_local[idx][ss];
perm = Stencil._permute[idx][ss];
ptype = Stencil._permute_type[idx];
offset = Stencil._offsets [Xm][ss];
local = Stencil._is_local[Xm][ss];
perm = Stencil._permute[Xm][ss];
ptype = Stencil._permute_type[Xm];
if ( local && perm )
{
@ -244,16 +229,15 @@ PARALLEL_FOR_LOOP
} else {
chi=comm_buf[offset];
}
mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
mult(&Uchi(),&Umu._odata[ssu](Xm),&chi());
accumReconXm(result,Uchi);
// Ym
idx = (Ym+dag*4)%8;
offset = Stencil._offsets [idx][ss];
local = Stencil._is_local[idx][ss];
perm = Stencil._permute[idx][ss];
ptype = Stencil._permute_type[idx];
offset = Stencil._offsets [Ym][ss];
local = Stencil._is_local[Ym][ss];
perm = Stencil._permute[Ym][ss];
ptype = Stencil._permute_type[Ym];
if ( local && perm ) {
spProjYm(tmp,in._odata[offset]);
@ -263,15 +247,14 @@ PARALLEL_FOR_LOOP
} else {
chi=comm_buf[offset];
}
mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
mult(&Uchi(),&Umu._odata[ssu](Ym),&chi());
accumReconYm(result,Uchi);
// Zm
idx = (Zm+dag*4)%8;
offset = Stencil._offsets [idx][ss];
local = Stencil._is_local[idx][ss];
perm = Stencil._permute[idx][ss];
ptype = Stencil._permute_type[idx];
offset = Stencil._offsets [Zm][ss];
local = Stencil._is_local[Zm][ss];
perm = Stencil._permute[Zm][ss];
ptype = Stencil._permute_type[Zm];
if ( local && perm ) {
spProjZm(tmp,in._odata[offset]);
permute(chi,tmp,ptype);
@ -280,15 +263,14 @@ PARALLEL_FOR_LOOP
} else {
chi=comm_buf[offset];
}
mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
mult(&Uchi(),&Umu._odata[ssu](Zm),&chi());
accumReconZm(result,Uchi);
// Tm
idx = (Tm+dag*4)%8;
offset = Stencil._offsets [idx][ss];
local = Stencil._is_local[idx][ss];
perm = Stencil._permute[idx][ss];
ptype = Stencil._permute_type[idx];
offset = Stencil._offsets [Tm][ss];
local = Stencil._is_local[Tm][ss];
perm = Stencil._permute[Tm][ss];
ptype = Stencil._permute_type[Tm];
if ( local && perm ) {
spProjTm(tmp,in._odata[offset]);
permute(chi,tmp,ptype);
@ -297,12 +279,175 @@ PARALLEL_FOR_LOOP
} else {
chi=comm_buf[offset];
}
mult(&Uchi(),&Umu._odata[ssu](idx),&chi());
mult(&Uchi(),&Umu._odata[ssu](Tm),&chi());
accumReconTm(result,Uchi);
vstream(out._odata[ss],result);
}
}
void WilsonMatrix::DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion &out)
{
vHalfSpinColourVector tmp;
vHalfSpinColourVector chi;
vSpinColourVector result;
vHalfSpinColourVector Uchi;
int offset,local,perm, ptype;
int ssu= ss;
// Xp
offset = Stencil._offsets [Xm][ss];
local = Stencil._is_local[Xm][ss];
perm = Stencil._permute[Xm][ss];
ptype = Stencil._permute_type[Xm];
if ( local && perm ) {
spProjXp(tmp,in._odata[offset]);
permute(chi,tmp,ptype);
} else if ( local ) {
spProjXp(chi,in._odata[offset]);
} else {
chi=comm_buf[offset];
}
mult(&Uchi(),&Umu._odata[ssu](Xm),&chi());
spReconXp(result,Uchi);
// Yp
offset = Stencil._offsets [Ym][ss];
local = Stencil._is_local[Ym][ss];
perm = Stencil._permute[Ym][ss];
ptype = Stencil._permute_type[Ym];
if ( local && perm ) {
spProjYp(tmp,in._odata[offset]);
permute(chi,tmp,ptype);
} else if ( local ) {
spProjYp(chi,in._odata[offset]);
} else {
chi=comm_buf[offset];
}
mult(&Uchi(),&Umu._odata[ssu](Ym),&chi());
accumReconYp(result,Uchi);
// Zp
offset = Stencil._offsets [Zm][ss];
local = Stencil._is_local[Zm][ss];
perm = Stencil._permute[Zm][ss];
ptype = Stencil._permute_type[Zm];
if ( local && perm ) {
spProjZp(tmp,in._odata[offset]);
permute(chi,tmp,ptype);
} else if ( local ) {
spProjZp(chi,in._odata[offset]);
} else {
chi=comm_buf[offset];
}
mult(&Uchi(),&Umu._odata[ssu](Zm),&chi());
accumReconZp(result,Uchi);
// Tp
offset = Stencil._offsets [Tm][ss];
local = Stencil._is_local[Tm][ss];
perm = Stencil._permute[Tm][ss];
ptype = Stencil._permute_type[Tm];
if ( local && perm ) {
spProjTp(tmp,in._odata[offset]);
permute(chi,tmp,ptype);
} else if ( local ) {
spProjTp(chi,in._odata[offset]);
} else {
chi=comm_buf[offset];
}
mult(&Uchi(),&Umu._odata[ssu](Tm),&chi());
accumReconTp(result,Uchi);
// Xm
offset = Stencil._offsets [Xp][ss];
local = Stencil._is_local[Xp][ss];
perm = Stencil._permute[Xp][ss];
ptype = Stencil._permute_type[Xp];
if ( local && perm )
{
spProjXm(tmp,in._odata[offset]);
permute(chi,tmp,ptype);
} else if ( local ) {
spProjXm(chi,in._odata[offset]);
} else {
chi=comm_buf[offset];
}
mult(&Uchi(),&Umu._odata[ssu](Xp),&chi());
accumReconXm(result,Uchi);
// Ym
offset = Stencil._offsets [Yp][ss];
local = Stencil._is_local[Yp][ss];
perm = Stencil._permute[Yp][ss];
ptype = Stencil._permute_type[Yp];
if ( local && perm ) {
spProjYm(tmp,in._odata[offset]);
permute(chi,tmp,ptype);
} else if ( local ) {
spProjYm(chi,in._odata[offset]);
} else {
chi=comm_buf[offset];
}
mult(&Uchi(),&Umu._odata[ssu](Yp),&chi());
accumReconYm(result,Uchi);
// Zm
offset = Stencil._offsets [Zp][ss];
local = Stencil._is_local[Zp][ss];
perm = Stencil._permute[Zp][ss];
ptype = Stencil._permute_type[Zp];
if ( local && perm ) {
spProjZm(tmp,in._odata[offset]);
permute(chi,tmp,ptype);
} else if ( local ) {
spProjZm(chi,in._odata[offset]);
} else {
chi=comm_buf[offset];
}
mult(&Uchi(),&Umu._odata[ssu](Zp),&chi());
accumReconZm(result,Uchi);
// Tm
offset = Stencil._offsets [Tp][ss];
local = Stencil._is_local[Tp][ss];
perm = Stencil._permute[Tp][ss];
ptype = Stencil._permute_type[Tp];
if ( local && perm ) {
spProjTm(tmp,in._odata[offset]);
permute(chi,tmp,ptype);
} else if ( local ) {
spProjTm(chi,in._odata[offset]);
} else {
chi=comm_buf[offset];
}
mult(&Uchi(),&Umu._odata[ssu](Tp),&chi());
accumReconTm(result,Uchi);
vstream(out._odata[ss],result);
}
void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out,int dag)
{
assert((dag==0) ||(dag==1));
WilsonCompressor compressor(dag);
Stencil.HaloExchange<vSpinColourVector,vHalfSpinColourVector,WilsonCompressor>(in,comm_buf,compressor);
if ( dag ) {
PARALLEL_FOR_LOOP
for(int sss=0;sss<grid->oSites();sss++){
DhopSiteDag(sss,in,out);
}
} else {
PARALLEL_FOR_LOOP
for(int sss=0;sss<grid->oSites();sss++){
DhopSite(sss,in,out);
}
}
}

View File

@ -46,6 +46,8 @@ namespace Grid {
// non-hermitian hopping term; half cb or both
void Dhop(const LatticeFermion &in, LatticeFermion &out,int dag);
void DhopSite (int ss,const LatticeFermion &in, LatticeFermion &out);
void DhopSiteDag(int ss,const LatticeFermion &in, LatticeFermion &out);
typedef iScalar<iMatrix<vComplex, Nc> > matrix;

0
lib/simd/.dirstamp Normal file
View File

194
lib/simd/Grid_sse4.h Normal file
View File

@ -0,0 +1,194 @@
//----------------------------------------------------------------------
/*! @file Grid_sse4.h
@brief Optimization libraries
*/
// Time-stamp: <2015-05-19 17:06:51 neo>
//----------------------------------------------------------------------
#include <pmmintrin.h>
namespace Optimization {
struct Vsplat{
//Complex float
inline __m128 operator()(float a, float b){
return _mm_set_ps(b,a,b,a);
}
// Real float
inline __m128 operator()(float a){
return _mm_set_ps(a,a,a,a);
}
//Complex double
inline __m128d operator()(double a, double b){
return _mm_set_pd(b,a);
}
//Real double
inline __m128d operator()(double a){
return _mm_set_pd(a,a);
}
//Integer
inline __m128i operator()(Integer a){
return _mm_set1_epi32(a);
}
};
struct Vstore{
//Float
inline void operator()(__m128 a, float* F){
_mm_store_ps(F,a);
}
//Double
inline void operator()(__m128d a, double* D){
_mm_store_pd(D,a);
}
//Integer
inline void operator()(__m128i a, Integer* I){
_mm_store_si128((__m128i *)I,a);
}
};
struct Vset{
// Complex float
inline __m128 operator()(Grid::ComplexF *a){
return _mm_set_ps(a[1].imag(), a[1].real(),a[0].imag(),a[0].real());
}
// Complex double
inline __m128d operator()(Grid::ComplexD *a){
return _mm_set_pd(a[0].imag(),a[0].real());
}
// Real float
inline __m128 operator()(float *a){
return _mm_set_ps(a[3],a[2],a[1],a[0]);
}
// Real double
inline __m128d operator()(double *a){
return _mm_set_pd(a[1],a[0]);
}
// Integer
inline __m128i operator()(Integer *a){
return _mm_set_epi32(a[0],a[1],a[2],a[3]);
}
};
struct Reduce{
//Complex float
inline Grid::ComplexF operator()(__m128 in){
union {
__m128 v1;
float f[4];
} u128;
u128.v1 = _mm_add_ps(in, _mm_shuffle_ps(in,in, 0b01001110)); // FIXME Prefer to use _MM_SHUFFLE macros
return Grid::ComplexF(u128.f[0], u128.f[1]);
}
//Complex double
inline Grid::ComplexD operator()(__m128d in){
printf("Missing complex double implementation -> FIX\n");
return Grid::ComplexD(0,0); // FIXME wrong
}
};
/////////////////////////////////////////////////////
// Arithmetic operations
/////////////////////////////////////////////////////
struct Sum{
//Complex/Real float
inline __m128 operator()(__m128 a, __m128 b){
return _mm_add_ps(a,b);
}
//Complex/Real double
inline __m128d operator()(__m128d a, __m128d b){
return _mm_add_pd(a,b);
}
//Integer
inline __m128i operator()(__m128i a, __m128i b){
return _mm_add_epi32(a,b);
}
};
struct Sub{
//Complex/Real float
inline __m128 operator()(__m128 a, __m128 b){
return _mm_sub_ps(a,b);
}
//Complex/Real double
inline __m128d operator()(__m128d a, __m128d b){
return _mm_sub_pd(a,b);
}
//Integer
inline __m128i operator()(__m128i a, __m128i b){
return _mm_sub_epi32(a,b);
}
};
struct MultComplex{
// Complex float
inline __m128 operator()(__m128 a, __m128 b){
__m128 ymm0,ymm1,ymm2;
ymm0 = _mm_shuffle_ps(a,a,_MM_SHUFFLE(2,2,0,0)); // ymm0 <- ar ar,
ymm0 = _mm_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
ymm1 = _mm_shuffle_ps(b,b,_MM_SHUFFLE(2,3,0,1)); // ymm1 <- br,bi
ymm2 = _mm_shuffle_ps(a,a,_MM_SHUFFLE(3,3,1,1)); // ymm2 <- ai,ai
ymm1 = _mm_mul_ps(ymm1,ymm2); // ymm1 <- br ai, ai bi
return _mm_addsub_ps(ymm0,ymm1);
}
// Complex double
inline __m128d operator()(__m128d a, __m128d b){
__m128d ymm0,ymm1,ymm2;
ymm0 = _mm_shuffle_pd(a,a,0x0); // ymm0 <- ar ar,
ymm0 = _mm_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br
ymm1 = _mm_shuffle_pd(b,b,0x1); // ymm1 <- br,bi b01
ymm2 = _mm_shuffle_pd(a,a,0x3); // ymm2 <- ai,ai b11
ymm1 = _mm_mul_pd(ymm1,ymm2); // ymm1 <- br ai, ai bi
return _mm_addsub_pd(ymm0,ymm1);
}
};
struct Mult{
// Real float
inline __m128 operator()(__m128 a, __m128 b){
return _mm_mul_ps(a,b);
}
// Real double
inline __m128d operator()(__m128d a, __m128d b){
return _mm_mul_pd(a,b);
}
// Integer
inline __m128i operator()(__m128i a, __m128i b){
return _mm_mul_epi32(a,b);
}
};
}
// Here assign types
namespace Grid {
typedef __m128 SIMD_Ftype; // Single precision type
typedef __m128d SIMD_Dtype; // Double precision type
typedef __m128i SIMD_Itype; // Integer type
// Function names
typedef Optimization::Vsplat VsplatSIMD;
typedef Optimization::Vstore VstoreSIMD;
// Arithmetic operations
typedef Optimization::Sum SumSIMD;
typedef Optimization::Sub SubSIMD;
typedef Optimization::Mult MultSIMD;
typedef Optimization::MultComplex MultComplexSIMD;
typedef Optimization::Vset VsetSIMD;
}

View File

@ -1,6 +1,14 @@
//---------------------------------------------------------------------------
/*! @file Grid_vector_types.h
@brief Defines templated class Grid_simd to deal with inner vector types
*/
// Time-stamp: <2015-05-19 17:20:36 neo>
//---------------------------------------------------------------------------
#ifndef GRID_VECTOR_TYPES
#define GRID_VECTOR_TYPES
#include "Grid_sse4.h"
namespace Grid {
@ -13,31 +21,32 @@ namespace Grid {
struct RealPart< std::complex<T> >{
typedef T type;
};
////////////////////////////////////////////////////////
////////////////////////////////////////////////////////
// Check for complexity with type traits
template <typename T>
struct is_complex : std::false_type {};
template < typename T >
struct is_complex< std::complex<T> >: std::true_type {};
////////////////////////////////////////////////////////
// Define the operation templates functors
template < class SIMD, class Operation >
SIMD binary(SIMD src_1, SIMD src_2, Operation op){
// general forms to allow for vsplat syntax
// need explicit declaration of types when used since
// clang cannot automatically determine the output type sometimes
template < class Out, class Input1, class Input2, class Operation >
Out binary(Input1 src_1, Input2 src_2, Operation op){
return op(src_1, src_2);
}
template < class SIMD, class Operation >
SIMD unary(SIMD src, Operation op){
template < class SIMDout, class Input, class Operation >
SIMDout unary(Input src, Operation op){
return op(src);
}
///////////////////////////////////////////////
/*
@brief Grid_simd class for the SIMD vector type operations
*/
template < class Scalar_type, class Vector_type >
class Grid_simd {
@ -70,75 +79,90 @@ namespace Grid {
///////////////////////////////////////////////
// mac, mult, sub, add, adj
// Should do an AVX2 version with mac.
///////////////////////////////////////////////
friend inline void mac (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ a,const Grid_simd *__restrict__ x){ *y = (*a)*(*x)+(*y); };
friend inline void mult(Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) * (*r); }
friend inline void sub (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) - (*r); }
friend inline void add (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) + (*r); }
//not for integer types... FIXME
friend inline Grid_simd adj(const Grid_simd &in){ return conjugate(in); }
//////////////////////////////////
// Initialise to 1,0,i
//////////////////////////////////
///////////////////////////////////////////////
// Initialise to 1,0,i for the correct types
///////////////////////////////////////////////
// if not complex overload here
friend inline void vone(Grid_simd &ret) { vsplat(ret,1.0); }
friend inline void vzero(Grid_simd &ret) { vsplat(ret,0.0); }
template < class S = Scalar_type,typename std::enable_if < !is_complex < S >::value, int >::type = 0 >
friend inline void vone(Grid_simd &ret) { vsplat(ret,1.0); }
template < class S = Scalar_type,typename std::enable_if < !is_complex < S >::value, int >::type = 0 >
friend inline void vzero(Grid_simd &ret) { vsplat(ret,0.0); }
// overload for complex type
template < class S = Scalar_type,typename std::enable_if < is_complex < S >::value, int >::type = 0 >
friend inline void vone(Grid_simd &ret) { vsplat(ret,1.0,0.0); }
friend inline void vone(Grid_simd &ret) { vsplat(ret,1.0,0.0); }
template < class S = Scalar_type,typename std::enable_if < is_complex < S >::value, int >::type = 0 >
friend inline void vzero(Grid_simd &ret) { vsplat(ret,0.0,0.0); }
friend inline void vzero(Grid_simd &ret) { vsplat(ret,0.0,0.0); }// use xor?
// For integral type
template < class S = Scalar_type,typename std::enable_if < std::is_integral < S >::value, int >::type = 0 >
friend inline void vone(Grid_simd &ret) { vsplat(ret,1); }
friend inline void vone(Grid_simd &ret) { vsplat(ret,1); }
template < class S = Scalar_type,typename std::enable_if < std::is_integral < S >::value, int >::type = 0 >
friend inline void vzero(Grid_simd &ret) { vsplat(ret,0); }
friend inline void vzero(Grid_simd &ret) { vsplat(ret,0); }
template < class S = Scalar_type,typename std::enable_if < std::is_integral < S >::value, int >::type = 0 >
friend inline void vtrue (Grid_simd &ret){vsplat(ret,0xFFFFFFFF);}
template < class S = Scalar_type,typename std::enable_if < std::is_integral < S >::value, int >::type = 0 >
friend inline void vfalse(vInteger &ret){vsplat(ret,0);}
// do not compile if real or integer, send an error message from the compiler
template < class S = Scalar_type,typename std::enable_if < is_complex < S >::value, int >::type = 0 >
friend inline void vcomplex_i(Grid_simd &ret){ vsplat(ret,0.0,1.0);}
////////////////////////////////////
// Arithmetic operator overloads +,-,*
////////////////////////////////////
friend inline Grid_simd operator + (Grid_simd a, Grid_simd b)
{
vComplexF ret;
// FIXME call the binary op
Grid_simd ret;
ret.v = binary<Vector_type>(a.v, b.v, SumSIMD());
return ret;
};
friend inline Grid_simd operator - (Grid_simd a, Grid_simd b)
{
vComplexF ret;
// FIXME call the binary op
Grid_simd ret;
ret.v = binary<Vector_type>(a.v, b.v, SubSIMD());
return ret;
};
friend inline Grid_simd operator * (Grid_simd a, Grid_simd b)
// Distinguish between complex types and others
template < class S = Scalar_type, typename std::enable_if < is_complex < S >::value, int >::type = 0 >
friend inline Grid_simd operator * (Grid_simd a, Grid_simd b)
{
vComplexF ret;
// FIXME call the binary op
Grid_simd ret;
ret.v = binary<Vector_type>(a.v,b.v, MultComplexSIMD());
return ret;
};
// Real/Integer types
template < class S = Scalar_type,typename std::enable_if < !is_complex < S >::value, int >::type = 0 >
friend inline Grid_simd operator * (Grid_simd a, Grid_simd b)
{
Grid_simd ret;
ret.v = binary<Vector_type>(a.v,b.v, MultSIMD());
return ret;
};
////////////////////////////////////////////////////////////////////////
// FIXME: gonna remove these load/store, get, set, prefetch
////////////////////////////////////////////////////////////////////////
friend inline void vset(Grid_simd &ret, Scalar_type *a){
// FIXME set
ret.v = unary<Vector_type>(a, VsetSIMD());
}
///////////////////////
@ -147,34 +171,33 @@ namespace Grid {
// overload if complex
template < class S = Scalar_type >
friend inline void vsplat(Grid_simd &ret, typename std::enable_if< is_complex < S >::value, S>::type c){
Real a= real(c);
Real b= imag(c);
Real a = real(c);
Real b = imag(c);
vsplat(ret,a,b);
}
// this only for the complex version
template < class S = Scalar_type, typename std::enable_if < is_complex < S >::value, int >::type = 0 >
friend inline void vsplat(Grid_simd &ret,Real a, Real b){
// FIXME add operator
ret.v = binary<Vector_type>(a, b, VsplatSIMD());
}
//if real fill with a, if complex fill with a in the real part
//if real fill with a, if complex fill with a in the real part (first function above)
friend inline void vsplat(Grid_simd &ret,Real a){
// FIXME add operator
ret.v = unary<Vector_type>(a, VsplatSIMD());
}
friend inline void vstore(const Grid_simd &ret, Scalar_type *a){
//FIXME
binary<void>(ret.v, (Real*)a, VstoreSIMD());
}
friend inline void vprefetch(const Grid_simd &v)
{
_mm_prefetch((const char*)&v.v,_MM_HINT_T0);
}
friend inline Scalar_type Reduce(const Grid_simd & in)
{
// FIXME add operator
@ -221,6 +244,7 @@ namespace Grid {
inline Grid_simd &operator *=(const Grid_simd &r) {
*this = (*this)*r;
return *this;
// return (*this)*r; ?
}
inline Grid_simd &operator +=(const Grid_simd &r) {
*this = *this+r;
@ -233,6 +257,12 @@ namespace Grid {
friend inline void permute(Grid_simd &y,Grid_simd b,int perm)
{
Gpermute<Grid_simd>(y,b,perm);
}
/*
friend inline void permute(Grid_simd &y,Grid_simd b,int perm)
{
Gpermute<Grid_simd>(y,b,perm);
@ -253,7 +283,7 @@ namespace Grid {
{
Gextract<Grid_simd,Scalar_type>(y,extracted);
}
*/
};// end of Grid_simd class definition
@ -286,11 +316,11 @@ namespace Grid {
// Define available types (now change names to avoid clashing)
typedef __m128 SIMD_type;// decided at compilation time
typedef Grid_simd< float , SIMD_type > MyRealF;
typedef Grid_simd< double , SIMD_type > MyRealD;
typedef Grid_simd< std::complex< float > , SIMD_type > MyComplexF;
typedef Grid_simd< std::complex< double >, SIMD_type > MyComplexD;
typedef Grid_simd< float , SIMD_Ftype > MyRealF;
typedef Grid_simd< double , SIMD_Dtype > MyRealD;
typedef Grid_simd< std::complex< float > , SIMD_Ftype > MyComplexF;
typedef Grid_simd< std::complex< double >, SIMD_Dtype > MyComplexD;

9
scripts/bench_wilson.sh Executable file
View File

@ -0,0 +1,9 @@
for omp in 1 2 4
do
echo > wilson.t$omp
for vol in 4.4.4.4 4.4.4.8 4.4.8.8 4.8.8.8 8.8.8.8 8.8.8.16 8.8.16.16 8.16.16.16
do
perf=` ./benchmarks/Grid_wilson --grid $vol --omp $omp | grep mflop | awk '{print $3}'`
echo $vol $perf >> wilson.t$omp
done
done

7
scripts/wilson.gnu Normal file
View File

@ -0,0 +1,7 @@
plot 'wilson.t1' u 2 w l t "AVX1-OMP=1"
replot 'wilson.t2' u 2 w l t "AVX1-OMP=2"
replot 'wilson.t4' u 2 w l t "AVX1-OMP=4"
set terminal 'pdf'
set output 'wilson_clang.pdf'
replot
quit

View File

@ -1,5 +1,9 @@
#include "Grid.h"
//DEBUG
#include "simd/Grid_vector_types.h"
using namespace std;
using namespace Grid;
using namespace Grid::QCD;
@ -151,6 +155,39 @@ int main (int argc, char ** argv)
scMat = sMat*scMat; // LatticeSpinColourMatrix = LatticeSpinMatrix * LatticeSpinColourMatrix
#ifdef SSE4
///////// Tests the new class Grid_simd
std::complex<double> ctest(3.0,2.0);
std::complex<float> ctestf(3.0,2.0);
MyComplexF TestMe1(1.0); // fill real part
MyComplexD TestMe2(ctest);
MyComplexD TestMe3(ctest);// compiler generate conversion of basic types
//MyRealF TestMe5(ctest);// Must generate compiler error
MyRealD TestMe4(2.0);
MyComplexF TestMe6(ctestf);
MyComplexF TestMe7(ctestf);
MyComplexD TheSum= TestMe2*TestMe3;
MyComplexF TheSumF= TestMe6*TestMe7;
double dsum[2];
_mm_store_pd(dsum, TheSum.v);
for (int i =0; i< 2; i++)
printf("%f\n", dsum[i]);
float fsum[4];
_mm_store_ps(fsum, TheSumF.v);
for (int i =0; i< 4; i++)
printf("%f\n", fsum[i]);
vstore(TheSum, &ctest);
std::cout << ctest<< std::endl;
#endif
///////////////////////
// Non-lattice (const objects) * Lattice
ColourMatrix cm;
SpinColourMatrix scm;