mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
Merge branch 'master' of https://github.com/paboyle/Grid
Conflicts: lib/tensors/Tensor_trace.h
This commit is contained in:
commit
1e4eca8321
31
.gitignore
vendored
31
.gitignore
vendored
@ -5,6 +5,7 @@
|
||||
*.obj
|
||||
*~
|
||||
errs
|
||||
*#
|
||||
|
||||
# Precompiled Headers
|
||||
*.gch
|
||||
@ -48,3 +49,33 @@ config.status
|
||||
/stamp-h1
|
||||
/config.sub
|
||||
/config.guess
|
||||
|
||||
|
||||
# Packages #
|
||||
############
|
||||
# it's better to unpack these files and commit the raw source
|
||||
# git has its own built in compression methods
|
||||
*.7z
|
||||
*.dmg
|
||||
*.gz
|
||||
*.iso
|
||||
*.jar
|
||||
*.rar
|
||||
*.tar
|
||||
*.zip
|
||||
|
||||
# Logs and databases #
|
||||
######################
|
||||
*.log
|
||||
*.sql
|
||||
*.sqlite
|
||||
|
||||
# OS generated files #
|
||||
######################
|
||||
.DS_Store
|
||||
.DS_Store?
|
||||
._*
|
||||
.Spotlight-V100
|
||||
.Trashes
|
||||
ehthumbs.db
|
||||
Thumbs.db
|
2
INSTALL
2
INSTALL
@ -1 +1 @@
|
||||
/opt/local/share/automake-1.15/INSTALL
|
||||
/usr/share/automake-1.14/INSTALL
|
@ -20,7 +20,7 @@ optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a signifi
|
||||
for most programmers.
|
||||
|
||||
The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
|
||||
Presently SSE2 (128 bit) AVX, AVX2 (256 bit) and IMCI and AVX512 (512 bit) targets are supported.
|
||||
Presently SSE4 (128 bit) AVX, AVX2 (256 bit) and IMCI and AVX512 (512 bit) targets are supported.
|
||||
|
||||
These are presented as
|
||||
|
||||
@ -46,3 +46,5 @@ are examples:
|
||||
./configure CXX=icpc CXXFLAGS="-std=c++11 -O3 -mmic" --enable-simd=AVX512 --host=none
|
||||
|
||||
|
||||
For developers:
|
||||
Use reconfigure_script in the scripts/ directory to create the autotools environment
|
||||
|
3
TODO
3
TODO
@ -66,6 +66,9 @@ Insert/Extract
|
||||
|
||||
* Support for ILDG
|
||||
|
||||
* Support different boundary conditions (finite temp, chem. potential ... )
|
||||
|
||||
* Support different fermion representations?
|
||||
|
||||
Actions -- coherent framework for implementing actions and their forces.
|
||||
|
||||
|
32
configure.ac
32
configure.ac
@ -3,7 +3,7 @@
|
||||
#
|
||||
# Project Grid package
|
||||
#
|
||||
# Time-stamp: <2015-05-26 17:18:54 neo>
|
||||
# Time-stamp: <2015-05-27 18:51:47 neo>
|
||||
|
||||
AC_PREREQ([2.63])
|
||||
AC_INIT([Grid], [1.0], [paboyle@ph.ed.ac.uk])
|
||||
@ -27,7 +27,7 @@ AC_PROG_CXX
|
||||
AC_OPENMP
|
||||
AC_PROG_RANLIB
|
||||
AX_CXX_COMPILE_STDCXX_11(noext, mandatory)
|
||||
|
||||
AX_EXT
|
||||
|
||||
# Checks for libraries.
|
||||
#AX_GCC_VAR_ATTRIBUTE(aligned)
|
||||
@ -69,26 +69,44 @@ Info at: http://www.mpfr.org/)])
|
||||
|
||||
|
||||
|
||||
AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE|AVX|AVX2|AVX512|MIC],\
|
||||
AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVX2|AVX512|MIC],\
|
||||
[Select instructions to be SSE4.0, AVX 1.0, AVX 2.0+FMA, AVX 512, MIC])],\
|
||||
[ac_SIMD=${enable_simd}],[ac_SIMD=AVX2])
|
||||
|
||||
supported=no
|
||||
|
||||
case ${ac_SIMD} in
|
||||
SSE4)
|
||||
echo Configuring for SSE4
|
||||
if test x"$ax_cv_support_ssse3_ext" = x"yes"; then dnl minimal support for SSE4
|
||||
AC_DEFINE([SSE4],[1],[SSE4] )
|
||||
supported=yes
|
||||
else
|
||||
AC_MSG_WARN([Your processor does not support SSE4 instructions])
|
||||
fi
|
||||
;;
|
||||
AVX)
|
||||
echo Configuring for AVX
|
||||
if test x"$ax_cv_support_avx_ext" = x"yes"; then dnl minimal support for AVX
|
||||
AC_DEFINE([AVX1],[1],[AVX] )
|
||||
supported=yes
|
||||
else
|
||||
AC_MSG_WARN([Your processor does not support AVX instructions])
|
||||
fi
|
||||
;;
|
||||
AVX2)
|
||||
echo Configuring for AVX2
|
||||
if test x"$ax_cv_support_avx2_ext" = x"yes"; then dnl minimal support for AVX2
|
||||
AC_DEFINE([AVX2],[1],[AVX2] )
|
||||
supported=yes
|
||||
else
|
||||
AC_MSG_WARN([Your processor does not support AVX2 instructions])
|
||||
fi
|
||||
;;
|
||||
AVX512|MIC)
|
||||
echo Configuring for AVX512 and MIC
|
||||
AC_DEFINE([AVX512],[1],[AVX512] )
|
||||
supported="cross compilation"
|
||||
;;
|
||||
*)
|
||||
AC_MSG_ERROR([${ac_SIMD} unsupported --enable-simd option]);
|
||||
@ -129,7 +147,9 @@ then
|
||||
AC_CONFIG_FILES([docs/doxy.cfg])
|
||||
fi
|
||||
|
||||
|
||||
echo
|
||||
echo Creating configuration files
|
||||
echo :::::::::::::::::::::::::::::::::::::::::::
|
||||
AC_CONFIG_FILES(Makefile)
|
||||
AC_CONFIG_FILES(lib/Makefile)
|
||||
AC_CONFIG_FILES(tests/Makefile)
|
||||
@ -150,9 +170,9 @@ The following features are enabled:
|
||||
- os (target) : $target_os
|
||||
- build DOXYGEN documentation : `if test "x$enable_doc" = xyes; then echo yes; else echo no; fi`
|
||||
- graphs and diagrams : `if test "x$enable_dot" = xyes; then echo yes; else echo no; fi`
|
||||
|
||||
- Supported SIMD flags : $SIMD_FLAGS
|
||||
----------------------------------------------------------
|
||||
- enabled simd support : ${ac_SIMD}
|
||||
- enabled simd support : ${ac_SIMD} (supported: $supported )
|
||||
- communications type : ${ac_COMMS}
|
||||
|
||||
|
||||
|
2305
docs/doxy.cfg.test
2305
docs/doxy.cfg.test
File diff suppressed because it is too large
Load Diff
0
lib/.dirstamp
Normal file
0
lib/.dirstamp
Normal file
@ -1,5 +1,5 @@
|
||||
/* lib/Grid_config.h. Generated from Grid_config.h.in by configure. */
|
||||
/* lib/Grid_config.h.in. Generated from configure.ac by autoheader. */
|
||||
/* lib/GridConfig.h. Generated from GridConfig.h.in by configure. */
|
||||
/* lib/GridConfig.h.in. Generated from configure.ac by autoheader. */
|
||||
|
||||
/* AVX */
|
||||
/* #undef AVX1 */
|
||||
@ -16,6 +16,15 @@
|
||||
/* GRID_COMMS_NONE */
|
||||
#define GRID_COMMS_NONE 1
|
||||
|
||||
/* Support Altivec instructions */
|
||||
/* #undef HAVE_ALTIVEC */
|
||||
|
||||
/* Support AVX (Advanced Vector Extensions) instructions */
|
||||
/* #undef HAVE_AVX */
|
||||
|
||||
/* Support AVX2 (Advanced Vector Extensions 2) instructions */
|
||||
/* #undef HAVE_AVX2 */
|
||||
|
||||
/* define if the compiler supports basic C++11 syntax */
|
||||
/* #undef HAVE_CXX11 */
|
||||
|
||||
@ -30,6 +39,9 @@
|
||||
/* Define to 1 if you have the <endian.h> header file. */
|
||||
#define HAVE_ENDIAN_H 1
|
||||
|
||||
/* Support FMA3 (Fused Multiply-Add) instructions */
|
||||
/* #undef HAVE_FMA */
|
||||
|
||||
/* Define to 1 if you have the `gettimeofday' function. */
|
||||
#define HAVE_GETTIMEOFDAY 1
|
||||
|
||||
@ -54,9 +66,30 @@
|
||||
/* Define to 1 if you have the <memory.h> header file. */
|
||||
#define HAVE_MEMORY_H 1
|
||||
|
||||
/* Support mmx instructions */
|
||||
#define HAVE_MMX /**/
|
||||
|
||||
/* Define to 1 if you have the <mm_malloc.h> header file. */
|
||||
#define HAVE_MM_MALLOC_H 1
|
||||
|
||||
/* Support SSE (Streaming SIMD Extensions) instructions */
|
||||
#define HAVE_SSE /**/
|
||||
|
||||
/* Support SSE2 (Streaming SIMD Extensions 2) instructions */
|
||||
#define HAVE_SSE2 /**/
|
||||
|
||||
/* Support SSE3 (Streaming SIMD Extensions 3) instructions */
|
||||
#define HAVE_SSE3 /**/
|
||||
|
||||
/* Support SSSE4.1 (Streaming SIMD Extensions 4.1) instructions */
|
||||
#define HAVE_SSE4_1 /**/
|
||||
|
||||
/* Support SSSE4.2 (Streaming SIMD Extensions 4.2) instructions */
|
||||
#define HAVE_SSE4_2 /**/
|
||||
|
||||
/* Support SSSE3 (Supplemental Streaming SIMD Extensions 3) instructions */
|
||||
#define HAVE_SSSE3 /**/
|
||||
|
||||
/* Define to 1 if you have the <stdint.h> header file. */
|
||||
#define HAVE_STDINT_H 1
|
||||
|
||||
|
@ -15,6 +15,15 @@
|
||||
/* GRID_COMMS_NONE */
|
||||
#undef GRID_COMMS_NONE
|
||||
|
||||
/* Support Altivec instructions */
|
||||
#undef HAVE_ALTIVEC
|
||||
|
||||
/* Support AVX (Advanced Vector Extensions) instructions */
|
||||
#undef HAVE_AVX
|
||||
|
||||
/* Support AVX2 (Advanced Vector Extensions 2) instructions */
|
||||
#undef HAVE_AVX2
|
||||
|
||||
/* define if the compiler supports basic C++11 syntax */
|
||||
#undef HAVE_CXX11
|
||||
|
||||
@ -29,6 +38,9 @@
|
||||
/* Define to 1 if you have the <endian.h> header file. */
|
||||
#undef HAVE_ENDIAN_H
|
||||
|
||||
/* Support FMA3 (Fused Multiply-Add) instructions */
|
||||
#undef HAVE_FMA
|
||||
|
||||
/* Define to 1 if you have the `gettimeofday' function. */
|
||||
#undef HAVE_GETTIMEOFDAY
|
||||
|
||||
@ -53,9 +65,30 @@
|
||||
/* Define to 1 if you have the <memory.h> header file. */
|
||||
#undef HAVE_MEMORY_H
|
||||
|
||||
/* Support mmx instructions */
|
||||
#undef HAVE_MMX
|
||||
|
||||
/* Define to 1 if you have the <mm_malloc.h> header file. */
|
||||
#undef HAVE_MM_MALLOC_H
|
||||
|
||||
/* Support SSE (Streaming SIMD Extensions) instructions */
|
||||
#undef HAVE_SSE
|
||||
|
||||
/* Support SSE2 (Streaming SIMD Extensions 2) instructions */
|
||||
#undef HAVE_SSE2
|
||||
|
||||
/* Support SSE3 (Streaming SIMD Extensions 3) instructions */
|
||||
#undef HAVE_SSE3
|
||||
|
||||
/* Support SSSE4.1 (Streaming SIMD Extensions 4.1) instructions */
|
||||
#undef HAVE_SSE4_1
|
||||
|
||||
/* Support SSSE4.2 (Streaming SIMD Extensions 4.2) instructions */
|
||||
#undef HAVE_SSE4_2
|
||||
|
||||
/* Support SSSE3 (Supplemental Streaming SIMD Extensions 3) instructions */
|
||||
#undef HAVE_SSSE3
|
||||
|
||||
/* Define to 1 if you have the <stdint.h> header file. */
|
||||
#undef HAVE_STDINT_H
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
|
||||
HFILES=./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/Remez.h ./algorithms/approx/Zolotarev.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/LinearOperator.h ./algorithms/SparseMatrix.h ./Algorithms.h ./AlignedAllocator.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_full.h ./cartesian/Cartesian_red_black.h ./Cartesian.h ./communicator/Communicator_base.h ./Communicator.h ./Comparison.h ./cshift/Cshift_common.h ./cshift/Cshift_mpi.h ./cshift/Cshift_none.h ./Cshift.h ./Grid.h ./GridConfig.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_comparison.h ./lattice/Lattice_conformable.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_ET.h ./lattice/Lattice_local.h ./lattice/Lattice_overload.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_reality.h ./lattice/Lattice_reduction.h ./lattice/Lattice_rng.h ./lattice/Lattice_trace.h ./lattice/Lattice_transfer.h ./lattice/Lattice_transpose.h ./lattice/Lattice_where.h ./Lattice.h ./parallelIO/NerscIO.h ./qcd/action/Actions.h ./qcd/action/DiffAction.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonKernels.h ./qcd/Dirac.h ./qcd/LinalgUtils.h ./qcd/QCD.h ./qcd/SpaceTimeGrid.h ./qcd/TwoSpinor.h ./simd/Grid_avx.h ./simd/Grid_avx512.h ./simd/Grid_qpx.h ./simd/Grid_sse4.h ./simd/Grid_vector_types.h ./simd/Old/Grid_vComplexD.h ./simd/Old/Grid_vComplexF.h ./simd/Old/Grid_vInteger.h ./simd/Old/Grid_vRealD.h ./simd/Old/Grid_vRealF.h ./Simd.h ./stencil/Lebesgue.h ./Stencil.h ./tensors/Tensor_arith.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_class.h ./tensors/Tensor_extract_merge.h ./tensors/Tensor_inner.h ./tensors/Tensor_outer.h ./tensors/Tensor_peek.h ./tensors/Tensor_poke.h ./tensors/Tensor_reality.h ./tensors/Tensor_trace.h ./tensors/Tensor_traits.h ./tensors/Tensor_transpose.h ./Tensors.h ./Threads.h
|
||||
HFILES=./Cshift.h ./simd/Grid_avx.h ./simd/Grid_vector_types.h ./simd/Grid_sse4.h ./simd/Grid_avx512.h ./simd/Old/Grid_vRealD.h ./simd/Old/Grid_vComplexD.h ./simd/Old/Grid_vInteger.h ./simd/Old/Grid_vComplexF.h ./simd/Old/Grid_vRealF.h ./simd/Grid_qpx.h ./Tensors.h ./Algorithms.h ./communicator/Communicator_base.h ./lattice/Lattice_rng.h ./lattice/Lattice_reduction.h ./lattice/Lattice_transfer.h ./lattice/Lattice_peekpoke.h ./lattice/Lattice_coordinate.h ./lattice/Lattice_comparison.h ./lattice/Lattice_overload.h ./lattice/Lattice_reality.h ./lattice/Lattice_local.h ./lattice/Lattice_conformable.h ./lattice/Lattice_where.h ./lattice/Lattice_arith.h ./lattice/Lattice_base.h ./lattice/Lattice_ET.h ./lattice/Lattice_transpose.h ./lattice/Lattice_trace.h ./Stencil.h ./tensors/Tensor_arith_sub.h ./tensors/Tensor_poke.h ./tensors/Tensor_arith_mul.h ./tensors/Tensor_class.h ./tensors/Tensor_transpose.h ./tensors/Tensor_arith_mac.h ./tensors/Tensor_arith_scalar.h ./tensors/Tensor_reality.h ./tensors/Tensor_trace.h ./tensors/Tensor_arith_add.h ./tensors/Tensor_outer.h ./tensors/Tensor_inner.h ./tensors/Tensor_traits.h ./tensors/Tensor_Ta.h ./tensors/Tensor_peek.h ./tensors/Tensor_arith.h ./tensors/Tensor_extract_merge.h ./Communicator.h ./Cartesian.h ./parallelIO/NerscIO.h ./qcd/QCD.h ./qcd/SpaceTimeGrid.h ./qcd/LinalgUtils.h ./qcd/TwoSpinor.h ./qcd/action/Actions.h ./qcd/action/fermion/CayleyFermion5D.h ./qcd/action/fermion/ScaledShamirFermion.h ./qcd/action/fermion/MobiusFermion.h ./qcd/action/fermion/OverlapWilsonContfracTanhFermion.h ./qcd/action/fermion/PartialFractionFermion5D.h ./qcd/action/fermion/ShamirZolotarevFermion.h ./qcd/action/fermion/FermionOperator.h ./qcd/action/fermion/WilsonFermion5D.h ./qcd/action/fermion/WilsonCompressor.h ./qcd/action/fermion/WilsonKernels.h ./qcd/action/fermion/DomainWallFermion.h ./qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h ./qcd/action/fermion/MobiusZolotarevFermion.h ./qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h ./qcd/action/fermion/WilsonFermion.h ./qcd/action/fermion/ContinuedFractionFermion5D.h ./qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h ./qcd/Dirac.h ./cshift/Cshift_common.h ./cshift/Cshift_none.h ./cshift/Cshift_mpi.h ./Simd.h ./GridConfig.h ./cartesian/Cartesian_base.h ./cartesian/Cartesian_red_black.h ./cartesian/Cartesian_full.h ./AlignedAllocator.h ./Lattice.h ./Threads.h ./Comparison.h ./Grid.h ./algorithms/iterative/SchurRedBlack.h ./algorithms/iterative/NormalEquations.h ./algorithms/iterative/ConjugateGradient.h ./algorithms/approx/Chebyshev.h ./algorithms/approx/Zolotarev.h ./algorithms/approx/bigfloat.h ./algorithms/approx/bigfloat_double.h ./algorithms/approx/Remez.h ./algorithms/LinearOperator.h ./algorithms/SparseMatrix.h ./stencil/Lebesgue.h
|
||||
|
||||
CCFILES=./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./GridInit.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/Dirac.cc ./qcd/SpaceTimeGrid.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc
|
||||
CCFILES=./qcd/SpaceTimeGrid.cc ./qcd/action/fermion/WilsonKernels.cc ./qcd/action/fermion/PartialFractionFermion5D.cc ./qcd/action/fermion/CayleyFermion5D.cc ./qcd/action/fermion/WilsonKernelsHand.cc ./qcd/action/fermion/WilsonFermion.cc ./qcd/action/fermion/ContinuedFractionFermion5D.cc ./qcd/action/fermion/WilsonFermion5D.cc ./qcd/Dirac.cc ./GridInit.cc ./algorithms/approx/Remez.cc ./algorithms/approx/Zolotarev.cc ./stencil/Lebesgue.cc ./stencil/Stencil_common.cc
|
||||
|
@ -8,6 +8,7 @@
|
||||
#include <tensors/Tensor_outer.h>
|
||||
#include <tensors/Tensor_transpose.h>
|
||||
#include <tensors/Tensor_trace.h>
|
||||
#include <tensors/Tensor_Ta.h>
|
||||
#include <tensors/Tensor_peek.h>
|
||||
#include <tensors/Tensor_poke.h>
|
||||
#include <tensors/Tensor_reality.h>
|
||||
|
@ -48,5 +48,16 @@ PARALLEL_FOR_LOOP
|
||||
}
|
||||
|
||||
|
||||
template<class vobj> inline auto Ta(const Lattice<vobj> &z) -> Lattice<decltype(Ta(z._odata[0]))>
|
||||
{
|
||||
Lattice<decltype(Ta(z._odata[0]))> ret(z._grid);
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int ss=0;ss<z._grid->oSites();ss++){
|
||||
ret._odata[ss] = Ta(z._odata[ss]);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
#endif
|
||||
|
0
lib/qcd/action/fermion/.dirstamp
Normal file
0
lib/qcd/action/fermion/.dirstamp
Normal file
@ -4,7 +4,7 @@
|
||||
|
||||
Using intrinsics
|
||||
*/
|
||||
// Time-stamp: <2015-05-27 12:07:15 neo>
|
||||
// Time-stamp: <2015-05-29 14:13:30 neo>
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
#include <immintrin.h>
|
||||
@ -261,13 +261,7 @@ namespace Optimization {
|
||||
}
|
||||
// Complex double
|
||||
inline __m256d operator()(__m256d in){
|
||||
return _mm256_xor_pd(_mm256_addsub_pd(_mm256_setzero_pd(),in), _mm256_set1_pd(-0.f));//untested
|
||||
/*
|
||||
// original
|
||||
// addsubps 0, inv=>0+in.v[3] 0-in.v[2], 0+in.v[1], 0-in.v[0], ...
|
||||
__m256d tmp = _mm256_addsub_pd(_mm256_setzero_pd(),_mm256_shuffle_pd(in,in,0x5));
|
||||
return _mm256_shuffle_pd(tmp,tmp,0x5);
|
||||
*/
|
||||
return _mm256_xor_pd(_mm256_addsub_pd(_mm256_setzero_pd(),in), _mm256_set1_pd(-0.f));
|
||||
}
|
||||
// do not define for integer input
|
||||
};
|
||||
|
@ -2,7 +2,7 @@
|
||||
/*! @file Grid_vector_types.h
|
||||
@brief Defines templated class Grid_simd to deal with inner vector types
|
||||
*/
|
||||
// Time-stamp: <2015-05-27 12:04:06 neo>
|
||||
// Time-stamp: <2015-05-29 14:19:48 neo>
|
||||
//---------------------------------------------------------------------------
|
||||
#ifndef GRID_VECTOR_TYPES
|
||||
#define GRID_VECTOR_TYPES
|
||||
@ -55,7 +55,6 @@ namespace Grid {
|
||||
// general forms to allow for vsplat syntax
|
||||
// need explicit declaration of types when used since
|
||||
// clang cannot automatically determine the output type sometimes
|
||||
// use decltype?
|
||||
template < class Out, class Input1, class Input2, class Operation >
|
||||
Out binary(Input1 src_1, Input2 src_2, Operation op){
|
||||
return op(src_1, src_2);
|
||||
|
@ -1 +1 @@
|
||||
timestamp for lib/Grid_config.h
|
||||
timestamp for lib/GridConfig.h
|
||||
|
43
lib/tensors/Tensor_Ta.h
Normal file
43
lib/tensors/Tensor_Ta.h
Normal file
@ -0,0 +1,43 @@
|
||||
#ifndef GRID_MATH_TA_H
|
||||
#define GRID_MATH_TA_H
|
||||
namespace Grid {
|
||||
|
||||
///////////////////////////////////////////////
|
||||
// Ta function for scalar, vector, matrix
|
||||
///////////////////////////////////////////////
|
||||
inline ComplexF Ta( const ComplexF &arg){ return arg;}
|
||||
inline ComplexD Ta( const ComplexD &arg){ return arg;}
|
||||
inline RealF Ta( const RealF &arg){ return arg;}
|
||||
inline RealD Ta( const RealD &arg){ return arg;}
|
||||
|
||||
|
||||
template<class vtype> inline iScalar<vtype> Ta(const iScalar<vtype>&r)
|
||||
{
|
||||
iScalar<vtype> ret;
|
||||
ret._internal = Ta(r._internal);
|
||||
return ret;
|
||||
}
|
||||
template<class vtype,int N> inline iVector<vtype,N> Ta(const iVector<vtype,N>&r)
|
||||
{
|
||||
iVector<vtype,N> ret;
|
||||
for(int i=0;i<N;i++){
|
||||
ret._internal[i] = Ta(r._internal[i]);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
template<class vtype,int N> inline iMatrix<vtype,N> Ta(const iMatrix<vtype,N> &arg)
|
||||
{
|
||||
iMatrix<vtype,N> ret(arg);
|
||||
double factor = (1/(double)N);
|
||||
for(int c1=0;c1<N;c1++){
|
||||
for(int c2=0;c2<N;c2++){
|
||||
ret._internal[c1][c2]= (ret._internal[c1][c2] - adj(arg._internal[c2][c1]));
|
||||
ret._internal[c1][c2] *= 0.5;
|
||||
}}
|
||||
//ret = (ret - adj(arg))*0.5;
|
||||
ret -= trace(ret)*factor;
|
||||
return ret;
|
||||
}
|
||||
|
||||
}
|
||||
#endif
|
@ -45,7 +45,10 @@ namespace Grid {
|
||||
{
|
||||
for(int c2=0;c2<N;c2++){
|
||||
for(int c1=0;c1<N;c1++){
|
||||
add(&ret->_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]);
|
||||
if ( c1==c2)
|
||||
add(&ret->_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]);
|
||||
else
|
||||
ret->_internal[c1][c2]=lhs->_internal[c1][c2];
|
||||
}}
|
||||
return;
|
||||
}
|
||||
|
@ -44,7 +44,7 @@ template<class vtype,class ltype,class rtype, int N> strong_inline void sub(iMat
|
||||
const iMatrix<rtype,N> * __restrict__ rhs){
|
||||
for(int c2=0;c2<N;c2++){
|
||||
for(int c1=0;c1<N;c1++){
|
||||
if ( c1!=c2) {
|
||||
if ( c1==c2) {
|
||||
sub(&ret->_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]);
|
||||
} else {
|
||||
// Fails -- need unary minus. Catalogue other unops?
|
||||
@ -60,7 +60,7 @@ template<class vtype,class ltype,class rtype, int N> strong_inline void sub(iMat
|
||||
const iScalar<rtype> * __restrict__ rhs){
|
||||
for(int c2=0;c2<N;c2++){
|
||||
for(int c1=0;c1<N;c1++){
|
||||
if ( c1!=c2)
|
||||
if ( c1==c2)
|
||||
sub(&ret->_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal);
|
||||
else
|
||||
ret->_internal[c1][c2]=lhs->_internal[c1][c2];
|
||||
|
@ -2,10 +2,6 @@
|
||||
#define GRID_MATH_REALITY_H
|
||||
namespace Grid {
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////////////// CONJ ///////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
///////////////////////////////////////////////
|
||||
// multiply by I; make recursive.
|
||||
///////////////////////////////////////////////
|
||||
@ -151,6 +147,9 @@ template<class vtype,int N> inline iMatrix<vtype,N> adj(const iMatrix<vtype,N> &
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Can only take the real/imag part of scalar objects, since
|
||||
// lattice objects of different complex nature are non-conformable.
|
||||
|
@ -75,7 +75,7 @@ auto traceIndex(const iMatrix<vtype,N> &arg) -> iMatrix<decltype(traceIndex<Lev
|
||||
// Allow to recurse if vector, but never terminate on a vector
|
||||
// trace of a different index can distribute across the vector index in a replicated way
|
||||
// but we do not trace a vector index.
|
||||
template<int Level,class vtype,int N,typename std::enable_if< iScalar<vtype>::TensorLevel != Level >::type * =nullptr> inline
|
||||
template<int Level,class vtype,int N,typename std::enable_if< iVector<vtype, N>::TensorLevel != Level >::type * =nullptr> inline
|
||||
auto traceIndex(const iVector<vtype,N> &arg) -> iVector<decltype(traceIndex<Level>(arg._internal[0])),N>
|
||||
{
|
||||
iVector<decltype(traceIndex<Level>(arg._internal[0])),N> ret;
|
||||
|
72
m4/ax_check_compile_flag.m4
Normal file
72
m4/ax_check_compile_flag.m4
Normal file
@ -0,0 +1,72 @@
|
||||
# ===========================================================================
|
||||
# http://www.gnu.org/software/autoconf-archive/ax_check_compile_flag.html
|
||||
# ===========================================================================
|
||||
#
|
||||
# SYNOPSIS
|
||||
#
|
||||
# AX_CHECK_COMPILE_FLAG(FLAG, [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS])
|
||||
#
|
||||
# DESCRIPTION
|
||||
#
|
||||
# Check whether the given FLAG works with the current language's compiler
|
||||
# or gives an error. (Warnings, however, are ignored)
|
||||
#
|
||||
# ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on
|
||||
# success/failure.
|
||||
#
|
||||
# If EXTRA-FLAGS is defined, it is added to the current language's default
|
||||
# flags (e.g. CFLAGS) when the check is done. The check is thus made with
|
||||
# the flags: "CFLAGS EXTRA-FLAGS FLAG". This can for example be used to
|
||||
# force the compiler to issue an error when a bad flag is given.
|
||||
#
|
||||
# NOTE: Implementation based on AX_CFLAGS_GCC_OPTION. Please keep this
|
||||
# macro in sync with AX_CHECK_{PREPROC,LINK}_FLAG.
|
||||
#
|
||||
# LICENSE
|
||||
#
|
||||
# Copyright (c) 2008 Guido U. Draheim <guidod@gmx.de>
|
||||
# Copyright (c) 2011 Maarten Bosmans <mkbosmans@gmail.com>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it
|
||||
# under the terms of the GNU General Public License as published by the
|
||||
# Free Software Foundation, either version 3 of the License, or (at your
|
||||
# option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but
|
||||
# WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
|
||||
# Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License along
|
||||
# with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
#
|
||||
# As a special exception, the respective Autoconf Macro's copyright owner
|
||||
# gives unlimited permission to copy, distribute and modify the configure
|
||||
# scripts that are the output of Autoconf when processing the Macro. You
|
||||
# need not follow the terms of the GNU General Public License when using
|
||||
# or distributing such scripts, even though portions of the text of the
|
||||
# Macro appear in them. The GNU General Public License (GPL) does govern
|
||||
# all other use of the material that constitutes the Autoconf Macro.
|
||||
#
|
||||
# This special exception to the GPL applies to versions of the Autoconf
|
||||
# Macro released by the Autoconf Archive. When you make and distribute a
|
||||
# modified version of the Autoconf Macro, you may extend this special
|
||||
# exception to the GPL to apply to your modified version as well.
|
||||
|
||||
#serial 2
|
||||
|
||||
AC_DEFUN([AX_CHECK_COMPILE_FLAG],
|
||||
[AC_PREREQ(2.59)dnl for _AC_LANG_PREFIX
|
||||
AS_VAR_PUSHDEF([CACHEVAR],[ax_cv_check_[]_AC_LANG_ABBREV[]flags_$4_$1])dnl
|
||||
AC_CACHE_CHECK([whether _AC_LANG compiler accepts $1], CACHEVAR, [
|
||||
ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS
|
||||
_AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $4 $1"
|
||||
AC_COMPILE_IFELSE([AC_LANG_PROGRAM()],
|
||||
[AS_VAR_SET(CACHEVAR,[yes])],
|
||||
[AS_VAR_SET(CACHEVAR,[no])])
|
||||
_AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags])
|
||||
AS_IF([test x"AS_VAR_GET(CACHEVAR)" = xyes],
|
||||
[m4_default([$2], :)],
|
||||
[m4_default([$3], :)])
|
||||
AS_VAR_POPDEF([CACHEVAR])dnl
|
||||
])dnl AX_CHECK_COMPILE_FLAGS
|
288
m4/ax_ext.m4
Normal file
288
m4/ax_ext.m4
Normal file
@ -0,0 +1,288 @@
|
||||
# ===========================================================================
|
||||
# http://www.gnu.org/software/autoconf-archive/ax_ext.html
|
||||
# ===========================================================================
|
||||
#
|
||||
# SYNOPSIS
|
||||
#
|
||||
# AX_EXT
|
||||
#
|
||||
# DESCRIPTION
|
||||
#
|
||||
# Find supported SIMD extensions by requesting cpuid. When an SIMD
|
||||
# extension is found, the -m"simdextensionname" is added to SIMD_FLAGS if
|
||||
# compiler supports it. For example, if "sse2" is available, then "-msse2"
|
||||
# is added to SIMD_FLAGS.
|
||||
#
|
||||
# This macro calls:
|
||||
#
|
||||
# AC_SUBST(SIMD_FLAGS)
|
||||
#
|
||||
# And defines:
|
||||
#
|
||||
# HAVE_MMX / HAVE_SSE / HAVE_SSE2 / HAVE_SSE3 / HAVE_SSSE3 / HAVE_SSE4.1 / HAVE_SSE4.2 / HAVE_AVX
|
||||
#
|
||||
# LICENSE
|
||||
#
|
||||
# Copyright (c) 2007 Christophe Tournayre <turn3r@users.sourceforge.net>
|
||||
# Copyright (c) 2013 Michael Petch <mpetch@capp-sysware.com>
|
||||
#
|
||||
# Copying and distribution of this file, with or without modification, are
|
||||
# permitted in any medium without royalty provided the copyright notice
|
||||
# and this notice are preserved. This file is offered as-is, without any
|
||||
# warranty.
|
||||
|
||||
#serial 13
|
||||
|
||||
AC_DEFUN([AX_EXT],
|
||||
[
|
||||
AC_REQUIRE([AC_CANONICAL_HOST])
|
||||
|
||||
case $host_cpu in
|
||||
powerpc*)
|
||||
AC_CACHE_CHECK([whether altivec is supported], [ax_cv_have_altivec_ext],
|
||||
[
|
||||
if test `/usr/sbin/sysctl -a 2>/dev/null| grep -c hw.optional.altivec` != 0; then
|
||||
if test `/usr/sbin/sysctl -n hw.optional.altivec` = 1; then
|
||||
ax_cv_have_altivec_ext=yes
|
||||
fi
|
||||
fi
|
||||
])
|
||||
|
||||
if test "$ax_cv_have_altivec_ext" = yes; then
|
||||
AC_DEFINE(HAVE_ALTIVEC,,[Support Altivec instructions])
|
||||
AX_CHECK_COMPILE_FLAG(-faltivec, [SIMD_FLAGS="$SIMD_FLAGS -faltivec"], [])
|
||||
fi
|
||||
;;
|
||||
|
||||
|
||||
i[[3456]]86*|x86_64*|amd64*)
|
||||
|
||||
AC_REQUIRE([AX_GCC_X86_CPUID])
|
||||
AC_REQUIRE([AX_GCC_X86_AVX_XGETBV])
|
||||
|
||||
AX_GCC_X86_CPUID(0x00000001)
|
||||
ecx=0
|
||||
edx=0
|
||||
ebx=0
|
||||
if test "$ax_cv_gcc_x86_cpuid_0x00000001" != "unknown";
|
||||
then
|
||||
ecx=`echo $ax_cv_gcc_x86_cpuid_0x00000001 | cut -d ":" -f 3`
|
||||
edx=`echo $ax_cv_gcc_x86_cpuid_0x00000001 | cut -d ":" -f 4`
|
||||
fi
|
||||
|
||||
AX_GCC_X86_CPUID(0x00000007)
|
||||
if test "$ax_cv_gcc_x86_cpuid_0x00000007" != "unknown";
|
||||
then
|
||||
ebx=`echo $ax_cv_gcc_x86_cpuid_0x00000007 | cut -d ":" -f 2`
|
||||
fi
|
||||
|
||||
AC_CACHE_CHECK([whether mmx is supported], [ax_cv_have_mmx_ext],
|
||||
[
|
||||
ax_cv_have_mmx_ext=no
|
||||
if test "$((0x$edx>>23&0x01))" = 1; then
|
||||
ax_cv_have_mmx_ext=yes
|
||||
fi
|
||||
])
|
||||
|
||||
AC_CACHE_CHECK([whether sse is supported], [ax_cv_have_sse_ext],
|
||||
[
|
||||
ax_cv_have_sse_ext=no
|
||||
if test "$((0x$edx>>25&0x01))" = 1; then
|
||||
ax_cv_have_sse_ext=yes
|
||||
fi
|
||||
])
|
||||
|
||||
AC_CACHE_CHECK([whether sse2 is supported], [ax_cv_have_sse2_ext],
|
||||
[
|
||||
ax_cv_have_sse2_ext=no
|
||||
if test "$((0x$edx>>26&0x01))" = 1; then
|
||||
ax_cv_have_sse2_ext=yes
|
||||
fi
|
||||
])
|
||||
|
||||
AC_CACHE_CHECK([whether sse3 is supported], [ax_cv_have_sse3_ext],
|
||||
[
|
||||
ax_cv_have_sse3_ext=no
|
||||
if test "$((0x$ecx&0x01))" = 1; then
|
||||
ax_cv_have_sse3_ext=yes
|
||||
fi
|
||||
])
|
||||
|
||||
AC_CACHE_CHECK([whether ssse3 is supported], [ax_cv_have_ssse3_ext],
|
||||
[
|
||||
ax_cv_have_ssse3_ext=no
|
||||
if test "$((0x$ecx>>9&0x01))" = 1; then
|
||||
ax_cv_have_ssse3_ext=yes
|
||||
fi
|
||||
])
|
||||
|
||||
AC_CACHE_CHECK([whether sse4.1 is supported], [ax_cv_have_sse41_ext],
|
||||
[
|
||||
ax_cv_have_sse41_ext=no
|
||||
if test "$((0x$ecx>>19&0x01))" = 1; then
|
||||
ax_cv_have_sse41_ext=yes
|
||||
fi
|
||||
])
|
||||
|
||||
AC_CACHE_CHECK([whether sse4.2 is supported], [ax_cv_have_sse42_ext],
|
||||
[
|
||||
ax_cv_have_sse42_ext=no
|
||||
if test "$((0x$ecx>>20&0x01))" = 1; then
|
||||
ax_cv_have_sse42_ext=yes
|
||||
fi
|
||||
])
|
||||
|
||||
AC_CACHE_CHECK([whether avx is supported by processor], [ax_cv_have_avx_cpu_ext],
|
||||
[
|
||||
ax_cv_have_avx_cpu_ext=no
|
||||
if test "$((0x$ecx>>28&0x01))" = 1; then
|
||||
ax_cv_have_avx_cpu_ext=yes
|
||||
fi
|
||||
])
|
||||
|
||||
AC_CACHE_CHECK([whether avx2 is supported by processor], [ax_cv_have_avx2_cpu_ext],
|
||||
[
|
||||
ax_cv_have_avx2_cpu_ext=no
|
||||
if test "$((0x$ebx>>5&0x01))" = 1; then
|
||||
ax_cv_have_avx2_cpu_ext=yes
|
||||
fi
|
||||
])
|
||||
|
||||
|
||||
AC_CACHE_CHECK([whether fma is supported by processor], [ax_cv_have_fma_cpu_ext],
|
||||
[
|
||||
ax_cv_have_fma_cpu_ext=no
|
||||
if test "$((0x$ecx>>12&0x01))" = 1; then
|
||||
ax_cv_have_fma_cpu_ext=yes
|
||||
fi
|
||||
])
|
||||
|
||||
|
||||
if test x"$ax_cv_have_avx_cpu_ext" = x"yes"; then
|
||||
AX_GCC_X86_AVX_XGETBV(0x00000000)
|
||||
|
||||
xgetbv_eax="0"
|
||||
if test x"$ax_cv_gcc_x86_avx_xgetbv_0x00000000" != x"unknown"; then
|
||||
xgetbv_eax=`echo $ax_cv_gcc_x86_avx_xgetbv_0x00000000 | cut -d ":" -f 1`
|
||||
fi
|
||||
|
||||
AC_CACHE_CHECK([whether avx is supported by operating system], [ax_cv_have_avx_ext],
|
||||
[
|
||||
ax_cv_have_avx_ext=no
|
||||
|
||||
if test "$((0x$ecx>>27&0x01))" = 1; then
|
||||
if test "$((0x$xgetbv_eax&0x6))" = 6; then
|
||||
ax_cv_have_avx_ext=yes
|
||||
fi
|
||||
fi
|
||||
])
|
||||
if test x"$ax_cv_have_avx_ext" = x"no"; then
|
||||
AC_MSG_WARN([Your processor supports AVX, but your operating system doesn't])
|
||||
fi
|
||||
fi
|
||||
|
||||
if test "$ax_cv_have_mmx_ext" = yes; then
|
||||
AX_CHECK_COMPILE_FLAG(-mmmx, ax_cv_support_mmx_ext=yes, [])
|
||||
if test x"$ax_cv_support_mmx_ext" = x"yes"; then
|
||||
SIMD_FLAGS="$SIMD_FLAGS -mmmx"
|
||||
AC_DEFINE(HAVE_MMX,,[Support mmx instructions])
|
||||
else
|
||||
AC_MSG_WARN([Your processor supports mmx instructions but not your compiler, can you try another compiler?])
|
||||
fi
|
||||
fi
|
||||
|
||||
if test "$ax_cv_have_sse_ext" = yes; then
|
||||
AX_CHECK_COMPILE_FLAG(-msse, ax_cv_support_sse_ext=yes, [])
|
||||
if test x"$ax_cv_support_sse_ext" = x"yes"; then
|
||||
SIMD_FLAGS="$SIMD_FLAGS -msse"
|
||||
AC_DEFINE(HAVE_SSE,,[Support SSE (Streaming SIMD Extensions) instructions])
|
||||
else
|
||||
AC_MSG_WARN([Your processor supports sse instructions but not your compiler, can you try another compiler?])
|
||||
fi
|
||||
fi
|
||||
|
||||
if test "$ax_cv_have_sse2_ext" = yes; then
|
||||
AX_CHECK_COMPILE_FLAG(-msse2, ax_cv_support_sse2_ext=yes, [])
|
||||
if test x"$ax_cv_support_sse2_ext" = x"yes"; then
|
||||
SIMD_FLAGS="$SIMD_FLAGS -msse2"
|
||||
AC_DEFINE(HAVE_SSE2,,[Support SSE2 (Streaming SIMD Extensions 2) instructions])
|
||||
else
|
||||
AC_MSG_WARN([Your processor supports sse2 instructions but not your compiler, can you try another compiler?])
|
||||
fi
|
||||
fi
|
||||
|
||||
if test "$ax_cv_have_sse3_ext" = yes; then
|
||||
AX_CHECK_COMPILE_FLAG(-msse3, ax_cv_support_sse3_ext=yes, [])
|
||||
if test x"$ax_cv_support_sse3_ext" = x"yes"; then
|
||||
SIMD_FLAGS="$SIMD_FLAGS -msse3"
|
||||
AC_DEFINE(HAVE_SSE3,,[Support SSE3 (Streaming SIMD Extensions 3) instructions])
|
||||
else
|
||||
AC_MSG_WARN([Your processor supports sse3 instructions but not your compiler, can you try another compiler?])
|
||||
fi
|
||||
fi
|
||||
|
||||
if test "$ax_cv_have_ssse3_ext" = yes; then
|
||||
AX_CHECK_COMPILE_FLAG(-mssse3, ax_cv_support_ssse3_ext=yes, [])
|
||||
if test x"$ax_cv_support_ssse3_ext" = x"yes"; then
|
||||
SIMD_FLAGS="$SIMD_FLAGS -mssse3"
|
||||
AC_DEFINE(HAVE_SSSE3,,[Support SSSE3 (Supplemental Streaming SIMD Extensions 3) instructions])
|
||||
else
|
||||
AC_MSG_WARN([Your processor supports ssse3 instructions but not your compiler, can you try another compiler?])
|
||||
fi
|
||||
fi
|
||||
|
||||
if test "$ax_cv_have_sse41_ext" = yes; then
|
||||
AX_CHECK_COMPILE_FLAG(-msse4.1, ax_cv_support_sse41_ext=yes, [])
|
||||
if test x"$ax_cv_support_sse41_ext" = x"yes"; then
|
||||
SIMD_FLAGS="$SIMD_FLAGS -msse4.1"
|
||||
AC_DEFINE(HAVE_SSE4_1,,[Support SSSE4.1 (Streaming SIMD Extensions 4.1) instructions])
|
||||
else
|
||||
AC_MSG_WARN([Your processor supports sse4.1 instructions but not your compiler, can you try another compiler?])
|
||||
fi
|
||||
fi
|
||||
|
||||
if test "$ax_cv_have_sse42_ext" = yes; then
|
||||
AX_CHECK_COMPILE_FLAG(-msse4.2, ax_cv_support_sse42_ext=yes, [])
|
||||
if test x"$ax_cv_support_sse42_ext" = x"yes"; then
|
||||
SIMD_FLAGS="$SIMD_FLAGS -msse4.2"
|
||||
AC_DEFINE(HAVE_SSE4_2,,[Support SSSE4.2 (Streaming SIMD Extensions 4.2) instructions])
|
||||
else
|
||||
AC_MSG_WARN([Your processor supports sse4.2 instructions but not your compiler, can you try another compiler?])
|
||||
fi
|
||||
fi
|
||||
|
||||
if test "$ax_cv_have_avx_ext" = yes; then
|
||||
AX_CHECK_COMPILE_FLAG(-mavx, ax_cv_support_avx_ext=yes, [])
|
||||
if test x"$ax_cv_support_avx_ext" = x"yes"; then
|
||||
SIMD_FLAGS="$SIMD_FLAGS -mavx"
|
||||
AC_DEFINE(HAVE_AVX,,[Support AVX (Advanced Vector Extensions) instructions])
|
||||
else
|
||||
AC_MSG_WARN([Your processor supports avx instructions but not your compiler, can you try another compiler?])
|
||||
fi
|
||||
fi
|
||||
|
||||
if test "$ax_cv_have_avx2_ext" = yes; then
|
||||
AX_CHECK_COMPILE_FLAG(-mavx2, ax_cv_support_avx2_ext=yes, [])
|
||||
if test x"$ax_cv_support_avx2_ext" = x"yes"; then
|
||||
SIMD_FLAGS="$SIMD_FLAGS -mavx2"
|
||||
AC_DEFINE(HAVE_AVX2,,[Support AVX2 (Advanced Vector Extensions 2) instructions])
|
||||
else
|
||||
AC_MSG_WARN([Your processor supports avx2 instructions but not your compiler, can you try another compiler?])
|
||||
fi
|
||||
fi
|
||||
|
||||
if test "$ax_cv_have_fma_ext" = yes; then
|
||||
AX_CHECK_COMPILE_FLAG(-mfma, ax_cv_support_fma_ext=yes, [])
|
||||
if test x"$ax_cv_support_fma_ext" = x"yes"; then
|
||||
SIMD_FLAGS="$SIMD_FLAGS -mfma"
|
||||
AC_DEFINE(HAVE_FMA,,[Support FMA3 (Fused Multiply-Add) instructions])
|
||||
else
|
||||
AC_MSG_WARN([Your processor supports fma instructions but not your compiler, can you try another compiler?])
|
||||
fi
|
||||
fi
|
||||
|
||||
;;
|
||||
esac
|
||||
|
||||
AC_SUBST(SIMD_FLAGS)
|
||||
])
|
79
m4/ax_gcc_x86_avx_xgetbv.m4
Normal file
79
m4/ax_gcc_x86_avx_xgetbv.m4
Normal file
@ -0,0 +1,79 @@
|
||||
# ===========================================================================
|
||||
# http://www.gnu.org/software/autoconf-archive/ax_gcc_x86_avx_xgetbv.html
|
||||
# ===========================================================================
|
||||
#
|
||||
# SYNOPSIS
|
||||
#
|
||||
# AX_GCC_X86_AVX_XGETBV
|
||||
#
|
||||
# DESCRIPTION
|
||||
#
|
||||
# On later x86 processors with AVX SIMD support, with gcc or a compiler
|
||||
# that has a compatible syntax for inline assembly instructions, run a
|
||||
# small program that executes the xgetbv instruction with input OP. This
|
||||
# can be used to detect if the OS supports AVX instruction usage.
|
||||
#
|
||||
# On output, the values of the eax and edx registers are stored as
|
||||
# hexadecimal strings as "eax:edx" in the cache variable
|
||||
# ax_cv_gcc_x86_avx_xgetbv.
|
||||
#
|
||||
# If the xgetbv instruction fails (because you are running a
|
||||
# cross-compiler, or because you are not using gcc, or because you are on
|
||||
# a processor that doesn't have this instruction),
|
||||
# ax_cv_gcc_x86_avx_xgetbv_OP is set to the string "unknown".
|
||||
#
|
||||
# This macro mainly exists to be used in AX_EXT.
|
||||
#
|
||||
# LICENSE
|
||||
#
|
||||
# Copyright (c) 2013 Michael Petch <mpetch@capp-sysware.com>
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify it
|
||||
# under the terms of the GNU General Public License as published by the
|
||||
# Free Software Foundation, either version 3 of the License, or (at your
|
||||
# option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but
|
||||
# WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
|
||||
# Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License along
|
||||
# with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
#
|
||||
# As a special exception, the respective Autoconf Macro's copyright owner
|
||||
# gives unlimited permission to copy, distribute and modify the configure
|
||||
# scripts that are the output of Autoconf when processing the Macro. You
|
||||
# need not follow the terms of the GNU General Public License when using
|
||||
# or distributing such scripts, even though portions of the text of the
|
||||
# Macro appear in them. The GNU General Public License (GPL) does govern
|
||||
# all other use of the material that constitutes the Autoconf Macro.
|
||||
#
|
||||
# This special exception to the GPL applies to versions of the Autoconf
|
||||
# Macro released by the Autoconf Archive. When you make and distribute a
|
||||
# modified version of the Autoconf Macro, you may extend this special
|
||||
# exception to the GPL to apply to your modified version as well.
|
||||
|
||||
#serial 1
|
||||
|
||||
AC_DEFUN([AX_GCC_X86_AVX_XGETBV],
|
||||
[AC_REQUIRE([AC_PROG_CC])
|
||||
AC_LANG_PUSH([C])
|
||||
AC_CACHE_CHECK(for x86-AVX xgetbv $1 output, ax_cv_gcc_x86_avx_xgetbv_$1,
|
||||
[AC_RUN_IFELSE([AC_LANG_PROGRAM([#include <stdio.h>], [
|
||||
int op = $1, eax, edx;
|
||||
FILE *f;
|
||||
/* Opcodes for xgetbv */
|
||||
__asm__(".byte 0x0f, 0x01, 0xd0"
|
||||
: "=a" (eax), "=d" (edx),
|
||||
: "c" (op));
|
||||
f = fopen("conftest_xgetbv", "w"); if (!f) return 1;
|
||||
fprintf(f, "%x:%x\n", eax, edx);
|
||||
fclose(f);
|
||||
return 0;
|
||||
])],
|
||||
[ax_cv_gcc_x86_avx_xgetbv_$1=`cat conftest_xgetbv`; rm -f conftest_xgetbv],
|
||||
[ax_cv_gcc_x86_avx_xgetbv_$1=unknown; rm -f conftest_xgetbv],
|
||||
[ax_cv_gcc_x86_avx_xgetbv_$1=unknown])])
|
||||
AC_LANG_POP([C])
|
||||
])
|
45
m4/ax_gcc_x86_cpuid.m4
Normal file
45
m4/ax_gcc_x86_cpuid.m4
Normal file
@ -0,0 +1,45 @@
|
||||
dnl @synopsis AX_GCC_X86_CPUID(OP)
|
||||
dnl
|
||||
dnl @summary run x86 cpuid instruction OP using gcc inline assembler
|
||||
dnl
|
||||
dnl On Pentium and later x86 processors, with gcc or a compiler that
|
||||
dnl has a compatible syntax for inline assembly instructions, run a
|
||||
dnl small program that executes the cpuid instruction with input OP.
|
||||
dnl This can be used to detect the CPU type.
|
||||
dnl
|
||||
dnl On output, the values of the eax, ebx, ecx, and edx registers are
|
||||
dnl stored as hexadecimal strings as "eax:ebx:ecx:edx" in the cache
|
||||
dnl variable ax_cv_gcc_x86_cpuid_OP.
|
||||
dnl
|
||||
dnl If the cpuid instruction fails (because you are running a
|
||||
dnl cross-compiler, or because you are not using gcc, or because you
|
||||
dnl are on a processor that doesn't have this instruction),
|
||||
dnl ax_cv_gcc_x86_cpuid_OP is set to the string "unknown".
|
||||
dnl
|
||||
dnl This macro mainly exists to be used in AX_GCC_ARCHFLAG.
|
||||
dnl
|
||||
dnl @category Misc
|
||||
dnl @author Steven G. Johnson <stevenj@alum.mit.edu> and Matteo Frigo.
|
||||
dnl @version 2005-05-30
|
||||
dnl @license GPLWithACException
|
||||
|
||||
AC_DEFUN([AX_GCC_X86_CPUID],
|
||||
[AC_REQUIRE([AC_PROG_CC])
|
||||
AC_LANG_PUSH([C])
|
||||
AC_CACHE_CHECK(for x86 cpuid $1 output, ax_cv_gcc_x86_cpuid_$1,
|
||||
[AC_RUN_IFELSE([AC_LANG_PROGRAM([#include <stdio.h>], [
|
||||
int op = $1, eax, ebx, ecx, edx;
|
||||
FILE *f;
|
||||
__asm__("cpuid"
|
||||
: "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
|
||||
: "a" (op));
|
||||
f = fopen("conftest_cpuid", "w"); if (!f) return 1;
|
||||
fprintf(f, "%x:%x:%x:%x\n", eax, ebx, ecx, edx);
|
||||
fclose(f);
|
||||
return 0;
|
||||
])],
|
||||
[ax_cv_gcc_x86_cpuid_$1=`cat conftest_cpuid`; rm -f conftest_cpuid],
|
||||
[ax_cv_gcc_x86_cpuid_$1=unknown; rm -f conftest_cpuid],
|
||||
[ax_cv_gcc_x86_cpuid_$1=unknown])])
|
||||
AC_LANG_POP([C])
|
||||
])
|
@ -54,3 +54,5 @@ echo ${BNAME}_SOURCES=$f >> Make.inc
|
||||
echo ${BNAME}_LDADD=-lGrid>> Make.inc
|
||||
echo >> Make.inc
|
||||
done
|
||||
|
||||
cd ..
|
||||
|
@ -56,6 +56,7 @@ int main (int argc, char ** argv)
|
||||
GridCartesian Fine(latt_size,simd_layout,mpi_layout);
|
||||
GridRedBlackCartesian rbFine(latt_size,simd_layout,mpi_layout);
|
||||
GridParallelRNG FineRNG(&Fine);
|
||||
GridSerialRNG SerialRNG;
|
||||
FineRNG.SeedRandomDevice();
|
||||
|
||||
LatticeColourMatrix Foo(&Fine);
|
||||
@ -83,6 +84,9 @@ int main (int argc, char ** argv)
|
||||
LatticeSpinMatrix sMat(&Fine);
|
||||
LatticeSpinColourMatrix scMat(&Fine);
|
||||
|
||||
LatticeLorentzColourMatrix lcMat(&Fine);
|
||||
|
||||
|
||||
LatticeComplex scalar(&Fine);
|
||||
LatticeReal rscalar(&Fine);
|
||||
LatticeReal iscalar(&Fine);
|
||||
@ -99,12 +103,15 @@ int main (int argc, char ** argv)
|
||||
random(FineRNG,cMat);
|
||||
random(FineRNG,sMat);
|
||||
random(FineRNG,scMat);
|
||||
random(FineRNG,lcMat);
|
||||
random(FineRNG,cVec);
|
||||
random(FineRNG,sVec);
|
||||
random(FineRNG,scVec);
|
||||
|
||||
|
||||
fflush(stdout);
|
||||
|
||||
TComplex tr = trace(cmat);
|
||||
|
||||
|
||||
cVec = cMat * cVec; // LatticeColourVector = LatticeColourMatrix * LatticeColourVector
|
||||
@ -116,7 +123,9 @@ int main (int argc, char ** argv)
|
||||
cMat = outerProduct(cVec,cVec);
|
||||
scalar = localInnerProduct(cVec,cVec);
|
||||
|
||||
|
||||
cMat = Ta(cMat); //traceless antihermitian
|
||||
|
||||
|
||||
scalar += scalar;
|
||||
scalar -= scalar;
|
||||
scalar *= scalar;
|
||||
@ -206,7 +215,13 @@ int main (int argc, char ** argv)
|
||||
scm=transpose(scm);
|
||||
scm=transposeIndex<1>(scm);
|
||||
|
||||
|
||||
//random(SerialRNG, cm);
|
||||
//std::cout << cm << std::endl;
|
||||
|
||||
cm = Ta(cm);
|
||||
//TComplex tracecm= trace(cm);
|
||||
//std::cout << cm << " "<< tracecm << std::endl;
|
||||
|
||||
|
||||
// Foo = Foo+scalar; // LatticeColourMatrix+Scalar
|
||||
@ -219,6 +234,10 @@ int main (int argc, char ** argv)
|
||||
LatticeComplex trscMat(&Fine);
|
||||
trscMat = trace(scMat); // Trace
|
||||
|
||||
// LatticeComplex trlcMat(&Fine);
|
||||
// trlcMat = trace(lcMat); // Trace involving iVector - now generates error
|
||||
|
||||
|
||||
{ // Peek-ology and Poke-ology, with a little app-ology
|
||||
TComplex c;
|
||||
ColourMatrix c_m;
|
||||
|
Loading…
Reference in New Issue
Block a user