mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-11 22:50:45 +01:00
Merge branch 'develop' of github.com:paboyle/Grid into feature/bgq
This commit is contained in:
commit
6dd75ad9e5
@ -4,7 +4,7 @@ EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.2.9.tar.bz2'
|
|||||||
FFTW_URL=http://www.fftw.org/fftw-3.3.4.tar.gz
|
FFTW_URL=http://www.fftw.org/fftw-3.3.4.tar.gz
|
||||||
|
|
||||||
echo "-- deploying Eigen source..."
|
echo "-- deploying Eigen source..."
|
||||||
wget ${EIGEN_URL}
|
wget ${EIGEN_URL} --no-check-certificate
|
||||||
./scripts/update_eigen.sh `basename ${EIGEN_URL}`
|
./scripts/update_eigen.sh `basename ${EIGEN_URL}`
|
||||||
rm `basename ${EIGEN_URL}`
|
rm `basename ${EIGEN_URL}`
|
||||||
|
|
||||||
|
47
configure.ac
47
configure.ac
@ -10,8 +10,19 @@ m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
|
|||||||
AC_LANG(C++)
|
AC_LANG(C++)
|
||||||
CXXFLAGS="-O3 $CXXFLAGS"
|
CXXFLAGS="-O3 $CXXFLAGS"
|
||||||
AC_PROG_CXX
|
AC_PROG_CXX
|
||||||
|
|
||||||
|
############ openmp ###############
|
||||||
AC_OPENMP
|
AC_OPENMP
|
||||||
|
|
||||||
|
ac_openmp=no
|
||||||
|
|
||||||
|
if test "${OPENMP_CXXFLAGS}X" != "X"; then
|
||||||
|
ac_openmp=yes
|
||||||
AM_CXXFLAGS="$OPENMP_CXXFLAGS $AM_CXXFLAGS"
|
AM_CXXFLAGS="$OPENMP_CXXFLAGS $AM_CXXFLAGS"
|
||||||
|
AM_LDFLAGS="$OPENMP_CXXFLAGS $AM_LDFLAGS"
|
||||||
|
fi
|
||||||
|
|
||||||
|
############ libtool ###############
|
||||||
LT_INIT
|
LT_INIT
|
||||||
|
|
||||||
############### Checks for header files
|
############### Checks for header files
|
||||||
@ -29,7 +40,7 @@ AC_TYPE_SIZE_T
|
|||||||
AC_TYPE_UINT32_T
|
AC_TYPE_UINT32_T
|
||||||
AC_TYPE_UINT64_T
|
AC_TYPE_UINT64_T
|
||||||
|
|
||||||
############### Options
|
############### GMP and MPFR #################
|
||||||
AC_ARG_WITH([gmp],
|
AC_ARG_WITH([gmp],
|
||||||
[AS_HELP_STRING([--with-gmp=prefix],
|
[AS_HELP_STRING([--with-gmp=prefix],
|
||||||
[try this for a non-standard install prefix of the GMP library])],
|
[try this for a non-standard install prefix of the GMP library])],
|
||||||
@ -40,6 +51,8 @@ AC_ARG_WITH([mpfr],
|
|||||||
[try this for a non-standard install prefix of the MPFR library])],
|
[try this for a non-standard install prefix of the MPFR library])],
|
||||||
[AM_CXXFLAGS="-I$with_mpfr/include $AM_CXXFLAGS"]
|
[AM_CXXFLAGS="-I$with_mpfr/include $AM_CXXFLAGS"]
|
||||||
[AM_LDFLAGS="-L$with_mpfr/lib $AM_LDFLAGS"])
|
[AM_LDFLAGS="-L$with_mpfr/lib $AM_LDFLAGS"])
|
||||||
|
|
||||||
|
################## lapack ####################
|
||||||
AC_ARG_ENABLE([lapack],
|
AC_ARG_ENABLE([lapack],
|
||||||
[AC_HELP_STRING([--enable-lapack=yes|no|prefix], [enable LAPACK])],
|
[AC_HELP_STRING([--enable-lapack=yes|no|prefix], [enable LAPACK])],
|
||||||
[ac_LAPACK=${enable_lapack}],[ac_LAPACK=no])
|
[ac_LAPACK=${enable_lapack}],[ac_LAPACK=no])
|
||||||
@ -55,14 +68,27 @@ case ${ac_LAPACK} in
|
|||||||
AC_DEFINE([USE_LAPACK],[1],[use LAPACK])
|
AC_DEFINE([USE_LAPACK],[1],[use LAPACK])
|
||||||
esac
|
esac
|
||||||
|
|
||||||
AC_CHECK_LIB([fftw3],[fft_init_threads],
|
################## FFTW3 ####################
|
||||||
|
AC_ARG_WITH([fftw],
|
||||||
|
[AS_HELP_STRING([--with-fftw=prefix],
|
||||||
|
[try this for a non-standard install prefix of the FFTW3 library])],
|
||||||
|
[AM_CXXFLAGS="-I$with_fftw/include $AM_CXXFLAGS"]
|
||||||
|
[AM_LDFLAGS="-L$with_fftw/lib $AM_LDFLAGS"])
|
||||||
|
|
||||||
|
#
|
||||||
|
# What about the MKL library replacement for fftw3 ? How do we know if fftw_execute
|
||||||
|
# can be found in MKL?
|
||||||
|
#
|
||||||
|
AC_CHECK_LIB([fftw3],[fftw_execute],
|
||||||
[AC_DEFINE([HAVE_FFTW],[1],[Define to 1 if you have the `FFTW' library (-lfftw3).])] [ac_fftw=yes],
|
[AC_DEFINE([HAVE_FFTW],[1],[Define to 1 if you have the `FFTW' library (-lfftw3).])] [ac_fftw=yes],
|
||||||
[ac_fftw=no])
|
[ac_fftw=no])
|
||||||
|
|
||||||
case ${ac_fftw} in
|
case ${ac_fftw} in
|
||||||
no)
|
no)
|
||||||
|
echo WARNING libfftw3 not found FFT routines will not work
|
||||||
;;
|
;;
|
||||||
yes)
|
yes)
|
||||||
AM_LDFLAGS="-lfftw3 $AM_LDFLAGS"
|
AM_LDFLAGS="$AM_LDFLAGS -lfftw3 -lfftw3f"
|
||||||
esac
|
esac
|
||||||
|
|
||||||
|
|
||||||
@ -307,19 +333,18 @@ Summary of configuration for $PACKAGE v$VERSION
|
|||||||
- compiler version : ${ax_cv_gxx_version}
|
- compiler version : ${ax_cv_gxx_version}
|
||||||
----- BUILD OPTIONS -----------------------------------
|
----- BUILD OPTIONS -----------------------------------
|
||||||
- SIMD : ${ac_SIMD}
|
- SIMD : ${ac_SIMD}
|
||||||
- communications type : ${ac_COMMS}
|
- Threading : ${ac_openmp}
|
||||||
- default precision : ${ac_PRECISION}
|
- Communications type : ${ac_COMMS}
|
||||||
|
- Default precision : ${ac_PRECISION}
|
||||||
- RNG choice : ${ac_RNG}
|
- RNG choice : ${ac_RNG}
|
||||||
- GMP : `if test "x$have_gmp" = xtrue; then echo yes; else echo no; fi`
|
- GMP : `if test "x$have_gmp" = xtrue; then echo yes; else echo no; fi`
|
||||||
- LAPACK : ${ac_LAPACK}
|
- LAPACK : ${ac_LAPACK}
|
||||||
|
- FFTW : ${ac_fftw}
|
||||||
- build DOXYGEN documentation : `if test "x$enable_doc" = xyes; then echo yes; else echo no; fi`
|
- build DOXYGEN documentation : `if test "x$enable_doc" = xyes; then echo yes; else echo no; fi`
|
||||||
- graphs and diagrams : `if test "x$enable_dot" = xyes; then echo yes; else echo no; fi`
|
- graphs and diagrams : `if test "x$enable_dot" = xyes; then echo yes; else echo no; fi`
|
||||||
----- BUILD FLAGS -------------------------------------
|
----- BUILD FLAGS -------------------------------------
|
||||||
- CXXFLAGS:
|
- CXXFLAGS: "${AM_CXXFLAGS} ${CXXFLAGS}"
|
||||||
`echo ${AM_CXXFLAGS} ${CXXFLAGS} | sed 's/ -/\n\t-/g' | sed 's/^-/\t-/g'`
|
- LDFLAGS: "${AM_LDFLAGS} ${LDFLAGS}"
|
||||||
- LDFLAGS:
|
- LIBS: "${LIBS} "
|
||||||
`echo ${AM_LDFLAGS} ${LDFLAGS} | sed 's/ -/\n\t-/g' | sed 's/^-/\t-/g'`
|
|
||||||
- LIBS:
|
|
||||||
`echo ${LIBS} | sed 's/ -/\n\t-/g' | sed 's/^-/\t-/g'`
|
|
||||||
-------------------------------------------------------
|
-------------------------------------------------------
|
||||||
"
|
"
|
||||||
|
203
lib/FFT.h
203
lib/FFT.h
@ -1,3 +1,4 @@
|
|||||||
|
|
||||||
/*************************************************************************************
|
/*************************************************************************************
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
@ -28,11 +29,75 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#ifndef _GRID_FFT_H_
|
#ifndef _GRID_FFT_H_
|
||||||
#define _GRID_FFT_H_
|
#define _GRID_FFT_H_
|
||||||
|
|
||||||
#include <Grid/fftw/fftw3.h>
|
#ifdef HAVE_FFTW
|
||||||
|
#include <fftw3.h>
|
||||||
|
#endif
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
|
template<class scalar> struct FFTW { };
|
||||||
|
|
||||||
|
#ifdef HAVE_FFTW
|
||||||
|
template<> struct FFTW<ComplexD> {
|
||||||
|
public:
|
||||||
|
|
||||||
|
typedef fftw_complex FFTW_scalar;
|
||||||
|
typedef fftw_plan FFTW_plan;
|
||||||
|
|
||||||
|
static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
|
||||||
|
FFTW_scalar *in, const int *inembed,
|
||||||
|
int istride, int idist,
|
||||||
|
FFTW_scalar *out, const int *onembed,
|
||||||
|
int ostride, int odist,
|
||||||
|
int sign, unsigned flags) {
|
||||||
|
return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
|
||||||
|
::fftw_flops(p,add,mul,fmas);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
|
||||||
|
::fftw_execute_dft(p,in,out);
|
||||||
|
}
|
||||||
|
inline static void fftw_destroy_plan(const FFTW_plan p) {
|
||||||
|
::fftw_destroy_plan(p);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template<> struct FFTW<ComplexF> {
|
||||||
|
public:
|
||||||
|
|
||||||
|
typedef fftwf_complex FFTW_scalar;
|
||||||
|
typedef fftwf_plan FFTW_plan;
|
||||||
|
|
||||||
|
static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
|
||||||
|
FFTW_scalar *in, const int *inembed,
|
||||||
|
int istride, int idist,
|
||||||
|
FFTW_scalar *out, const int *onembed,
|
||||||
|
int ostride, int odist,
|
||||||
|
int sign, unsigned flags) {
|
||||||
|
return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
|
||||||
|
::fftwf_flops(p,add,mul,fmas);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
|
||||||
|
::fftwf_execute_dft(p,in,out);
|
||||||
|
}
|
||||||
|
inline static void fftw_destroy_plan(const FFTW_plan p) {
|
||||||
|
::fftwf_destroy_plan(p);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef FFTW_FORWARD
|
||||||
|
#define FFTW_FORWARD (-1)
|
||||||
|
#define FFTW_BACKWARD (+1)
|
||||||
|
#endif
|
||||||
|
|
||||||
class FFT {
|
class FFT {
|
||||||
private:
|
private:
|
||||||
|
|
||||||
@ -40,6 +105,10 @@ namespace Grid {
|
|||||||
GridCartesian *sgrid;
|
GridCartesian *sgrid;
|
||||||
|
|
||||||
int Nd;
|
int Nd;
|
||||||
|
double flops;
|
||||||
|
double flops_call;
|
||||||
|
uint64_t usec;
|
||||||
|
|
||||||
std::vector<int> dimensions;
|
std::vector<int> dimensions;
|
||||||
std::vector<int> processors;
|
std::vector<int> processors;
|
||||||
std::vector<int> processor_coor;
|
std::vector<int> processor_coor;
|
||||||
@ -49,6 +118,9 @@ namespace Grid {
|
|||||||
static const int forward=FFTW_FORWARD;
|
static const int forward=FFTW_FORWARD;
|
||||||
static const int backward=FFTW_BACKWARD;
|
static const int backward=FFTW_BACKWARD;
|
||||||
|
|
||||||
|
double Flops(void) {return flops;}
|
||||||
|
double MFlops(void) {return flops/usec;}
|
||||||
|
|
||||||
FFT ( GridCartesian * grid ) :
|
FFT ( GridCartesian * grid ) :
|
||||||
vgrid(grid),
|
vgrid(grid),
|
||||||
Nd(grid->_ndimension),
|
Nd(grid->_ndimension),
|
||||||
@ -56,6 +128,8 @@ namespace Grid {
|
|||||||
processors(grid->_processors),
|
processors(grid->_processors),
|
||||||
processor_coor(grid->_processor_coor)
|
processor_coor(grid->_processor_coor)
|
||||||
{
|
{
|
||||||
|
flops=0;
|
||||||
|
usec =0;
|
||||||
std::vector<int> layout(Nd,1);
|
std::vector<int> layout(Nd,1);
|
||||||
sgrid = new GridCartesian(dimensions,layout,processors);
|
sgrid = new GridCartesian(dimensions,layout,processors);
|
||||||
};
|
};
|
||||||
@ -75,55 +149,62 @@ namespace Grid {
|
|||||||
|
|
||||||
std::vector<int> layout(Nd,1);
|
std::vector<int> layout(Nd,1);
|
||||||
std::vector<int> pencil_gd(vgrid->_fdimensions);
|
std::vector<int> pencil_gd(vgrid->_fdimensions);
|
||||||
std::vector<int> pencil_ld(processors);
|
|
||||||
|
|
||||||
pencil_gd[dim] = G*processors[dim];
|
pencil_gd[dim] = G*processors[dim];
|
||||||
pencil_ld[dim] = G*processors[dim];
|
|
||||||
|
|
||||||
// Pencil global vol LxLxGxLxL per node
|
// Pencil global vol LxLxGxLxL per node
|
||||||
GridCartesian pencil_g(pencil_gd,layout,processors);
|
GridCartesian pencil_g(pencil_gd,layout,processors);
|
||||||
GridCartesian pencil_l(pencil_ld,layout,processors);
|
|
||||||
|
|
||||||
// Construct pencils
|
// Construct pencils
|
||||||
typedef typename vobj::scalar_object sobj;
|
typedef typename vobj::scalar_object sobj;
|
||||||
|
typedef typename sobj::scalar_type scalar;
|
||||||
|
|
||||||
Lattice<vobj> ssource(vgrid); ssource =source;
|
Lattice<vobj> ssource(vgrid); ssource =source;
|
||||||
Lattice<sobj> pgsource(&pencil_g);
|
Lattice<sobj> pgsource(&pencil_g);
|
||||||
Lattice<sobj> pgresult(&pencil_g);
|
Lattice<sobj> pgresult(&pencil_g); pgresult=zero;
|
||||||
Lattice<sobj> plsource(&pencil_l);
|
|
||||||
Lattice<sobj> plresult(&pencil_l);
|
#ifndef HAVE_FFTW
|
||||||
|
assert(0);
|
||||||
|
#else
|
||||||
|
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
|
||||||
|
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
|
||||||
|
|
||||||
{
|
{
|
||||||
assert(sizeof(typename sobj::scalar_type)==sizeof(ComplexD));
|
int Ncomp = sizeof(sobj)/sizeof(scalar);
|
||||||
assert(sizeof(fftw_complex)==sizeof(ComplexD));
|
int Nlow = 1;
|
||||||
assert(sizeof(fftw_complex)==sizeof(ComplexD));
|
for(int d=0;d<dim;d++){
|
||||||
|
Nlow*=vgrid->_ldimensions[d];
|
||||||
|
}
|
||||||
|
|
||||||
int Ncomp = sizeof(sobj)/sizeof(fftw_complex);
|
int rank = 1; /* 1d transforms */
|
||||||
|
|
||||||
int rank = 1; /* not 2: we are computing 1d transforms */
|
|
||||||
int n[] = {G}; /* 1d transforms of length G */
|
int n[] = {G}; /* 1d transforms of length G */
|
||||||
int howmany = Ncomp;
|
int howmany = Ncomp;
|
||||||
int odist,idist,istride,ostride;
|
int odist,idist,istride,ostride;
|
||||||
idist = odist = 1;
|
idist = odist = 1; /* Distance between consecutive FT's */
|
||||||
istride = ostride = Ncomp; /* distance between two elements in the same column */
|
istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */
|
||||||
int *inembed = n, *onembed = n;
|
int *inembed = n, *onembed = n;
|
||||||
|
|
||||||
fftw_complex *in = (fftw_complex *)&plsource._odata[0];
|
|
||||||
fftw_complex *out= (fftw_complex *)&plresult._odata[0];
|
|
||||||
|
|
||||||
int sign = FFTW_FORWARD;
|
int sign = FFTW_FORWARD;
|
||||||
if (inverse) sign = FFTW_BACKWARD;
|
if (inverse) sign = FFTW_BACKWARD;
|
||||||
|
|
||||||
#ifdef HAVE_FFTW
|
FFTW_plan p;
|
||||||
fftw_plan p = fftw_plan_many_dft(rank,n,howmany,
|
{
|
||||||
in,inembed,
|
FFTW_scalar *in = (FFTW_scalar *)&pgsource._odata[0];
|
||||||
istride,idist,
|
FFTW_scalar *out= (FFTW_scalar *)&pgresult._odata[0];
|
||||||
out,onembed,
|
p = FFTW<scalar>::fftw_plan_many_dft(rank,n,howmany,
|
||||||
ostride, odist,
|
in,inembed,
|
||||||
sign,FFTW_ESTIMATE);
|
istride,idist,
|
||||||
#else
|
out,onembed,
|
||||||
fftw_plan p ;
|
ostride, odist,
|
||||||
assert(0);
|
sign,FFTW_ESTIMATE);
|
||||||
#endif
|
}
|
||||||
|
|
||||||
|
double add,mul,fma;
|
||||||
|
FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
|
||||||
|
flops_call = add+mul+2.0*fma;
|
||||||
|
|
||||||
|
GridStopWatch timer;
|
||||||
|
|
||||||
// Barrel shift and collect global pencil
|
// Barrel shift and collect global pencil
|
||||||
for(int p=0;p<processors[dim];p++) {
|
for(int p=0;p<processors[dim];p++) {
|
||||||
@ -146,43 +227,45 @@ namespace Grid {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Loop over orthog coords
|
// Loop over orthog coords
|
||||||
for(int idx=0;idx<sgrid->lSites();idx++) {
|
int NN=pencil_g.lSites();
|
||||||
|
|
||||||
|
GridStopWatch Timer;
|
||||||
|
Timer.Start();
|
||||||
|
|
||||||
|
PARALLEL_FOR_LOOP
|
||||||
|
for(int idx=0;idx<NN;idx++) {
|
||||||
|
|
||||||
std::vector<int> pcoor(Nd,0);
|
|
||||||
std::vector<int> lcoor(Nd);
|
std::vector<int> lcoor(Nd);
|
||||||
sgrid->LocalIndexToLocalCoor(idx,lcoor);
|
pencil_g.LocalIndexToLocalCoor(idx,lcoor);
|
||||||
|
|
||||||
if ( lcoor[dim] == 0 ) { // restricts loop to plane at lcoor[dim]==0
|
if ( lcoor[dim] == 0 ) { // restricts loop to plane at lcoor[dim]==0
|
||||||
|
FFTW_scalar *in = (FFTW_scalar *)&pgsource._odata[idx];
|
||||||
// Project to local pencil array
|
FFTW_scalar *out= (FFTW_scalar *)&pgresult._odata[idx];
|
||||||
for(int l=0;l<G;l++){
|
FFTW<scalar>::fftw_execute_dft(p,in,out);
|
||||||
sobj s;
|
|
||||||
pcoor[dim]=l;
|
|
||||||
lcoor[dim]=l;
|
|
||||||
peekLocalSite(s,pgsource,lcoor);
|
|
||||||
pokeLocalSite(s,plsource,pcoor);
|
|
||||||
}
|
|
||||||
|
|
||||||
// FFT the pencil
|
|
||||||
#ifdef HAVE_FFTW
|
|
||||||
fftw_execute(p);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Extract the result
|
|
||||||
for(int l=0;l<L;l++){
|
|
||||||
sobj s;
|
|
||||||
int p = processor_coor[dim];
|
|
||||||
lcoor[dim] = l;
|
|
||||||
pcoor[dim] = l+L*p;
|
|
||||||
peekLocalSite(s,plresult,pcoor);
|
|
||||||
pokeLocalSite(s,result,lcoor);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fftw_destroy_plan(p);
|
Timer.Stop();
|
||||||
|
usec += Timer.useconds();
|
||||||
|
flops+= flops_call*NN;
|
||||||
|
|
||||||
|
int pc = processor_coor[dim];
|
||||||
|
for(int idx=0;idx<sgrid->lSites();idx++) {
|
||||||
|
std::vector<int> lcoor(Nd);
|
||||||
|
sgrid->LocalIndexToLocalCoor(idx,lcoor);
|
||||||
|
std::vector<int> gcoor = lcoor;
|
||||||
|
// extract the result
|
||||||
|
sobj s;
|
||||||
|
gcoor[dim] = lcoor[dim]+L*pc;
|
||||||
|
peekLocalSite(s,pgresult,gcoor);
|
||||||
|
pokeLocalSite(s,result,lcoor);
|
||||||
|
}
|
||||||
|
|
||||||
|
FFTW<scalar>::fftw_destroy_plan(p);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
@ -153,6 +153,7 @@ void GridParseLayout(char **argv,int argc,
|
|||||||
assert(ompthreads.size()==1);
|
assert(ompthreads.size()==1);
|
||||||
GridThread::SetThreads(ompthreads[0]);
|
GridThread::SetThreads(ompthreads[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
if( GridCmdOptionExists(argv,argv+argc,"--cores") ){
|
if( GridCmdOptionExists(argv,argv+argc,"--cores") ){
|
||||||
std::vector<int> cores(0);
|
std::vector<int> cores(0);
|
||||||
arg= GridCmdOptionPayload(argv,argv+argc,"--cores");
|
arg= GridCmdOptionPayload(argv,argv+argc,"--cores");
|
||||||
@ -203,7 +204,6 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
GridLogConfigure(logstreams);
|
GridLogConfigure(logstreams);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
|
||||||
Grid_debug_handler_init();
|
Grid_debug_handler_init();
|
||||||
}
|
}
|
||||||
|
412
lib/fftw/fftw3.h
412
lib/fftw/fftw3.h
@ -1,412 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
|
||||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
|
||||||
*
|
|
||||||
* The following statement of license applies *only* to this header file,
|
|
||||||
* and *not* to the other files distributed with FFTW or derived therefrom:
|
|
||||||
*
|
|
||||||
* Redistribution and use in source and binary forms, with or without
|
|
||||||
* modification, are permitted provided that the following conditions
|
|
||||||
* are met:
|
|
||||||
*
|
|
||||||
* 1. Redistributions of source code must retain the above copyright
|
|
||||||
* notice, this list of conditions and the following disclaimer.
|
|
||||||
*
|
|
||||||
* 2. Redistributions in binary form must reproduce the above copyright
|
|
||||||
* notice, this list of conditions and the following disclaimer in the
|
|
||||||
* documentation and/or other materials provided with the distribution.
|
|
||||||
*
|
|
||||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
|
|
||||||
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
||||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
||||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
|
||||||
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
||||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
|
||||||
* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
||||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
|
||||||
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
||||||
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
||||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/***************************** NOTE TO USERS *********************************
|
|
||||||
*
|
|
||||||
* THIS IS A HEADER FILE, NOT A MANUAL
|
|
||||||
*
|
|
||||||
* If you want to know how to use FFTW, please read the manual,
|
|
||||||
* online at http://www.fftw.org/doc/ and also included with FFTW.
|
|
||||||
* For a quick start, see the manual's tutorial section.
|
|
||||||
*
|
|
||||||
* (Reading header files to learn how to use a library is a habit
|
|
||||||
* stemming from code lacking a proper manual. Arguably, it's a
|
|
||||||
* *bad* habit in most cases, because header files can contain
|
|
||||||
* interfaces that are not part of the public, stable API.)
|
|
||||||
*
|
|
||||||
****************************************************************************/
|
|
||||||
|
|
||||||
#ifndef FFTW3_H
|
|
||||||
#define FFTW3_H
|
|
||||||
|
|
||||||
#include <stdio.h>
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C"
|
|
||||||
{
|
|
||||||
#endif /* __cplusplus */
|
|
||||||
|
|
||||||
/* If <complex.h> is included, use the C99 complex type. Otherwise
|
|
||||||
define a type bit-compatible with C99 complex */
|
|
||||||
#if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined(complex) && defined(I)
|
|
||||||
# define FFTW_DEFINE_COMPLEX(R, C) typedef R _Complex C
|
|
||||||
#else
|
|
||||||
# define FFTW_DEFINE_COMPLEX(R, C) typedef R C[2]
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define FFTW_CONCAT(prefix, name) prefix ## name
|
|
||||||
#define FFTW_MANGLE_DOUBLE(name) FFTW_CONCAT(fftw_, name)
|
|
||||||
#define FFTW_MANGLE_FLOAT(name) FFTW_CONCAT(fftwf_, name)
|
|
||||||
#define FFTW_MANGLE_LONG_DOUBLE(name) FFTW_CONCAT(fftwl_, name)
|
|
||||||
#define FFTW_MANGLE_QUAD(name) FFTW_CONCAT(fftwq_, name)
|
|
||||||
|
|
||||||
/* IMPORTANT: for Windows compilers, you should add a line
|
|
||||||
#define FFTW_DLL
|
|
||||||
here and in kernel/ifftw.h if you are compiling/using FFTW as a
|
|
||||||
DLL, in order to do the proper importing/exporting, or
|
|
||||||
alternatively compile with -DFFTW_DLL or the equivalent
|
|
||||||
command-line flag. This is not necessary under MinGW/Cygwin, where
|
|
||||||
libtool does the imports/exports automatically. */
|
|
||||||
#if defined(FFTW_DLL) && (defined(_WIN32) || defined(__WIN32__))
|
|
||||||
/* annoying Windows syntax for shared-library declarations */
|
|
||||||
# if defined(COMPILING_FFTW) /* defined in api.h when compiling FFTW */
|
|
||||||
# define FFTW_EXTERN extern __declspec(dllexport)
|
|
||||||
# else /* user is calling FFTW; import symbol */
|
|
||||||
# define FFTW_EXTERN extern __declspec(dllimport)
|
|
||||||
# endif
|
|
||||||
#else
|
|
||||||
# define FFTW_EXTERN extern
|
|
||||||
#endif
|
|
||||||
|
|
||||||
enum fftw_r2r_kind_do_not_use_me {
|
|
||||||
FFTW_R2HC=0, FFTW_HC2R=1, FFTW_DHT=2,
|
|
||||||
FFTW_REDFT00=3, FFTW_REDFT01=4, FFTW_REDFT10=5, FFTW_REDFT11=6,
|
|
||||||
FFTW_RODFT00=7, FFTW_RODFT01=8, FFTW_RODFT10=9, FFTW_RODFT11=10
|
|
||||||
};
|
|
||||||
|
|
||||||
struct fftw_iodim_do_not_use_me {
|
|
||||||
int n; /* dimension size */
|
|
||||||
int is; /* input stride */
|
|
||||||
int os; /* output stride */
|
|
||||||
};
|
|
||||||
|
|
||||||
#include <stddef.h> /* for ptrdiff_t */
|
|
||||||
struct fftw_iodim64_do_not_use_me {
|
|
||||||
ptrdiff_t n; /* dimension size */
|
|
||||||
ptrdiff_t is; /* input stride */
|
|
||||||
ptrdiff_t os; /* output stride */
|
|
||||||
};
|
|
||||||
|
|
||||||
typedef void (*fftw_write_char_func_do_not_use_me)(char c, void *);
|
|
||||||
typedef int (*fftw_read_char_func_do_not_use_me)(void *);
|
|
||||||
|
|
||||||
/*
|
|
||||||
huge second-order macro that defines prototypes for all API
|
|
||||||
functions. We expand this macro for each supported precision
|
|
||||||
|
|
||||||
X: name-mangling macro
|
|
||||||
R: real data type
|
|
||||||
C: complex data type
|
|
||||||
*/
|
|
||||||
|
|
||||||
#define FFTW_DEFINE_API(X, R, C) \
|
|
||||||
\
|
|
||||||
FFTW_DEFINE_COMPLEX(R, C); \
|
|
||||||
\
|
|
||||||
typedef struct X(plan_s) *X(plan); \
|
|
||||||
\
|
|
||||||
typedef struct fftw_iodim_do_not_use_me X(iodim); \
|
|
||||||
typedef struct fftw_iodim64_do_not_use_me X(iodim64); \
|
|
||||||
\
|
|
||||||
typedef enum fftw_r2r_kind_do_not_use_me X(r2r_kind); \
|
|
||||||
\
|
|
||||||
typedef fftw_write_char_func_do_not_use_me X(write_char_func); \
|
|
||||||
typedef fftw_read_char_func_do_not_use_me X(read_char_func); \
|
|
||||||
\
|
|
||||||
FFTW_EXTERN void X(execute)(const X(plan) p); \
|
|
||||||
\
|
|
||||||
FFTW_EXTERN X(plan) X(plan_dft)(int rank, const int *n, \
|
|
||||||
C *in, C *out, int sign, unsigned flags); \
|
|
||||||
\
|
|
||||||
FFTW_EXTERN X(plan) X(plan_dft_1d)(int n, C *in, C *out, int sign, \
|
|
||||||
unsigned flags); \
|
|
||||||
FFTW_EXTERN X(plan) X(plan_dft_2d)(int n0, int n1, \
|
|
||||||
C *in, C *out, int sign, unsigned flags); \
|
|
||||||
FFTW_EXTERN X(plan) X(plan_dft_3d)(int n0, int n1, int n2, \
|
|
||||||
C *in, C *out, int sign, unsigned flags); \
|
|
||||||
\
|
|
||||||
FFTW_EXTERN X(plan) X(plan_many_dft)(int rank, const int *n, \
|
|
||||||
int howmany, \
|
|
||||||
C *in, const int *inembed, \
|
|
||||||
int istride, int idist, \
|
|
||||||
C *out, const int *onembed, \
|
|
||||||
int ostride, int odist, \
|
|
||||||
int sign, unsigned flags); \
|
|
||||||
\
|
|
||||||
FFTW_EXTERN X(plan) X(plan_guru_dft)(int rank, const X(iodim) *dims, \
|
|
||||||
int howmany_rank, \
|
|
||||||
const X(iodim) *howmany_dims, \
|
|
||||||
C *in, C *out, \
|
|
||||||
int sign, unsigned flags); \
|
|
||||||
FFTW_EXTERN X(plan) X(plan_guru_split_dft)(int rank, const X(iodim) *dims, \
|
|
||||||
int howmany_rank, \
|
|
||||||
const X(iodim) *howmany_dims, \
|
|
||||||
R *ri, R *ii, R *ro, R *io, \
|
|
||||||
unsigned flags); \
|
|
||||||
\
|
|
||||||
FFTW_EXTERN X(plan) X(plan_guru64_dft)(int rank, \
|
|
||||||
const X(iodim64) *dims, \
|
|
||||||
int howmany_rank, \
|
|
||||||
const X(iodim64) *howmany_dims, \
|
|
||||||
C *in, C *out, \
|
|
||||||
int sign, unsigned flags); \
|
|
||||||
FFTW_EXTERN X(plan) X(plan_guru64_split_dft)(int rank, \
|
|
||||||
const X(iodim64) *dims, \
|
|
||||||
int howmany_rank, \
|
|
||||||
const X(iodim64) *howmany_dims, \
|
|
||||||
R *ri, R *ii, R *ro, R *io, \
|
|
||||||
unsigned flags); \
|
|
||||||
\
|
|
||||||
FFTW_EXTERN void X(execute_dft)(const X(plan) p, C *in, C *out); \
|
|
||||||
FFTW_EXTERN void X(execute_split_dft)(const X(plan) p, R *ri, R *ii, \
|
|
||||||
R *ro, R *io); \
|
|
||||||
\
|
|
||||||
FFTW_EXTERN X(plan) X(plan_many_dft_r2c)(int rank, const int *n, \
|
|
||||||
int howmany, \
|
|
||||||
R *in, const int *inembed, \
|
|
||||||
int istride, int idist, \
|
|
||||||
C *out, const int *onembed, \
|
|
||||||
int ostride, int odist, \
|
|
||||||
unsigned flags); \
|
|
||||||
\
|
|
||||||
FFTW_EXTERN X(plan) X(plan_dft_r2c)(int rank, const int *n, \
|
|
||||||
R *in, C *out, unsigned flags); \
|
|
||||||
\
|
|
||||||
FFTW_EXTERN X(plan) X(plan_dft_r2c_1d)(int n,R *in,C *out,unsigned flags); \
|
|
||||||
FFTW_EXTERN X(plan) X(plan_dft_r2c_2d)(int n0, int n1, \
|
|
||||||
R *in, C *out, unsigned flags); \
|
|
||||||
FFTW_EXTERN X(plan) X(plan_dft_r2c_3d)(int n0, int n1, \
|
|
||||||
int n2, \
|
|
||||||
R *in, C *out, unsigned flags); \
|
|
||||||
\
|
|
||||||
\
|
|
||||||
FFTW_EXTERN X(plan) X(plan_many_dft_c2r)(int rank, const int *n, \
|
|
||||||
int howmany, \
|
|
||||||
C *in, const int *inembed, \
|
|
||||||
int istride, int idist, \
|
|
||||||
R *out, const int *onembed, \
|
|
||||||
int ostride, int odist, \
|
|
||||||
unsigned flags); \
|
|
||||||
\
|
|
||||||
FFTW_EXTERN X(plan) X(plan_dft_c2r)(int rank, const int *n, \
|
|
||||||
C *in, R *out, unsigned flags); \
|
|
||||||
\
|
|
||||||
FFTW_EXTERN X(plan) X(plan_dft_c2r_1d)(int n,C *in,R *out,unsigned flags); \
|
|
||||||
FFTW_EXTERN X(plan) X(plan_dft_c2r_2d)(int n0, int n1, \
|
|
||||||
C *in, R *out, unsigned flags); \
|
|
||||||
FFTW_EXTERN X(plan) X(plan_dft_c2r_3d)(int n0, int n1, \
|
|
||||||
int n2, \
|
|
||||||
C *in, R *out, unsigned flags); \
|
|
||||||
\
|
|
||||||
FFTW_EXTERN X(plan) X(plan_guru_dft_r2c)(int rank, const X(iodim) *dims, \
|
|
||||||
int howmany_rank, \
|
|
||||||
const X(iodim) *howmany_dims, \
|
|
||||||
R *in, C *out, \
|
|
||||||
unsigned flags); \
|
|
||||||
FFTW_EXTERN X(plan) X(plan_guru_dft_c2r)(int rank, const X(iodim) *dims, \
|
|
||||||
int howmany_rank, \
|
|
||||||
const X(iodim) *howmany_dims, \
|
|
||||||
C *in, R *out, \
|
|
||||||
unsigned flags); \
|
|
||||||
\
|
|
||||||
FFTW_EXTERN X(plan) X(plan_guru_split_dft_r2c)( \
|
|
||||||
int rank, const X(iodim) *dims, \
|
|
||||||
int howmany_rank, \
|
|
||||||
const X(iodim) *howmany_dims, \
|
|
||||||
R *in, R *ro, R *io, \
|
|
||||||
unsigned flags); \
|
|
||||||
FFTW_EXTERN X(plan) X(plan_guru_split_dft_c2r)( \
|
|
||||||
int rank, const X(iodim) *dims, \
|
|
||||||
int howmany_rank, \
|
|
||||||
const X(iodim) *howmany_dims, \
|
|
||||||
R *ri, R *ii, R *out, \
|
|
||||||
unsigned flags); \
|
|
||||||
\
|
|
||||||
FFTW_EXTERN X(plan) X(plan_guru64_dft_r2c)(int rank, \
|
|
||||||
const X(iodim64) *dims, \
|
|
||||||
int howmany_rank, \
|
|
||||||
const X(iodim64) *howmany_dims, \
|
|
||||||
R *in, C *out, \
|
|
||||||
unsigned flags); \
|
|
||||||
FFTW_EXTERN X(plan) X(plan_guru64_dft_c2r)(int rank, \
|
|
||||||
const X(iodim64) *dims, \
|
|
||||||
int howmany_rank, \
|
|
||||||
const X(iodim64) *howmany_dims, \
|
|
||||||
C *in, R *out, \
|
|
||||||
unsigned flags); \
|
|
||||||
\
|
|
||||||
FFTW_EXTERN X(plan) X(plan_guru64_split_dft_r2c)( \
|
|
||||||
int rank, const X(iodim64) *dims, \
|
|
||||||
int howmany_rank, \
|
|
||||||
const X(iodim64) *howmany_dims, \
|
|
||||||
R *in, R *ro, R *io, \
|
|
||||||
unsigned flags); \
|
|
||||||
FFTW_EXTERN X(plan) X(plan_guru64_split_dft_c2r)( \
|
|
||||||
int rank, const X(iodim64) *dims, \
|
|
||||||
int howmany_rank, \
|
|
||||||
const X(iodim64) *howmany_dims, \
|
|
||||||
R *ri, R *ii, R *out, \
|
|
||||||
unsigned flags); \
|
|
||||||
\
|
|
||||||
FFTW_EXTERN void X(execute_dft_r2c)(const X(plan) p, R *in, C *out); \
|
|
||||||
FFTW_EXTERN void X(execute_dft_c2r)(const X(plan) p, C *in, R *out); \
|
|
||||||
\
|
|
||||||
FFTW_EXTERN void X(execute_split_dft_r2c)(const X(plan) p, \
|
|
||||||
R *in, R *ro, R *io); \
|
|
||||||
FFTW_EXTERN void X(execute_split_dft_c2r)(const X(plan) p, \
|
|
||||||
R *ri, R *ii, R *out); \
|
|
||||||
\
|
|
||||||
FFTW_EXTERN X(plan) X(plan_many_r2r)(int rank, const int *n, \
|
|
||||||
int howmany, \
|
|
||||||
R *in, const int *inembed, \
|
|
||||||
int istride, int idist, \
|
|
||||||
R *out, const int *onembed, \
|
|
||||||
int ostride, int odist, \
|
|
||||||
const X(r2r_kind) *kind, unsigned flags); \
|
|
||||||
\
|
|
||||||
FFTW_EXTERN X(plan) X(plan_r2r)(int rank, const int *n, R *in, R *out, \
|
|
||||||
const X(r2r_kind) *kind, unsigned flags); \
|
|
||||||
\
|
|
||||||
FFTW_EXTERN X(plan) X(plan_r2r_1d)(int n, R *in, R *out, \
|
|
||||||
X(r2r_kind) kind, unsigned flags); \
|
|
||||||
FFTW_EXTERN X(plan) X(plan_r2r_2d)(int n0, int n1, R *in, R *out, \
|
|
||||||
X(r2r_kind) kind0, X(r2r_kind) kind1, \
|
|
||||||
unsigned flags); \
|
|
||||||
FFTW_EXTERN X(plan) X(plan_r2r_3d)(int n0, int n1, int n2, \
|
|
||||||
R *in, R *out, X(r2r_kind) kind0, \
|
|
||||||
X(r2r_kind) kind1, X(r2r_kind) kind2, \
|
|
||||||
unsigned flags); \
|
|
||||||
\
|
|
||||||
FFTW_EXTERN X(plan) X(plan_guru_r2r)(int rank, const X(iodim) *dims, \
|
|
||||||
int howmany_rank, \
|
|
||||||
const X(iodim) *howmany_dims, \
|
|
||||||
R *in, R *out, \
|
|
||||||
const X(r2r_kind) *kind, unsigned flags); \
|
|
||||||
\
|
|
||||||
FFTW_EXTERN X(plan) X(plan_guru64_r2r)(int rank, const X(iodim64) *dims, \
|
|
||||||
int howmany_rank, \
|
|
||||||
const X(iodim64) *howmany_dims, \
|
|
||||||
R *in, R *out, \
|
|
||||||
const X(r2r_kind) *kind, unsigned flags); \
|
|
||||||
\
|
|
||||||
FFTW_EXTERN void X(execute_r2r)(const X(plan) p, R *in, R *out); \
|
|
||||||
\
|
|
||||||
FFTW_EXTERN void X(destroy_plan)(X(plan) p); \
|
|
||||||
FFTW_EXTERN void X(forget_wisdom)(void); \
|
|
||||||
FFTW_EXTERN void X(cleanup)(void); \
|
|
||||||
\
|
|
||||||
FFTW_EXTERN void X(set_timelimit)(double t); \
|
|
||||||
\
|
|
||||||
FFTW_EXTERN void X(plan_with_nthreads)(int nthreads); \
|
|
||||||
FFTW_EXTERN int X(init_threads)(void); \
|
|
||||||
FFTW_EXTERN void X(cleanup_threads)(void); \
|
|
||||||
\
|
|
||||||
FFTW_EXTERN int X(export_wisdom_to_filename)(const char *filename); \
|
|
||||||
FFTW_EXTERN void X(export_wisdom_to_file)(FILE *output_file); \
|
|
||||||
FFTW_EXTERN char *X(export_wisdom_to_string)(void); \
|
|
||||||
FFTW_EXTERN void X(export_wisdom)(X(write_char_func) write_char, \
|
|
||||||
void *data); \
|
|
||||||
FFTW_EXTERN int X(import_system_wisdom)(void); \
|
|
||||||
FFTW_EXTERN int X(import_wisdom_from_filename)(const char *filename); \
|
|
||||||
FFTW_EXTERN int X(import_wisdom_from_file)(FILE *input_file); \
|
|
||||||
FFTW_EXTERN int X(import_wisdom_from_string)(const char *input_string); \
|
|
||||||
FFTW_EXTERN int X(import_wisdom)(X(read_char_func) read_char, void *data); \
|
|
||||||
\
|
|
||||||
FFTW_EXTERN void X(fprint_plan)(const X(plan) p, FILE *output_file); \
|
|
||||||
FFTW_EXTERN void X(print_plan)(const X(plan) p); \
|
|
||||||
FFTW_EXTERN char *X(sprint_plan)(const X(plan) p); \
|
|
||||||
\
|
|
||||||
FFTW_EXTERN void *X(malloc)(size_t n); \
|
|
||||||
FFTW_EXTERN R *X(alloc_real)(size_t n); \
|
|
||||||
FFTW_EXTERN C *X(alloc_complex)(size_t n); \
|
|
||||||
FFTW_EXTERN void X(free)(void *p); \
|
|
||||||
\
|
|
||||||
FFTW_EXTERN void X(flops)(const X(plan) p, \
|
|
||||||
double *add, double *mul, double *fmas); \
|
|
||||||
FFTW_EXTERN double X(estimate_cost)(const X(plan) p); \
|
|
||||||
FFTW_EXTERN double X(cost)(const X(plan) p); \
|
|
||||||
\
|
|
||||||
FFTW_EXTERN int X(alignment_of)(R *p); \
|
|
||||||
FFTW_EXTERN const char X(version)[]; \
|
|
||||||
FFTW_EXTERN const char X(cc)[]; \
|
|
||||||
FFTW_EXTERN const char X(codelet_optim)[];
|
|
||||||
|
|
||||||
|
|
||||||
/* end of FFTW_DEFINE_API macro */
|
|
||||||
|
|
||||||
FFTW_DEFINE_API(FFTW_MANGLE_DOUBLE, double, fftw_complex)
|
|
||||||
FFTW_DEFINE_API(FFTW_MANGLE_FLOAT, float, fftwf_complex)
|
|
||||||
FFTW_DEFINE_API(FFTW_MANGLE_LONG_DOUBLE, long double, fftwl_complex)
|
|
||||||
|
|
||||||
/* __float128 (quad precision) is a gcc extension on i386, x86_64, and ia64
|
|
||||||
for gcc >= 4.6 (compiled in FFTW with --enable-quad-precision) */
|
|
||||||
#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) \
|
|
||||||
&& !(defined(__ICC) || defined(__INTEL_COMPILER)) \
|
|
||||||
&& (defined(__i386__) || defined(__x86_64__) || defined(__ia64__))
|
|
||||||
# if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined(complex) && defined(I)
|
|
||||||
/* note: __float128 is a typedef, which is not supported with the _Complex
|
|
||||||
keyword in gcc, so instead we use this ugly __attribute__ version.
|
|
||||||
However, we can't simply pass the __attribute__ version to
|
|
||||||
FFTW_DEFINE_API because the __attribute__ confuses gcc in pointer
|
|
||||||
types. Hence redefining FFTW_DEFINE_COMPLEX. Ugh. */
|
|
||||||
# undef FFTW_DEFINE_COMPLEX
|
|
||||||
# define FFTW_DEFINE_COMPLEX(R, C) typedef _Complex float __attribute__((mode(TC))) C
|
|
||||||
# endif
|
|
||||||
FFTW_DEFINE_API(FFTW_MANGLE_QUAD, __float128, fftwq_complex)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define FFTW_FORWARD (-1)
|
|
||||||
#define FFTW_BACKWARD (+1)
|
|
||||||
|
|
||||||
#define FFTW_NO_TIMELIMIT (-1.0)
|
|
||||||
|
|
||||||
/* documented flags */
|
|
||||||
#define FFTW_MEASURE (0U)
|
|
||||||
#define FFTW_DESTROY_INPUT (1U << 0)
|
|
||||||
#define FFTW_UNALIGNED (1U << 1)
|
|
||||||
#define FFTW_CONSERVE_MEMORY (1U << 2)
|
|
||||||
#define FFTW_EXHAUSTIVE (1U << 3) /* NO_EXHAUSTIVE is default */
|
|
||||||
#define FFTW_PRESERVE_INPUT (1U << 4) /* cancels FFTW_DESTROY_INPUT */
|
|
||||||
#define FFTW_PATIENT (1U << 5) /* IMPATIENT is default */
|
|
||||||
#define FFTW_ESTIMATE (1U << 6)
|
|
||||||
#define FFTW_WISDOM_ONLY (1U << 21)
|
|
||||||
|
|
||||||
/* undocumented beyond-guru flags */
|
|
||||||
#define FFTW_ESTIMATE_PATIENT (1U << 7)
|
|
||||||
#define FFTW_BELIEVE_PCOST (1U << 8)
|
|
||||||
#define FFTW_NO_DFT_R2HC (1U << 9)
|
|
||||||
#define FFTW_NO_NONTHREADED (1U << 10)
|
|
||||||
#define FFTW_NO_BUFFERING (1U << 11)
|
|
||||||
#define FFTW_NO_INDIRECT_OP (1U << 12)
|
|
||||||
#define FFTW_ALLOW_LARGE_GENERIC (1U << 13) /* NO_LARGE_GENERIC is default */
|
|
||||||
#define FFTW_NO_RANK_SPLITS (1U << 14)
|
|
||||||
#define FFTW_NO_VRANK_SPLITS (1U << 15)
|
|
||||||
#define FFTW_NO_VRECURSE (1U << 16)
|
|
||||||
#define FFTW_NO_SIMD (1U << 17)
|
|
||||||
#define FFTW_NO_SLOW (1U << 18)
|
|
||||||
#define FFTW_NO_FIXED_RADIX_LARGE_N (1U << 19)
|
|
||||||
#define FFTW_ALLOW_PRUNING (1U << 20)
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
} /* extern "C" */
|
|
||||||
#endif /* __cplusplus */
|
|
||||||
|
|
||||||
#endif /* FFTW3_H */
|
|
@ -194,22 +194,22 @@ class BinaryIO {
|
|||||||
|
|
||||||
std::vector<int> site({x,y,z,t});
|
std::vector<int> site({x,y,z,t});
|
||||||
|
|
||||||
if ( grid->IsBoss() ) {
|
if (grid->IsBoss()) {
|
||||||
fin.read((char *)&file_object,sizeof(file_object));
|
fin.read((char *)&file_object, sizeof(file_object));
|
||||||
bytes += sizeof(file_object);
|
bytes += sizeof(file_object);
|
||||||
if(ieee32big) be32toh_v((void *)&file_object,sizeof(file_object));
|
if (ieee32big) be32toh_v((void *)&file_object, sizeof(file_object));
|
||||||
if(ieee32) le32toh_v((void *)&file_object,sizeof(file_object));
|
if (ieee32) le32toh_v((void *)&file_object, sizeof(file_object));
|
||||||
if(ieee64big) be64toh_v((void *)&file_object,sizeof(file_object));
|
if (ieee64big) be64toh_v((void *)&file_object, sizeof(file_object));
|
||||||
if(ieee64) le64toh_v((void *)&file_object,sizeof(file_object));
|
if (ieee64) le64toh_v((void *)&file_object, sizeof(file_object));
|
||||||
|
|
||||||
munge(file_object,munged,csum);
|
munge(file_object, munged, csum);
|
||||||
}
|
}
|
||||||
// The boss who read the file has their value poked
|
// The boss who read the file has their value poked
|
||||||
pokeSite(munged,Umu,site);
|
pokeSite(munged,Umu,site);
|
||||||
}}}}
|
}}}}
|
||||||
timer.Stop();
|
timer.Stop();
|
||||||
std::cout<<GridLogPerformance<<"readObjectSerial: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
|
std::cout<<GridLogPerformance<<"readObjectSerial: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
|
||||||
<< (double)bytes/ (double)timer.useconds() <<" MB/s " <<std::endl;
|
<< (double)bytes/ (double)timer.useconds() <<" MB/s " <<std::endl;
|
||||||
|
|
||||||
return csum;
|
return csum;
|
||||||
}
|
}
|
||||||
@ -254,20 +254,20 @@ class BinaryIO {
|
|||||||
|
|
||||||
|
|
||||||
if ( grid->IsBoss() ) {
|
if ( grid->IsBoss() ) {
|
||||||
|
|
||||||
if(ieee32big) htobe32_v((void *)&file_object,sizeof(file_object));
|
if(ieee32big) htobe32_v((void *)&file_object,sizeof(file_object));
|
||||||
if(ieee32) htole32_v((void *)&file_object,sizeof(file_object));
|
if(ieee32) htole32_v((void *)&file_object,sizeof(file_object));
|
||||||
if(ieee64big) htobe64_v((void *)&file_object,sizeof(file_object));
|
if(ieee64big) htobe64_v((void *)&file_object,sizeof(file_object));
|
||||||
if(ieee64) htole64_v((void *)&file_object,sizeof(file_object));
|
if(ieee64) htole64_v((void *)&file_object,sizeof(file_object));
|
||||||
|
|
||||||
// NB could gather an xstrip as an optimisation.
|
// NB could gather an xstrip as an optimisation.
|
||||||
fout.write((char *)&file_object,sizeof(file_object));
|
fout.write((char *)&file_object,sizeof(file_object));
|
||||||
bytes+=sizeof(file_object);
|
bytes+=sizeof(file_object);
|
||||||
}
|
}
|
||||||
}}}}
|
}}}}
|
||||||
timer.Stop();
|
timer.Stop();
|
||||||
std::cout<<GridLogPerformance<<"writeObjectSerial: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
|
std::cout<<GridLogPerformance<<"writeObjectSerial: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
|
||||||
<< (double)bytes/timer.useconds() <<" MB/s " <<std::endl;
|
<< (double)bytes/timer.useconds() <<" MB/s " <<std::endl;
|
||||||
|
|
||||||
return csum;
|
return csum;
|
||||||
}
|
}
|
||||||
@ -305,15 +305,15 @@ class BinaryIO {
|
|||||||
int l_idx=parallel.generator_idx(o_idx,i_idx);
|
int l_idx=parallel.generator_idx(o_idx,i_idx);
|
||||||
|
|
||||||
if( rank == grid->ThisRank() ){
|
if( rank == grid->ThisRank() ){
|
||||||
// std::cout << "rank" << rank<<" Getting state for index "<<l_idx<<std::endl;
|
// std::cout << "rank" << rank<<" Getting state for index "<<l_idx<<std::endl;
|
||||||
parallel.GetState(saved,l_idx);
|
parallel.GetState(saved,l_idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
grid->Broadcast(rank,(void *)&saved[0],bytes);
|
grid->Broadcast(rank,(void *)&saved[0],bytes);
|
||||||
|
|
||||||
if ( grid->IsBoss() ) {
|
if ( grid->IsBoss() ) {
|
||||||
Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
|
Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
|
||||||
fout.write((char *)&saved[0],bytes);
|
fout.write((char *)&saved[0],bytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -355,14 +355,14 @@ class BinaryIO {
|
|||||||
int l_idx=parallel.generator_idx(o_idx,i_idx);
|
int l_idx=parallel.generator_idx(o_idx,i_idx);
|
||||||
|
|
||||||
if ( grid->IsBoss() ) {
|
if ( grid->IsBoss() ) {
|
||||||
fin.read((char *)&saved[0],bytes);
|
fin.read((char *)&saved[0],bytes);
|
||||||
Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
|
Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
|
||||||
}
|
}
|
||||||
|
|
||||||
grid->Broadcast(0,(void *)&saved[0],bytes);
|
grid->Broadcast(0,(void *)&saved[0],bytes);
|
||||||
|
|
||||||
if( rank == grid->ThisRank() ){
|
if( rank == grid->ThisRank() ){
|
||||||
parallel.SetState(saved,l_idx);
|
parallel.SetState(saved,l_idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -415,15 +415,15 @@ class BinaryIO {
|
|||||||
|
|
||||||
if ( d == 0 ) parallel[d] = 0;
|
if ( d == 0 ) parallel[d] = 0;
|
||||||
if (parallel[d]) {
|
if (parallel[d]) {
|
||||||
range[d] = grid->_ldimensions[d];
|
range[d] = grid->_ldimensions[d];
|
||||||
start[d] = grid->_processor_coor[d]*range[d];
|
start[d] = grid->_processor_coor[d]*range[d];
|
||||||
ioproc[d]= grid->_processor_coor[d];
|
ioproc[d]= grid->_processor_coor[d];
|
||||||
} else {
|
} else {
|
||||||
range[d] = grid->_gdimensions[d];
|
range[d] = grid->_gdimensions[d];
|
||||||
start[d] = 0;
|
start[d] = 0;
|
||||||
ioproc[d]= 0;
|
ioproc[d]= 0;
|
||||||
|
|
||||||
if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
|
if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
|
||||||
}
|
}
|
||||||
slice_vol = slice_vol * range[d];
|
slice_vol = slice_vol * range[d];
|
||||||
}
|
}
|
||||||
@ -434,9 +434,9 @@ class BinaryIO {
|
|||||||
std::cout<< std::dec ;
|
std::cout<< std::dec ;
|
||||||
std::cout<< GridLogMessage<< "Parallel read I/O to "<< file << " with " <<tmp<< " IOnodes for subslice ";
|
std::cout<< GridLogMessage<< "Parallel read I/O to "<< file << " with " <<tmp<< " IOnodes for subslice ";
|
||||||
for(int d=0;d<grid->_ndimension;d++){
|
for(int d=0;d<grid->_ndimension;d++){
|
||||||
std::cout<< range[d];
|
std::cout<< range[d];
|
||||||
if( d< grid->_ndimension-1 )
|
if( d< grid->_ndimension-1 )
|
||||||
std::cout<< " x ";
|
std::cout<< " x ";
|
||||||
}
|
}
|
||||||
std::cout << std::endl;
|
std::cout << std::endl;
|
||||||
}
|
}
|
||||||
@ -463,7 +463,7 @@ class BinaryIO {
|
|||||||
|
|
||||||
// need to implement these loops in Nd independent way with a lexico conversion
|
// need to implement these loops in Nd independent way with a lexico conversion
|
||||||
for(int tlex=0;tlex<slice_vol;tlex++){
|
for(int tlex=0;tlex<slice_vol;tlex++){
|
||||||
|
|
||||||
std::vector<int> tsite(nd); // temporary mixed up site
|
std::vector<int> tsite(nd); // temporary mixed up site
|
||||||
std::vector<int> gsite(nd);
|
std::vector<int> gsite(nd);
|
||||||
std::vector<int> lsite(nd);
|
std::vector<int> lsite(nd);
|
||||||
@ -472,8 +472,8 @@ class BinaryIO {
|
|||||||
Lexicographic::CoorFromIndex(tsite,tlex,range);
|
Lexicographic::CoorFromIndex(tsite,tlex,range);
|
||||||
|
|
||||||
for(int d=0;d<nd;d++){
|
for(int d=0;d<nd;d++){
|
||||||
lsite[d] = tsite[d]%grid->_ldimensions[d]; // local site
|
lsite[d] = tsite[d]%grid->_ldimensions[d]; // local site
|
||||||
gsite[d] = tsite[d]+start[d]; // global site
|
gsite[d] = tsite[d]+start[d]; // global site
|
||||||
}
|
}
|
||||||
|
|
||||||
/////////////////////////
|
/////////////////////////
|
||||||
@ -487,29 +487,29 @@ class BinaryIO {
|
|||||||
// iorank reads from the seek
|
// iorank reads from the seek
|
||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
if (myrank == iorank) {
|
if (myrank == iorank) {
|
||||||
|
|
||||||
fin.seekg(offset+g_idx*sizeof(fileObj));
|
fin.seekg(offset+g_idx*sizeof(fileObj));
|
||||||
fin.read((char *)&fileObj,sizeof(fileObj));
|
fin.read((char *)&fileObj,sizeof(fileObj));
|
||||||
bytes+=sizeof(fileObj);
|
bytes+=sizeof(fileObj);
|
||||||
|
|
||||||
if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj));
|
if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj));
|
||||||
if(ieee32) le32toh_v((void *)&fileObj,sizeof(fileObj));
|
if(ieee32) le32toh_v((void *)&fileObj,sizeof(fileObj));
|
||||||
if(ieee64big) be64toh_v((void *)&fileObj,sizeof(fileObj));
|
if(ieee64big) be64toh_v((void *)&fileObj,sizeof(fileObj));
|
||||||
if(ieee64) le64toh_v((void *)&fileObj,sizeof(fileObj));
|
if(ieee64) le64toh_v((void *)&fileObj,sizeof(fileObj));
|
||||||
|
|
||||||
munge(fileObj,siteObj,csum);
|
munge(fileObj,siteObj,csum);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Possibly do transport through pt2pt
|
// Possibly do transport through pt2pt
|
||||||
if ( rank != iorank ) {
|
if ( rank != iorank ) {
|
||||||
if ( (myrank == rank) || (myrank==iorank) ) {
|
if ( (myrank == rank) || (myrank==iorank) ) {
|
||||||
grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,iorank,rank,sizeof(siteObj));
|
grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,iorank,rank,sizeof(siteObj));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Poke at destination
|
// Poke at destination
|
||||||
if ( myrank == rank ) {
|
if ( myrank == rank ) {
|
||||||
pokeLocalSite(siteObj,Umu,lsite);
|
pokeLocalSite(siteObj,Umu,lsite);
|
||||||
}
|
}
|
||||||
grid->Barrier(); // necessary?
|
grid->Barrier(); // necessary?
|
||||||
}
|
}
|
||||||
@ -520,7 +520,7 @@ class BinaryIO {
|
|||||||
|
|
||||||
timer.Stop();
|
timer.Stop();
|
||||||
std::cout<<GridLogPerformance<<"readObjectParallel: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
|
std::cout<<GridLogPerformance<<"readObjectParallel: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
|
||||||
<< (double)bytes/timer.useconds() <<" MB/s " <<std::endl;
|
<< (double)bytes/timer.useconds() <<" MB/s " <<std::endl;
|
||||||
|
|
||||||
return csum;
|
return csum;
|
||||||
}
|
}
|
||||||
@ -558,15 +558,15 @@ class BinaryIO {
|
|||||||
if ( d!= grid->_ndimension-1 ) parallel[d] = 0;
|
if ( d!= grid->_ndimension-1 ) parallel[d] = 0;
|
||||||
|
|
||||||
if (parallel[d]) {
|
if (parallel[d]) {
|
||||||
range[d] = grid->_ldimensions[d];
|
range[d] = grid->_ldimensions[d];
|
||||||
start[d] = grid->_processor_coor[d]*range[d];
|
start[d] = grid->_processor_coor[d]*range[d];
|
||||||
ioproc[d]= grid->_processor_coor[d];
|
ioproc[d]= grid->_processor_coor[d];
|
||||||
} else {
|
} else {
|
||||||
range[d] = grid->_gdimensions[d];
|
range[d] = grid->_gdimensions[d];
|
||||||
start[d] = 0;
|
start[d] = 0;
|
||||||
ioproc[d]= 0;
|
ioproc[d]= 0;
|
||||||
|
|
||||||
if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
|
if ( grid->_processor_coor[d] != 0 ) IOnode = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
slice_vol = slice_vol * range[d];
|
slice_vol = slice_vol * range[d];
|
||||||
@ -577,9 +577,9 @@ class BinaryIO {
|
|||||||
grid->GlobalSum(tmp);
|
grid->GlobalSum(tmp);
|
||||||
std::cout<< GridLogMessage<< "Parallel write I/O from "<< file << " with " <<tmp<< " IOnodes for subslice ";
|
std::cout<< GridLogMessage<< "Parallel write I/O from "<< file << " with " <<tmp<< " IOnodes for subslice ";
|
||||||
for(int d=0;d<grid->_ndimension;d++){
|
for(int d=0;d<grid->_ndimension;d++){
|
||||||
std::cout<< range[d];
|
std::cout<< range[d];
|
||||||
if( d< grid->_ndimension-1 )
|
if( d< grid->_ndimension-1 )
|
||||||
std::cout<< " x ";
|
std::cout<< " x ";
|
||||||
}
|
}
|
||||||
std::cout << std::endl;
|
std::cout << std::endl;
|
||||||
}
|
}
|
||||||
@ -610,7 +610,7 @@ class BinaryIO {
|
|||||||
// should aggregate a whole chunk and then write.
|
// should aggregate a whole chunk and then write.
|
||||||
// need to implement these loops in Nd independent way with a lexico conversion
|
// need to implement these loops in Nd independent way with a lexico conversion
|
||||||
for(int tlex=0;tlex<slice_vol;tlex++){
|
for(int tlex=0;tlex<slice_vol;tlex++){
|
||||||
|
|
||||||
std::vector<int> tsite(nd); // temporary mixed up site
|
std::vector<int> tsite(nd); // temporary mixed up site
|
||||||
std::vector<int> gsite(nd);
|
std::vector<int> gsite(nd);
|
||||||
std::vector<int> lsite(nd);
|
std::vector<int> lsite(nd);
|
||||||
@ -619,8 +619,8 @@ class BinaryIO {
|
|||||||
Lexicographic::CoorFromIndex(tsite,tlex,range);
|
Lexicographic::CoorFromIndex(tsite,tlex,range);
|
||||||
|
|
||||||
for(int d=0;d<nd;d++){
|
for(int d=0;d<nd;d++){
|
||||||
lsite[d] = tsite[d]%grid->_ldimensions[d]; // local site
|
lsite[d] = tsite[d]%grid->_ldimensions[d]; // local site
|
||||||
gsite[d] = tsite[d]+start[d]; // global site
|
gsite[d] = tsite[d]+start[d]; // global site
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -640,26 +640,26 @@ class BinaryIO {
|
|||||||
|
|
||||||
// Pair of nodes may need to do pt2pt send
|
// Pair of nodes may need to do pt2pt send
|
||||||
if ( rank != iorank ) { // comms is necessary
|
if ( rank != iorank ) { // comms is necessary
|
||||||
if ( (myrank == rank) || (myrank==iorank) ) { // and we have to do it
|
if ( (myrank == rank) || (myrank==iorank) ) { // and we have to do it
|
||||||
// Send to IOrank
|
// Send to IOrank
|
||||||
grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,rank,iorank,sizeof(siteObj));
|
grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,rank,iorank,sizeof(siteObj));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
grid->Barrier(); // necessary?
|
grid->Barrier(); // necessary?
|
||||||
|
|
||||||
if (myrank == iorank) {
|
if (myrank == iorank) {
|
||||||
|
|
||||||
munge(siteObj,fileObj,csum);
|
munge(siteObj,fileObj,csum);
|
||||||
|
|
||||||
if(ieee32big) htobe32_v((void *)&fileObj,sizeof(fileObj));
|
if(ieee32big) htobe32_v((void *)&fileObj,sizeof(fileObj));
|
||||||
if(ieee32) htole32_v((void *)&fileObj,sizeof(fileObj));
|
if(ieee32) htole32_v((void *)&fileObj,sizeof(fileObj));
|
||||||
if(ieee64big) htobe64_v((void *)&fileObj,sizeof(fileObj));
|
if(ieee64big) htobe64_v((void *)&fileObj,sizeof(fileObj));
|
||||||
if(ieee64) htole64_v((void *)&fileObj,sizeof(fileObj));
|
if(ieee64) htole64_v((void *)&fileObj,sizeof(fileObj));
|
||||||
|
|
||||||
fout.seekp(offset+g_idx*sizeof(fileObj));
|
fout.seekp(offset+g_idx*sizeof(fileObj));
|
||||||
fout.write((char *)&fileObj,sizeof(fileObj));
|
fout.write((char *)&fileObj,sizeof(fileObj));
|
||||||
bytes+=sizeof(fileObj);
|
bytes+=sizeof(fileObj);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -668,7 +668,7 @@ class BinaryIO {
|
|||||||
|
|
||||||
timer.Stop();
|
timer.Stop();
|
||||||
std::cout<<GridLogPerformance<<"writeObjectParallel: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
|
std::cout<<GridLogPerformance<<"writeObjectParallel: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
|
||||||
<< (double)bytes/timer.useconds() <<" MB/s " <<std::endl;
|
<< (double)bytes/timer.useconds() <<" MB/s " <<std::endl;
|
||||||
|
|
||||||
return csum;
|
return csum;
|
||||||
}
|
}
|
||||||
|
@ -55,11 +55,14 @@ namespace QCD {
|
|||||||
//////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
// QCD iMatrix types
|
// QCD iMatrix types
|
||||||
// Index conventions: Lorentz x Spin x Colour
|
// Index conventions: Lorentz x Spin x Colour
|
||||||
|
// note: static const int or constexpr will work for type deductions
|
||||||
|
// with the intel compiler (up to version 17)
|
||||||
//////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
static const int ColourIndex = 2;
|
#define ColourIndex 2
|
||||||
static const int SpinIndex = 1;
|
#define SpinIndex 1
|
||||||
static const int LorentzIndex= 0;
|
#define LorentzIndex 0
|
||||||
|
|
||||||
|
|
||||||
// Also should make these a named enum type
|
// Also should make these a named enum type
|
||||||
static const int DaggerNo=0;
|
static const int DaggerNo=0;
|
||||||
static const int DaggerYes=1;
|
static const int DaggerYes=1;
|
||||||
|
@ -96,7 +96,25 @@ namespace Grid {
|
|||||||
WilsonKernels(const ImplParams &p= ImplParams());
|
WilsonKernels(const ImplParams &p= ImplParams());
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////
|
||||||
|
// Default to no assembler implementation
|
||||||
|
///////////////////////////////////////////////////////////
|
||||||
|
template<class Impl>
|
||||||
|
void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
template<class Impl>
|
||||||
|
void WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
/*************************************************************************************
|
/*************************************************************************************
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
@ -26,68 +26,56 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
*************************************************************************************/
|
*************************************************************************************/
|
||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
|
|
||||||
#include <Grid.h>
|
#include <Grid.h>
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
namespace QCD {
|
namespace QCD {
|
||||||
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////
|
|
||||||
// Default to no assembler implementation
|
|
||||||
///////////////////////////////////////////////////////////
|
|
||||||
template<class Impl>
|
|
||||||
void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
|
||||||
{
|
|
||||||
assert(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
#if defined(AVX512)
|
#if defined(AVX512)
|
||||||
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////
|
||||||
// If we are AVX512 specialise the single precision routine
|
// If we are AVX512 specialise the single precision routine
|
||||||
///////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////
|
||||||
|
|
||||||
#include <simd/Intel512wilson.h>
|
#include <simd/Intel512wilson.h>
|
||||||
#include <simd/Intel512single.h>
|
#include <simd/Intel512single.h>
|
||||||
|
|
||||||
static Vector<vComplexF> signs;
|
static Vector<vComplexF> signs;
|
||||||
|
|
||||||
int setupSigns(void ){
|
int setupSigns(void ){
|
||||||
Vector<vComplexF> bother(2);
|
Vector<vComplexF> bother(2);
|
||||||
signs = bother;
|
signs = bother;
|
||||||
vrsign(signs[0]);
|
vrsign(signs[0]);
|
||||||
visign(signs[1]);
|
visign(signs[1]);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
static int signInit = setupSigns();
|
static int signInit = setupSigns();
|
||||||
|
|
||||||
#define label(A) ilabel(A)
|
#define label(A) ilabel(A)
|
||||||
#define ilabel(A) ".globl\n" #A ":\n"
|
#define ilabel(A) ".globl\n" #A ":\n"
|
||||||
|
|
||||||
#define MAYBEPERM(A,perm) if (perm) { A ; }
|
#define MAYBEPERM(A,perm) if (perm) { A ; }
|
||||||
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
|
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
|
||||||
#define FX(A) WILSONASM_ ##A
|
#define FX(A) WILSONASM_ ##A
|
||||||
|
|
||||||
#undef KERNEL_DAG
|
#undef KERNEL_DAG
|
||||||
template<>
|
template<>
|
||||||
void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||||
|
|
||||||
#define KERNEL_DAG
|
#define KERNEL_DAG
|
||||||
template<>
|
template<>
|
||||||
void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||||
|
|
||||||
#undef VMOVIDUP
|
#undef VMOVIDUP
|
||||||
#undef VMOVRDUP
|
#undef VMOVRDUP
|
||||||
#undef MAYBEPERM
|
#undef MAYBEPERM
|
||||||
@ -98,43 +86,22 @@ void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,Lebesgue
|
|||||||
#define VMOVIDUP(A,B,C) VBCASTIDUPf(A,B,C)
|
#define VMOVIDUP(A,B,C) VBCASTIDUPf(A,B,C)
|
||||||
#define VMOVRDUP(A,B,C) VBCASTRDUPf(A,B,C)
|
#define VMOVRDUP(A,B,C) VBCASTRDUPf(A,B,C)
|
||||||
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
|
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
|
||||||
|
|
||||||
#undef KERNEL_DAG
|
#undef KERNEL_DAG
|
||||||
template<>
|
template<>
|
||||||
void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||||
|
|
||||||
#define KERNEL_DAG
|
#define KERNEL_DAG
|
||||||
template<>
|
template<>
|
||||||
void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||||
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
|
||||||
|
|
||||||
template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
|
||||||
template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
|
||||||
template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
|
||||||
template void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
|
||||||
template void WilsonKernels<DomainWallVec5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
|
||||||
}}
|
|
||||||
|
|
||||||
|
@ -35,6 +35,9 @@ int main (int argc, char ** argv)
|
|||||||
{
|
{
|
||||||
Grid_init(&argc,&argv);
|
Grid_init(&argc,&argv);
|
||||||
|
|
||||||
|
int threads = GridThread::GetThreads();
|
||||||
|
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
||||||
|
|
||||||
std::vector<int> latt_size = GridDefaultLatt();
|
std::vector<int> latt_size = GridDefaultLatt();
|
||||||
std::vector<int> simd_layout( { vComplexD::Nsimd(),1,1,1});
|
std::vector<int> simd_layout( { vComplexD::Nsimd(),1,1,1});
|
||||||
std::vector<int> mpi_layout = GridDefaultMpi();
|
std::vector<int> mpi_layout = GridDefaultMpi();
|
||||||
@ -75,10 +78,10 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
FFT theFFT(&Fine);
|
FFT theFFT(&Fine);
|
||||||
|
|
||||||
theFFT.FFT_dim(Ctilde,C,0,FFT::forward); C=Ctilde;
|
theFFT.FFT_dim(Ctilde,C,0,FFT::forward); C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
|
||||||
theFFT.FFT_dim(Ctilde,C,1,FFT::forward); C=Ctilde;
|
theFFT.FFT_dim(Ctilde,C,1,FFT::forward); C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
|
||||||
theFFT.FFT_dim(Ctilde,C,2,FFT::forward); C=Ctilde;
|
theFFT.FFT_dim(Ctilde,C,2,FFT::forward); C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
|
||||||
theFFT.FFT_dim(Ctilde,C,3,FFT::forward);
|
theFFT.FFT_dim(Ctilde,C,3,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
|
||||||
|
|
||||||
// C=zero;
|
// C=zero;
|
||||||
// Ctilde = where(abs(Ctilde)<1.0e-10,C,Ctilde);
|
// Ctilde = where(abs(Ctilde)<1.0e-10,C,Ctilde);
|
||||||
@ -90,10 +93,10 @@ int main (int argc, char ** argv)
|
|||||||
C=C-Ctilde;
|
C=C-Ctilde;
|
||||||
std::cout << "diff scalar "<<norm2(C) << std::endl;
|
std::cout << "diff scalar "<<norm2(C) << std::endl;
|
||||||
|
|
||||||
theFFT.FFT_dim(Stilde,S,0,FFT::forward); S=Stilde;
|
theFFT.FFT_dim(Stilde,S,0,FFT::forward); S=Stilde; std::cout << theFFT.MFlops()<<std::endl;
|
||||||
theFFT.FFT_dim(Stilde,S,1,FFT::forward); S=Stilde;
|
theFFT.FFT_dim(Stilde,S,1,FFT::forward); S=Stilde;std::cout << theFFT.MFlops()<<std::endl;
|
||||||
theFFT.FFT_dim(Stilde,S,2,FFT::forward); S=Stilde;
|
theFFT.FFT_dim(Stilde,S,2,FFT::forward); S=Stilde;std::cout << theFFT.MFlops()<<std::endl;
|
||||||
theFFT.FFT_dim(Stilde,S,3,FFT::forward);
|
theFFT.FFT_dim(Stilde,S,3,FFT::forward);std::cout << theFFT.MFlops()<<std::endl;
|
||||||
|
|
||||||
SpinMatrixD Sp;
|
SpinMatrixD Sp;
|
||||||
Sp = zero; Sp = Sp+cVol;
|
Sp = zero; Sp = Sp+cVol;
|
||||||
|
111
tests/core/Test_fftf.cc
Normal file
111
tests/core/Test_fftf.cc
Normal file
@ -0,0 +1,111 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./tests/Test_cshift.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid/Grid.h>
|
||||||
|
|
||||||
|
using namespace Grid;
|
||||||
|
using namespace Grid::QCD;
|
||||||
|
|
||||||
|
int main (int argc, char ** argv)
|
||||||
|
{
|
||||||
|
Grid_init(&argc,&argv);
|
||||||
|
|
||||||
|
int threads = GridThread::GetThreads();
|
||||||
|
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
||||||
|
|
||||||
|
std::vector<int> latt_size = GridDefaultLatt();
|
||||||
|
std::vector<int> simd_layout( { vComplexF::Nsimd(),1,1,1});
|
||||||
|
std::vector<int> mpi_layout = GridDefaultMpi();
|
||||||
|
|
||||||
|
int vol = 1;
|
||||||
|
for(int d=0;d<latt_size.size();d++){
|
||||||
|
vol = vol * latt_size[d];
|
||||||
|
}
|
||||||
|
GridCartesian Fine(latt_size,simd_layout,mpi_layout);
|
||||||
|
|
||||||
|
LatticeComplexF one(&Fine);
|
||||||
|
LatticeComplexF zz(&Fine);
|
||||||
|
LatticeComplexF C(&Fine);
|
||||||
|
LatticeComplexF Ctilde(&Fine);
|
||||||
|
LatticeComplexF coor(&Fine);
|
||||||
|
|
||||||
|
LatticeSpinMatrixF S(&Fine);
|
||||||
|
LatticeSpinMatrixF Stilde(&Fine);
|
||||||
|
|
||||||
|
std::vector<int> p({1,2,3,2});
|
||||||
|
|
||||||
|
one = ComplexF(1.0,0.0);
|
||||||
|
zz = ComplexF(0.0,0.0);
|
||||||
|
|
||||||
|
ComplexF ci(0.0,1.0);
|
||||||
|
|
||||||
|
C=zero;
|
||||||
|
for(int mu=0;mu<4;mu++){
|
||||||
|
RealD TwoPiL = M_PI * 2.0/ latt_size[mu];
|
||||||
|
LatticeCoordinate(coor,mu);
|
||||||
|
C = C - (TwoPiL * p[mu]) * coor;
|
||||||
|
}
|
||||||
|
|
||||||
|
C = exp(C*ci);
|
||||||
|
|
||||||
|
S=zero;
|
||||||
|
S = S+C;
|
||||||
|
|
||||||
|
FFT theFFT(&Fine);
|
||||||
|
|
||||||
|
theFFT.FFT_dim(Ctilde,C,0,FFT::forward); C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
|
||||||
|
theFFT.FFT_dim(Ctilde,C,1,FFT::forward); C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
|
||||||
|
theFFT.FFT_dim(Ctilde,C,2,FFT::forward); C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
|
||||||
|
theFFT.FFT_dim(Ctilde,C,3,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
|
||||||
|
|
||||||
|
// C=zero;
|
||||||
|
// Ctilde = where(abs(Ctilde)<1.0e-10,C,Ctilde);
|
||||||
|
TComplexF cVol;
|
||||||
|
cVol()()() = vol;
|
||||||
|
|
||||||
|
C=zero;
|
||||||
|
pokeSite(cVol,C,p);
|
||||||
|
C=C-Ctilde;
|
||||||
|
std::cout << "diff scalar "<<norm2(C) << std::endl;
|
||||||
|
|
||||||
|
theFFT.FFT_dim(Stilde,S,0,FFT::forward); S=Stilde; std::cout << theFFT.MFlops()<<std::endl;
|
||||||
|
theFFT.FFT_dim(Stilde,S,1,FFT::forward); S=Stilde;std::cout << theFFT.MFlops()<<std::endl;
|
||||||
|
theFFT.FFT_dim(Stilde,S,2,FFT::forward); S=Stilde;std::cout << theFFT.MFlops()<<std::endl;
|
||||||
|
theFFT.FFT_dim(Stilde,S,3,FFT::forward);std::cout << theFFT.MFlops()<<std::endl;
|
||||||
|
|
||||||
|
SpinMatrixF Sp;
|
||||||
|
Sp = zero; Sp = Sp+cVol;
|
||||||
|
|
||||||
|
S=zero;
|
||||||
|
pokeSite(Sp,S,p);
|
||||||
|
|
||||||
|
S= S-Stilde;
|
||||||
|
std::cout << "diff FT[SpinMat] "<<norm2(S) << std::endl;
|
||||||
|
|
||||||
|
Grid_finalize();
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user