From ff6da364e807b965ba49df55fd5dba58723d0a51 Mon Sep 17 00:00:00 2001 From: paboyle Date: Wed, 24 Aug 2016 15:05:00 +0100 Subject: [PATCH] FFT double and single precision gives good performance now in multithreaded code. --- configure.ac | 47 +++-- lib/FFT.h | 199 +++++++++++++------ lib/Init.cc | 2 +- lib/fftw/fftw3.h | 412 ---------------------------------------- tests/core/Test_fft.cc | 19 +- tests/core/Test_fftf.cc | 111 +++++++++++ 6 files changed, 298 insertions(+), 492 deletions(-) delete mode 100644 lib/fftw/fftw3.h create mode 100644 tests/core/Test_fftf.cc diff --git a/configure.ac b/configure.ac index b14f9748..98489162 100644 --- a/configure.ac +++ b/configure.ac @@ -10,8 +10,19 @@ m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) AC_LANG(C++) CXXFLAGS="-O3 $CXXFLAGS" AC_PROG_CXX + +############ openmp ############### AC_OPENMP + +ac_openmp=no + +if test "${OPENMP_CXXFLAGS}X" != "X"; then +ac_openmp=yes AM_CXXFLAGS="$OPENMP_CXXFLAGS $AM_CXXFLAGS" +AM_LDFLAGS="$OPENMP_CXXFLAGS $AM_LDFLAGS" +fi + +############ libtool ############### LT_INIT ############### Checks for header files @@ -29,7 +40,7 @@ AC_TYPE_SIZE_T AC_TYPE_UINT32_T AC_TYPE_UINT64_T -############### Options +############### GMP and MPFR ################# AC_ARG_WITH([gmp], [AS_HELP_STRING([--with-gmp=prefix], [try this for a non-standard install prefix of the GMP library])], @@ -40,6 +51,8 @@ AC_ARG_WITH([mpfr], [try this for a non-standard install prefix of the MPFR library])], [AM_CXXFLAGS="-I$with_mpfr/include $AM_CXXFLAGS"] [AM_LDFLAGS="-L$with_mpfr/lib $AM_LDFLAGS"]) + +################## lapack #################### AC_ARG_ENABLE([lapack], [AC_HELP_STRING([--enable-lapack=yes|no|prefix], [enable LAPACK])], [ac_LAPACK=${enable_lapack}],[ac_LAPACK=no]) @@ -55,14 +68,27 @@ case ${ac_LAPACK} in AC_DEFINE([USE_LAPACK],[1],[use LAPACK]) esac -AC_CHECK_LIB([fftw3],[fft_init_threads], +################## FFTW3 #################### +AC_ARG_WITH([fftw], + [AS_HELP_STRING([--with-fftw=prefix], + [try this for a non-standard install prefix of the FFTW3 library])], + [AM_CXXFLAGS="-I$with_fftw/include $AM_CXXFLAGS"] + [AM_LDFLAGS="-L$with_fftw/lib $AM_LDFLAGS"]) + +# +# What about the MKL library replacement for fftw3 ? How do we know if fftw_execute +# can be found in MKL? +# +AC_CHECK_LIB([fftw3],[fftw_execute], [AC_DEFINE([HAVE_FFTW],[1],[Define to 1 if you have the `FFTW' library (-lfftw3).])] [ac_fftw=yes], [ac_fftw=no]) + case ${ac_fftw} in no) + echo WARNING libfftw3 not found FFT routines will not work ;; yes) - AM_LDFLAGS="-lfftw3 $AM_LDFLAGS" + AM_LDFLAGS="$AM_LDFLAGS -lfftw3 -lfftw3f" esac @@ -304,19 +330,18 @@ Summary of configuration for $PACKAGE v$VERSION - compiler version : ${ax_cv_gxx_version} ----- BUILD OPTIONS ----------------------------------- - SIMD : ${ac_SIMD} -- communications type : ${ac_COMMS} -- default precision : ${ac_PRECISION} +- Threading : ${ac_openmp} +- Communications type : ${ac_COMMS} +- Default precision : ${ac_PRECISION} - RNG choice : ${ac_RNG} - GMP : `if test "x$have_gmp" = xtrue; then echo yes; else echo no; fi` - LAPACK : ${ac_LAPACK} +- FFTW : ${ac_fftw} - build DOXYGEN documentation : `if test "x$enable_doc" = xyes; then echo yes; else echo no; fi` - graphs and diagrams : `if test "x$enable_dot" = xyes; then echo yes; else echo no; fi` ----- BUILD FLAGS ------------------------------------- -- CXXFLAGS: -`echo ${AM_CXXFLAGS} ${CXXFLAGS} | sed 's/ -/\n\t-/g' | sed 's/^-/\t-/g'` -- LDFLAGS: -`echo ${AM_LDFLAGS} ${LDFLAGS} | sed 's/ -/\n\t-/g' | sed 's/^-/\t-/g'` -- LIBS: -`echo ${LIBS} | sed 's/ -/\n\t-/g' | sed 's/^-/\t-/g'` +- CXXFLAGS: "${AM_CXXFLAGS} ${CXXFLAGS}" +- LDFLAGS: "${AM_LDFLAGS} ${LDFLAGS}" +- LIBS: "${LIBS} " ------------------------------------------------------- " diff --git a/lib/FFT.h b/lib/FFT.h index cf012e79..17c8a309 100644 --- a/lib/FFT.h +++ b/lib/FFT.h @@ -1,3 +1,4 @@ + /************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -28,11 +29,70 @@ Author: Peter Boyle #ifndef _GRID_FFT_H_ #define _GRID_FFT_H_ -#include - +#ifdef HAVE_FFTW +#include +#endif namespace Grid { - + template struct FFTW { }; + +#ifdef HAVE_FFTW + template<> struct FFTW { + public: + + typedef fftw_complex FFTW_scalar; + typedef fftw_plan FFTW_plan; + + static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany, + FFTW_scalar *in, const int *inembed, + int istride, int idist, + FFTW_scalar *out, const int *onembed, + int ostride, int odist, + int sign, unsigned flags) { + return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags); + } + + static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){ + ::fftw_flops(p,add,mul,fmas); + } + + inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) { + ::fftw_execute_dft(p,in,out); + } + inline static void fftw_destroy_plan(const FFTW_plan p) { + ::fftw_destroy_plan(p); + } + }; + + template<> struct FFTW { + public: + + typedef fftwf_complex FFTW_scalar; + typedef fftwf_plan FFTW_plan; + + static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany, + FFTW_scalar *in, const int *inembed, + int istride, int idist, + FFTW_scalar *out, const int *onembed, + int ostride, int odist, + int sign, unsigned flags) { + return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags); + } + + static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){ + ::fftwf_flops(p,add,mul,fmas); + } + + inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) { + ::fftwf_execute_dft(p,in,out); + } + inline static void fftw_destroy_plan(const FFTW_plan p) { + ::fftwf_destroy_plan(p); + } + }; + +#endif + class FFT { private: @@ -40,6 +100,10 @@ namespace Grid { GridCartesian *sgrid; int Nd; + double flops; + double flops_call; + uint64_t usec; + std::vector dimensions; std::vector processors; std::vector processor_coor; @@ -49,6 +113,9 @@ namespace Grid { static const int forward=FFTW_FORWARD; static const int backward=FFTW_BACKWARD; + double Flops(void) {return flops;} + double MFlops(void) {return flops/usec;} + FFT ( GridCartesian * grid ) : vgrid(grid), Nd(grid->_ndimension), @@ -56,6 +123,8 @@ namespace Grid { processors(grid->_processors), processor_coor(grid->_processor_coor) { + flops=0; + usec =0; std::vector layout(Nd,1); sgrid = new GridCartesian(dimensions,layout,processors); }; @@ -75,55 +144,62 @@ namespace Grid { std::vector layout(Nd,1); std::vector pencil_gd(vgrid->_fdimensions); - std::vector pencil_ld(processors); pencil_gd[dim] = G*processors[dim]; - pencil_ld[dim] = G*processors[dim]; // Pencil global vol LxLxGxLxL per node GridCartesian pencil_g(pencil_gd,layout,processors); - GridCartesian pencil_l(pencil_ld,layout,processors); // Construct pencils typedef typename vobj::scalar_object sobj; + typedef typename sobj::scalar_type scalar; + Lattice ssource(vgrid); ssource =source; Lattice pgsource(&pencil_g); - Lattice pgresult(&pencil_g); - Lattice plsource(&pencil_l); - Lattice plresult(&pencil_l); + Lattice pgresult(&pencil_g); pgresult=zero; + +#ifndef HAVE_FFTW + assert(0); +#else + typedef typename FFTW::FFTW_scalar FFTW_scalar; + typedef typename FFTW::FFTW_plan FFTW_plan; { - assert(sizeof(typename sobj::scalar_type)==sizeof(ComplexD)); - assert(sizeof(fftw_complex)==sizeof(ComplexD)); - assert(sizeof(fftw_complex)==sizeof(ComplexD)); + int Ncomp = sizeof(sobj)/sizeof(scalar); + int Nlow = 1; + for(int d=0;d_ldimensions[d]; + } - int Ncomp = sizeof(sobj)/sizeof(fftw_complex); - - int rank = 1; /* not 2: we are computing 1d transforms */ + int rank = 1; /* 1d transforms */ int n[] = {G}; /* 1d transforms of length G */ int howmany = Ncomp; int odist,idist,istride,ostride; - idist = odist = 1; - istride = ostride = Ncomp; /* distance between two elements in the same column */ + idist = odist = 1; /* Distance between consecutive FT's */ + istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */ int *inembed = n, *onembed = n; - fftw_complex *in = (fftw_complex *)&plsource._odata[0]; - fftw_complex *out= (fftw_complex *)&plresult._odata[0]; int sign = FFTW_FORWARD; if (inverse) sign = FFTW_BACKWARD; -#ifdef HAVE_FFTW - fftw_plan p = fftw_plan_many_dft(rank,n,howmany, - in,inembed, - istride,idist, - out,onembed, - ostride, odist, - sign,FFTW_ESTIMATE); -#else - fftw_plan p ; - assert(0); -#endif + FFTW_plan p; + { + FFTW_scalar *in = (FFTW_scalar *)&pgsource._odata[0]; + FFTW_scalar *out= (FFTW_scalar *)&pgresult._odata[0]; + p = FFTW::fftw_plan_many_dft(rank,n,howmany, + in,inembed, + istride,idist, + out,onembed, + ostride, odist, + sign,FFTW_ESTIMATE); + } + + double add,mul,fma; + FFTW::fftw_flops(p,&add,&mul,&fma); + flops_call = add+mul+2.0*fma; + std::cout << "FFT Flops per call "<lSites();idx++) { + int NN=pencil_g.lSites(); + + GridStopWatch Timer; + Timer.Start(); + +PARALLEL_FOR_LOOP + for(int idx=0;idx pcoor(Nd,0); std::vector lcoor(Nd); - sgrid->LocalIndexToLocalCoor(idx,lcoor); + pencil_g.LocalIndexToLocalCoor(idx,lcoor); if ( lcoor[dim] == 0 ) { // restricts loop to plane at lcoor[dim]==0 - - // Project to local pencil array - for(int l=0;l::fftw_execute_dft(p,in,out); } } - - fftw_destroy_plan(p); + + Timer.Stop(); + usec += Timer.useconds(); + flops+= flops_call*NN; + + + int pc = processor_coor[dim]; + for(int idx=0;idxlSites();idx++) { + std::vector lcoor(Nd); + sgrid->LocalIndexToLocalCoor(idx,lcoor); + std::vector gcoor = lcoor; + // extract the result + sobj s; + gcoor[dim] = lcoor[dim]+L*pc; + peekLocalSite(s,pgresult,gcoor); + pokeLocalSite(s,result,lcoor); + } + + FFTW::fftw_destroy_plan(p); } +#endif + + } }; diff --git a/lib/Init.cc b/lib/Init.cc index 6543435c..34e503a4 100644 --- a/lib/Init.cc +++ b/lib/Init.cc @@ -153,6 +153,7 @@ void GridParseLayout(char **argv,int argc, assert(ompthreads.size()==1); GridThread::SetThreads(ompthreads[0]); } + if( GridCmdOptionExists(argv,argv+argc,"--cores") ){ std::vector cores(0); arg= GridCmdOptionPayload(argv,argv+argc,"--cores"); @@ -203,7 +204,6 @@ void Grid_init(int *argc,char ***argv) GridLogConfigure(logstreams); } - if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){ Grid_debug_handler_init(); } diff --git a/lib/fftw/fftw3.h b/lib/fftw/fftw3.h deleted file mode 100644 index 1bf34396..00000000 --- a/lib/fftw/fftw3.h +++ /dev/null @@ -1,412 +0,0 @@ -/* - * Copyright (c) 2003, 2007-14 Matteo Frigo - * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology - * - * The following statement of license applies *only* to this header file, - * and *not* to the other files distributed with FFTW or derived therefrom: - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS - * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE - * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/***************************** NOTE TO USERS ********************************* - * - * THIS IS A HEADER FILE, NOT A MANUAL - * - * If you want to know how to use FFTW, please read the manual, - * online at http://www.fftw.org/doc/ and also included with FFTW. - * For a quick start, see the manual's tutorial section. - * - * (Reading header files to learn how to use a library is a habit - * stemming from code lacking a proper manual. Arguably, it's a - * *bad* habit in most cases, because header files can contain - * interfaces that are not part of the public, stable API.) - * - ****************************************************************************/ - -#ifndef FFTW3_H -#define FFTW3_H - -#include - -#ifdef __cplusplus -extern "C" -{ -#endif /* __cplusplus */ - -/* If is included, use the C99 complex type. Otherwise - define a type bit-compatible with C99 complex */ -#if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined(complex) && defined(I) -# define FFTW_DEFINE_COMPLEX(R, C) typedef R _Complex C -#else -# define FFTW_DEFINE_COMPLEX(R, C) typedef R C[2] -#endif - -#define FFTW_CONCAT(prefix, name) prefix ## name -#define FFTW_MANGLE_DOUBLE(name) FFTW_CONCAT(fftw_, name) -#define FFTW_MANGLE_FLOAT(name) FFTW_CONCAT(fftwf_, name) -#define FFTW_MANGLE_LONG_DOUBLE(name) FFTW_CONCAT(fftwl_, name) -#define FFTW_MANGLE_QUAD(name) FFTW_CONCAT(fftwq_, name) - -/* IMPORTANT: for Windows compilers, you should add a line - #define FFTW_DLL - here and in kernel/ifftw.h if you are compiling/using FFTW as a - DLL, in order to do the proper importing/exporting, or - alternatively compile with -DFFTW_DLL or the equivalent - command-line flag. This is not necessary under MinGW/Cygwin, where - libtool does the imports/exports automatically. */ -#if defined(FFTW_DLL) && (defined(_WIN32) || defined(__WIN32__)) - /* annoying Windows syntax for shared-library declarations */ -# if defined(COMPILING_FFTW) /* defined in api.h when compiling FFTW */ -# define FFTW_EXTERN extern __declspec(dllexport) -# else /* user is calling FFTW; import symbol */ -# define FFTW_EXTERN extern __declspec(dllimport) -# endif -#else -# define FFTW_EXTERN extern -#endif - -enum fftw_r2r_kind_do_not_use_me { - FFTW_R2HC=0, FFTW_HC2R=1, FFTW_DHT=2, - FFTW_REDFT00=3, FFTW_REDFT01=4, FFTW_REDFT10=5, FFTW_REDFT11=6, - FFTW_RODFT00=7, FFTW_RODFT01=8, FFTW_RODFT10=9, FFTW_RODFT11=10 -}; - -struct fftw_iodim_do_not_use_me { - int n; /* dimension size */ - int is; /* input stride */ - int os; /* output stride */ -}; - -#include /* for ptrdiff_t */ -struct fftw_iodim64_do_not_use_me { - ptrdiff_t n; /* dimension size */ - ptrdiff_t is; /* input stride */ - ptrdiff_t os; /* output stride */ -}; - -typedef void (*fftw_write_char_func_do_not_use_me)(char c, void *); -typedef int (*fftw_read_char_func_do_not_use_me)(void *); - -/* - huge second-order macro that defines prototypes for all API - functions. We expand this macro for each supported precision - - X: name-mangling macro - R: real data type - C: complex data type -*/ - -#define FFTW_DEFINE_API(X, R, C) \ - \ -FFTW_DEFINE_COMPLEX(R, C); \ - \ -typedef struct X(plan_s) *X(plan); \ - \ -typedef struct fftw_iodim_do_not_use_me X(iodim); \ -typedef struct fftw_iodim64_do_not_use_me X(iodim64); \ - \ -typedef enum fftw_r2r_kind_do_not_use_me X(r2r_kind); \ - \ -typedef fftw_write_char_func_do_not_use_me X(write_char_func); \ -typedef fftw_read_char_func_do_not_use_me X(read_char_func); \ - \ -FFTW_EXTERN void X(execute)(const X(plan) p); \ - \ -FFTW_EXTERN X(plan) X(plan_dft)(int rank, const int *n, \ - C *in, C *out, int sign, unsigned flags); \ - \ -FFTW_EXTERN X(plan) X(plan_dft_1d)(int n, C *in, C *out, int sign, \ - unsigned flags); \ -FFTW_EXTERN X(plan) X(plan_dft_2d)(int n0, int n1, \ - C *in, C *out, int sign, unsigned flags); \ -FFTW_EXTERN X(plan) X(plan_dft_3d)(int n0, int n1, int n2, \ - C *in, C *out, int sign, unsigned flags); \ - \ -FFTW_EXTERN X(plan) X(plan_many_dft)(int rank, const int *n, \ - int howmany, \ - C *in, const int *inembed, \ - int istride, int idist, \ - C *out, const int *onembed, \ - int ostride, int odist, \ - int sign, unsigned flags); \ - \ -FFTW_EXTERN X(plan) X(plan_guru_dft)(int rank, const X(iodim) *dims, \ - int howmany_rank, \ - const X(iodim) *howmany_dims, \ - C *in, C *out, \ - int sign, unsigned flags); \ -FFTW_EXTERN X(plan) X(plan_guru_split_dft)(int rank, const X(iodim) *dims, \ - int howmany_rank, \ - const X(iodim) *howmany_dims, \ - R *ri, R *ii, R *ro, R *io, \ - unsigned flags); \ - \ -FFTW_EXTERN X(plan) X(plan_guru64_dft)(int rank, \ - const X(iodim64) *dims, \ - int howmany_rank, \ - const X(iodim64) *howmany_dims, \ - C *in, C *out, \ - int sign, unsigned flags); \ -FFTW_EXTERN X(plan) X(plan_guru64_split_dft)(int rank, \ - const X(iodim64) *dims, \ - int howmany_rank, \ - const X(iodim64) *howmany_dims, \ - R *ri, R *ii, R *ro, R *io, \ - unsigned flags); \ - \ -FFTW_EXTERN void X(execute_dft)(const X(plan) p, C *in, C *out); \ -FFTW_EXTERN void X(execute_split_dft)(const X(plan) p, R *ri, R *ii, \ - R *ro, R *io); \ - \ -FFTW_EXTERN X(plan) X(plan_many_dft_r2c)(int rank, const int *n, \ - int howmany, \ - R *in, const int *inembed, \ - int istride, int idist, \ - C *out, const int *onembed, \ - int ostride, int odist, \ - unsigned flags); \ - \ -FFTW_EXTERN X(plan) X(plan_dft_r2c)(int rank, const int *n, \ - R *in, C *out, unsigned flags); \ - \ -FFTW_EXTERN X(plan) X(plan_dft_r2c_1d)(int n,R *in,C *out,unsigned flags); \ -FFTW_EXTERN X(plan) X(plan_dft_r2c_2d)(int n0, int n1, \ - R *in, C *out, unsigned flags); \ -FFTW_EXTERN X(plan) X(plan_dft_r2c_3d)(int n0, int n1, \ - int n2, \ - R *in, C *out, unsigned flags); \ - \ - \ -FFTW_EXTERN X(plan) X(plan_many_dft_c2r)(int rank, const int *n, \ - int howmany, \ - C *in, const int *inembed, \ - int istride, int idist, \ - R *out, const int *onembed, \ - int ostride, int odist, \ - unsigned flags); \ - \ -FFTW_EXTERN X(plan) X(plan_dft_c2r)(int rank, const int *n, \ - C *in, R *out, unsigned flags); \ - \ -FFTW_EXTERN X(plan) X(plan_dft_c2r_1d)(int n,C *in,R *out,unsigned flags); \ -FFTW_EXTERN X(plan) X(plan_dft_c2r_2d)(int n0, int n1, \ - C *in, R *out, unsigned flags); \ -FFTW_EXTERN X(plan) X(plan_dft_c2r_3d)(int n0, int n1, \ - int n2, \ - C *in, R *out, unsigned flags); \ - \ -FFTW_EXTERN X(plan) X(plan_guru_dft_r2c)(int rank, const X(iodim) *dims, \ - int howmany_rank, \ - const X(iodim) *howmany_dims, \ - R *in, C *out, \ - unsigned flags); \ -FFTW_EXTERN X(plan) X(plan_guru_dft_c2r)(int rank, const X(iodim) *dims, \ - int howmany_rank, \ - const X(iodim) *howmany_dims, \ - C *in, R *out, \ - unsigned flags); \ - \ -FFTW_EXTERN X(plan) X(plan_guru_split_dft_r2c)( \ - int rank, const X(iodim) *dims, \ - int howmany_rank, \ - const X(iodim) *howmany_dims, \ - R *in, R *ro, R *io, \ - unsigned flags); \ -FFTW_EXTERN X(plan) X(plan_guru_split_dft_c2r)( \ - int rank, const X(iodim) *dims, \ - int howmany_rank, \ - const X(iodim) *howmany_dims, \ - R *ri, R *ii, R *out, \ - unsigned flags); \ - \ -FFTW_EXTERN X(plan) X(plan_guru64_dft_r2c)(int rank, \ - const X(iodim64) *dims, \ - int howmany_rank, \ - const X(iodim64) *howmany_dims, \ - R *in, C *out, \ - unsigned flags); \ -FFTW_EXTERN X(plan) X(plan_guru64_dft_c2r)(int rank, \ - const X(iodim64) *dims, \ - int howmany_rank, \ - const X(iodim64) *howmany_dims, \ - C *in, R *out, \ - unsigned flags); \ - \ -FFTW_EXTERN X(plan) X(plan_guru64_split_dft_r2c)( \ - int rank, const X(iodim64) *dims, \ - int howmany_rank, \ - const X(iodim64) *howmany_dims, \ - R *in, R *ro, R *io, \ - unsigned flags); \ -FFTW_EXTERN X(plan) X(plan_guru64_split_dft_c2r)( \ - int rank, const X(iodim64) *dims, \ - int howmany_rank, \ - const X(iodim64) *howmany_dims, \ - R *ri, R *ii, R *out, \ - unsigned flags); \ - \ -FFTW_EXTERN void X(execute_dft_r2c)(const X(plan) p, R *in, C *out); \ -FFTW_EXTERN void X(execute_dft_c2r)(const X(plan) p, C *in, R *out); \ - \ -FFTW_EXTERN void X(execute_split_dft_r2c)(const X(plan) p, \ - R *in, R *ro, R *io); \ -FFTW_EXTERN void X(execute_split_dft_c2r)(const X(plan) p, \ - R *ri, R *ii, R *out); \ - \ -FFTW_EXTERN X(plan) X(plan_many_r2r)(int rank, const int *n, \ - int howmany, \ - R *in, const int *inembed, \ - int istride, int idist, \ - R *out, const int *onembed, \ - int ostride, int odist, \ - const X(r2r_kind) *kind, unsigned flags); \ - \ -FFTW_EXTERN X(plan) X(plan_r2r)(int rank, const int *n, R *in, R *out, \ - const X(r2r_kind) *kind, unsigned flags); \ - \ -FFTW_EXTERN X(plan) X(plan_r2r_1d)(int n, R *in, R *out, \ - X(r2r_kind) kind, unsigned flags); \ -FFTW_EXTERN X(plan) X(plan_r2r_2d)(int n0, int n1, R *in, R *out, \ - X(r2r_kind) kind0, X(r2r_kind) kind1, \ - unsigned flags); \ -FFTW_EXTERN X(plan) X(plan_r2r_3d)(int n0, int n1, int n2, \ - R *in, R *out, X(r2r_kind) kind0, \ - X(r2r_kind) kind1, X(r2r_kind) kind2, \ - unsigned flags); \ - \ -FFTW_EXTERN X(plan) X(plan_guru_r2r)(int rank, const X(iodim) *dims, \ - int howmany_rank, \ - const X(iodim) *howmany_dims, \ - R *in, R *out, \ - const X(r2r_kind) *kind, unsigned flags); \ - \ -FFTW_EXTERN X(plan) X(plan_guru64_r2r)(int rank, const X(iodim64) *dims, \ - int howmany_rank, \ - const X(iodim64) *howmany_dims, \ - R *in, R *out, \ - const X(r2r_kind) *kind, unsigned flags); \ - \ -FFTW_EXTERN void X(execute_r2r)(const X(plan) p, R *in, R *out); \ - \ -FFTW_EXTERN void X(destroy_plan)(X(plan) p); \ -FFTW_EXTERN void X(forget_wisdom)(void); \ -FFTW_EXTERN void X(cleanup)(void); \ - \ -FFTW_EXTERN void X(set_timelimit)(double t); \ - \ -FFTW_EXTERN void X(plan_with_nthreads)(int nthreads); \ -FFTW_EXTERN int X(init_threads)(void); \ -FFTW_EXTERN void X(cleanup_threads)(void); \ - \ -FFTW_EXTERN int X(export_wisdom_to_filename)(const char *filename); \ -FFTW_EXTERN void X(export_wisdom_to_file)(FILE *output_file); \ -FFTW_EXTERN char *X(export_wisdom_to_string)(void); \ -FFTW_EXTERN void X(export_wisdom)(X(write_char_func) write_char, \ - void *data); \ -FFTW_EXTERN int X(import_system_wisdom)(void); \ -FFTW_EXTERN int X(import_wisdom_from_filename)(const char *filename); \ -FFTW_EXTERN int X(import_wisdom_from_file)(FILE *input_file); \ -FFTW_EXTERN int X(import_wisdom_from_string)(const char *input_string); \ -FFTW_EXTERN int X(import_wisdom)(X(read_char_func) read_char, void *data); \ - \ -FFTW_EXTERN void X(fprint_plan)(const X(plan) p, FILE *output_file); \ -FFTW_EXTERN void X(print_plan)(const X(plan) p); \ -FFTW_EXTERN char *X(sprint_plan)(const X(plan) p); \ - \ -FFTW_EXTERN void *X(malloc)(size_t n); \ -FFTW_EXTERN R *X(alloc_real)(size_t n); \ -FFTW_EXTERN C *X(alloc_complex)(size_t n); \ -FFTW_EXTERN void X(free)(void *p); \ - \ -FFTW_EXTERN void X(flops)(const X(plan) p, \ - double *add, double *mul, double *fmas); \ -FFTW_EXTERN double X(estimate_cost)(const X(plan) p); \ -FFTW_EXTERN double X(cost)(const X(plan) p); \ - \ -FFTW_EXTERN int X(alignment_of)(R *p); \ -FFTW_EXTERN const char X(version)[]; \ -FFTW_EXTERN const char X(cc)[]; \ -FFTW_EXTERN const char X(codelet_optim)[]; - - -/* end of FFTW_DEFINE_API macro */ - -FFTW_DEFINE_API(FFTW_MANGLE_DOUBLE, double, fftw_complex) -FFTW_DEFINE_API(FFTW_MANGLE_FLOAT, float, fftwf_complex) -FFTW_DEFINE_API(FFTW_MANGLE_LONG_DOUBLE, long double, fftwl_complex) - -/* __float128 (quad precision) is a gcc extension on i386, x86_64, and ia64 - for gcc >= 4.6 (compiled in FFTW with --enable-quad-precision) */ -#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) \ - && !(defined(__ICC) || defined(__INTEL_COMPILER)) \ - && (defined(__i386__) || defined(__x86_64__) || defined(__ia64__)) -# if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined(complex) && defined(I) -/* note: __float128 is a typedef, which is not supported with the _Complex - keyword in gcc, so instead we use this ugly __attribute__ version. - However, we can't simply pass the __attribute__ version to - FFTW_DEFINE_API because the __attribute__ confuses gcc in pointer - types. Hence redefining FFTW_DEFINE_COMPLEX. Ugh. */ -# undef FFTW_DEFINE_COMPLEX -# define FFTW_DEFINE_COMPLEX(R, C) typedef _Complex float __attribute__((mode(TC))) C -# endif -FFTW_DEFINE_API(FFTW_MANGLE_QUAD, __float128, fftwq_complex) -#endif - -#define FFTW_FORWARD (-1) -#define FFTW_BACKWARD (+1) - -#define FFTW_NO_TIMELIMIT (-1.0) - -/* documented flags */ -#define FFTW_MEASURE (0U) -#define FFTW_DESTROY_INPUT (1U << 0) -#define FFTW_UNALIGNED (1U << 1) -#define FFTW_CONSERVE_MEMORY (1U << 2) -#define FFTW_EXHAUSTIVE (1U << 3) /* NO_EXHAUSTIVE is default */ -#define FFTW_PRESERVE_INPUT (1U << 4) /* cancels FFTW_DESTROY_INPUT */ -#define FFTW_PATIENT (1U << 5) /* IMPATIENT is default */ -#define FFTW_ESTIMATE (1U << 6) -#define FFTW_WISDOM_ONLY (1U << 21) - -/* undocumented beyond-guru flags */ -#define FFTW_ESTIMATE_PATIENT (1U << 7) -#define FFTW_BELIEVE_PCOST (1U << 8) -#define FFTW_NO_DFT_R2HC (1U << 9) -#define FFTW_NO_NONTHREADED (1U << 10) -#define FFTW_NO_BUFFERING (1U << 11) -#define FFTW_NO_INDIRECT_OP (1U << 12) -#define FFTW_ALLOW_LARGE_GENERIC (1U << 13) /* NO_LARGE_GENERIC is default */ -#define FFTW_NO_RANK_SPLITS (1U << 14) -#define FFTW_NO_VRANK_SPLITS (1U << 15) -#define FFTW_NO_VRECURSE (1U << 16) -#define FFTW_NO_SIMD (1U << 17) -#define FFTW_NO_SLOW (1U << 18) -#define FFTW_NO_FIXED_RADIX_LARGE_N (1U << 19) -#define FFTW_ALLOW_PRUNING (1U << 20) - -#ifdef __cplusplus -} /* extern "C" */ -#endif /* __cplusplus */ - -#endif /* FFTW3_H */ diff --git a/tests/core/Test_fft.cc b/tests/core/Test_fft.cc index ed57800c..2fdaeed2 100644 --- a/tests/core/Test_fft.cc +++ b/tests/core/Test_fft.cc @@ -35,6 +35,9 @@ int main (int argc, char ** argv) { Grid_init(&argc,&argv); + int threads = GridThread::GetThreads(); + std::cout< latt_size = GridDefaultLatt(); std::vector simd_layout( { vComplexD::Nsimd(),1,1,1}); std::vector mpi_layout = GridDefaultMpi(); @@ -75,10 +78,10 @@ int main (int argc, char ** argv) FFT theFFT(&Fine); - theFFT.FFT_dim(Ctilde,C,0,FFT::forward); C=Ctilde; - theFFT.FFT_dim(Ctilde,C,1,FFT::forward); C=Ctilde; - theFFT.FFT_dim(Ctilde,C,2,FFT::forward); C=Ctilde; - theFFT.FFT_dim(Ctilde,C,3,FFT::forward); + theFFT.FFT_dim(Ctilde,C,0,FFT::forward); C=Ctilde; std::cout << theFFT.MFlops()< +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace Grid; +using namespace Grid::QCD; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + int threads = GridThread::GetThreads(); + std::cout< latt_size = GridDefaultLatt(); + std::vector simd_layout( { vComplexF::Nsimd(),1,1,1}); + std::vector mpi_layout = GridDefaultMpi(); + + int vol = 1; + for(int d=0;d p({1,2,3,2}); + + one = ComplexF(1.0,0.0); + zz = ComplexF(0.0,0.0); + + ComplexF ci(0.0,1.0); + + C=zero; + for(int mu=0;mu<4;mu++){ + RealD TwoPiL = M_PI * 2.0/ latt_size[mu]; + LatticeCoordinate(coor,mu); + C = C - (TwoPiL * p[mu]) * coor; + } + + C = exp(C*ci); + + S=zero; + S = S+C; + + FFT theFFT(&Fine); + + theFFT.FFT_dim(Ctilde,C,0,FFT::forward); C=Ctilde; std::cout << theFFT.MFlops()<