From df6c9f55d1af1c40607087f241c2a98edf021075 Mon Sep 17 00:00:00 2001 From: Richard Rollins Date: Wed, 20 Jul 2016 17:38:56 +0100 Subject: [PATCH 01/29] Use common benchmark output format for dwf_sweep and zmm --- benchmarks/Benchmark_dwf_sweep.cc | 2 ++ benchmarks/Benchmark_zmm.cc | 16 ++++++++++------ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/benchmarks/Benchmark_dwf_sweep.cc b/benchmarks/Benchmark_dwf_sweep.cc index d52112ba..18d2f389 100644 --- a/benchmarks/Benchmark_dwf_sweep.cc +++ b/benchmarks/Benchmark_dwf_sweep.cc @@ -61,6 +61,8 @@ int main (int argc, char ** argv) QCD::WilsonKernelsStatic::AsmOpt=0; } + std::cout< grid({L,L,m*L,m*L}); + std::cout << GridLogMessage <<"\t"; for(int i=0;i<4;i++) { std::cout << grid[i]<<"x"; } - std::cout << Ls< &latt4,int Ls) RealD M5 =1.8; DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); - std::cout< &latt4,int Ls) double flops=1344*volume/2; mfc = flops*ncall/(t1-t0); - std::cout< &latt4,int Ls) } t1=usecond(); mfa = flops*ncall/(t1-t0); - std::cout< &latt4,int Ls) //resulta = (-0.5) * resulta; diff = resulto-resulta; - std::cout< Date: Thu, 4 Aug 2016 16:27:02 +0100 Subject: [PATCH 02/29] README update --- README.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a41564d4..2a0acad6 100644 --- a/README.md +++ b/README.md @@ -68,10 +68,18 @@ Now you can execute the `configure` script to generate makefiles (here from a bu ``` bash mkdir build; cd build -../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi --prefix= +../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi-auto --prefix= ``` -where `--enable-precision=` set the default precision (`single` or `double`), `--enable-simd=` set the SIMD type (see possible values below), `--enable-comms=` set the protocol used for communications (`none`, `mpi` or `shmem`), and `` should be replaced by the prefix path where you want to install Grid. Other options are available, use `configure --help` to display them. Like with any other program using GNU autotool, the `CXX`, `CXXFLAGS`, `LDFLAGS`, ... environment variables can be modified to customise the build. +where `--enable-precision=` set the default precision (`single` or `double`), +`--enable-simd=` set the SIMD type (see possible values below), `--enable- +comms=` set the protocol used for communications (`none`, `mpi`, `mpi-auto` or +`shmem`), and `` should be replaced by the prefix path where you want to +install Grid. The `mpi-auto` communication option set `configure` to determine +automatically how to link to MPI. Other options are available, use `configure +--help` to display them. Like with any other program using GNU autotool, the +`CXX`, `CXXFLAGS`, `LDFLAGS`, ... environment variables can be modified to +customise the build. Finally, you can build and install Grid: From 32bc7a6ab893e79ba033fb29ca4618e59278b2c5 Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 5 Aug 2016 10:36:00 +0100 Subject: [PATCH 03/29] MPI back out of change that hangs AVX2 for clang, gcc needs the -mfma flag. --- benchmarks/Benchmark_comms.cc | 4 +-- configure.ac | 2 +- lib/communicator/Communicator_base.h | 9 ------ lib/communicator/Communicator_mpi.cc | 39 ++++++++------------------ lib/communicator/Communicator_none.cc | 13 --------- lib/communicator/Communicator_shmem.cc | 13 --------- 6 files changed, 14 insertions(+), 66 deletions(-) diff --git a/benchmarks/Benchmark_comms.cc b/benchmarks/Benchmark_comms.cc index 7162a0f9..d0cd96c7 100644 --- a/benchmarks/Benchmark_comms.cc +++ b/benchmarks/Benchmark_comms.cc @@ -194,7 +194,7 @@ int main (int argc, char ** argv) } } - +#if 0 std::cout< &list, - void *xmit, - int xmit_to_rank, - void *recv, - int recv_from_rank, - int bytes); - void SendToRecvFromBegin(std::vector &list, void *xmit, int xmit_to_rank, void *recv, int recv_from_rank, int bytes); - - void SendToRecvFromBegin(std::vector &list); void SendToRecvFromComplete(std::vector &waitall); //////////////////////////////////////////////////////////// diff --git a/lib/communicator/Communicator_mpi.cc b/lib/communicator/Communicator_mpi.cc index 6830c14a..dff9811a 100644 --- a/lib/communicator/Communicator_mpi.cc +++ b/lib/communicator/Communicator_mpi.cc @@ -144,28 +144,6 @@ void CartesianCommunicator::SendRecvPacket(void *xmit, } // Basic Halo comms primitive -// Basic Halo comms primitive -void CartesianCommunicator::SendToRecvFromInit(std::vector &list, - void *xmit, - int dest, - void *recv, - int from, - int bytes) -{ - MPI_Request xrq; - MPI_Request rrq; - int rank = _processor; - int ierr; - ierr =MPI_Send_init(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); - ierr|=MPI_Recv_init(recv, bytes, MPI_CHAR,dest,_processor,communicator,&rrq); - assert(ierr==0); - list.push_back(xrq); - list.push_back(rrq); -} -void CartesianCommunicator::SendToRecvFromBegin(std::vector &list) -{ - MPI_Startall(list.size(),&list[0]); -} void CartesianCommunicator::SendToRecvFromBegin(std::vector &list, void *xmit, int dest, @@ -173,12 +151,17 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector &lis int from, int bytes) { - std::vector reqs(0); - SendToRecvFromInit(reqs,xmit,dest,recv,from,bytes); - SendToRecvFromBegin(reqs); - for(int i=0;i &list) { diff --git a/lib/communicator/Communicator_none.cc b/lib/communicator/Communicator_none.cc index 2873462a..8601255a 100644 --- a/lib/communicator/Communicator_none.cc +++ b/lib/communicator/Communicator_none.cc @@ -84,19 +84,6 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector &lis { assert(0); } -void CartesianCommunicator::SendToRecvFromInit(std::vector &list, - void *xmit, - int dest, - void *recv, - int from, - int bytes) -{ - assert(0); -} -void CartesianCommunicator::SendToRecvFromBegin(std::vector &list) -{ - assert(0); -} void CartesianCommunicator::SendToRecvFromComplete(std::vector &list) { assert(0); diff --git a/lib/communicator/Communicator_shmem.cc b/lib/communicator/Communicator_shmem.cc index c415b13d..091e266e 100644 --- a/lib/communicator/Communicator_shmem.cc +++ b/lib/communicator/Communicator_shmem.cc @@ -268,10 +268,6 @@ void CartesianCommunicator::SendRecvPacket(void *xmit, } // Basic Halo comms primitive -void CartesianCommunicator::SendToRecvFromBegin(std::vector &list) -{ - assert(0); //unimplemented -} void CartesianCommunicator::SendToRecvFromBegin(std::vector &list, void *xmit, int dest, @@ -284,15 +280,6 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector &lis // shmem_putmem_nb(recv,xmit,bytes,dest,NULL); shmem_putmem(recv,xmit,bytes,dest); } -void CartesianCommunicator::SendToRecvFromInit(std::vector &list, - void *xmit, - int dest, - void *recv, - int from, - int bytes) -{ - assert(0); // Unimplemented -} void CartesianCommunicator::SendToRecvFromComplete(std::vector &list) { // shmem_quiet(); // I'm done From 5a68715be34f70594d0c08643afe0a2d0c4316be Mon Sep 17 00:00:00 2001 From: paboyle Date: Fri, 5 Aug 2016 10:51:57 +0100 Subject: [PATCH 04/29] Richards sweep test --- benchmarks/Benchmark_wilson_sweep.cc | 117 +++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 benchmarks/Benchmark_wilson_sweep.cc diff --git a/benchmarks/Benchmark_wilson_sweep.cc b/benchmarks/Benchmark_wilson_sweep.cc new file mode 100644 index 00000000..d7c712d3 --- /dev/null +++ b/benchmarks/Benchmark_wilson_sweep.cc @@ -0,0 +1,117 @@ +/************************************************************************************* + Grid physics library, www.github.com/paboyle/Grid + Source file: ./benchmarks/Benchmark_wilson.cc + Copyright (C) 2015 +Author: Peter Boyle +Author: paboyle +Author: Richard Rollins + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + See the full license in the file "LICENSE" in the top level distribution directory +*************************************************************************************/ +/* END LEGAL */ +#include + +using namespace std; +using namespace Grid; +using namespace Grid::QCD; + +template +struct scal { + d internal; +}; + +Gamma::GammaMatrix Gmu [] = { + Gamma::GammaX, + Gamma::GammaY, + Gamma::GammaZ, + Gamma::GammaT +}; + +bool overlapComms = false; + +void bench_wilson ( + LatticeFermion & src, + LatticeFermion & result, + WilsonFermionR & Dw, + double const volume, + int const dag ); + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){ overlapComms = true; } + typename WilsonFermionR::ImplParams params; + params.overlapCommsCompute = overlapComms; + + std::vector simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); + std::vector mpi_layout = GridDefaultMpi(); + std::vector seeds({1,2,3,4}); + RealD mass = 0.1; + + std::cout< latt_size = std::vector(4,L); + for(int d=4; d>dmin; d--) + { + if ( d<=3 ) { latt_size[d] *= 2; } + + std::cout << GridLogMessage; + std::copy( latt_size.begin(), --latt_size.end(), std::ostream_iterator( std::cout, std::string("x").c_str() ) ); + std::cout << latt_size.back() << "\t\t"; + + GridCartesian Grid(latt_size,simd_layout,mpi_layout); + GridRedBlackCartesian RBGrid(latt_size,simd_layout,mpi_layout); + + GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds); + LatticeGaugeField Umu(&Grid); random(pRNG,Umu); + LatticeFermion src(&Grid); random(pRNG,src); + LatticeFermion result(&Grid); result=zero; + + double volume = std::accumulate(latt_size.begin(),latt_size.end(),1,std::multiplies()); + + WilsonFermionR Dw(Umu,Grid,RBGrid,mass,params); + + bench_wilson(src,result,Dw,volume,DaggerNo); + bench_wilson(src,result,Dw,volume,DaggerYes); + std::cout << std::endl; + } + } + + std::cout< Date: Fri, 5 Aug 2016 13:41:52 +0100 Subject: [PATCH 05/29] first try at including MPI tests in Travis --- .travis.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/.travis.yml b/.travis.yml index 0733d037..c7c10d24 100644 --- a/.travis.yml +++ b/.travis.yml @@ -23,6 +23,8 @@ matrix: - libmpfr-dev - libgmp-dev - libmpc-dev + - libopenmpi-dev + - openmpi-bin - binutils-dev env: VERSION=-4.9 - compiler: gcc @@ -35,6 +37,8 @@ matrix: - libmpfr-dev - libgmp-dev - libmpc-dev + - libopenmpi-dev + - openmpi-bin - binutils-dev env: VERSION=-5 - compiler: clang @@ -47,6 +51,8 @@ matrix: - libmpfr-dev - libgmp-dev - libmpc-dev + - libopenmpi-dev + - openmpi-bin - binutils-dev env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz - compiler: clang @@ -59,6 +65,8 @@ matrix: - libmpfr-dev - libgmp-dev - libmpc-dev + - libopenmpi-dev + - openmpi-bin - binutils-dev env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz @@ -69,6 +77,7 @@ before_install: - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi + - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install openmpi; fi - if [[ "$TRAVIS_OS_NAME" == "osx" ]] && [[ "$CC" == "gcc" ]]; then brew install gcc5; fi install: @@ -92,3 +101,8 @@ script: - ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none - make -j4 - ./benchmarks/Benchmark_dwf --threads 1 + - echo make clean + - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto + - make -j4 + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1; fi + - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then mpirun -n 2 ./benchmarks/Benchmark_dwf --threads 1; fi From 8dc2cfcedbf635fced78845d86aeec2a00fe0239 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 6 Aug 2016 00:28:28 +0100 Subject: [PATCH 06/29] Adding fftw header pulling --- bootstrap.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/bootstrap.sh b/bootstrap.sh index 5e9d4041..461eb121 100755 --- a/bootstrap.sh +++ b/bootstrap.sh @@ -1,11 +1,18 @@ #!/usr/bin/env bash EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.2.9.tar.bz2' +FFTW_URL=http://www.fftw.org/fftw-3.3.4.tar.gz echo "-- deploying Eigen source..." wget ${EIGEN_URL} ./scripts/update_eigen.sh `basename ${EIGEN_URL}` rm `basename ${EIGEN_URL}` + +echo "-- copying fftw prototypes..." +wget ${FFTW_URL} +./scripts/update_fftw.sh `basename ${FFTW_URL}` +rm `basename ${FFTW_URL}` + echo '-- generating Make.inc files...' ./scripts/filelist echo '-- generating configure script...' From fc25d2295cdaaed7ce5e7d629ef2a30696b0d12e Mon Sep 17 00:00:00 2001 From: paboyle Date: Sat, 6 Aug 2016 00:28:52 +0100 Subject: [PATCH 07/29] fftw download --- scripts/update_fftw.sh | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100755 scripts/update_fftw.sh diff --git a/scripts/update_fftw.sh b/scripts/update_fftw.sh new file mode 100755 index 00000000..20e42080 --- /dev/null +++ b/scripts/update_fftw.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +if (( $# != 1 )); then + echo "usage: `basename $0` " 1>&2 + exit 1 +fi +ARC=$1 + +INITDIR=`pwd` +rm -rf lib/fftw +mkdir lib/fftw + +ARCDIR=`tar -tf ${ARC} | head -n1 | sed -e 's@/.*@@'` +tar -xf ${ARC} +cp ${ARCDIR}/api/fftw3.h lib/fftw/ + +cd ${INITDIR} +rm -rf ${ARCDIR} From ec68e08dd26c1dab129e5650fa8d93c9a3e60394 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Sat, 6 Aug 2016 00:36:05 +0100 Subject: [PATCH 08/29] Travis MPI fix --- .travis.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index c7c10d24..2fe52a68 100644 --- a/.travis.yml +++ b/.travis.yml @@ -102,7 +102,8 @@ script: - make -j4 - ./benchmarks/Benchmark_dwf --threads 1 - echo make clean + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then export CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED'; fi - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto - make -j4 - - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1; fi - - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then mpirun -n 2 ./benchmarks/Benchmark_dwf --threads 1; fi + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi + - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then mpirun -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi From 15218ec57fdff7e6b4af10a0c730bbddac5c2e06 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Sat, 6 Aug 2016 00:49:14 +0100 Subject: [PATCH 09/29] more Travis MPI fix --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 2fe52a68..73430b4d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -102,7 +102,7 @@ script: - make -j4 - ./benchmarks/Benchmark_dwf --threads 1 - echo make clean - - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then export CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED'; fi + - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then export CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; fi - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto - make -j4 - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi From 573b8c602072f8daa7596969f19dda69252b41aa Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Sat, 6 Aug 2016 01:26:24 +0100 Subject: [PATCH 10/29] build system: -O3 is not overriden by env CXXFLAGS --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 89bb3568..cdcd9d58 100644 --- a/configure.ac +++ b/configure.ac @@ -8,7 +8,7 @@ m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) ############### Checks for programs AC_LANG(C++) -: ${CXXFLAGS="-O3"} +CXXFLAGS="-O3 $CXXFLAGS" AC_PROG_CXX AC_OPENMP AM_CXXFLAGS="$OPENMP_CXXFLAGS $AM_CXXFLAGS" From 90e70790f312bff33a397d78a5b133c7f3d7a136 Mon Sep 17 00:00:00 2001 From: paboyle Date: Mon, 15 Aug 2016 22:31:29 +0100 Subject: [PATCH 11/29] Feature for z-Mobius prep --- configure.ac | 2 +- lib/qcd/action/Actions.h | 12 ++- lib/qcd/action/fermion/CayleyFermion5D.cc | 58 ++++++++------ lib/qcd/action/fermion/CayleyFermion5D.h | 49 ++++++------ .../action/fermion/CayleyFermion5Dcache.cc | 14 ++-- lib/qcd/action/fermion/CayleyFermion5Dssp.cc | 12 +-- lib/qcd/action/fermion/CayleyFermion5Dvec.cc | 24 +++--- lib/qcd/action/fermion/FermionOperatorImpl.h | 25 +++++- lib/qcd/action/fermion/WilsonKernels.cc | 23 +++--- lib/qcd/action/fermion/WilsonKernels.h | 4 + lib/qcd/action/fermion/WilsonKernelsAsm.cc | 20 +++++ lib/qcd/action/fermion/WilsonKernelsAsmBody.h | 65 ++++++++++++++- lib/qcd/action/fermion/ZMobiusFermion.h | 79 +++++++++++++++++++ tests/debug/Test_cayley_even_odd_vec.cc | 12 +++ 14 files changed, 312 insertions(+), 87 deletions(-) create mode 100644 lib/qcd/action/fermion/ZMobiusFermion.h diff --git a/configure.ac b/configure.ac index cdcd9d58..2e46a411 100644 --- a/configure.ac +++ b/configure.ac @@ -12,7 +12,7 @@ CXXFLAGS="-O3 $CXXFLAGS" AC_PROG_CXX AC_OPENMP AM_CXXFLAGS="$OPENMP_CXXFLAGS $AM_CXXFLAGS" -LT_INIT([disable-shared]) +LT_INIT ############### Checks for header files AC_CHECK_HEADERS(stdint.h) diff --git a/lib/qcd/action/Actions.h b/lib/qcd/action/Actions.h index be08cdde..1ad3a584 100644 --- a/lib/qcd/action/Actions.h +++ b/lib/qcd/action/Actions.h @@ -111,12 +111,16 @@ typedef SymanzikGaugeAction ConjugateSymanzikGaugeAction #define FermOp4dVecTemplateInstantiate(A) \ template class A; \ template class A; \ + template class A; \ + template class A; \ template class A; \ template class A; #define FermOp5dVecTemplateInstantiate(A) \ template class A; \ - template class A; + template class A; \ + template class A; \ + template class A; #define FermOpTemplateInstantiate(A) \ FermOp4dVecTemplateInstantiate(A) \ @@ -138,6 +142,7 @@ typedef SymanzikGaugeAction ConjugateSymanzikGaugeAction #include #include #include +#include #include #include #include @@ -176,6 +181,11 @@ typedef DomainWallFermion DomainWallFermionD; typedef MobiusFermion MobiusFermionR; typedef MobiusFermion MobiusFermionF; typedef MobiusFermion MobiusFermionD; + +typedef ZMobiusFermion ZMobiusFermionR; +typedef ZMobiusFermion ZMobiusFermionF; +typedef ZMobiusFermion ZMobiusFermionD; + typedef ScaledShamirFermion ScaledShamirFermionR; typedef ScaledShamirFermion ScaledShamirFermionF; typedef ScaledShamirFermion ScaledShamirFermionD; diff --git a/lib/qcd/action/fermion/CayleyFermion5D.cc b/lib/qcd/action/fermion/CayleyFermion5D.cc index 066cede4..c4196043 100644 --- a/lib/qcd/action/fermion/CayleyFermion5D.cc +++ b/lib/qcd/action/fermion/CayleyFermion5D.cc @@ -54,18 +54,18 @@ template void CayleyFermion5D::M5D (const FermionField &psi, FermionField &chi) { int Ls=this->Ls; - std::vector diag (Ls,1.0); - std::vector upper(Ls,-1.0); upper[Ls-1]=mass; - std::vector lower(Ls,-1.0); lower[0] =mass; + std::vector diag (Ls,1.0); + std::vector upper(Ls,-1.0); upper[Ls-1]=mass; + std::vector lower(Ls,-1.0); lower[0] =mass; M5D(psi,chi,chi,lower,diag,upper); } template void CayleyFermion5D::Meooe5D (const FermionField &psi, FermionField &Din) { int Ls=this->Ls; - std::vector diag = bs; - std::vector upper= cs; - std::vector lower= cs; + std::vector diag = bs; + std::vector upper= cs; + std::vector lower= cs; upper[Ls-1]=-mass*upper[Ls-1]; lower[0] =-mass*lower[0]; M5D(psi,psi,Din,lower,diag,upper); @@ -73,9 +73,9 @@ void CayleyFermion5D::Meooe5D (const FermionField &psi, FermionField &D template void CayleyFermion5D::Meo5D (const FermionField &psi, FermionField &chi) { int Ls=this->Ls; - std::vector diag = beo; - std::vector upper(Ls); - std::vector lower(Ls); + std::vector diag = beo; + std::vector upper(Ls); + std::vector lower(Ls); for(int i=0;i void CayleyFermion5D::Mooee (const FermionField &psi, FermionField &chi) { int Ls=this->Ls; - std::vector diag = bee; - std::vector upper(Ls); - std::vector lower(Ls); + std::vector diag = bee; + std::vector upper(Ls); + std::vector lower(Ls); for(int i=0;i void CayleyFermion5D::MooeeDag (const FermionField &psi, FermionField &chi) { int Ls=this->Ls; - std::vector diag = bee; - std::vector upper(Ls); - std::vector lower(Ls); + std::vector diag = bee; + std::vector upper(Ls); + std::vector lower(Ls); for (int s=0;s void CayleyFermion5D::M5Ddag (const FermionField &psi, FermionField &chi) { int Ls=this->Ls; - std::vector diag(Ls,1.0); - std::vector upper(Ls,-1.0); - std::vector lower(Ls,-1.0); + std::vector diag(Ls,1.0); + std::vector upper(Ls,-1.0); + std::vector lower(Ls,-1.0); upper[Ls-1]=-mass*upper[Ls-1]; lower[0] =-mass*lower[0]; M5Ddag(psi,chi,chi,lower,diag,upper); @@ -141,9 +141,9 @@ template void CayleyFermion5D::MeooeDag5D (const FermionField &psi, FermionField &Din) { int Ls=this->Ls; - std::vector diag =bs; - std::vector upper=cs; - std::vector lower=cs; + std::vector diag =bs; + std::vector upper=cs; + std::vector lower=cs; upper[Ls-1]=-mass*upper[Ls-1]; lower[0] =-mass*lower[0]; M5Ddag(psi,psi,Din,lower,diag,upper); @@ -273,11 +273,21 @@ void CayleyFermion5D::MeoDeriv(GaugeField &mat,const FermionField &U,const template void CayleyFermion5D::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c) { - SetCoefficientsZolotarev(1.0,zdata,b,c); + std::vector gamma(this->Ls); + for(int s=0;sLs;s++) gamma[s] = zdata->gamma[s]; + SetCoefficientsInternal(1.0,gamma,b,c); } //Zolo template void CayleyFermion5D::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c) +{ + std::vector gamma(this->Ls); + for(int s=0;sLs;s++) gamma[s] = zdata->gamma[s]; + SetCoefficientsInternal(zolo_hi,gamma,b,c); +} +//Zolo +template +void CayleyFermion5D::SetCoefficientsInternal(RealD zolo_hi,std::vector & gamma,RealD b,RealD c) { int Ls=this->Ls; @@ -315,7 +325,7 @@ void CayleyFermion5D::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolot double bmc = b-c; for(int i=0; i < Ls; i++){ as[i] = 1.0; - omega[i] = ((double)zdata->gamma[i])*zolo_hi; //NB reciprocal relative to Chroma NEF code + omega[i] = gamma[i]*zolo_hi; //NB reciprocal relative to Chroma NEF code bs[i] = 0.5*(bpc/omega[i] + bmc); cs[i] = 0.5*(bpc/omega[i] - bmc); } @@ -377,7 +387,7 @@ void CayleyFermion5D::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolot } { - double delta_d=mass*cee[Ls-1]; + Coeff_t delta_d=mass*cee[Ls-1]; for(int j=0;j &lower, - std::vector &diag, - std::vector &upper); + std::vector &lower, + std::vector &diag, + std::vector &upper); void M5Ddag(const FermionField &psi, const FermionField &phi, FermionField &chi, - std::vector &lower, - std::vector &diag, - std::vector &upper); + std::vector &lower, + std::vector &diag, + std::vector &upper); void MooeeInternal(const FermionField &in, FermionField &out,int dag,int inv); virtual void Instantiatable(void)=0; @@ -91,23 +91,23 @@ namespace Grid { RealD mass; // Cayley form Moebius (tanh and zolotarev) - std::vector omega; - std::vector bs; // S dependent coeffs - std::vector cs; - std::vector as; + std::vector omega; + std::vector bs; // S dependent coeffs + std::vector cs; + std::vector as; // For preconditioning Cayley form - std::vector bee; - std::vector cee; - std::vector aee; - std::vector beo; - std::vector ceo; - std::vector aeo; + std::vector bee; + std::vector cee; + std::vector aee; + std::vector beo; + std::vector ceo; + std::vector aeo; // LDU factorisation of the eeoo matrix - std::vector lee; - std::vector leem; - std::vector uee; - std::vector ueem; - std::vector dee; + std::vector lee; + std::vector leem; + std::vector uee; + std::vector ueem; + std::vector dee; // Constructors CayleyFermion5D(GaugeField &_Umu, @@ -117,20 +117,19 @@ namespace Grid { GridRedBlackCartesian &FourDimRedBlackGrid, RealD _mass,RealD _M5,const ImplParams &p= ImplParams()); - - protected: void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c); void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c); + void SetCoefficientsInternal(RealD zolo_hi,std::vector & gamma,RealD b,RealD c); }; } } #define INSTANTIATE_DPERP(A)\ template void CayleyFermion5D< A >::M5D(const FermionField &psi,const FermionField &phi,FermionField &chi,\ - std::vector &lower,std::vector &diag,std::vector &upper); \ + std::vector &lower,std::vector &diag,std::vector &upper); \ template void CayleyFermion5D< A >::M5Ddag(const FermionField &psi,const FermionField &phi,FermionField &chi,\ - std::vector &lower,std::vector &diag,std::vector &upper); \ + std::vector &lower,std::vector &diag,std::vector &upper); \ template void CayleyFermion5D< A >::MooeeInv (const FermionField &psi, FermionField &chi); \ template void CayleyFermion5D< A >::MooeeInvDag (const FermionField &psi, FermionField &chi); diff --git a/lib/qcd/action/fermion/CayleyFermion5Dcache.cc b/lib/qcd/action/fermion/CayleyFermion5Dcache.cc index 4791e7a2..62e91dd4 100644 --- a/lib/qcd/action/fermion/CayleyFermion5Dcache.cc +++ b/lib/qcd/action/fermion/CayleyFermion5Dcache.cc @@ -43,9 +43,9 @@ template void CayleyFermion5D::M5D(const FermionField &psi, const FermionField &phi, FermionField &chi, - std::vector &lower, - std::vector &diag, - std::vector &upper) + std::vector &lower, + std::vector &diag, + std::vector &upper) { int Ls =this->Ls; GridBase *grid=psi._grid; @@ -82,9 +82,9 @@ template void CayleyFermion5D::M5Ddag(const FermionField &psi, const FermionField &phi, FermionField &chi, - std::vector &lower, - std::vector &diag, - std::vector &upper) + std::vector &lower, + std::vector &diag, + std::vector &upper) { int Ls =this->Ls; GridBase *grid=psi._grid; @@ -204,6 +204,8 @@ PARALLEL_FOR_LOOP INSTANTIATE_DPERP(WilsonImplD); INSTANTIATE_DPERP(GparityWilsonImplF); INSTANTIATE_DPERP(GparityWilsonImplD); + INSTANTIATE_DPERP(ZWilsonImplF); + INSTANTIATE_DPERP(ZWilsonImplD); #endif }} diff --git a/lib/qcd/action/fermion/CayleyFermion5Dssp.cc b/lib/qcd/action/fermion/CayleyFermion5Dssp.cc index a2c96d87..ad7daddb 100644 --- a/lib/qcd/action/fermion/CayleyFermion5Dssp.cc +++ b/lib/qcd/action/fermion/CayleyFermion5Dssp.cc @@ -43,9 +43,9 @@ template void CayleyFermion5D::M5D(const FermionField &psi, const FermionField &phi, FermionField &chi, - std::vector &lower, - std::vector &diag, - std::vector &upper) + std::vector &lower, + std::vector &diag, + std::vector &upper) { int Ls=this->Ls; for(int s=0;s void CayleyFermion5D::M5Ddag(const FermionField &psi, const FermionField &phi, FermionField &chi, - std::vector &lower, - std::vector &diag, - std::vector &upper) + std::vector &lower, + std::vector &diag, + std::vector &upper) { int Ls=this->Ls; for(int s=0;s void CayleyFermion5D::M5D(const FermionField &psi, const FermionField &phi, FermionField &chi, - std::vector &lower, - std::vector &diag, - std::vector &upper) + std::vector &lower, + std::vector &diag, + std::vector &upper) { GridBase *grid=psi._grid; int Ls = this->Ls; @@ -121,9 +121,9 @@ template void CayleyFermion5D::M5Ddag(const FermionField &psi, const FermionField &phi, FermionField &chi, - std::vector &lower, - std::vector &diag, - std::vector &upper) + std::vector &lower, + std::vector &diag, + std::vector &upper) { GridBase *grid=psi._grid; int Ls = this->Ls; @@ -194,8 +194,8 @@ void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField chi.checkerboard=psi.checkerboard; - Eigen::MatrixXd Pplus = Eigen::MatrixXd::Zero(Ls,Ls); - Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls); + Eigen::MatrixXcd Pplus = Eigen::MatrixXcd::Zero(Ls,Ls); + Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls); for(int s=0;s::MooeeInternal(const FermionField &psi, FermionField Pplus (0,Ls-1) = mass*cee[0]; Pminus(Ls-1,0) = mass*cee[Ls-1]; - Eigen::MatrixXd PplusMat ; - Eigen::MatrixXd PminusMat; + Eigen::MatrixXcd PplusMat ; + Eigen::MatrixXcd PminusMat; if ( inv ) { PplusMat =Pplus.inverse(); @@ -298,8 +298,12 @@ PARALLEL_FOR_LOOP INSTANTIATE_DPERP(DomainWallVec5dImplD); INSTANTIATE_DPERP(DomainWallVec5dImplF); +INSTANTIATE_DPERP(ZDomainWallVec5dImplD); +INSTANTIATE_DPERP(ZDomainWallVec5dImplF); template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); +template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); +template void CayleyFermion5D::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv); }} diff --git a/lib/qcd/action/fermion/FermionOperatorImpl.h b/lib/qcd/action/fermion/FermionOperatorImpl.h index 4126d185..b756936b 100644 --- a/lib/qcd/action/fermion/FermionOperatorImpl.h +++ b/lib/qcd/action/fermion/FermionOperatorImpl.h @@ -100,7 +100,8 @@ namespace Grid { typedef typename Impl::SiteHalfSpinor SiteHalfSpinor; \ typedef typename Impl::Compressor Compressor; \ typedef typename Impl::StencilImpl StencilImpl; \ - typedef typename Impl::ImplParams ImplParams; + typedef typename Impl::ImplParams ImplParams; \ + typedef typename Impl::Coeff_t Coeff_t; #define INHERIT_IMPL_TYPES(Base) \ INHERIT_GIMPL_TYPES(Base)\ @@ -109,12 +110,14 @@ namespace Grid { /////// // Single flavour four spinors with colour index /////// - template + template class WilsonImpl : public PeriodicGaugeImpl< GaugeImplTypes< S, Nrepresentation> > { public: + const bool LsVectorised=false; + typedef _Coeff_t Coeff_t; typedef PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > Gimpl; INHERIT_GIMPL_TYPES(Gimpl); @@ -192,12 +195,13 @@ PARALLEL_FOR_LOOP /////// // Single flavour four spinors with colour index, 5d redblack /////// - template + template class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { public: const bool LsVectorised=true; + typedef _Coeff_t Coeff_t; typedef PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > Gimpl; INHERIT_GIMPL_TYPES(Gimpl); @@ -287,12 +291,13 @@ PARALLEL_FOR_LOOP // Flavour doubled spinors; is Gparity the only? what about C*? //////////////////////////////////////////////////////////////////////////////////////// - template + template class GparityWilsonImpl : public ConjugateGaugeImpl< GaugeImplTypes >{ public: const bool LsVectorised=false; + typedef _Coeff_t Coeff_t; typedef ConjugateGaugeImpl< GaugeImplTypes > Gimpl; INHERIT_GIMPL_TYPES(Gimpl); @@ -483,6 +488,18 @@ PARALLEL_FOR_LOOP typedef WilsonImpl WilsonImplF; // Float typedef WilsonImpl WilsonImplD; // Double + typedef WilsonImpl ZWilsonImplR; // Real.. whichever prec + typedef WilsonImpl ZWilsonImplF; // Float + typedef WilsonImpl ZWilsonImplD; // Double + + typedef DomainWallVec5dImpl DomainWallVec5dImplR; // Real.. whichever prec + typedef DomainWallVec5dImpl DomainWallVec5dImplF; // Float + typedef DomainWallVec5dImpl DomainWallVec5dImplD; // Double + + typedef DomainWallVec5dImpl ZDomainWallVec5dImplR; // Real.. whichever prec + typedef DomainWallVec5dImpl ZDomainWallVec5dImplF; // Float + typedef DomainWallVec5dImpl ZDomainWallVec5dImplD; // Double + typedef DomainWallVec5dImpl DomainWallVec5dImplR; // Real.. whichever prec typedef DomainWallVec5dImpl DomainWallVec5dImplF; // Float typedef DomainWallVec5dImpl DomainWallVec5dImplD; // Double diff --git a/lib/qcd/action/fermion/WilsonKernels.cc b/lib/qcd/action/fermion/WilsonKernels.cc index d644f6ef..a12a2b16 100644 --- a/lib/qcd/action/fermion/WilsonKernels.cc +++ b/lib/qcd/action/fermion/WilsonKernels.cc @@ -68,16 +68,21 @@ void WilsonKernels::DiracOptDhopSiteDag(StencilImpl &st,LebesgueOrder &lo, std::vector > &buf, int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out) { - // No asm implementation yet. - // if ( AsmOpt ) WilsonKernels::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,in,out); - // else - for(int site=0;site::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out); - else WilsonKernels::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out); - sF++; +#ifdef AVX512 + if ( AsmOpt ) { + WilsonKernels::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out); + } else { +#else + { +#endif + for(int site=0;site::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out); + else WilsonKernels::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out); + sF++; + } + sU++; } - sU++; } } diff --git a/lib/qcd/action/fermion/WilsonKernels.h b/lib/qcd/action/fermion/WilsonKernels.h index 231fa293..b679d3f9 100644 --- a/lib/qcd/action/fermion/WilsonKernels.h +++ b/lib/qcd/action/fermion/WilsonKernels.h @@ -79,6 +79,10 @@ namespace Grid { std::vector > &buf, int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out); + void DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, + std::vector > &buf, + int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out); + void DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, std::vector > &buf, diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc index 83871b7b..b443ccf9 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc +++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc @@ -73,12 +73,21 @@ static int signInit = setupSigns(); #define MAYBEPERM(A,perm) if (perm) { A ; } #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf) #define FX(A) WILSONASM_ ##A + +#undef KERNEL_DAG template<> void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, std::vector > &buf, int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +#define KERNEL_DAG +template<> +void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, + std::vector > &buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #undef VMOVIDUP #undef VMOVRDUP #undef MAYBEPERM @@ -89,14 +98,25 @@ void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrd #define VMOVIDUP(A,B,C) VBCASTIDUPf(A,B,C) #define VMOVRDUP(A,B,C) VBCASTRDUPf(A,B,C) #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf) + +#undef KERNEL_DAG template<> void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, std::vector > &buf, int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include +#define KERNEL_DAG +template<> +void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, + std::vector > &buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +#include + #endif + + template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, std::vector > &buf, int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out); diff --git a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h index 4f3ef861..d236a774 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsmBody.h +++ b/lib/qcd/action/fermion/WilsonKernelsAsmBody.h @@ -30,7 +30,11 @@ basep = st.GetPFInfo(nent,plocal); nent++; if ( local ) { LOAD64(%r10,isigns); +#ifdef KERNEL_DAG + XP_PROJMEM(base); +#else XM_PROJMEM(base); +#endif MAYBEPERM(PERMUTE_DIR3,perm); } else { LOAD_CHI(base); @@ -41,15 +45,22 @@ MULT_2SPIN_DIR_PFXP(Xp,basep); } LOAD64(%r10,isigns); +#ifdef KERNEL_DAG + XP_RECON; +#else XM_RECON; - +#endif //////////////////////////////// // Yp //////////////////////////////// basep = st.GetPFInfo(nent,plocal); nent++; if ( local ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit +#ifdef KERNEL_DAG + YP_PROJMEM(base); +#else YM_PROJMEM(base); +#endif MAYBEPERM(PERMUTE_DIR2,perm); } else { LOAD_CHI(base); @@ -60,7 +71,11 @@ MULT_2SPIN_DIR_PFYP(Yp,basep); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit +#ifdef KERNEL_DAG + YP_RECON_ACCUM; +#else YM_RECON_ACCUM; +#endif //////////////////////////////// // Zp @@ -68,7 +83,11 @@ basep = st.GetPFInfo(nent,plocal); nent++; if ( local ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit +#ifdef KERNEL_DAG + ZP_PROJMEM(base); +#else ZM_PROJMEM(base); +#endif MAYBEPERM(PERMUTE_DIR1,perm); } else { LOAD_CHI(base); @@ -79,7 +98,11 @@ MULT_2SPIN_DIR_PFZP(Zp,basep); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit +#ifdef KERNEL_DAG + ZP_RECON_ACCUM; +#else ZM_RECON_ACCUM; +#endif //////////////////////////////// // Tp @@ -87,7 +110,11 @@ basep = st.GetPFInfo(nent,plocal); nent++; if ( local ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit +#ifdef KERNEL_DAG + TP_PROJMEM(base); +#else TM_PROJMEM(base); +#endif MAYBEPERM(PERMUTE_DIR0,perm); } else { LOAD_CHI(base); @@ -98,7 +125,11 @@ MULT_2SPIN_DIR_PFTP(Tp,basep); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit +#ifdef KERNEL_DAG + TP_RECON_ACCUM; +#else TM_RECON_ACCUM; +#endif //////////////////////////////// // Xm @@ -107,7 +138,11 @@ // basep= st.GetPFInfo(nent,plocal); nent++; if ( local ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit +#ifdef KERNEL_DAG + XM_PROJMEM(base); +#else XP_PROJMEM(base); +#endif MAYBEPERM(PERMUTE_DIR3,perm); } else { LOAD_CHI(base); @@ -118,7 +153,11 @@ MULT_2SPIN_DIR_PFXM(Xm,basep); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit +#ifdef KERNEL_DAG + XM_RECON_ACCUM; +#else XP_RECON_ACCUM; +#endif //////////////////////////////// // Ym @@ -126,7 +165,11 @@ basep= st.GetPFInfo(nent,plocal); nent++; if ( local ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit +#ifdef KERNEL_DAG + YM_PROJMEM(base); +#else YP_PROJMEM(base); +#endif MAYBEPERM(PERMUTE_DIR2,perm); } else { LOAD_CHI(base); @@ -137,7 +180,11 @@ MULT_2SPIN_DIR_PFYM(Ym,basep); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit +#ifdef KERNEL_DAG + YM_RECON_ACCUM; +#else YP_RECON_ACCUM; +#endif //////////////////////////////// // Zm @@ -145,7 +192,11 @@ basep= st.GetPFInfo(nent,plocal); nent++; if ( local ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit +#ifdef KERNEL_DAG + ZM_PROJMEM(base); +#else ZP_PROJMEM(base); +#endif MAYBEPERM(PERMUTE_DIR1,perm); } else { LOAD_CHI(base); @@ -156,7 +207,11 @@ MULT_2SPIN_DIR_PFZM(Zm,basep); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit +#ifdef KERNEL_DAG + ZM_RECON_ACCUM; +#else ZP_RECON_ACCUM; +#endif //////////////////////////////// // Tm @@ -164,7 +219,11 @@ basep= st.GetPFInfo(nent,plocal); nent++; if ( local ) { LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit +#ifdef KERNEL_DAG + TM_PROJMEM(base); +#else TP_PROJMEM(base); +#endif MAYBEPERM(PERMUTE_DIR0,perm); } else { LOAD_CHI(base); @@ -175,7 +234,11 @@ MULT_2SPIN_DIR_PFTM(Tm,basep); } LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit +#ifdef KERNEL_DAG + TM_RECON_ACCUM; +#else TP_RECON_ACCUM; +#endif basep= st.GetPFInfo(nent,plocal); nent++; SAVE_RESULT(base,basep); diff --git a/lib/qcd/action/fermion/ZMobiusFermion.h b/lib/qcd/action/fermion/ZMobiusFermion.h new file mode 100644 index 00000000..d0e00657 --- /dev/null +++ b/lib/qcd/action/fermion/ZMobiusFermion.h @@ -0,0 +1,79 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/qcd/action/fermion/MobiusFermion.h + + Copyright (C) 2015 + +Author: Peter Boyle +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#ifndef GRID_QCD_ZMOBIUS_FERMION_H +#define GRID_QCD_ZMOBIUS_FERMION_H + +#include + +namespace Grid { + + namespace QCD { + + template + class ZMobiusFermion : public CayleyFermion5D + { + public: + INHERIT_IMPL_TYPES(Impl); + public: + + virtual void Instantiatable(void) {}; + // Constructors + ZMobiusFermion(GaugeField &_Umu, + GridCartesian &FiveDimGrid, + GridRedBlackCartesian &FiveDimRedBlackGrid, + GridCartesian &FourDimGrid, + GridRedBlackCartesian &FourDimRedBlackGrid, + RealD _mass,RealD _M5, + std::vector &gamma, RealD b,RealD c,const ImplParams &p= ImplParams()) : + + CayleyFermion5D(_Umu, + FiveDimGrid, + FiveDimRedBlackGrid, + FourDimGrid, + FourDimRedBlackGrid,_mass,_M5,p) + + { + RealD eps = 1.0; + + std::cout< zgamma(this->Ls); + for(int s=0;sLs;s++){ + zgamma[s] = gamma[s]; + } + + // Call base setter + this->SetCoefficientsInternal(1.0,zgamma,b,c); + } + + }; + + } +} + +#endif diff --git a/tests/debug/Test_cayley_even_odd_vec.cc b/tests/debug/Test_cayley_even_odd_vec.cc index 68315b88..f8600782 100644 --- a/tests/debug/Test_cayley_even_odd_vec.cc +++ b/tests/debug/Test_cayley_even_odd_vec.cc @@ -44,6 +44,7 @@ struct scal { }; typedef DomainWallFermion DomainWallVecFermionR; +typedef ZMobiusFermion ZMobiusVecFermionR; typedef MobiusFermion MobiusVecFermionR; typedef MobiusZolotarevFermion MobiusZolotarevVecFermionR; typedef ScaledShamirFermion ScaledShamirVecFermionR; @@ -117,6 +118,17 @@ int main (int argc, char ** argv) TestWhat(Dmob,FGrid,FrbGrid,UGrid,mass,M5,&RNG4,&RNG5); TestWhat(sDmob,sFGrid,sFrbGrid,sUGrid,mass,M5,&sRNG4,&sRNG5); + + std::cout< gamma(Ls,std::complex(1.0,0.0)); + ZMobiusFermionR zDmob(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,gamma,b,c); + ZMobiusVecFermionR szDmob(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,mass,M5,gamma,b,c); + TestMoo(zDmob,szDmob); + TestWhat(zDmob,FGrid,FrbGrid,UGrid,mass,M5,&RNG4,&RNG5); + TestWhat(szDmob,sFGrid,sFrbGrid,sUGrid,mass,M5,&sRNG4,&sRNG5); + std::cout< Date: Mon, 15 Aug 2016 23:00:40 +0100 Subject: [PATCH 12/29] Instantiate --- lib/qcd/action/fermion/FermionOperatorImpl.h | 4 +- lib/qcd/action/fermion/WilsonKernelsHand.cc | 55 ++++++-------------- 2 files changed, 18 insertions(+), 41 deletions(-) diff --git a/lib/qcd/action/fermion/FermionOperatorImpl.h b/lib/qcd/action/fermion/FermionOperatorImpl.h index b756936b..6ca72b93 100644 --- a/lib/qcd/action/fermion/FermionOperatorImpl.h +++ b/lib/qcd/action/fermion/FermionOperatorImpl.h @@ -488,7 +488,7 @@ PARALLEL_FOR_LOOP typedef WilsonImpl WilsonImplF; // Float typedef WilsonImpl WilsonImplD; // Double - typedef WilsonImpl ZWilsonImplR; // Real.. whichever prec + typedef WilsonImpl ZWilsonImplR; // Real.. whichever prec typedef WilsonImpl ZWilsonImplF; // Float typedef WilsonImpl ZWilsonImplD; // Double @@ -496,7 +496,7 @@ PARALLEL_FOR_LOOP typedef DomainWallVec5dImpl DomainWallVec5dImplF; // Float typedef DomainWallVec5dImpl DomainWallVec5dImplD; // Double - typedef DomainWallVec5dImpl ZDomainWallVec5dImplR; // Real.. whichever prec + typedef DomainWallVec5dImpl ZDomainWallVec5dImplR; // Real.. whichever prec typedef DomainWallVec5dImpl ZDomainWallVec5dImplF; // Float typedef DomainWallVec5dImpl ZDomainWallVec5dImplD; // Double diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/lib/qcd/action/fermion/WilsonKernelsHand.cc index d073e677..11b22209 100644 --- a/lib/qcd/action/fermion/WilsonKernelsHand.cc +++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc @@ -839,46 +839,23 @@ void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st, ////////////// Wilson ; uses this implementation ///////////////////// // Need Nc=3 though // -template void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - std::vector > &buf, - int ss,int sU,const FermionField &in, FermionField &out); -template void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - std::vector > &buf, - int ss,int sU,const FermionField &in, FermionField &out); -template void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - std::vector > &buf, - int ss,int sU,const FermionField &in, FermionField &out); -template void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - std::vector > &buf, +#define INSTANTIATE_THEM(A) \ +template void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\ + std::vector > &buf,\ + int ss,int sU,const FermionField &in, FermionField &out);\ +template void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\ + std::vector > &buf,\ int ss,int sU,const FermionField &in, FermionField &out); - -template void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - std::vector > &buf, - int ss,int sU,const FermionField &in, FermionField &out); -template void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - std::vector > &buf, - int ss,int sU,const FermionField &in, FermionField &out); -template void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - std::vector > &buf, - int ss,int sU,const FermionField &in, FermionField &out); -template void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - std::vector > &buf, - int ss,int sU,const FermionField &in, FermionField &out); - - -template void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - std::vector > &buf, - int ss,int sU,const FermionField &in, FermionField &out); -template void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - std::vector > &buf, - int ss,int sU,const FermionField &in, FermionField &out); -template void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - std::vector > &buf, - int ss,int sU,const FermionField &in, FermionField &out); -template void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U, - std::vector > &buf, - int ss,int sU,const FermionField &in, FermionField &out); - +INSTANTIATE_THEM(WilsonImplF); +INSTANTIATE_THEM(WilsonImplD); +INSTANTIATE_THEM(ZWilsonImplF); +INSTANTIATE_THEM(ZWilsonImplD); +INSTANTIATE_THEM(GparityWilsonImplF); +INSTANTIATE_THEM(GparityWilsonImplD); +INSTANTIATE_THEM(DomainWallVec5dImplF); +INSTANTIATE_THEM(DomainWallVec5dImplD); +INSTANTIATE_THEM(ZDomainWallVec5dImplF); +INSTANTIATE_THEM(ZDomainWallVec5dImplD); }} From 17097a93ece0df40fbc09877245ac6298d198f92 Mon Sep 17 00:00:00 2001 From: paboyle Date: Wed, 17 Aug 2016 01:33:55 +0100 Subject: [PATCH 13/29] FFTW test ran over 4 mpi processes. --- lib/FFT.h | 198 +++++++++++++++++++++++++++++++++ lib/Grid.h | 4 +- lib/lattice/Lattice_transfer.h | 75 ++++++++++++- lib/simd/Grid_vector_types.h | 6 + lib/simd/Grid_vector_unops.h | 10 +- lib/tensors/Tensor_unary.h | 1 + tests/core/Test_fft.cc | 87 +++++++++++++++ 7 files changed, 372 insertions(+), 9 deletions(-) create mode 100644 lib/FFT.h create mode 100644 tests/core/Test_fft.cc diff --git a/lib/FFT.h b/lib/FFT.h new file mode 100644 index 00000000..262e525b --- /dev/null +++ b/lib/FFT.h @@ -0,0 +1,198 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/Cshift.h + + Copyright (C) 2015 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#ifndef _GRID_FFT_H_ +#define _GRID_FFT_H_ + +#include + +namespace Grid { + + + class FFT { + private: + + GridCartesian *vgrid; + GridCartesian *sgrid; + + int Nd; + std::vector dimensions; + std::vector processors; + std::vector processor_coor; + + public: + + static const int forward=FFTW_FORWARD; + static const int backward=FFTW_BACKWARD; + + FFT ( GridCartesian * grid ) : + vgrid(grid), + Nd(grid->_ndimension), + dimensions(grid->_fdimensions), + processors(grid->_processors), + processor_coor(grid->_processor_coor) + { + std::vector layout(Nd,1); + sgrid = new GridCartesian(dimensions,layout,processors); + }; + + ~FFT ( void) { + delete sgrid; + } + + template + void FFT_dim(Lattice &result,const Lattice &source,int dim, int inverse){ + + conformable(result._grid,vgrid); + conformable(source._grid,vgrid); + + int L = vgrid->_ldimensions[dim]; + int G = vgrid->_fdimensions[dim]; + + std::vector layout(Nd,1); + std::vector pencil_gd(vgrid->_fdimensions); + std::vector pencil_ld(processors); + + pencil_gd[dim] = G*processors[dim]; + pencil_ld[dim] = G*processors[dim]; + + // Pencil global vol LxLxGxLxL per node + GridCartesian pencil_g(pencil_gd,layout,processors); + GridCartesian pencil_l(pencil_ld,layout,processors); + + // Construct pencils + typedef typename vobj::scalar_object sobj; + Lattice ssource(vgrid); ssource =source; + Lattice pgsource(&pencil_g); + Lattice pgresult(&pencil_g); + Lattice plsource(&pencil_l); + Lattice plresult(&pencil_l); + + { + + assert(sizeof(typename sobj::scalar_type)==sizeof(ComplexD)); + assert(sizeof(fftw_complex)==sizeof(ComplexD)); + assert(sizeof(fftw_complex)==sizeof(ComplexD)); + + int Ncomp = sizeof(sobj)/sizeof(fftw_complex); + + std::cout << "Ncomp = "<lSites();idx++) { + + std::vector lcoor(Nd); + sgrid->LocalIndexToLocalCoor(idx,lcoor); + + sobj s; + + peekLocalSite(s,ssource,lcoor); + + lcoor[dim]+=p*L; + + pokeLocalSite(s,pgsource,lcoor); + } + + ssource = Cshift(ssource,dim,L); + } + + std::cout << " pgsource pencil " << pgsource<lSites();idx++) { + + std::vector pcoor(Nd,0); + std::vector lcoor(Nd); + sgrid->LocalIndexToLocalCoor(idx,lcoor); + + if ( lcoor[dim] == 0 ) { // restricts loop to plane at lcoor[dim]==0 + + // Project to local pencil array + for(int l=0;l #include #include #include +#include #include #include #include @@ -78,7 +79,8 @@ Author: paboyle #include #include #include -#include + +#include #include #include diff --git a/lib/lattice/Lattice_transfer.h b/lib/lattice/Lattice_transfer.h index 2fa72014..cc4617de 100644 --- a/lib/lattice/Lattice_transfer.h +++ b/lib/lattice/Lattice_transfer.h @@ -349,7 +349,7 @@ void localConvert(const Lattice &in,Lattice &out) assert(ig->_ldimensions[d] == og->_ldimensions[d]); } -PARALLEL_FOR_LOOP + //PARALLEL_FOR_LOOP for(int idx=0;idxlSites();idx++){ std::vector lcoor(ni); ig->LocalIndexToLocalCoor(idx,lcoor); @@ -446,6 +446,79 @@ void ExtractSlice(Lattice &lowDim, Lattice & higherDim,int slice, in } + +template +void InsertSliceLocal(Lattice &lowDim, Lattice & higherDim,int slice_lo,int slice_hi, int orthog) +{ + typedef typename vobj::scalar_object sobj; + sobj s; + + GridBase *lg = lowDim._grid; + GridBase *hg = higherDim._grid; + int nl = lg->_ndimension; + int nh = hg->_ndimension; + + assert(nl == nh); + assert(orthog=0); + + for(int d=0;d_processors[d] == hg->_processors[d]); + assert(lg->_ldimensions[d] == hg->_ldimensions[d]); + } + + // the above should guarantee that the operations are local + //PARALLEL_FOR_LOOP + for(int idx=0;idxlSites();idx++){ + std::vector lcoor(nl); + std::vector hcoor(nh); + lg->LocalIndexToLocalCoor(idx,lcoor); + if( lcoor[orthog] == slice_lo ) { + hcoor=lcoor; + hcoor[orthog] = slice_hi; + peekLocalSite(s,lowDim,lcoor); + pokeLocalSite(s,higherDim,hcoor); + } + } +} + + +template +void ExtractSliceLocal(Lattice &lowDim, Lattice & higherDim,int slice_lo,int slice_hi, int orthog) +{ + typedef typename vobj::scalar_object sobj; + sobj s; + + GridBase *lg = lowDim._grid; + GridBase *hg = higherDim._grid; + int nl = lg->_ndimension; + int nh = hg->_ndimension; + + assert(nl == nh); + assert(orthog=0); + + for(int d=0;d_processors[d] == hg->_processors[d]); + assert(lg->_ldimensions[d] == hg->_ldimensions[d]); + } + + // the above should guarantee that the operations are local + //PARALLEL_FOR_LOOP + for(int idx=0;idxlSites();idx++){ + std::vector lcoor(nl); + std::vector hcoor(nh); + lg->LocalIndexToLocalCoor(idx,lcoor); + if( lcoor[orthog] == slice_lo ) { + hcoor=lcoor; + hcoor[orthog] = slice_hi; + peekLocalSite(s,higherDim,hcoor); + pokeLocalSite(s,lowDim,lcoor); + } + } +} + + template void Replicate(Lattice &coarse,Lattice & fine) { diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h index 2f2b70c8..30576fb6 100644 --- a/lib/simd/Grid_vector_types.h +++ b/lib/simd/Grid_vector_types.h @@ -388,6 +388,12 @@ class Grid_simd { }; // end of Grid_simd class definition + +inline void permute(ComplexD &y,ComplexD b, int perm) { y=b; } +inline void permute(ComplexF &y,ComplexF b, int perm) { y=b; } +inline void permute(RealD &y,RealD b, int perm) { y=b; } +inline void permute(RealF &y,RealF b, int perm) { y=b; } + //////////////////////////////////////////////////////////////////// // General rotate //////////////////////////////////////////////////////////////////// diff --git a/lib/simd/Grid_vector_unops.h b/lib/simd/Grid_vector_unops.h index a67f4e7d..2afac190 100644 --- a/lib/simd/Grid_vector_unops.h +++ b/lib/simd/Grid_vector_unops.h @@ -67,15 +67,13 @@ template struct AsinRealFunctor { scalar operator()(const scalar &a) const { return asin(real(a)); } }; - template struct LogRealFunctor { scalar operator()(const scalar &a) const { return log(real(a)); } }; - template -struct ExpRealFunctor { - scalar operator()(const scalar &a) const { return exp(real(a)); } +struct ExpFunctor { + scalar operator()(const scalar &a) const { return exp(a); } }; template struct NotFunctor { @@ -85,7 +83,6 @@ template struct AbsRealFunctor { scalar operator()(const scalar &a) const { return std::abs(real(a)); } }; - template struct PowRealFunctor { double y; @@ -135,7 +132,6 @@ template inline Scalar rsqrt(const Scalar &r) { return (RSqrtRealFunctor(), r); } - template inline Grid_simd cos(const Grid_simd &r) { return SimdApply(CosRealFunctor(), r); @@ -162,7 +158,7 @@ inline Grid_simd abs(const Grid_simd &r) { } template inline Grid_simd exp(const Grid_simd &r) { - return SimdApply(ExpRealFunctor(), r); + return SimdApply(ExpFunctor(), r); } template inline Grid_simd Not(const Grid_simd &r) { diff --git a/lib/tensors/Tensor_unary.h b/lib/tensors/Tensor_unary.h index dd05a4a7..92a968df 100644 --- a/lib/tensors/Tensor_unary.h +++ b/lib/tensors/Tensor_unary.h @@ -36,6 +36,7 @@ template inline auto func(const iScalar &z) -> iScalar\ {\ iScalar ret;\ ret._internal = func( (z._internal));\ + std::cout << "Unary "<<#func<<" " << z._internal <<" -> "<< ret._internal <<" "<< typeid(obj).name() < inline auto func(const iVector &z) -> iVector\ diff --git a/tests/core/Test_fft.cc b/tests/core/Test_fft.cc new file mode 100644 index 00000000..deed6f0a --- /dev/null +++ b/tests/core/Test_fft.cc @@ -0,0 +1,87 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_cshift.cc + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace Grid; +using namespace Grid::QCD; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + std::vector latt_size = GridDefaultLatt(); + std::vector simd_layout( { vComplexD::Nsimd(),1,1,1}); + std::vector mpi_layout = GridDefaultMpi(); + + GridCartesian Fine(latt_size,simd_layout,mpi_layout); + + LatticeComplexD one(&Fine); + LatticeComplexD zz(&Fine); + LatticeComplexD C(&Fine); + LatticeComplexD Ctilde(&Fine); + LatticeComplexD coor(&Fine); + + std::vector p({1.0,2.0,3.0,2.0}); + + one = ComplexD(1.0,0.0); + zz = ComplexD(0.0,0.0); + + ComplexD ci(0.0,1.0); + + C=zero; + for(int mu=0;mu<4;mu++){ + RealD TwoPiL = M_PI * 2.0/ latt_size[mu]; + LatticeCoordinate(coor,mu); + C = C - TwoPiL * p[mu] * coor; + } + + std::cout << GridLogMessage<< " C " << C< Date: Thu, 18 Aug 2016 02:23:21 +0100 Subject: [PATCH 14/29] FFT improved and test_FFT passing under MPI 8 processes, 8^4 for LatticeComplexD and LatticeSpinMatrixD --- lib/FFT.h | 18 ++---------- lib/tensors/Tensor_unary.h | 1 - tests/core/Test_fft.cc | 57 ++++++++++++++++++++++++++------------ 3 files changed, 42 insertions(+), 34 deletions(-) diff --git a/lib/FFT.h b/lib/FFT.h index 262e525b..f9470472 100644 --- a/lib/FFT.h +++ b/lib/FFT.h @@ -93,23 +93,20 @@ namespace Grid { Lattice plresult(&pencil_l); { - assert(sizeof(typename sobj::scalar_type)==sizeof(ComplexD)); assert(sizeof(fftw_complex)==sizeof(ComplexD)); assert(sizeof(fftw_complex)==sizeof(ComplexD)); int Ncomp = sizeof(sobj)/sizeof(fftw_complex); - std::cout << "Ncomp = "<lSites();idx++) { std::vector lcoor(Nd); @@ -144,8 +139,6 @@ namespace Grid { ssource = Cshift(ssource,dim,L); } - - std::cout << " pgsource pencil " << pgsource<lSites();idx++) { @@ -165,11 +158,6 @@ namespace Grid { pokeLocalSite(s,plsource,pcoor); } - - if ( idx==0) { - std::cout << " plsource pencil " << pgsource< inline auto func(const iScalar &z) -> iScalar\ {\ iScalar ret;\ ret._internal = func( (z._internal));\ - std::cout << "Unary "<<#func<<" " << z._internal <<" -> "<< ret._internal <<" "<< typeid(obj).name() < inline auto func(const iVector &z) -> iVector\ diff --git a/tests/core/Test_fft.cc b/tests/core/Test_fft.cc index deed6f0a..ed57800c 100644 --- a/tests/core/Test_fft.cc +++ b/tests/core/Test_fft.cc @@ -39,6 +39,10 @@ int main (int argc, char ** argv) std::vector simd_layout( { vComplexD::Nsimd(),1,1,1}); std::vector mpi_layout = GridDefaultMpi(); + int vol = 1; + for(int d=0;d p({1.0,2.0,3.0,2.0}); + std::vector p({1,2,3,2}); one = ComplexD(1.0,0.0); zz = ComplexD(0.0,0.0); @@ -58,30 +65,44 @@ int main (int argc, char ** argv) for(int mu=0;mu<4;mu++){ RealD TwoPiL = M_PI * 2.0/ latt_size[mu]; LatticeCoordinate(coor,mu); - C = C - TwoPiL * p[mu] * coor; + C = C - (TwoPiL * p[mu]) * coor; } - std::cout << GridLogMessage<< " C " << C< Date: Mon, 22 Aug 2016 16:21:01 +0100 Subject: [PATCH 15/29] Adding a test for libfftw3 --- configure.ac | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 89bb3568..0ed501f9 100644 --- a/configure.ac +++ b/configure.ac @@ -43,6 +43,7 @@ AC_ARG_WITH([mpfr], AC_ARG_ENABLE([lapack], [AC_HELP_STRING([--enable-lapack=yes|no|prefix], [enable LAPACK])], [ac_LAPACK=${enable_lapack}],[ac_LAPACK=no]) + case ${ac_LAPACK} in no) ;; @@ -54,6 +55,17 @@ case ${ac_LAPACK} in AC_DEFINE([USE_LAPACK],[1],[use LAPACK]) esac +AC_CHECK_LIB([fftw3],[fft_init_threads], + [AC_DEFINE([HAVE_FFTW],[1],[Define to 1 if you have the `FFTW' library (-lfftw3).])] [ac_fftw=yes], + [ac_fftw=no]) +case ${ac_fftw} in + no) + ;; + yes) + AM_LDFLAGS="-lfftw3 $AM_LDFLAGS" +esac + + ################ Get compiler informations AC_LANG([C++]) AX_CXX_COMPILE_STDCXX_11([noext],[mandatory]) @@ -77,7 +89,7 @@ AC_CHECK_LIB([gmp],[__gmpf_init], [have_mpfr=true] [LIBS="$LIBS -lmpfr"], [AC_MSG_ERROR([MPFR library not found])])] - [AC_DEFINE([HAVE_LIBGMP], [1], [Define to 1 if you have the `GMP' library (-lgmp).])] + [AC_DEFINE([HAVE_LIBGMP], [1], [Define to 1 if you have the `GMP' library (-lgmp).])] [have_gmp=true] [LIBS="$LIBS -lgmp"], [AC_MSG_WARN([**** GMP library not found, Grid can still compile but RHMC will not work ****])]) From 73ce476890b303d7f452dc33e91dc93f212167f7 Mon Sep 17 00:00:00 2001 From: paboyle Date: Mon, 22 Aug 2016 16:24:21 +0100 Subject: [PATCH 16/29] Include fftw headers --- lib/fftw/fftw3.h | 412 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 412 insertions(+) create mode 100644 lib/fftw/fftw3.h diff --git a/lib/fftw/fftw3.h b/lib/fftw/fftw3.h new file mode 100644 index 00000000..1bf34396 --- /dev/null +++ b/lib/fftw/fftw3.h @@ -0,0 +1,412 @@ +/* + * Copyright (c) 2003, 2007-14 Matteo Frigo + * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology + * + * The following statement of license applies *only* to this header file, + * and *not* to the other files distributed with FFTW or derived therefrom: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE + * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/***************************** NOTE TO USERS ********************************* + * + * THIS IS A HEADER FILE, NOT A MANUAL + * + * If you want to know how to use FFTW, please read the manual, + * online at http://www.fftw.org/doc/ and also included with FFTW. + * For a quick start, see the manual's tutorial section. + * + * (Reading header files to learn how to use a library is a habit + * stemming from code lacking a proper manual. Arguably, it's a + * *bad* habit in most cases, because header files can contain + * interfaces that are not part of the public, stable API.) + * + ****************************************************************************/ + +#ifndef FFTW3_H +#define FFTW3_H + +#include + +#ifdef __cplusplus +extern "C" +{ +#endif /* __cplusplus */ + +/* If is included, use the C99 complex type. Otherwise + define a type bit-compatible with C99 complex */ +#if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined(complex) && defined(I) +# define FFTW_DEFINE_COMPLEX(R, C) typedef R _Complex C +#else +# define FFTW_DEFINE_COMPLEX(R, C) typedef R C[2] +#endif + +#define FFTW_CONCAT(prefix, name) prefix ## name +#define FFTW_MANGLE_DOUBLE(name) FFTW_CONCAT(fftw_, name) +#define FFTW_MANGLE_FLOAT(name) FFTW_CONCAT(fftwf_, name) +#define FFTW_MANGLE_LONG_DOUBLE(name) FFTW_CONCAT(fftwl_, name) +#define FFTW_MANGLE_QUAD(name) FFTW_CONCAT(fftwq_, name) + +/* IMPORTANT: for Windows compilers, you should add a line + #define FFTW_DLL + here and in kernel/ifftw.h if you are compiling/using FFTW as a + DLL, in order to do the proper importing/exporting, or + alternatively compile with -DFFTW_DLL or the equivalent + command-line flag. This is not necessary under MinGW/Cygwin, where + libtool does the imports/exports automatically. */ +#if defined(FFTW_DLL) && (defined(_WIN32) || defined(__WIN32__)) + /* annoying Windows syntax for shared-library declarations */ +# if defined(COMPILING_FFTW) /* defined in api.h when compiling FFTW */ +# define FFTW_EXTERN extern __declspec(dllexport) +# else /* user is calling FFTW; import symbol */ +# define FFTW_EXTERN extern __declspec(dllimport) +# endif +#else +# define FFTW_EXTERN extern +#endif + +enum fftw_r2r_kind_do_not_use_me { + FFTW_R2HC=0, FFTW_HC2R=1, FFTW_DHT=2, + FFTW_REDFT00=3, FFTW_REDFT01=4, FFTW_REDFT10=5, FFTW_REDFT11=6, + FFTW_RODFT00=7, FFTW_RODFT01=8, FFTW_RODFT10=9, FFTW_RODFT11=10 +}; + +struct fftw_iodim_do_not_use_me { + int n; /* dimension size */ + int is; /* input stride */ + int os; /* output stride */ +}; + +#include /* for ptrdiff_t */ +struct fftw_iodim64_do_not_use_me { + ptrdiff_t n; /* dimension size */ + ptrdiff_t is; /* input stride */ + ptrdiff_t os; /* output stride */ +}; + +typedef void (*fftw_write_char_func_do_not_use_me)(char c, void *); +typedef int (*fftw_read_char_func_do_not_use_me)(void *); + +/* + huge second-order macro that defines prototypes for all API + functions. We expand this macro for each supported precision + + X: name-mangling macro + R: real data type + C: complex data type +*/ + +#define FFTW_DEFINE_API(X, R, C) \ + \ +FFTW_DEFINE_COMPLEX(R, C); \ + \ +typedef struct X(plan_s) *X(plan); \ + \ +typedef struct fftw_iodim_do_not_use_me X(iodim); \ +typedef struct fftw_iodim64_do_not_use_me X(iodim64); \ + \ +typedef enum fftw_r2r_kind_do_not_use_me X(r2r_kind); \ + \ +typedef fftw_write_char_func_do_not_use_me X(write_char_func); \ +typedef fftw_read_char_func_do_not_use_me X(read_char_func); \ + \ +FFTW_EXTERN void X(execute)(const X(plan) p); \ + \ +FFTW_EXTERN X(plan) X(plan_dft)(int rank, const int *n, \ + C *in, C *out, int sign, unsigned flags); \ + \ +FFTW_EXTERN X(plan) X(plan_dft_1d)(int n, C *in, C *out, int sign, \ + unsigned flags); \ +FFTW_EXTERN X(plan) X(plan_dft_2d)(int n0, int n1, \ + C *in, C *out, int sign, unsigned flags); \ +FFTW_EXTERN X(plan) X(plan_dft_3d)(int n0, int n1, int n2, \ + C *in, C *out, int sign, unsigned flags); \ + \ +FFTW_EXTERN X(plan) X(plan_many_dft)(int rank, const int *n, \ + int howmany, \ + C *in, const int *inembed, \ + int istride, int idist, \ + C *out, const int *onembed, \ + int ostride, int odist, \ + int sign, unsigned flags); \ + \ +FFTW_EXTERN X(plan) X(plan_guru_dft)(int rank, const X(iodim) *dims, \ + int howmany_rank, \ + const X(iodim) *howmany_dims, \ + C *in, C *out, \ + int sign, unsigned flags); \ +FFTW_EXTERN X(plan) X(plan_guru_split_dft)(int rank, const X(iodim) *dims, \ + int howmany_rank, \ + const X(iodim) *howmany_dims, \ + R *ri, R *ii, R *ro, R *io, \ + unsigned flags); \ + \ +FFTW_EXTERN X(plan) X(plan_guru64_dft)(int rank, \ + const X(iodim64) *dims, \ + int howmany_rank, \ + const X(iodim64) *howmany_dims, \ + C *in, C *out, \ + int sign, unsigned flags); \ +FFTW_EXTERN X(plan) X(plan_guru64_split_dft)(int rank, \ + const X(iodim64) *dims, \ + int howmany_rank, \ + const X(iodim64) *howmany_dims, \ + R *ri, R *ii, R *ro, R *io, \ + unsigned flags); \ + \ +FFTW_EXTERN void X(execute_dft)(const X(plan) p, C *in, C *out); \ +FFTW_EXTERN void X(execute_split_dft)(const X(plan) p, R *ri, R *ii, \ + R *ro, R *io); \ + \ +FFTW_EXTERN X(plan) X(plan_many_dft_r2c)(int rank, const int *n, \ + int howmany, \ + R *in, const int *inembed, \ + int istride, int idist, \ + C *out, const int *onembed, \ + int ostride, int odist, \ + unsigned flags); \ + \ +FFTW_EXTERN X(plan) X(plan_dft_r2c)(int rank, const int *n, \ + R *in, C *out, unsigned flags); \ + \ +FFTW_EXTERN X(plan) X(plan_dft_r2c_1d)(int n,R *in,C *out,unsigned flags); \ +FFTW_EXTERN X(plan) X(plan_dft_r2c_2d)(int n0, int n1, \ + R *in, C *out, unsigned flags); \ +FFTW_EXTERN X(plan) X(plan_dft_r2c_3d)(int n0, int n1, \ + int n2, \ + R *in, C *out, unsigned flags); \ + \ + \ +FFTW_EXTERN X(plan) X(plan_many_dft_c2r)(int rank, const int *n, \ + int howmany, \ + C *in, const int *inembed, \ + int istride, int idist, \ + R *out, const int *onembed, \ + int ostride, int odist, \ + unsigned flags); \ + \ +FFTW_EXTERN X(plan) X(plan_dft_c2r)(int rank, const int *n, \ + C *in, R *out, unsigned flags); \ + \ +FFTW_EXTERN X(plan) X(plan_dft_c2r_1d)(int n,C *in,R *out,unsigned flags); \ +FFTW_EXTERN X(plan) X(plan_dft_c2r_2d)(int n0, int n1, \ + C *in, R *out, unsigned flags); \ +FFTW_EXTERN X(plan) X(plan_dft_c2r_3d)(int n0, int n1, \ + int n2, \ + C *in, R *out, unsigned flags); \ + \ +FFTW_EXTERN X(plan) X(plan_guru_dft_r2c)(int rank, const X(iodim) *dims, \ + int howmany_rank, \ + const X(iodim) *howmany_dims, \ + R *in, C *out, \ + unsigned flags); \ +FFTW_EXTERN X(plan) X(plan_guru_dft_c2r)(int rank, const X(iodim) *dims, \ + int howmany_rank, \ + const X(iodim) *howmany_dims, \ + C *in, R *out, \ + unsigned flags); \ + \ +FFTW_EXTERN X(plan) X(plan_guru_split_dft_r2c)( \ + int rank, const X(iodim) *dims, \ + int howmany_rank, \ + const X(iodim) *howmany_dims, \ + R *in, R *ro, R *io, \ + unsigned flags); \ +FFTW_EXTERN X(plan) X(plan_guru_split_dft_c2r)( \ + int rank, const X(iodim) *dims, \ + int howmany_rank, \ + const X(iodim) *howmany_dims, \ + R *ri, R *ii, R *out, \ + unsigned flags); \ + \ +FFTW_EXTERN X(plan) X(plan_guru64_dft_r2c)(int rank, \ + const X(iodim64) *dims, \ + int howmany_rank, \ + const X(iodim64) *howmany_dims, \ + R *in, C *out, \ + unsigned flags); \ +FFTW_EXTERN X(plan) X(plan_guru64_dft_c2r)(int rank, \ + const X(iodim64) *dims, \ + int howmany_rank, \ + const X(iodim64) *howmany_dims, \ + C *in, R *out, \ + unsigned flags); \ + \ +FFTW_EXTERN X(plan) X(plan_guru64_split_dft_r2c)( \ + int rank, const X(iodim64) *dims, \ + int howmany_rank, \ + const X(iodim64) *howmany_dims, \ + R *in, R *ro, R *io, \ + unsigned flags); \ +FFTW_EXTERN X(plan) X(plan_guru64_split_dft_c2r)( \ + int rank, const X(iodim64) *dims, \ + int howmany_rank, \ + const X(iodim64) *howmany_dims, \ + R *ri, R *ii, R *out, \ + unsigned flags); \ + \ +FFTW_EXTERN void X(execute_dft_r2c)(const X(plan) p, R *in, C *out); \ +FFTW_EXTERN void X(execute_dft_c2r)(const X(plan) p, C *in, R *out); \ + \ +FFTW_EXTERN void X(execute_split_dft_r2c)(const X(plan) p, \ + R *in, R *ro, R *io); \ +FFTW_EXTERN void X(execute_split_dft_c2r)(const X(plan) p, \ + R *ri, R *ii, R *out); \ + \ +FFTW_EXTERN X(plan) X(plan_many_r2r)(int rank, const int *n, \ + int howmany, \ + R *in, const int *inembed, \ + int istride, int idist, \ + R *out, const int *onembed, \ + int ostride, int odist, \ + const X(r2r_kind) *kind, unsigned flags); \ + \ +FFTW_EXTERN X(plan) X(plan_r2r)(int rank, const int *n, R *in, R *out, \ + const X(r2r_kind) *kind, unsigned flags); \ + \ +FFTW_EXTERN X(plan) X(plan_r2r_1d)(int n, R *in, R *out, \ + X(r2r_kind) kind, unsigned flags); \ +FFTW_EXTERN X(plan) X(plan_r2r_2d)(int n0, int n1, R *in, R *out, \ + X(r2r_kind) kind0, X(r2r_kind) kind1, \ + unsigned flags); \ +FFTW_EXTERN X(plan) X(plan_r2r_3d)(int n0, int n1, int n2, \ + R *in, R *out, X(r2r_kind) kind0, \ + X(r2r_kind) kind1, X(r2r_kind) kind2, \ + unsigned flags); \ + \ +FFTW_EXTERN X(plan) X(plan_guru_r2r)(int rank, const X(iodim) *dims, \ + int howmany_rank, \ + const X(iodim) *howmany_dims, \ + R *in, R *out, \ + const X(r2r_kind) *kind, unsigned flags); \ + \ +FFTW_EXTERN X(plan) X(plan_guru64_r2r)(int rank, const X(iodim64) *dims, \ + int howmany_rank, \ + const X(iodim64) *howmany_dims, \ + R *in, R *out, \ + const X(r2r_kind) *kind, unsigned flags); \ + \ +FFTW_EXTERN void X(execute_r2r)(const X(plan) p, R *in, R *out); \ + \ +FFTW_EXTERN void X(destroy_plan)(X(plan) p); \ +FFTW_EXTERN void X(forget_wisdom)(void); \ +FFTW_EXTERN void X(cleanup)(void); \ + \ +FFTW_EXTERN void X(set_timelimit)(double t); \ + \ +FFTW_EXTERN void X(plan_with_nthreads)(int nthreads); \ +FFTW_EXTERN int X(init_threads)(void); \ +FFTW_EXTERN void X(cleanup_threads)(void); \ + \ +FFTW_EXTERN int X(export_wisdom_to_filename)(const char *filename); \ +FFTW_EXTERN void X(export_wisdom_to_file)(FILE *output_file); \ +FFTW_EXTERN char *X(export_wisdom_to_string)(void); \ +FFTW_EXTERN void X(export_wisdom)(X(write_char_func) write_char, \ + void *data); \ +FFTW_EXTERN int X(import_system_wisdom)(void); \ +FFTW_EXTERN int X(import_wisdom_from_filename)(const char *filename); \ +FFTW_EXTERN int X(import_wisdom_from_file)(FILE *input_file); \ +FFTW_EXTERN int X(import_wisdom_from_string)(const char *input_string); \ +FFTW_EXTERN int X(import_wisdom)(X(read_char_func) read_char, void *data); \ + \ +FFTW_EXTERN void X(fprint_plan)(const X(plan) p, FILE *output_file); \ +FFTW_EXTERN void X(print_plan)(const X(plan) p); \ +FFTW_EXTERN char *X(sprint_plan)(const X(plan) p); \ + \ +FFTW_EXTERN void *X(malloc)(size_t n); \ +FFTW_EXTERN R *X(alloc_real)(size_t n); \ +FFTW_EXTERN C *X(alloc_complex)(size_t n); \ +FFTW_EXTERN void X(free)(void *p); \ + \ +FFTW_EXTERN void X(flops)(const X(plan) p, \ + double *add, double *mul, double *fmas); \ +FFTW_EXTERN double X(estimate_cost)(const X(plan) p); \ +FFTW_EXTERN double X(cost)(const X(plan) p); \ + \ +FFTW_EXTERN int X(alignment_of)(R *p); \ +FFTW_EXTERN const char X(version)[]; \ +FFTW_EXTERN const char X(cc)[]; \ +FFTW_EXTERN const char X(codelet_optim)[]; + + +/* end of FFTW_DEFINE_API macro */ + +FFTW_DEFINE_API(FFTW_MANGLE_DOUBLE, double, fftw_complex) +FFTW_DEFINE_API(FFTW_MANGLE_FLOAT, float, fftwf_complex) +FFTW_DEFINE_API(FFTW_MANGLE_LONG_DOUBLE, long double, fftwl_complex) + +/* __float128 (quad precision) is a gcc extension on i386, x86_64, and ia64 + for gcc >= 4.6 (compiled in FFTW with --enable-quad-precision) */ +#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) \ + && !(defined(__ICC) || defined(__INTEL_COMPILER)) \ + && (defined(__i386__) || defined(__x86_64__) || defined(__ia64__)) +# if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined(complex) && defined(I) +/* note: __float128 is a typedef, which is not supported with the _Complex + keyword in gcc, so instead we use this ugly __attribute__ version. + However, we can't simply pass the __attribute__ version to + FFTW_DEFINE_API because the __attribute__ confuses gcc in pointer + types. Hence redefining FFTW_DEFINE_COMPLEX. Ugh. */ +# undef FFTW_DEFINE_COMPLEX +# define FFTW_DEFINE_COMPLEX(R, C) typedef _Complex float __attribute__((mode(TC))) C +# endif +FFTW_DEFINE_API(FFTW_MANGLE_QUAD, __float128, fftwq_complex) +#endif + +#define FFTW_FORWARD (-1) +#define FFTW_BACKWARD (+1) + +#define FFTW_NO_TIMELIMIT (-1.0) + +/* documented flags */ +#define FFTW_MEASURE (0U) +#define FFTW_DESTROY_INPUT (1U << 0) +#define FFTW_UNALIGNED (1U << 1) +#define FFTW_CONSERVE_MEMORY (1U << 2) +#define FFTW_EXHAUSTIVE (1U << 3) /* NO_EXHAUSTIVE is default */ +#define FFTW_PRESERVE_INPUT (1U << 4) /* cancels FFTW_DESTROY_INPUT */ +#define FFTW_PATIENT (1U << 5) /* IMPATIENT is default */ +#define FFTW_ESTIMATE (1U << 6) +#define FFTW_WISDOM_ONLY (1U << 21) + +/* undocumented beyond-guru flags */ +#define FFTW_ESTIMATE_PATIENT (1U << 7) +#define FFTW_BELIEVE_PCOST (1U << 8) +#define FFTW_NO_DFT_R2HC (1U << 9) +#define FFTW_NO_NONTHREADED (1U << 10) +#define FFTW_NO_BUFFERING (1U << 11) +#define FFTW_NO_INDIRECT_OP (1U << 12) +#define FFTW_ALLOW_LARGE_GENERIC (1U << 13) /* NO_LARGE_GENERIC is default */ +#define FFTW_NO_RANK_SPLITS (1U << 14) +#define FFTW_NO_VRANK_SPLITS (1U << 15) +#define FFTW_NO_VRECURSE (1U << 16) +#define FFTW_NO_SIMD (1U << 17) +#define FFTW_NO_SLOW (1U << 18) +#define FFTW_NO_FIXED_RADIX_LARGE_N (1U << 19) +#define FFTW_ALLOW_PRUNING (1U << 20) + +#ifdef __cplusplus +} /* extern "C" */ +#endif /* __cplusplus */ + +#endif /* FFTW3_H */ From 356e7940fda0dc48f8a6d21e0138ca6e116699d2 Mon Sep 17 00:00:00 2001 From: paboyle Date: Mon, 22 Aug 2016 16:24:49 +0100 Subject: [PATCH 17/29] fftw can be switched off --- lib/FFT.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/lib/FFT.h b/lib/FFT.h index f9470472..cf012e79 100644 --- a/lib/FFT.h +++ b/lib/FFT.h @@ -113,12 +113,17 @@ namespace Grid { int sign = FFTW_FORWARD; if (inverse) sign = FFTW_BACKWARD; +#ifdef HAVE_FFTW fftw_plan p = fftw_plan_many_dft(rank,n,howmany, in,inembed, istride,idist, out,onembed, ostride, odist, sign,FFTW_ESTIMATE); +#else + fftw_plan p ; + assert(0); +#endif // Barrel shift and collect global pencil for(int p=0;p Date: Tue, 23 Aug 2016 14:41:44 +0100 Subject: [PATCH 18/29] first commit for QPX intrinsics --- configure.ac | 5 +- lib/simd/Grid_qpx.h | 376 +++++++++++++++++++++++--------------------- 2 files changed, 198 insertions(+), 183 deletions(-) diff --git a/configure.ac b/configure.ac index b14f9748..9c56971e 100644 --- a/configure.ac +++ b/configure.ac @@ -125,11 +125,14 @@ case ${ax_cv_cxx_compiler_vendor} in AC_DEFINE([AVX512],[1],[AVX512 intrinsics]) SIMD_FLAGS='-mavx512f -mavx512pf -mavx512er -mavx512cd';; IMCI|KNC) - AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner]) + AC_DEFINE([IMCI],[1],[IMCI intrinsics for Knights Corner]) SIMD_FLAGS='';; GEN) AC_DEFINE([GENERIC_VEC],[1],[generic vector code]) SIMD_FLAGS='';; + QPX|BGQ) + AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q]) + SIMD_FLAGS='';; *) AC_MSG_ERROR(["SIMD option ${ac_SIMD} not supported by the GCC/Clang compiler"]);; esac;; diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h index 10ac3841..6ac83290 100644 --- a/lib/simd/Grid_qpx.h +++ b/lib/simd/Grid_qpx.h @@ -1,300 +1,312 @@ - /************************************************************************************* - - Grid physics library, www.github.com/paboyle/Grid - - Source file: ./lib/simd/Grid_qpx.h - - Copyright (C) 2015 - -Author: neo - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ -//---------------------------------------------------------------------- -/*! @file Grid_qpx.h - @brief Optimization libraries for QPX instructions set for BG/Q - - Using intrinsics -*/ -// Time-stamp: <2015-05-27 11:30:21 neo> -//---------------------------------------------------------------------- - -// lot of undefined functions +/******************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/simd/Grid_qpx.h + + Copyright (C) 2016 + + Author: Antonin Portelli + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + ******************************************************************************/ +namespace Grid { namespace Optimization { + inline std::ostream & operator<<(std::ostream& stream, const vector4double a) + { + stream << "{"< - struct Reduce{ - //Need templated class to overload output type - //General form must generate error if compiled - inline Out_type operator()(In_type in){ - printf("Error, using wrong Reduce function\n"); - exit(1); - return 0; - } - }; - - - - + struct Reduce{ + //Need templated class to overload output type + //General form must generate error if compiled + inline Out_type operator()(In_type in){ + printf("Error, using wrong Reduce function\n"); + exit(1); + return 0; + } + }; + ///////////////////////////////////////////////////// // Arithmetic operations ///////////////////////////////////////////////////// struct Sum{ - //Complex/Real float - inline float operator()(float a, float b){ -#error - } //Complex/Real double inline vector4double operator()(vector4double a, vector4double b){ - return vec_add(a,b); + return vec_add(a, b); } //Integer inline int operator()(int a, int b){ -#error + return a + b; } }; - + struct Sub{ - //Complex/Real float - inline float operator()(float a, float b){ -#error - } //Complex/Real double inline vector4double operator()(vector4double a, vector4double b){ -#error + return vec_sub(a, b); } //Integer - inline floati operator()(int a, int b){ -#error + inline int operator()(int a, int b){ + return a - b; } }; - - + struct MultComplex{ - // Complex float - inline float operator()(float a, float b){ -#error - } // Complex double inline vector4double operator()(vector4double a, vector4double b){ -#error + return vec_xxnpmadd(a, b, vec_xmul(b, a)); } }; - + struct Mult{ - // Real float - inline float operator()(float a, float b){ -#error - } // Real double inline vector4double operator()(vector4double a, vector4double b){ -#error + return vec_mul(a, b); } // Integer inline int operator()(int a, int b){ -#error + return a*b; } }; - - + struct Conj{ - // Complex single - inline float operator()(float in){ - assert(0); - } // Complex double - inline vector4double operator()(vector4double in){ - assert(0); + inline vector4double operator()(vector4double v){ + return vec_mul(v, (vector4double){1., -1., 1., -1.}); } - // do not define for integer input }; - + struct TimesMinusI{ - //Complex single - inline float operator()(float in, float ret){ - assert(0); - } //Complex double - inline vector4double operator()(vector4double in, vector4double ret){ - assert(0); + inline vector4double operator()(vector4double v, vector4double ret){ + return vec_xxcpnmadd(v, (vector4double){1., 1., 1., 1.}, + (vector4double){0., 0., 0., 0.}); } - - }; - + struct TimesI{ - //Complex single - inline float operator()(float in, float ret){ - - } //Complex double - inline vector4double operator()(vector4double in, vector4double ret){ - + inline vector4double operator()(vector4double v, vector4double ret){ + return vec_xxcpnmadd(v, (vector4double){-1., -1., -1., -1.}, + (vector4double){0., 0., 0., 0.}); } - - }; - + struct Permute{ + static inline vector4double Permute0(vector4double v){ //0123 -> 2301 + return vec_perm(v, v, vec_gpci(02301)); + }; + static inline vector4double Permute1(vector4double v){ //0123 -> 1032 + return vec_perm(v, v, vec_gpci(01032)); + }; + static inline vector4double Permute2(vector4double v){ + return v; + }; + static inline vector4double Permute3(vector4double v){ + return v; + }; + }; - - - ////////////////////////////////////////////// - // Some Template specialization + struct Rotate{ + static inline vector4double rotate(vector4double v, int n){ + switch(n){ + case 0: + return v; + break; + case 1: + return vec_perm(v, v, vec_gpci(01230)); + break; + case 2: + return vec_perm(v, v, vec_gpci(02301)); + break; + case 3: + return vec_perm(v, v, vec_gpci(03012)); + break; + default: assert(0); + } + } + }; //Complex float Reduce template<> - inline Grid::ComplexF Reduce::operator()(float in){ - assert(0); + inline Grid::ComplexF + Reduce::operator()(vector4double v) { //2 complex + vector4double v1,v2; + + v1 = Optimization::Permute::Permute0(v); + v1 = vec_add(v1, v); + + return Grid::ComplexF((float)vec_extract(v1, 0), (float)vec_extract(v1, 1)); } //Real float Reduce template<> - inline Grid::RealF Reduce::operator()(float in){ - assert(0); + inline Grid::RealF + Reduce::operator()(vector4double v){ //4 floats + vector4double v1,v2; + + v1 = Optimization::Permute::Permute0(v); + v1 = vec_add(v1, v); + v2 = Optimization::Permute::Permute1(v1); + v1 = vec_add(v1, v2); + + return (float)vec_extract(v1, 0); } //Complex double Reduce template<> - inline Grid::ComplexD Reduce::operator()(vector4double in){ - assert(0); + inline Grid::ComplexD + Reduce::operator()(vector4double v){ //2 complex + vector4double v1; + + v1 = Optimization::Permute::Permute0(v); + v1 = vec_add(v1, v); + + return Grid::ComplexD(vec_extract(v1, 0), vec_extract(v1, 1)); } //Real double Reduce template<> - inline Grid::RealD Reduce::operator()(vector4double in){ - assert(0); - } + inline Grid::RealD + Reduce::operator()(vector4double v){ //4 doubles + vector4double v1,v2; + + v1 = Optimization::Permute::Permute0(v); + v1 = vec_add(v1, v); + v2 = Optimization::Permute::Permute1(v1); + v1 = vec_add(v1, v2); + return vec_extract(v1, 0); + } + //Integer Reduce template<> - inline Integer Reduce::operator()(float in){ + inline Integer Reduce::operator()(int in){ + // FIXME unimplemented + printf("Reduce : Missing integer implementation -> FIX\n"); assert(0); } - - } -////////////////////////////////////////////////////////////////////////////////////// -// Here assign types -namespace Grid { - typedef float SIMD_Ftype __attribute__ ((vector_size (16))); // Single precision type - typedef vector4double SIMD_Dtype; // Double precision type - typedef int SIMD_Itype; // Integer type +//////////////////////////////////////////////////////////////////////////////// +// Here assign types - inline void v_prefetch0(int size, const char *ptr){}; +typedef vector4double SIMD_Ftype; // Single precision type +typedef vector4double SIMD_Dtype; // Double precision type +typedef int SIMD_Itype; // Integer type - // Function name aliases - typedef Optimization::Vsplat VsplatSIMD; - typedef Optimization::Vstore VstoreSIMD; - typedef Optimization::Vset VsetSIMD; - typedef Optimization::Vstream VstreamSIMD; - template using ReduceSIMD = Optimization::Reduce; +// prefetch utilities +inline void v_prefetch0(int size, const char *ptr){}; +inline void prefetch_HINT_T0(const char *ptr){}; - // Arithmetic operations - typedef Optimization::Sum SumSIMD; - typedef Optimization::Sub SubSIMD; - typedef Optimization::Mult MultSIMD; - typedef Optimization::MultComplex MultComplexSIMD; - typedef Optimization::Conj ConjSIMD; - typedef Optimization::TimesMinusI TimesMinusISIMD; - typedef Optimization::TimesI TimesISIMD; +// Function name aliases +typedef Optimization::Vsplat VsplatSIMD; +typedef Optimization::Vstore VstoreSIMD; +typedef Optimization::Vset VsetSIMD; +typedef Optimization::Vstream VstreamSIMD; +template using ReduceSIMD = Optimization::Reduce; +// Arithmetic operations +typedef Optimization::Sum SumSIMD; +typedef Optimization::Sub SubSIMD; +typedef Optimization::Mult MultSIMD; +typedef Optimization::MultComplex MultComplexSIMD; +typedef Optimization::Conj ConjSIMD; +typedef Optimization::TimesMinusI TimesMinusISIMD; +typedef Optimization::TimesI TimesISIMD; + } From ff6da364e807b965ba49df55fd5dba58723d0a51 Mon Sep 17 00:00:00 2001 From: paboyle Date: Wed, 24 Aug 2016 15:05:00 +0100 Subject: [PATCH 19/29] FFT double and single precision gives good performance now in multithreaded code. --- configure.ac | 47 +++-- lib/FFT.h | 199 +++++++++++++------ lib/Init.cc | 2 +- lib/fftw/fftw3.h | 412 ---------------------------------------- tests/core/Test_fft.cc | 19 +- tests/core/Test_fftf.cc | 111 +++++++++++ 6 files changed, 298 insertions(+), 492 deletions(-) delete mode 100644 lib/fftw/fftw3.h create mode 100644 tests/core/Test_fftf.cc diff --git a/configure.ac b/configure.ac index b14f9748..98489162 100644 --- a/configure.ac +++ b/configure.ac @@ -10,8 +10,19 @@ m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) AC_LANG(C++) CXXFLAGS="-O3 $CXXFLAGS" AC_PROG_CXX + +############ openmp ############### AC_OPENMP + +ac_openmp=no + +if test "${OPENMP_CXXFLAGS}X" != "X"; then +ac_openmp=yes AM_CXXFLAGS="$OPENMP_CXXFLAGS $AM_CXXFLAGS" +AM_LDFLAGS="$OPENMP_CXXFLAGS $AM_LDFLAGS" +fi + +############ libtool ############### LT_INIT ############### Checks for header files @@ -29,7 +40,7 @@ AC_TYPE_SIZE_T AC_TYPE_UINT32_T AC_TYPE_UINT64_T -############### Options +############### GMP and MPFR ################# AC_ARG_WITH([gmp], [AS_HELP_STRING([--with-gmp=prefix], [try this for a non-standard install prefix of the GMP library])], @@ -40,6 +51,8 @@ AC_ARG_WITH([mpfr], [try this for a non-standard install prefix of the MPFR library])], [AM_CXXFLAGS="-I$with_mpfr/include $AM_CXXFLAGS"] [AM_LDFLAGS="-L$with_mpfr/lib $AM_LDFLAGS"]) + +################## lapack #################### AC_ARG_ENABLE([lapack], [AC_HELP_STRING([--enable-lapack=yes|no|prefix], [enable LAPACK])], [ac_LAPACK=${enable_lapack}],[ac_LAPACK=no]) @@ -55,14 +68,27 @@ case ${ac_LAPACK} in AC_DEFINE([USE_LAPACK],[1],[use LAPACK]) esac -AC_CHECK_LIB([fftw3],[fft_init_threads], +################## FFTW3 #################### +AC_ARG_WITH([fftw], + [AS_HELP_STRING([--with-fftw=prefix], + [try this for a non-standard install prefix of the FFTW3 library])], + [AM_CXXFLAGS="-I$with_fftw/include $AM_CXXFLAGS"] + [AM_LDFLAGS="-L$with_fftw/lib $AM_LDFLAGS"]) + +# +# What about the MKL library replacement for fftw3 ? How do we know if fftw_execute +# can be found in MKL? +# +AC_CHECK_LIB([fftw3],[fftw_execute], [AC_DEFINE([HAVE_FFTW],[1],[Define to 1 if you have the `FFTW' library (-lfftw3).])] [ac_fftw=yes], [ac_fftw=no]) + case ${ac_fftw} in no) + echo WARNING libfftw3 not found FFT routines will not work ;; yes) - AM_LDFLAGS="-lfftw3 $AM_LDFLAGS" + AM_LDFLAGS="$AM_LDFLAGS -lfftw3 -lfftw3f" esac @@ -304,19 +330,18 @@ Summary of configuration for $PACKAGE v$VERSION - compiler version : ${ax_cv_gxx_version} ----- BUILD OPTIONS ----------------------------------- - SIMD : ${ac_SIMD} -- communications type : ${ac_COMMS} -- default precision : ${ac_PRECISION} +- Threading : ${ac_openmp} +- Communications type : ${ac_COMMS} +- Default precision : ${ac_PRECISION} - RNG choice : ${ac_RNG} - GMP : `if test "x$have_gmp" = xtrue; then echo yes; else echo no; fi` - LAPACK : ${ac_LAPACK} +- FFTW : ${ac_fftw} - build DOXYGEN documentation : `if test "x$enable_doc" = xyes; then echo yes; else echo no; fi` - graphs and diagrams : `if test "x$enable_dot" = xyes; then echo yes; else echo no; fi` ----- BUILD FLAGS ------------------------------------- -- CXXFLAGS: -`echo ${AM_CXXFLAGS} ${CXXFLAGS} | sed 's/ -/\n\t-/g' | sed 's/^-/\t-/g'` -- LDFLAGS: -`echo ${AM_LDFLAGS} ${LDFLAGS} | sed 's/ -/\n\t-/g' | sed 's/^-/\t-/g'` -- LIBS: -`echo ${LIBS} | sed 's/ -/\n\t-/g' | sed 's/^-/\t-/g'` +- CXXFLAGS: "${AM_CXXFLAGS} ${CXXFLAGS}" +- LDFLAGS: "${AM_LDFLAGS} ${LDFLAGS}" +- LIBS: "${LIBS} " ------------------------------------------------------- " diff --git a/lib/FFT.h b/lib/FFT.h index cf012e79..17c8a309 100644 --- a/lib/FFT.h +++ b/lib/FFT.h @@ -1,3 +1,4 @@ + /************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -28,11 +29,70 @@ Author: Peter Boyle #ifndef _GRID_FFT_H_ #define _GRID_FFT_H_ -#include - +#ifdef HAVE_FFTW +#include +#endif namespace Grid { - + template struct FFTW { }; + +#ifdef HAVE_FFTW + template<> struct FFTW { + public: + + typedef fftw_complex FFTW_scalar; + typedef fftw_plan FFTW_plan; + + static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany, + FFTW_scalar *in, const int *inembed, + int istride, int idist, + FFTW_scalar *out, const int *onembed, + int ostride, int odist, + int sign, unsigned flags) { + return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags); + } + + static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){ + ::fftw_flops(p,add,mul,fmas); + } + + inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) { + ::fftw_execute_dft(p,in,out); + } + inline static void fftw_destroy_plan(const FFTW_plan p) { + ::fftw_destroy_plan(p); + } + }; + + template<> struct FFTW { + public: + + typedef fftwf_complex FFTW_scalar; + typedef fftwf_plan FFTW_plan; + + static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany, + FFTW_scalar *in, const int *inembed, + int istride, int idist, + FFTW_scalar *out, const int *onembed, + int ostride, int odist, + int sign, unsigned flags) { + return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags); + } + + static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){ + ::fftwf_flops(p,add,mul,fmas); + } + + inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) { + ::fftwf_execute_dft(p,in,out); + } + inline static void fftw_destroy_plan(const FFTW_plan p) { + ::fftwf_destroy_plan(p); + } + }; + +#endif + class FFT { private: @@ -40,6 +100,10 @@ namespace Grid { GridCartesian *sgrid; int Nd; + double flops; + double flops_call; + uint64_t usec; + std::vector dimensions; std::vector processors; std::vector processor_coor; @@ -49,6 +113,9 @@ namespace Grid { static const int forward=FFTW_FORWARD; static const int backward=FFTW_BACKWARD; + double Flops(void) {return flops;} + double MFlops(void) {return flops/usec;} + FFT ( GridCartesian * grid ) : vgrid(grid), Nd(grid->_ndimension), @@ -56,6 +123,8 @@ namespace Grid { processors(grid->_processors), processor_coor(grid->_processor_coor) { + flops=0; + usec =0; std::vector layout(Nd,1); sgrid = new GridCartesian(dimensions,layout,processors); }; @@ -75,55 +144,62 @@ namespace Grid { std::vector layout(Nd,1); std::vector pencil_gd(vgrid->_fdimensions); - std::vector pencil_ld(processors); pencil_gd[dim] = G*processors[dim]; - pencil_ld[dim] = G*processors[dim]; // Pencil global vol LxLxGxLxL per node GridCartesian pencil_g(pencil_gd,layout,processors); - GridCartesian pencil_l(pencil_ld,layout,processors); // Construct pencils typedef typename vobj::scalar_object sobj; + typedef typename sobj::scalar_type scalar; + Lattice ssource(vgrid); ssource =source; Lattice pgsource(&pencil_g); - Lattice pgresult(&pencil_g); - Lattice plsource(&pencil_l); - Lattice plresult(&pencil_l); + Lattice pgresult(&pencil_g); pgresult=zero; + +#ifndef HAVE_FFTW + assert(0); +#else + typedef typename FFTW::FFTW_scalar FFTW_scalar; + typedef typename FFTW::FFTW_plan FFTW_plan; { - assert(sizeof(typename sobj::scalar_type)==sizeof(ComplexD)); - assert(sizeof(fftw_complex)==sizeof(ComplexD)); - assert(sizeof(fftw_complex)==sizeof(ComplexD)); + int Ncomp = sizeof(sobj)/sizeof(scalar); + int Nlow = 1; + for(int d=0;d_ldimensions[d]; + } - int Ncomp = sizeof(sobj)/sizeof(fftw_complex); - - int rank = 1; /* not 2: we are computing 1d transforms */ + int rank = 1; /* 1d transforms */ int n[] = {G}; /* 1d transforms of length G */ int howmany = Ncomp; int odist,idist,istride,ostride; - idist = odist = 1; - istride = ostride = Ncomp; /* distance between two elements in the same column */ + idist = odist = 1; /* Distance between consecutive FT's */ + istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */ int *inembed = n, *onembed = n; - fftw_complex *in = (fftw_complex *)&plsource._odata[0]; - fftw_complex *out= (fftw_complex *)&plresult._odata[0]; int sign = FFTW_FORWARD; if (inverse) sign = FFTW_BACKWARD; -#ifdef HAVE_FFTW - fftw_plan p = fftw_plan_many_dft(rank,n,howmany, - in,inembed, - istride,idist, - out,onembed, - ostride, odist, - sign,FFTW_ESTIMATE); -#else - fftw_plan p ; - assert(0); -#endif + FFTW_plan p; + { + FFTW_scalar *in = (FFTW_scalar *)&pgsource._odata[0]; + FFTW_scalar *out= (FFTW_scalar *)&pgresult._odata[0]; + p = FFTW::fftw_plan_many_dft(rank,n,howmany, + in,inembed, + istride,idist, + out,onembed, + ostride, odist, + sign,FFTW_ESTIMATE); + } + + double add,mul,fma; + FFTW::fftw_flops(p,&add,&mul,&fma); + flops_call = add+mul+2.0*fma; + std::cout << "FFT Flops per call "<lSites();idx++) { + int NN=pencil_g.lSites(); + + GridStopWatch Timer; + Timer.Start(); + +PARALLEL_FOR_LOOP + for(int idx=0;idx pcoor(Nd,0); std::vector lcoor(Nd); - sgrid->LocalIndexToLocalCoor(idx,lcoor); + pencil_g.LocalIndexToLocalCoor(idx,lcoor); if ( lcoor[dim] == 0 ) { // restricts loop to plane at lcoor[dim]==0 - - // Project to local pencil array - for(int l=0;l::fftw_execute_dft(p,in,out); } } - - fftw_destroy_plan(p); + + Timer.Stop(); + usec += Timer.useconds(); + flops+= flops_call*NN; + + + int pc = processor_coor[dim]; + for(int idx=0;idxlSites();idx++) { + std::vector lcoor(Nd); + sgrid->LocalIndexToLocalCoor(idx,lcoor); + std::vector gcoor = lcoor; + // extract the result + sobj s; + gcoor[dim] = lcoor[dim]+L*pc; + peekLocalSite(s,pgresult,gcoor); + pokeLocalSite(s,result,lcoor); + } + + FFTW::fftw_destroy_plan(p); } +#endif + + } }; diff --git a/lib/Init.cc b/lib/Init.cc index 6543435c..34e503a4 100644 --- a/lib/Init.cc +++ b/lib/Init.cc @@ -153,6 +153,7 @@ void GridParseLayout(char **argv,int argc, assert(ompthreads.size()==1); GridThread::SetThreads(ompthreads[0]); } + if( GridCmdOptionExists(argv,argv+argc,"--cores") ){ std::vector cores(0); arg= GridCmdOptionPayload(argv,argv+argc,"--cores"); @@ -203,7 +204,6 @@ void Grid_init(int *argc,char ***argv) GridLogConfigure(logstreams); } - if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){ Grid_debug_handler_init(); } diff --git a/lib/fftw/fftw3.h b/lib/fftw/fftw3.h deleted file mode 100644 index 1bf34396..00000000 --- a/lib/fftw/fftw3.h +++ /dev/null @@ -1,412 +0,0 @@ -/* - * Copyright (c) 2003, 2007-14 Matteo Frigo - * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology - * - * The following statement of license applies *only* to this header file, - * and *not* to the other files distributed with FFTW or derived therefrom: - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS - * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE - * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/***************************** NOTE TO USERS ********************************* - * - * THIS IS A HEADER FILE, NOT A MANUAL - * - * If you want to know how to use FFTW, please read the manual, - * online at http://www.fftw.org/doc/ and also included with FFTW. - * For a quick start, see the manual's tutorial section. - * - * (Reading header files to learn how to use a library is a habit - * stemming from code lacking a proper manual. Arguably, it's a - * *bad* habit in most cases, because header files can contain - * interfaces that are not part of the public, stable API.) - * - ****************************************************************************/ - -#ifndef FFTW3_H -#define FFTW3_H - -#include - -#ifdef __cplusplus -extern "C" -{ -#endif /* __cplusplus */ - -/* If is included, use the C99 complex type. Otherwise - define a type bit-compatible with C99 complex */ -#if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined(complex) && defined(I) -# define FFTW_DEFINE_COMPLEX(R, C) typedef R _Complex C -#else -# define FFTW_DEFINE_COMPLEX(R, C) typedef R C[2] -#endif - -#define FFTW_CONCAT(prefix, name) prefix ## name -#define FFTW_MANGLE_DOUBLE(name) FFTW_CONCAT(fftw_, name) -#define FFTW_MANGLE_FLOAT(name) FFTW_CONCAT(fftwf_, name) -#define FFTW_MANGLE_LONG_DOUBLE(name) FFTW_CONCAT(fftwl_, name) -#define FFTW_MANGLE_QUAD(name) FFTW_CONCAT(fftwq_, name) - -/* IMPORTANT: for Windows compilers, you should add a line - #define FFTW_DLL - here and in kernel/ifftw.h if you are compiling/using FFTW as a - DLL, in order to do the proper importing/exporting, or - alternatively compile with -DFFTW_DLL or the equivalent - command-line flag. This is not necessary under MinGW/Cygwin, where - libtool does the imports/exports automatically. */ -#if defined(FFTW_DLL) && (defined(_WIN32) || defined(__WIN32__)) - /* annoying Windows syntax for shared-library declarations */ -# if defined(COMPILING_FFTW) /* defined in api.h when compiling FFTW */ -# define FFTW_EXTERN extern __declspec(dllexport) -# else /* user is calling FFTW; import symbol */ -# define FFTW_EXTERN extern __declspec(dllimport) -# endif -#else -# define FFTW_EXTERN extern -#endif - -enum fftw_r2r_kind_do_not_use_me { - FFTW_R2HC=0, FFTW_HC2R=1, FFTW_DHT=2, - FFTW_REDFT00=3, FFTW_REDFT01=4, FFTW_REDFT10=5, FFTW_REDFT11=6, - FFTW_RODFT00=7, FFTW_RODFT01=8, FFTW_RODFT10=9, FFTW_RODFT11=10 -}; - -struct fftw_iodim_do_not_use_me { - int n; /* dimension size */ - int is; /* input stride */ - int os; /* output stride */ -}; - -#include /* for ptrdiff_t */ -struct fftw_iodim64_do_not_use_me { - ptrdiff_t n; /* dimension size */ - ptrdiff_t is; /* input stride */ - ptrdiff_t os; /* output stride */ -}; - -typedef void (*fftw_write_char_func_do_not_use_me)(char c, void *); -typedef int (*fftw_read_char_func_do_not_use_me)(void *); - -/* - huge second-order macro that defines prototypes for all API - functions. We expand this macro for each supported precision - - X: name-mangling macro - R: real data type - C: complex data type -*/ - -#define FFTW_DEFINE_API(X, R, C) \ - \ -FFTW_DEFINE_COMPLEX(R, C); \ - \ -typedef struct X(plan_s) *X(plan); \ - \ -typedef struct fftw_iodim_do_not_use_me X(iodim); \ -typedef struct fftw_iodim64_do_not_use_me X(iodim64); \ - \ -typedef enum fftw_r2r_kind_do_not_use_me X(r2r_kind); \ - \ -typedef fftw_write_char_func_do_not_use_me X(write_char_func); \ -typedef fftw_read_char_func_do_not_use_me X(read_char_func); \ - \ -FFTW_EXTERN void X(execute)(const X(plan) p); \ - \ -FFTW_EXTERN X(plan) X(plan_dft)(int rank, const int *n, \ - C *in, C *out, int sign, unsigned flags); \ - \ -FFTW_EXTERN X(plan) X(plan_dft_1d)(int n, C *in, C *out, int sign, \ - unsigned flags); \ -FFTW_EXTERN X(plan) X(plan_dft_2d)(int n0, int n1, \ - C *in, C *out, int sign, unsigned flags); \ -FFTW_EXTERN X(plan) X(plan_dft_3d)(int n0, int n1, int n2, \ - C *in, C *out, int sign, unsigned flags); \ - \ -FFTW_EXTERN X(plan) X(plan_many_dft)(int rank, const int *n, \ - int howmany, \ - C *in, const int *inembed, \ - int istride, int idist, \ - C *out, const int *onembed, \ - int ostride, int odist, \ - int sign, unsigned flags); \ - \ -FFTW_EXTERN X(plan) X(plan_guru_dft)(int rank, const X(iodim) *dims, \ - int howmany_rank, \ - const X(iodim) *howmany_dims, \ - C *in, C *out, \ - int sign, unsigned flags); \ -FFTW_EXTERN X(plan) X(plan_guru_split_dft)(int rank, const X(iodim) *dims, \ - int howmany_rank, \ - const X(iodim) *howmany_dims, \ - R *ri, R *ii, R *ro, R *io, \ - unsigned flags); \ - \ -FFTW_EXTERN X(plan) X(plan_guru64_dft)(int rank, \ - const X(iodim64) *dims, \ - int howmany_rank, \ - const X(iodim64) *howmany_dims, \ - C *in, C *out, \ - int sign, unsigned flags); \ -FFTW_EXTERN X(plan) X(plan_guru64_split_dft)(int rank, \ - const X(iodim64) *dims, \ - int howmany_rank, \ - const X(iodim64) *howmany_dims, \ - R *ri, R *ii, R *ro, R *io, \ - unsigned flags); \ - \ -FFTW_EXTERN void X(execute_dft)(const X(plan) p, C *in, C *out); \ -FFTW_EXTERN void X(execute_split_dft)(const X(plan) p, R *ri, R *ii, \ - R *ro, R *io); \ - \ -FFTW_EXTERN X(plan) X(plan_many_dft_r2c)(int rank, const int *n, \ - int howmany, \ - R *in, const int *inembed, \ - int istride, int idist, \ - C *out, const int *onembed, \ - int ostride, int odist, \ - unsigned flags); \ - \ -FFTW_EXTERN X(plan) X(plan_dft_r2c)(int rank, const int *n, \ - R *in, C *out, unsigned flags); \ - \ -FFTW_EXTERN X(plan) X(plan_dft_r2c_1d)(int n,R *in,C *out,unsigned flags); \ -FFTW_EXTERN X(plan) X(plan_dft_r2c_2d)(int n0, int n1, \ - R *in, C *out, unsigned flags); \ -FFTW_EXTERN X(plan) X(plan_dft_r2c_3d)(int n0, int n1, \ - int n2, \ - R *in, C *out, unsigned flags); \ - \ - \ -FFTW_EXTERN X(plan) X(plan_many_dft_c2r)(int rank, const int *n, \ - int howmany, \ - C *in, const int *inembed, \ - int istride, int idist, \ - R *out, const int *onembed, \ - int ostride, int odist, \ - unsigned flags); \ - \ -FFTW_EXTERN X(plan) X(plan_dft_c2r)(int rank, const int *n, \ - C *in, R *out, unsigned flags); \ - \ -FFTW_EXTERN X(plan) X(plan_dft_c2r_1d)(int n,C *in,R *out,unsigned flags); \ -FFTW_EXTERN X(plan) X(plan_dft_c2r_2d)(int n0, int n1, \ - C *in, R *out, unsigned flags); \ -FFTW_EXTERN X(plan) X(plan_dft_c2r_3d)(int n0, int n1, \ - int n2, \ - C *in, R *out, unsigned flags); \ - \ -FFTW_EXTERN X(plan) X(plan_guru_dft_r2c)(int rank, const X(iodim) *dims, \ - int howmany_rank, \ - const X(iodim) *howmany_dims, \ - R *in, C *out, \ - unsigned flags); \ -FFTW_EXTERN X(plan) X(plan_guru_dft_c2r)(int rank, const X(iodim) *dims, \ - int howmany_rank, \ - const X(iodim) *howmany_dims, \ - C *in, R *out, \ - unsigned flags); \ - \ -FFTW_EXTERN X(plan) X(plan_guru_split_dft_r2c)( \ - int rank, const X(iodim) *dims, \ - int howmany_rank, \ - const X(iodim) *howmany_dims, \ - R *in, R *ro, R *io, \ - unsigned flags); \ -FFTW_EXTERN X(plan) X(plan_guru_split_dft_c2r)( \ - int rank, const X(iodim) *dims, \ - int howmany_rank, \ - const X(iodim) *howmany_dims, \ - R *ri, R *ii, R *out, \ - unsigned flags); \ - \ -FFTW_EXTERN X(plan) X(plan_guru64_dft_r2c)(int rank, \ - const X(iodim64) *dims, \ - int howmany_rank, \ - const X(iodim64) *howmany_dims, \ - R *in, C *out, \ - unsigned flags); \ -FFTW_EXTERN X(plan) X(plan_guru64_dft_c2r)(int rank, \ - const X(iodim64) *dims, \ - int howmany_rank, \ - const X(iodim64) *howmany_dims, \ - C *in, R *out, \ - unsigned flags); \ - \ -FFTW_EXTERN X(plan) X(plan_guru64_split_dft_r2c)( \ - int rank, const X(iodim64) *dims, \ - int howmany_rank, \ - const X(iodim64) *howmany_dims, \ - R *in, R *ro, R *io, \ - unsigned flags); \ -FFTW_EXTERN X(plan) X(plan_guru64_split_dft_c2r)( \ - int rank, const X(iodim64) *dims, \ - int howmany_rank, \ - const X(iodim64) *howmany_dims, \ - R *ri, R *ii, R *out, \ - unsigned flags); \ - \ -FFTW_EXTERN void X(execute_dft_r2c)(const X(plan) p, R *in, C *out); \ -FFTW_EXTERN void X(execute_dft_c2r)(const X(plan) p, C *in, R *out); \ - \ -FFTW_EXTERN void X(execute_split_dft_r2c)(const X(plan) p, \ - R *in, R *ro, R *io); \ -FFTW_EXTERN void X(execute_split_dft_c2r)(const X(plan) p, \ - R *ri, R *ii, R *out); \ - \ -FFTW_EXTERN X(plan) X(plan_many_r2r)(int rank, const int *n, \ - int howmany, \ - R *in, const int *inembed, \ - int istride, int idist, \ - R *out, const int *onembed, \ - int ostride, int odist, \ - const X(r2r_kind) *kind, unsigned flags); \ - \ -FFTW_EXTERN X(plan) X(plan_r2r)(int rank, const int *n, R *in, R *out, \ - const X(r2r_kind) *kind, unsigned flags); \ - \ -FFTW_EXTERN X(plan) X(plan_r2r_1d)(int n, R *in, R *out, \ - X(r2r_kind) kind, unsigned flags); \ -FFTW_EXTERN X(plan) X(plan_r2r_2d)(int n0, int n1, R *in, R *out, \ - X(r2r_kind) kind0, X(r2r_kind) kind1, \ - unsigned flags); \ -FFTW_EXTERN X(plan) X(plan_r2r_3d)(int n0, int n1, int n2, \ - R *in, R *out, X(r2r_kind) kind0, \ - X(r2r_kind) kind1, X(r2r_kind) kind2, \ - unsigned flags); \ - \ -FFTW_EXTERN X(plan) X(plan_guru_r2r)(int rank, const X(iodim) *dims, \ - int howmany_rank, \ - const X(iodim) *howmany_dims, \ - R *in, R *out, \ - const X(r2r_kind) *kind, unsigned flags); \ - \ -FFTW_EXTERN X(plan) X(plan_guru64_r2r)(int rank, const X(iodim64) *dims, \ - int howmany_rank, \ - const X(iodim64) *howmany_dims, \ - R *in, R *out, \ - const X(r2r_kind) *kind, unsigned flags); \ - \ -FFTW_EXTERN void X(execute_r2r)(const X(plan) p, R *in, R *out); \ - \ -FFTW_EXTERN void X(destroy_plan)(X(plan) p); \ -FFTW_EXTERN void X(forget_wisdom)(void); \ -FFTW_EXTERN void X(cleanup)(void); \ - \ -FFTW_EXTERN void X(set_timelimit)(double t); \ - \ -FFTW_EXTERN void X(plan_with_nthreads)(int nthreads); \ -FFTW_EXTERN int X(init_threads)(void); \ -FFTW_EXTERN void X(cleanup_threads)(void); \ - \ -FFTW_EXTERN int X(export_wisdom_to_filename)(const char *filename); \ -FFTW_EXTERN void X(export_wisdom_to_file)(FILE *output_file); \ -FFTW_EXTERN char *X(export_wisdom_to_string)(void); \ -FFTW_EXTERN void X(export_wisdom)(X(write_char_func) write_char, \ - void *data); \ -FFTW_EXTERN int X(import_system_wisdom)(void); \ -FFTW_EXTERN int X(import_wisdom_from_filename)(const char *filename); \ -FFTW_EXTERN int X(import_wisdom_from_file)(FILE *input_file); \ -FFTW_EXTERN int X(import_wisdom_from_string)(const char *input_string); \ -FFTW_EXTERN int X(import_wisdom)(X(read_char_func) read_char, void *data); \ - \ -FFTW_EXTERN void X(fprint_plan)(const X(plan) p, FILE *output_file); \ -FFTW_EXTERN void X(print_plan)(const X(plan) p); \ -FFTW_EXTERN char *X(sprint_plan)(const X(plan) p); \ - \ -FFTW_EXTERN void *X(malloc)(size_t n); \ -FFTW_EXTERN R *X(alloc_real)(size_t n); \ -FFTW_EXTERN C *X(alloc_complex)(size_t n); \ -FFTW_EXTERN void X(free)(void *p); \ - \ -FFTW_EXTERN void X(flops)(const X(plan) p, \ - double *add, double *mul, double *fmas); \ -FFTW_EXTERN double X(estimate_cost)(const X(plan) p); \ -FFTW_EXTERN double X(cost)(const X(plan) p); \ - \ -FFTW_EXTERN int X(alignment_of)(R *p); \ -FFTW_EXTERN const char X(version)[]; \ -FFTW_EXTERN const char X(cc)[]; \ -FFTW_EXTERN const char X(codelet_optim)[]; - - -/* end of FFTW_DEFINE_API macro */ - -FFTW_DEFINE_API(FFTW_MANGLE_DOUBLE, double, fftw_complex) -FFTW_DEFINE_API(FFTW_MANGLE_FLOAT, float, fftwf_complex) -FFTW_DEFINE_API(FFTW_MANGLE_LONG_DOUBLE, long double, fftwl_complex) - -/* __float128 (quad precision) is a gcc extension on i386, x86_64, and ia64 - for gcc >= 4.6 (compiled in FFTW with --enable-quad-precision) */ -#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) \ - && !(defined(__ICC) || defined(__INTEL_COMPILER)) \ - && (defined(__i386__) || defined(__x86_64__) || defined(__ia64__)) -# if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined(complex) && defined(I) -/* note: __float128 is a typedef, which is not supported with the _Complex - keyword in gcc, so instead we use this ugly __attribute__ version. - However, we can't simply pass the __attribute__ version to - FFTW_DEFINE_API because the __attribute__ confuses gcc in pointer - types. Hence redefining FFTW_DEFINE_COMPLEX. Ugh. */ -# undef FFTW_DEFINE_COMPLEX -# define FFTW_DEFINE_COMPLEX(R, C) typedef _Complex float __attribute__((mode(TC))) C -# endif -FFTW_DEFINE_API(FFTW_MANGLE_QUAD, __float128, fftwq_complex) -#endif - -#define FFTW_FORWARD (-1) -#define FFTW_BACKWARD (+1) - -#define FFTW_NO_TIMELIMIT (-1.0) - -/* documented flags */ -#define FFTW_MEASURE (0U) -#define FFTW_DESTROY_INPUT (1U << 0) -#define FFTW_UNALIGNED (1U << 1) -#define FFTW_CONSERVE_MEMORY (1U << 2) -#define FFTW_EXHAUSTIVE (1U << 3) /* NO_EXHAUSTIVE is default */ -#define FFTW_PRESERVE_INPUT (1U << 4) /* cancels FFTW_DESTROY_INPUT */ -#define FFTW_PATIENT (1U << 5) /* IMPATIENT is default */ -#define FFTW_ESTIMATE (1U << 6) -#define FFTW_WISDOM_ONLY (1U << 21) - -/* undocumented beyond-guru flags */ -#define FFTW_ESTIMATE_PATIENT (1U << 7) -#define FFTW_BELIEVE_PCOST (1U << 8) -#define FFTW_NO_DFT_R2HC (1U << 9) -#define FFTW_NO_NONTHREADED (1U << 10) -#define FFTW_NO_BUFFERING (1U << 11) -#define FFTW_NO_INDIRECT_OP (1U << 12) -#define FFTW_ALLOW_LARGE_GENERIC (1U << 13) /* NO_LARGE_GENERIC is default */ -#define FFTW_NO_RANK_SPLITS (1U << 14) -#define FFTW_NO_VRANK_SPLITS (1U << 15) -#define FFTW_NO_VRECURSE (1U << 16) -#define FFTW_NO_SIMD (1U << 17) -#define FFTW_NO_SLOW (1U << 18) -#define FFTW_NO_FIXED_RADIX_LARGE_N (1U << 19) -#define FFTW_ALLOW_PRUNING (1U << 20) - -#ifdef __cplusplus -} /* extern "C" */ -#endif /* __cplusplus */ - -#endif /* FFTW3_H */ diff --git a/tests/core/Test_fft.cc b/tests/core/Test_fft.cc index ed57800c..2fdaeed2 100644 --- a/tests/core/Test_fft.cc +++ b/tests/core/Test_fft.cc @@ -35,6 +35,9 @@ int main (int argc, char ** argv) { Grid_init(&argc,&argv); + int threads = GridThread::GetThreads(); + std::cout< latt_size = GridDefaultLatt(); std::vector simd_layout( { vComplexD::Nsimd(),1,1,1}); std::vector mpi_layout = GridDefaultMpi(); @@ -75,10 +78,10 @@ int main (int argc, char ** argv) FFT theFFT(&Fine); - theFFT.FFT_dim(Ctilde,C,0,FFT::forward); C=Ctilde; - theFFT.FFT_dim(Ctilde,C,1,FFT::forward); C=Ctilde; - theFFT.FFT_dim(Ctilde,C,2,FFT::forward); C=Ctilde; - theFFT.FFT_dim(Ctilde,C,3,FFT::forward); + theFFT.FFT_dim(Ctilde,C,0,FFT::forward); C=Ctilde; std::cout << theFFT.MFlops()< +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include + +using namespace Grid; +using namespace Grid::QCD; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + int threads = GridThread::GetThreads(); + std::cout< latt_size = GridDefaultLatt(); + std::vector simd_layout( { vComplexF::Nsimd(),1,1,1}); + std::vector mpi_layout = GridDefaultMpi(); + + int vol = 1; + for(int d=0;d p({1,2,3,2}); + + one = ComplexF(1.0,0.0); + zz = ComplexF(0.0,0.0); + + ComplexF ci(0.0,1.0); + + C=zero; + for(int mu=0;mu<4;mu++){ + RealD TwoPiL = M_PI * 2.0/ latt_size[mu]; + LatticeCoordinate(coor,mu); + C = C - (TwoPiL * p[mu]) * coor; + } + + C = exp(C*ci); + + S=zero; + S = S+C; + + FFT theFFT(&Fine); + + theFFT.FFT_dim(Ctilde,C,0,FFT::forward); C=Ctilde; std::cout << theFFT.MFlops()< Date: Wed, 24 Aug 2016 15:05:56 +0100 Subject: [PATCH 20/29] Printing --- lib/FFT.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/FFT.h b/lib/FFT.h index 17c8a309..b25e8198 100644 --- a/lib/FFT.h +++ b/lib/FFT.h @@ -198,7 +198,7 @@ namespace Grid { double add,mul,fma; FFTW::fftw_flops(p,&add,&mul,&fma); flops_call = add+mul+2.0*fma; - std::cout << "FFT Flops per call "< Date: Wed, 24 Aug 2016 16:38:36 +0100 Subject: [PATCH 21/29] tidy up --- lib/FFT.h | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/FFT.h b/lib/FFT.h index b25e8198..0c79c527 100644 --- a/lib/FFT.h +++ b/lib/FFT.h @@ -244,7 +244,6 @@ PARALLEL_FOR_LOOP usec += Timer.useconds(); flops+= flops_call*NN; - int pc = processor_coor[dim]; for(int idx=0;idxlSites();idx++) { std::vector lcoor(Nd); From 8c89391c02207bb9dbcbf198f8474a6d203aeb8b Mon Sep 17 00:00:00 2001 From: paboyle Date: Wed, 24 Aug 2016 16:41:47 +0100 Subject: [PATCH 22/29] FFTW unresolved fixed when no fftw3.h --- lib/FFT.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/FFT.h b/lib/FFT.h index 0c79c527..4cda6483 100644 --- a/lib/FFT.h +++ b/lib/FFT.h @@ -91,6 +91,11 @@ namespace Grid { } }; +#endif + +#ifndef FFTW_FORWARD +#define FFTW_FORWARD (-1) +#define FFTW_BACKWARD (+1) #endif class FFT { From aa20cc8b52e90bbe0bbc960e2ea8e49b7f0baa5b Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Fri, 9 Sep 2016 02:53:22 -0700 Subject: [PATCH 23/29] Fixing compilation error with AVX512 flag --- bootstrap.sh | 2 +- lib/qcd/action/fermion/WilsonKernels.h | 19 ++++++++++++- lib/qcd/action/fermion/WilsonKernelsAsm.cc | 32 +++++++++++++++++++++- 3 files changed, 50 insertions(+), 3 deletions(-) diff --git a/bootstrap.sh b/bootstrap.sh index 461eb121..f847b7ab 100755 --- a/bootstrap.sh +++ b/bootstrap.sh @@ -4,7 +4,7 @@ EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.2.9.tar.bz2' FFTW_URL=http://www.fftw.org/fftw-3.3.4.tar.gz echo "-- deploying Eigen source..." -wget ${EIGEN_URL} +wget ${EIGEN_URL} --no-check-certificate ./scripts/update_eigen.sh `basename ${EIGEN_URL}` rm `basename ${EIGEN_URL}` diff --git a/lib/qcd/action/fermion/WilsonKernels.h b/lib/qcd/action/fermion/WilsonKernels.h index b679d3f9..b551319b 100644 --- a/lib/qcd/action/fermion/WilsonKernels.h +++ b/lib/qcd/action/fermion/WilsonKernels.h @@ -96,7 +96,24 @@ namespace Grid { WilsonKernels(const ImplParams &p= ImplParams()); }; - + + + template + void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, + std::vector > &buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) + { + assert(0); + } + template + void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, + std::vector > &buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) + { + assert(0); + } + + } } #endif diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc index b443ccf9..ce592540 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc +++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc @@ -38,6 +38,7 @@ namespace QCD { /////////////////////////////////////////////////////////// // Default to no assembler implementation /////////////////////////////////////////////////////////// + /* template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, std::vector > &buf, @@ -45,6 +46,14 @@ void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & l { assert(0); } +template +void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, + std::vector > &buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) +{ + assert(0); +} + */ #if defined(AVX512) @@ -116,7 +125,7 @@ void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st #endif - +/* template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, std::vector > &buf, int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out); @@ -136,5 +145,26 @@ template void WilsonKernels::DiracOptAsmDhopSite(StencilIm template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, std::vector > &buf, int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out); + +template void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, + std::vector > &buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out); + +template void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, + std::vector > &buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out); +template void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, + std::vector > &buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out); +template void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, + std::vector > &buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out); +template void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, + std::vector > &buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out); +template void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, + std::vector > &buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out); +*/ }} From f76f281e58e5e703ea0cd9806b1f4c729014a785 Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Fri, 9 Sep 2016 11:34:25 +0100 Subject: [PATCH 24/29] Cleaning files after fix --- lib/parallelIO/BinaryIO.h | 166 ++++++++++----------- lib/qcd/action/fermion/WilsonKernels.h | 31 ++-- lib/qcd/action/fermion/WilsonKernelsAsm.cc | 159 ++++++-------------- 3 files changed, 147 insertions(+), 209 deletions(-) diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h index 184209dc..5eddb57d 100644 --- a/lib/parallelIO/BinaryIO.h +++ b/lib/parallelIO/BinaryIO.h @@ -194,22 +194,22 @@ class BinaryIO { std::vector site({x,y,z,t}); - if ( grid->IsBoss() ) { - fin.read((char *)&file_object,sizeof(file_object)); - bytes += sizeof(file_object); - if(ieee32big) be32toh_v((void *)&file_object,sizeof(file_object)); - if(ieee32) le32toh_v((void *)&file_object,sizeof(file_object)); - if(ieee64big) be64toh_v((void *)&file_object,sizeof(file_object)); - if(ieee64) le64toh_v((void *)&file_object,sizeof(file_object)); + if (grid->IsBoss()) { + fin.read((char *)&file_object, sizeof(file_object)); + bytes += sizeof(file_object); + if (ieee32big) be32toh_v((void *)&file_object, sizeof(file_object)); + if (ieee32) le32toh_v((void *)&file_object, sizeof(file_object)); + if (ieee64big) be64toh_v((void *)&file_object, sizeof(file_object)); + if (ieee64) le64toh_v((void *)&file_object, sizeof(file_object)); - munge(file_object,munged,csum); + munge(file_object, munged, csum); } // The boss who read the file has their value poked pokeSite(munged,Umu,site); }}}} timer.Stop(); std::cout<IsBoss() ) { - - if(ieee32big) htobe32_v((void *)&file_object,sizeof(file_object)); - if(ieee32) htole32_v((void *)&file_object,sizeof(file_object)); - if(ieee64big) htobe64_v((void *)&file_object,sizeof(file_object)); - if(ieee64) htole64_v((void *)&file_object,sizeof(file_object)); + + if(ieee32big) htobe32_v((void *)&file_object,sizeof(file_object)); + if(ieee32) htole32_v((void *)&file_object,sizeof(file_object)); + if(ieee64big) htobe64_v((void *)&file_object,sizeof(file_object)); + if(ieee64) htole64_v((void *)&file_object,sizeof(file_object)); - // NB could gather an xstrip as an optimisation. - fout.write((char *)&file_object,sizeof(file_object)); - bytes+=sizeof(file_object); + // NB could gather an xstrip as an optimisation. + fout.write((char *)&file_object,sizeof(file_object)); + bytes+=sizeof(file_object); } }}}} timer.Stop(); std::cout<ThisRank() ){ - // std::cout << "rank" << rank<<" Getting state for index "<Broadcast(rank,(void *)&saved[0],bytes); if ( grid->IsBoss() ) { - Uint32Checksum((uint32_t *)&saved[0],bytes,csum); - fout.write((char *)&saved[0],bytes); + Uint32Checksum((uint32_t *)&saved[0],bytes,csum); + fout.write((char *)&saved[0],bytes); } } @@ -355,14 +355,14 @@ class BinaryIO { int l_idx=parallel.generator_idx(o_idx,i_idx); if ( grid->IsBoss() ) { - fin.read((char *)&saved[0],bytes); - Uint32Checksum((uint32_t *)&saved[0],bytes,csum); + fin.read((char *)&saved[0],bytes); + Uint32Checksum((uint32_t *)&saved[0],bytes,csum); } grid->Broadcast(0,(void *)&saved[0],bytes); if( rank == grid->ThisRank() ){ - parallel.SetState(saved,l_idx); + parallel.SetState(saved,l_idx); } } @@ -415,15 +415,15 @@ class BinaryIO { if ( d == 0 ) parallel[d] = 0; if (parallel[d]) { - range[d] = grid->_ldimensions[d]; - start[d] = grid->_processor_coor[d]*range[d]; - ioproc[d]= grid->_processor_coor[d]; + range[d] = grid->_ldimensions[d]; + start[d] = grid->_processor_coor[d]*range[d]; + ioproc[d]= grid->_processor_coor[d]; } else { - range[d] = grid->_gdimensions[d]; - start[d] = 0; - ioproc[d]= 0; + range[d] = grid->_gdimensions[d]; + start[d] = 0; + ioproc[d]= 0; - if ( grid->_processor_coor[d] != 0 ) IOnode = 0; + if ( grid->_processor_coor[d] != 0 ) IOnode = 0; } slice_vol = slice_vol * range[d]; } @@ -434,9 +434,9 @@ class BinaryIO { std::cout<< std::dec ; std::cout<< GridLogMessage<< "Parallel read I/O to "<< file << " with " <_ndimension;d++){ - std::cout<< range[d]; - if( d< grid->_ndimension-1 ) - std::cout<< " x "; + std::cout<< range[d]; + if( d< grid->_ndimension-1 ) + std::cout<< " x "; } std::cout << std::endl; } @@ -463,7 +463,7 @@ class BinaryIO { // need to implement these loops in Nd independent way with a lexico conversion for(int tlex=0;tlex tsite(nd); // temporary mixed up site std::vector gsite(nd); std::vector lsite(nd); @@ -472,8 +472,8 @@ class BinaryIO { Lexicographic::CoorFromIndex(tsite,tlex,range); for(int d=0;d_ldimensions[d]; // local site - gsite[d] = tsite[d]+start[d]; // global site + lsite[d] = tsite[d]%grid->_ldimensions[d]; // local site + gsite[d] = tsite[d]+start[d]; // global site } ///////////////////////// @@ -487,29 +487,29 @@ class BinaryIO { // iorank reads from the seek //////////////////////////////// if (myrank == iorank) { - - fin.seekg(offset+g_idx*sizeof(fileObj)); - fin.read((char *)&fileObj,sizeof(fileObj)); - bytes+=sizeof(fileObj); - - if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj)); - if(ieee32) le32toh_v((void *)&fileObj,sizeof(fileObj)); - if(ieee64big) be64toh_v((void *)&fileObj,sizeof(fileObj)); - if(ieee64) le64toh_v((void *)&fileObj,sizeof(fileObj)); - - munge(fileObj,siteObj,csum); + + fin.seekg(offset+g_idx*sizeof(fileObj)); + fin.read((char *)&fileObj,sizeof(fileObj)); + bytes+=sizeof(fileObj); + + if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj)); + if(ieee32) le32toh_v((void *)&fileObj,sizeof(fileObj)); + if(ieee64big) be64toh_v((void *)&fileObj,sizeof(fileObj)); + if(ieee64) le64toh_v((void *)&fileObj,sizeof(fileObj)); + + munge(fileObj,siteObj,csum); - } + } // Possibly do transport through pt2pt if ( rank != iorank ) { - if ( (myrank == rank) || (myrank==iorank) ) { - grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,iorank,rank,sizeof(siteObj)); - } + if ( (myrank == rank) || (myrank==iorank) ) { + grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,iorank,rank,sizeof(siteObj)); + } } // Poke at destination if ( myrank == rank ) { - pokeLocalSite(siteObj,Umu,lsite); + pokeLocalSite(siteObj,Umu,lsite); } grid->Barrier(); // necessary? } @@ -520,7 +520,7 @@ class BinaryIO { timer.Stop(); std::cout<_ndimension-1 ) parallel[d] = 0; if (parallel[d]) { - range[d] = grid->_ldimensions[d]; - start[d] = grid->_processor_coor[d]*range[d]; - ioproc[d]= grid->_processor_coor[d]; + range[d] = grid->_ldimensions[d]; + start[d] = grid->_processor_coor[d]*range[d]; + ioproc[d]= grid->_processor_coor[d]; } else { - range[d] = grid->_gdimensions[d]; - start[d] = 0; - ioproc[d]= 0; + range[d] = grid->_gdimensions[d]; + start[d] = 0; + ioproc[d]= 0; - if ( grid->_processor_coor[d] != 0 ) IOnode = 0; + if ( grid->_processor_coor[d] != 0 ) IOnode = 0; } slice_vol = slice_vol * range[d]; @@ -577,9 +577,9 @@ class BinaryIO { grid->GlobalSum(tmp); std::cout<< GridLogMessage<< "Parallel write I/O from "<< file << " with " <_ndimension;d++){ - std::cout<< range[d]; - if( d< grid->_ndimension-1 ) - std::cout<< " x "; + std::cout<< range[d]; + if( d< grid->_ndimension-1 ) + std::cout<< " x "; } std::cout << std::endl; } @@ -610,7 +610,7 @@ class BinaryIO { // should aggregate a whole chunk and then write. // need to implement these loops in Nd independent way with a lexico conversion for(int tlex=0;tlex tsite(nd); // temporary mixed up site std::vector gsite(nd); std::vector lsite(nd); @@ -619,8 +619,8 @@ class BinaryIO { Lexicographic::CoorFromIndex(tsite,tlex,range); for(int d=0;d_ldimensions[d]; // local site - gsite[d] = tsite[d]+start[d]; // global site + lsite[d] = tsite[d]%grid->_ldimensions[d]; // local site + gsite[d] = tsite[d]+start[d]; // global site } @@ -640,26 +640,26 @@ class BinaryIO { // Pair of nodes may need to do pt2pt send if ( rank != iorank ) { // comms is necessary - if ( (myrank == rank) || (myrank==iorank) ) { // and we have to do it - // Send to IOrank - grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,rank,iorank,sizeof(siteObj)); - } + if ( (myrank == rank) || (myrank==iorank) ) { // and we have to do it + // Send to IOrank + grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,rank,iorank,sizeof(siteObj)); + } } grid->Barrier(); // necessary? if (myrank == iorank) { - - munge(siteObj,fileObj,csum); + + munge(siteObj,fileObj,csum); - if(ieee32big) htobe32_v((void *)&fileObj,sizeof(fileObj)); - if(ieee32) htole32_v((void *)&fileObj,sizeof(fileObj)); - if(ieee64big) htobe64_v((void *)&fileObj,sizeof(fileObj)); - if(ieee64) htole64_v((void *)&fileObj,sizeof(fileObj)); - - fout.seekp(offset+g_idx*sizeof(fileObj)); - fout.write((char *)&fileObj,sizeof(fileObj)); - bytes+=sizeof(fileObj); + if(ieee32big) htobe32_v((void *)&fileObj,sizeof(fileObj)); + if(ieee32) htole32_v((void *)&fileObj,sizeof(fileObj)); + if(ieee64big) htobe64_v((void *)&fileObj,sizeof(fileObj)); + if(ieee64) htole64_v((void *)&fileObj,sizeof(fileObj)); + + fout.seekp(offset+g_idx*sizeof(fileObj)); + fout.write((char *)&fileObj,sizeof(fileObj)); + bytes+=sizeof(fileObj); } } @@ -668,7 +668,7 @@ class BinaryIO { timer.Stop(); std::cout< - void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - std::vector > &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) - { - assert(0); - } + void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, + std::vector > &buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) + { + assert(0); + } template - void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - std::vector > &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) - { - assert(0); - } - - + void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, + std::vector > &buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) + { + assert(0); + } + } } #endif diff --git a/lib/qcd/action/fermion/WilsonKernelsAsm.cc b/lib/qcd/action/fermion/WilsonKernelsAsm.cc index ce592540..d2cb4285 100644 --- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc +++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc @@ -1,4 +1,4 @@ - /************************************************************************************* +/************************************************************************************* Grid physics library, www.github.com/paboyle/Grid @@ -26,77 +26,56 @@ Author: paboyle 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. See the full license in the file "LICENSE" in the top level distribution directory - *************************************************************************************/ - /* END LEGAL */ +*************************************************************************************/ +/* END LEGAL */ #include namespace Grid { -namespace QCD { - - - /////////////////////////////////////////////////////////// - // Default to no assembler implementation - /////////////////////////////////////////////////////////// - /* -template -void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - std::vector > &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) -{ - assert(0); -} -template -void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - std::vector > &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) -{ - assert(0); -} - */ - + namespace QCD { + #if defined(AVX512) - - - /////////////////////////////////////////////////////////// - // If we are AVX512 specialise the single precision routine - /////////////////////////////////////////////////////////// - + + + /////////////////////////////////////////////////////////// + // If we are AVX512 specialise the single precision routine + /////////////////////////////////////////////////////////// + #include #include - -static Vector signs; - -int setupSigns(void ){ - Vector bother(2); - signs = bother; - vrsign(signs[0]); - visign(signs[1]); - return 1; -} -static int signInit = setupSigns(); - + + static Vector signs; + + int setupSigns(void ){ + Vector bother(2); + signs = bother; + vrsign(signs[0]); + visign(signs[1]); + return 1; + } + static int signInit = setupSigns(); + #define label(A) ilabel(A) #define ilabel(A) ".globl\n" #A ":\n" - + #define MAYBEPERM(A,perm) if (perm) { A ; } #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf) #define FX(A) WILSONASM_ ##A - + #undef KERNEL_DAG -template<> -void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - std::vector > &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) + template<> + void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, + std::vector > &buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include - + #define KERNEL_DAG -template<> -void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - std::vector > &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) + template<> + void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, + std::vector > &buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include - + #undef VMOVIDUP #undef VMOVRDUP #undef MAYBEPERM @@ -107,64 +86,22 @@ void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,Lebesgue #define VMOVIDUP(A,B,C) VBCASTIDUPf(A,B,C) #define VMOVRDUP(A,B,C) VBCASTRDUPf(A,B,C) #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf) - + #undef KERNEL_DAG -template<> -void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - std::vector > &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) + template<> + void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, + std::vector > &buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include - + #define KERNEL_DAG -template<> -void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - std::vector > &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) + template<> + void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, + std::vector > &buf, + int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out) #include - + #endif - - -/* -template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - std::vector > &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out); - -template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - std::vector > &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out); -template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - std::vector > &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out); -template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - std::vector > &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out); -template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - std::vector > &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out); -template void WilsonKernels::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - std::vector > &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out); - -template void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - std::vector > &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out); - -template void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - std::vector > &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out); -template void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - std::vector > &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out); -template void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - std::vector > &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out); -template void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - std::vector > &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out); -template void WilsonKernels::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, - std::vector > &buf, - int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out); -*/ -}} + } +} From 5df5d52d417decf8d9a229a674a9d4a082149767 Mon Sep 17 00:00:00 2001 From: Guido Cossu Date: Mon, 12 Sep 2016 17:17:20 +0100 Subject: [PATCH 25/29] Fix for the Intel compiler --- lib/qcd/QCD.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/lib/qcd/QCD.h b/lib/qcd/QCD.h index ec11c837..0e9d3c17 100644 --- a/lib/qcd/QCD.h +++ b/lib/qcd/QCD.h @@ -55,11 +55,14 @@ namespace QCD { ////////////////////////////////////////////////////////////////////////////// // QCD iMatrix types // Index conventions: Lorentz x Spin x Colour + // note: static const int or constexpr will work for type deductions + // with the intel compiler (up to version 17) ////////////////////////////////////////////////////////////////////////////// - static const int ColourIndex = 2; - static const int SpinIndex = 1; - static const int LorentzIndex= 0; + #define ColourIndex 2 + #define SpinIndex 1 + #define LorentzIndex 0 + // Also should make these a named enum type static const int DaggerNo=0; static const int DaggerYes=1; From 2e74520821e4b0549d83ffcf61effd5334a589a2 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Fri, 16 Sep 2016 15:25:49 +0100 Subject: [PATCH 26/29] removed libtool use (BG/Q compatibility) --- configure.ac | 4 +--- lib/Makefile.am | 6 +++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/configure.ac b/configure.ac index d064345b..a987eabb 100644 --- a/configure.ac +++ b/configure.ac @@ -10,6 +10,7 @@ m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) AC_LANG(C++) CXXFLAGS="-O3 $CXXFLAGS" AC_PROG_CXX +AC_PROG_RANLIB ############ openmp ############### AC_OPENMP @@ -22,9 +23,6 @@ AM_CXXFLAGS="$OPENMP_CXXFLAGS $AM_CXXFLAGS" AM_LDFLAGS="$OPENMP_CXXFLAGS $AM_LDFLAGS" fi -############ libtool ############### -LT_INIT - ############### Checks for header files AC_CHECK_HEADERS(stdint.h) AC_CHECK_HEADERS(mm_malloc.h) diff --git a/lib/Makefile.am b/lib/Makefile.am index a7ef229a..bc752af5 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -17,8 +17,8 @@ endif include Make.inc include Eigen.inc -lib_LTLIBRARIES = libGrid.la +lib_LIBRARIES = libGrid.a -libGrid_la_SOURCES = $(CCFILES) $(extra_sources) -libGrid_ladir = $(pkgincludedir) +libGrid_a_SOURCES = $(CCFILES) $(extra_sources) +libGrid_adir = $(pkgincludedir) nobase_dist_pkginclude_HEADERS = $(HFILES) $(eigen_files) Config.h From 0724f7af75764f7617dc945a7382ff6d289c1cf4 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Mon, 19 Sep 2016 18:09:12 +0100 Subject: [PATCH 27/29] QPX single precision implementation --- lib/Stencil.h | 2 +- lib/simd/Grid_qpx.h | 151 ++++++++++++++++++++++++++++++++++++++------ tests/Test_simd.cc | 3 +- 3 files changed, 132 insertions(+), 24 deletions(-) diff --git a/lib/Stencil.h b/lib/Stencil.h index 47c1b977..79479879 100644 --- a/lib/Stencil.h +++ b/lib/Stencil.h @@ -265,7 +265,7 @@ // _mm_prefetch((char *)&_entries[ent],_MM_HINT_T0); } inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) { - _mm_prefetch((char *)&_entries[ent+1],_MM_HINT_T0); + //_mm_prefetch((char *)&_entries[ent+1],_MM_HINT_T0); local = _entries[ent]._is_local; perm = _entries[ent]._permute; if (perm) ptype = _permute_type[point]; diff --git a/lib/simd/Grid_qpx.h b/lib/simd/Grid_qpx.h index 6ac83290..07933f52 100644 --- a/lib/simd/Grid_qpx.h +++ b/lib/simd/Grid_qpx.h @@ -27,20 +27,31 @@ namespace Grid { namespace Optimization { + typedef struct + { + float v0,v1,v2,v3; + } vector4float; + inline std::ostream & operator<<(std::ostream& stream, const vector4double a) { stream << "{"< 2301 return vec_perm(v, v, vec_gpci(02301)); }; @@ -198,6 +291,12 @@ namespace Optimization { static inline vector4double Permute3(vector4double v){ return v; }; + + // Complex float + FLOAT_WRAP_1(Permute0, static inline) + FLOAT_WRAP_1(Permute1, static inline) + FLOAT_WRAP_1(Permute2, static inline) + FLOAT_WRAP_1(Permute3, static inline) }; struct Rotate{ @@ -218,31 +317,42 @@ namespace Optimization { default: assert(0); } } + + static inline vector4float rotate(vector4float v, int n){ + vector4double vd, rd; + vector4float r; + + vd = Vset()(v); + rd = rotate(vd, n); + Vstore()(rd, r); + + return r; + } }; //Complex float Reduce template<> inline Grid::ComplexF - Reduce::operator()(vector4double v) { //2 complex - vector4double v1,v2; + Reduce::operator()(vector4float v) { //2 complex + vector4float v1,v2; v1 = Optimization::Permute::Permute0(v); - v1 = vec_add(v1, v); + v1 = Optimization::Sum()(v1, v); - return Grid::ComplexF((float)vec_extract(v1, 0), (float)vec_extract(v1, 1)); + return Grid::ComplexF(v1.v0, v1.v1); } //Real float Reduce template<> inline Grid::RealF - Reduce::operator()(vector4double v){ //4 floats - vector4double v1,v2; + Reduce::operator()(vector4float v){ //4 floats + vector4float v1,v2; v1 = Optimization::Permute::Permute0(v); - v1 = vec_add(v1, v); + v1 = Optimization::Sum()(v1, v); v2 = Optimization::Permute::Permute1(v1); - v1 = vec_add(v1, v2); + v1 = Optimization::Sum()(v1, v2); - return (float)vec_extract(v1, 0); + return v1.v0; } @@ -283,10 +393,9 @@ namespace Optimization { //////////////////////////////////////////////////////////////////////////////// // Here assign types - -typedef vector4double SIMD_Ftype; // Single precision type -typedef vector4double SIMD_Dtype; // Double precision type -typedef int SIMD_Itype; // Integer type +typedef Optimization::vector4float SIMD_Ftype; // Single precision type +typedef vector4double SIMD_Dtype; // Double precision type +typedef int SIMD_Itype; // Integer type // prefetch utilities inline void v_prefetch0(int size, const char *ptr){}; diff --git a/tests/Test_simd.cc b/tests/Test_simd.cc index 5d84b9ef..189f0559 100644 --- a/tests/Test_simd.cc +++ b/tests/Test_simd.cc @@ -157,10 +157,9 @@ void Tester(const functor &func) std::cout << GridLogMessage << " " << func.name() << std::endl; std::cout << GridLogDebug << v_input1 << std::endl; + std::cout << GridLogDebug << v_input2 << std::endl; std::cout << GridLogDebug << v_result << std::endl; - - int ok=0; for(int i=0;i1.0e-7){ From 65ca174dbbb938cef617a40aea7f106f23babab4 Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Tue, 20 Sep 2016 11:25:06 +0100 Subject: [PATCH 28/29] gitignore update --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index f972b7f5..e82ecf9c 100644 --- a/.gitignore +++ b/.gitignore @@ -94,6 +94,10 @@ build.sh ################ lib/Eigen/* +# FFTW source # +################ +lib/fftw/* + # libtool macros # ################## m4/lt* From d2573189d8d79d20fb5d3d7ea115d996ad431bca Mon Sep 17 00:00:00 2001 From: Antonin Portelli Date: Tue, 20 Sep 2016 12:30:24 +0100 Subject: [PATCH 29/29] build system: FFTW fix --- configure.ac | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/configure.ac b/configure.ac index a987eabb..db379510 100644 --- a/configure.ac +++ b/configure.ac @@ -73,23 +73,6 @@ AC_ARG_WITH([fftw], [AM_CXXFLAGS="-I$with_fftw/include $AM_CXXFLAGS"] [AM_LDFLAGS="-L$with_fftw/lib $AM_LDFLAGS"]) -# -# What about the MKL library replacement for fftw3 ? How do we know if fftw_execute -# can be found in MKL? -# -AC_CHECK_LIB([fftw3],[fftw_execute], - [AC_DEFINE([HAVE_FFTW],[1],[Define to 1 if you have the `FFTW' library (-lfftw3).])] [ac_fftw=yes], - [ac_fftw=no]) - -case ${ac_fftw} in - no) - echo WARNING libfftw3 not found FFT routines will not work - ;; - yes) - AM_LDFLAGS="$AM_LDFLAGS -lfftw3 -lfftw3f" -esac - - ################ Get compiler informations AC_LANG([C++]) AX_CXX_COMPILE_STDCXX_11([noext],[mandatory]) @@ -103,7 +86,6 @@ AC_DEFINE_UNQUOTED([GXX_VERSION],["$GXX_VERSION"], ############### Checks for library functions CXXFLAGS_CPY=$CXXFLAGS LDFLAGS_CPY=$LDFLAGS -LIBS_CPY=$LIBS CXXFLAGS="$AM_CXXFLAGS $CXXFLAGS" LDFLAGS="$AM_LDFLAGS $LDFLAGS" AC_CHECK_FUNCS([gettimeofday]) @@ -122,6 +104,11 @@ if test "${ac_LAPACK}x" != "nox"; then AC_CHECK_LIB([lapack],[LAPACKE_sbdsdc],[], [AC_MSG_ERROR("LAPACK enabled but library not found")]) fi +AC_CHECK_LIB([fftw3],[fftw_execute], + [AC_DEFINE([HAVE_FFTW],[1],[Define to 1 if you have the `FFTW' library (-lfftw3).])] + [have_fftw=true] + [LIBS="$LIBS -lfftw3 -lfftw3f"], + [AC_MSG_WARN([**** FFTW library not found, Grid can still compile but FFT-based routines will not work ****])]) CXXFLAGS=$CXXFLAGS_CPY LDFLAGS=$LDFLAGS_CPY @@ -337,12 +324,15 @@ Summary of configuration for $PACKAGE v$VERSION - RNG choice : ${ac_RNG} - GMP : `if test "x$have_gmp" = xtrue; then echo yes; else echo no; fi` - LAPACK : ${ac_LAPACK} -- FFTW : ${ac_fftw} +- FFTW : `if test "x$have_fftw" = xtrue; then echo yes; else echo no; fi` - build DOXYGEN documentation : `if test "x$enable_doc" = xyes; then echo yes; else echo no; fi` - graphs and diagrams : `if test "x$enable_dot" = xyes; then echo yes; else echo no; fi` ----- BUILD FLAGS ------------------------------------- -- CXXFLAGS: "${AM_CXXFLAGS} ${CXXFLAGS}" -- LDFLAGS: "${AM_LDFLAGS} ${LDFLAGS}" -- LIBS: "${LIBS} " +- CXXFLAGS: +`echo ${AM_CXXFLAGS} ${CXXFLAGS} | sed 's/ -/\n\t-/g' | sed 's/^-/\t-/g'` +- LDFLAGS: +`echo ${AM_LDFLAGS} ${LDFLAGS} | sed 's/ -/\n\t-/g' | sed 's/^-/\t-/g'` +- LIBS: +`echo ${LIBS} | sed 's/ -/\n\t-/g' | sed 's/^-/\t-/g'` ------------------------------------------------------- "