mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-04 19:25:56 +01:00
Merge branch 'develop' of https://github.com/paboyle/Grid into feature/staggering
This commit is contained in:
commit
c097fd041a
1
.gitignore
vendored
1
.gitignore
vendored
@ -49,6 +49,7 @@ config.status
|
|||||||
.deps
|
.deps
|
||||||
Make.inc
|
Make.inc
|
||||||
eigen.inc
|
eigen.inc
|
||||||
|
Eigen.inc
|
||||||
|
|
||||||
# http://www.gnu.org/software/autoconf #
|
# http://www.gnu.org/software/autoconf #
|
||||||
########################################
|
########################################
|
||||||
|
@ -138,7 +138,7 @@ The following options can be use with the `--enable-comms=` option to target dif
|
|||||||
| `mpi3l[-auto]` | MPI communications using MPI 3 shared memory and leader model |
|
| `mpi3l[-auto]` | MPI communications using MPI 3 shared memory and leader model |
|
||||||
| `shmem ` | Cray SHMEM communications |
|
| `shmem ` | Cray SHMEM communications |
|
||||||
|
|
||||||
For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names).
|
For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names). The `-auto` suffix is not supported by the Cray environment wrapper scripts. Use the standard versions instead.
|
||||||
|
|
||||||
### Possible SIMD types
|
### Possible SIMD types
|
||||||
|
|
||||||
|
237
benchmarks/Benchmark_mooee.cc
Normal file
237
benchmarks/Benchmark_mooee.cc
Normal file
@ -0,0 +1,237 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./benchmarks/Benchmark_dwf.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid/Grid.h>
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
using namespace Grid;
|
||||||
|
using namespace Grid::QCD;
|
||||||
|
|
||||||
|
|
||||||
|
int main (int argc, char ** argv)
|
||||||
|
{
|
||||||
|
Grid_init(&argc,&argv);
|
||||||
|
|
||||||
|
int threads = GridThread::GetThreads();
|
||||||
|
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
||||||
|
|
||||||
|
std::vector<int> latt4 = GridDefaultLatt();
|
||||||
|
const int Ls=8;
|
||||||
|
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
||||||
|
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||||
|
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||||
|
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "Making Vec5d innermost grids"<<std::endl;
|
||||||
|
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
|
||||||
|
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
|
||||||
|
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
|
||||||
|
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
|
||||||
|
|
||||||
|
std::vector<int> seeds4({1,2,3,4});
|
||||||
|
std::vector<int> seeds5({5,6,7,8});
|
||||||
|
|
||||||
|
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
||||||
|
std::cout << GridLogMessage << "Seeded"<<std::endl;
|
||||||
|
|
||||||
|
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(RNG4,Umu);
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "made random gauge fields"<<std::endl;
|
||||||
|
|
||||||
|
RealD mass=0.1;
|
||||||
|
RealD M5 =1.8;
|
||||||
|
RealD NP = UGrid->_Nprocessors;
|
||||||
|
|
||||||
|
|
||||||
|
if (1)
|
||||||
|
{
|
||||||
|
const int ncall=100;
|
||||||
|
|
||||||
|
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||||
|
|
||||||
|
GridParallelRNG RNG5(FGrid);
|
||||||
|
LatticeFermion src(FGrid); random(RNG5,src);
|
||||||
|
LatticeFermion result(FGrid);
|
||||||
|
|
||||||
|
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
||||||
|
|
||||||
|
FGrid->Barrier();
|
||||||
|
|
||||||
|
double t0,t1;
|
||||||
|
t0=usecond();
|
||||||
|
for(int i=0;i<ncall;i++){
|
||||||
|
Dw.Dhop(src,result,0);
|
||||||
|
}
|
||||||
|
t1=usecond();
|
||||||
|
FGrid->Barrier();
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "Called Dhop "<< (t1-t0)/ncall<<" us"<<std::endl;
|
||||||
|
|
||||||
|
LatticeFermion r_eo(FGrid);
|
||||||
|
LatticeFermion src_e (FrbGrid);
|
||||||
|
LatticeFermion src_o (FrbGrid);
|
||||||
|
LatticeFermion r_e (FrbGrid);
|
||||||
|
LatticeFermion r_o (FrbGrid);
|
||||||
|
|
||||||
|
pickCheckerboard(Even,src_e,src);
|
||||||
|
pickCheckerboard(Odd,src_o,src);
|
||||||
|
|
||||||
|
setCheckerboard(r_eo,src_o);
|
||||||
|
setCheckerboard(r_eo,src_e);
|
||||||
|
|
||||||
|
r_e = zero;
|
||||||
|
r_o = zero;
|
||||||
|
|
||||||
|
FGrid->Barrier();
|
||||||
|
t0=usecond();
|
||||||
|
for (int i = 0; i < ncall; i++) {
|
||||||
|
Dw.DhopEO(src_o, r_e, DaggerNo);
|
||||||
|
}
|
||||||
|
t1=usecond();
|
||||||
|
FGrid->Barrier();
|
||||||
|
std::cout<<GridLogMessage << "Called DhopEO "<< (t1-t0)/ncall<<" us"<<std::endl;
|
||||||
|
|
||||||
|
FGrid->Barrier();
|
||||||
|
t0=usecond();
|
||||||
|
for (int i = 0; i < ncall; i++) {
|
||||||
|
Dw.Mooee(src_o, r_o);
|
||||||
|
}
|
||||||
|
t1=usecond();
|
||||||
|
FGrid->Barrier();
|
||||||
|
std::cout<<GridLogMessage << "Called Mooee "<< (t1-t0)/ncall<<" us"<<std::endl;
|
||||||
|
|
||||||
|
FGrid->Barrier();
|
||||||
|
t0=usecond();
|
||||||
|
for (int i = 0; i < ncall; i++) {
|
||||||
|
Dw.MooeeInv(src_o, r_o);
|
||||||
|
}
|
||||||
|
t1=usecond();
|
||||||
|
FGrid->Barrier();
|
||||||
|
std::cout<<GridLogMessage << "Called MooeeInv "<< (t1-t0)/ncall<<" us"<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
|
FGrid->Barrier();
|
||||||
|
t0=usecond();
|
||||||
|
for (int i = 0; i < ncall; i++) {
|
||||||
|
Dw.Meooe(src_o, r_e);
|
||||||
|
}
|
||||||
|
t1=usecond();
|
||||||
|
FGrid->Barrier();
|
||||||
|
std::cout<<GridLogMessage << "Called Meooe "<< (t1-t0)/ncall<<" us"<<std::endl;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
if (1)
|
||||||
|
{
|
||||||
|
const int ncall=100;
|
||||||
|
|
||||||
|
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionVec5dR::Dhop "<<std::endl;
|
||||||
|
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||||
|
|
||||||
|
GridParallelRNG RNG5(sFGrid);
|
||||||
|
LatticeFermion src(sFGrid); random(RNG5,src);
|
||||||
|
LatticeFermion sref(sFGrid);
|
||||||
|
LatticeFermion result(sFGrid);
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "Constructing Vec5D Dw "<<std::endl;
|
||||||
|
DomainWallFermionVec5dR Dw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,mass,M5);
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "Calling Dhop "<<std::endl;
|
||||||
|
FGrid->Barrier();
|
||||||
|
|
||||||
|
double t0,t1;
|
||||||
|
t0=usecond();
|
||||||
|
for(int i=0;i<ncall;i++){
|
||||||
|
Dw.Dhop(src,result,0);
|
||||||
|
}
|
||||||
|
t1=usecond();
|
||||||
|
FGrid->Barrier();
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "Called Vec5D Dhop "<< (t1-t0)/ncall<<" us"<<std::endl;
|
||||||
|
|
||||||
|
LatticeFermion r_eo(sFGrid);
|
||||||
|
LatticeFermion src_e (sFrbGrid);
|
||||||
|
LatticeFermion src_o (sFrbGrid);
|
||||||
|
LatticeFermion r_e (sFrbGrid);
|
||||||
|
LatticeFermion r_o (sFrbGrid);
|
||||||
|
|
||||||
|
pickCheckerboard(Even,src_e,src);
|
||||||
|
pickCheckerboard(Odd,src_o,src);
|
||||||
|
|
||||||
|
setCheckerboard(r_eo,src_o);
|
||||||
|
setCheckerboard(r_eo,src_e);
|
||||||
|
|
||||||
|
r_e = zero;
|
||||||
|
r_o = zero;
|
||||||
|
|
||||||
|
FGrid->Barrier();
|
||||||
|
t0=usecond();
|
||||||
|
for (int i = 0; i < ncall; i++) {
|
||||||
|
Dw.DhopEO(src_o, r_e, DaggerNo);
|
||||||
|
}
|
||||||
|
t1=usecond();
|
||||||
|
FGrid->Barrier();
|
||||||
|
std::cout<<GridLogMessage << "Called Vec5D DhopEO "<< (t1-t0)/ncall<<" us"<<std::endl;
|
||||||
|
|
||||||
|
FGrid->Barrier();
|
||||||
|
t0=usecond();
|
||||||
|
for (int i = 0; i < ncall; i++) {
|
||||||
|
Dw.Mooee(src_o, r_o);
|
||||||
|
}
|
||||||
|
t1=usecond();
|
||||||
|
FGrid->Barrier();
|
||||||
|
std::cout<<GridLogMessage << "Called Vec5D Mooee "<< (t1-t0)/ncall<<" us"<<std::endl;
|
||||||
|
|
||||||
|
FGrid->Barrier();
|
||||||
|
t0=usecond();
|
||||||
|
for (int i = 0; i < ncall; i++) {
|
||||||
|
Dw.MooeeInv(src_o, r_o);
|
||||||
|
}
|
||||||
|
t1=usecond();
|
||||||
|
FGrid->Barrier();
|
||||||
|
std::cout<<GridLogMessage << "Called Vec5D MooeeInv "<< (t1-t0)/ncall<<" us"<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
|
FGrid->Barrier();
|
||||||
|
t0=usecond();
|
||||||
|
for (int i = 0; i < ncall; i++) {
|
||||||
|
Dw.Meooe(src_o, r_e);
|
||||||
|
}
|
||||||
|
t1=usecond();
|
||||||
|
FGrid->Barrier();
|
||||||
|
std::cout<<GridLogMessage << "Called Vec5D Meooe "<< (t1-t0)/ncall<<" us"<<std::endl;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Grid_finalize();
|
||||||
|
}
|
@ -206,8 +206,8 @@ case ${ax_cv_cxx_compiler_vendor} in
|
|||||||
AC_DEFINE([AVX1],[1],[AVX intrinsics])
|
AC_DEFINE([AVX1],[1],[AVX intrinsics])
|
||||||
SIMD_FLAGS='-mavx -xavx';;
|
SIMD_FLAGS='-mavx -xavx';;
|
||||||
AVXFMA)
|
AVXFMA)
|
||||||
AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA4])
|
AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA3])
|
||||||
SIMD_FLAGS='-mavx -mfma';;
|
SIMD_FLAGS='-mavx -fma';;
|
||||||
AVX2)
|
AVX2)
|
||||||
AC_DEFINE([AVX2],[1],[AVX2 intrinsics])
|
AC_DEFINE([AVX2],[1],[AVX2 intrinsics])
|
||||||
SIMD_FLAGS='-march=core-avx2 -xcore-avx2';;
|
SIMD_FLAGS='-march=core-avx2 -xcore-avx2';;
|
||||||
@ -290,7 +290,7 @@ esac
|
|||||||
case ${ac_COMMS} in
|
case ${ac_COMMS} in
|
||||||
*-auto)
|
*-auto)
|
||||||
LX_FIND_MPI
|
LX_FIND_MPI
|
||||||
if test "x$have_CXX_mpi" = 'xno'; then AC_MSG_ERROR(["MPI not found"]); fi
|
if test "x$have_CXX_mpi" = 'xno'; then AC_MSG_ERROR(["The configure could not find the MPI compilation flags. N.B. The -auto mode is not supported by Cray wrappers. Use the non -auto version in this case."]); fi
|
||||||
AM_CXXFLAGS="$MPI_CXXFLAGS $AM_CXXFLAGS"
|
AM_CXXFLAGS="$MPI_CXXFLAGS $AM_CXXFLAGS"
|
||||||
AM_CFLAGS="$MPI_CFLAGS $AM_CFLAGS"
|
AM_CFLAGS="$MPI_CFLAGS $AM_CFLAGS"
|
||||||
AM_LDFLAGS="`echo $MPI_CXXLDFLAGS | sed -E 's/-l@<:@^ @:>@+//g'` $AM_LDFLAGS"
|
AM_LDFLAGS="`echo $MPI_CXXLDFLAGS | sed -E 's/-l@<:@^ @:>@+//g'` $AM_LDFLAGS"
|
||||||
|
@ -244,7 +244,10 @@ namespace Grid {
|
|||||||
pokeLocalSite(s,pgbuf,cbuf);
|
pokeLocalSite(s,pgbuf,cbuf);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
result = Cshift(result,dim,L);
|
if (p != processors[dim] - 1)
|
||||||
|
{
|
||||||
|
result = Cshift(result,dim,L);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Loop over orthog coords
|
// Loop over orthog coords
|
||||||
@ -287,10 +290,10 @@ namespace Grid {
|
|||||||
cgbuf = clbuf;
|
cgbuf = clbuf;
|
||||||
cgbuf[dim] = clbuf[dim]+L*pc;
|
cgbuf[dim] = clbuf[dim]+L*pc;
|
||||||
peekLocalSite(s,pgbuf,cgbuf);
|
peekLocalSite(s,pgbuf,cgbuf);
|
||||||
s = s * div;
|
|
||||||
pokeLocalSite(s,result,clbuf);
|
pokeLocalSite(s,result,clbuf);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
result = result*div;
|
||||||
|
|
||||||
// destroying plan
|
// destroying plan
|
||||||
FFTW<scalar>::fftw_destroy_plan(p);
|
FFTW<scalar>::fftw_destroy_plan(p);
|
||||||
|
@ -203,6 +203,7 @@ typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
|
|||||||
typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
|
typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
|
||||||
typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
|
typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
|
||||||
typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
|
typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
|
||||||
|
|
||||||
typedef MobiusFermion<WilsonImplR> MobiusFermionR;
|
typedef MobiusFermion<WilsonImplR> MobiusFermionR;
|
||||||
typedef MobiusFermion<WilsonImplF> MobiusFermionF;
|
typedef MobiusFermion<WilsonImplF> MobiusFermionF;
|
||||||
typedef MobiusFermion<WilsonImplD> MobiusFermionD;
|
typedef MobiusFermion<WilsonImplD> MobiusFermionD;
|
||||||
@ -211,6 +212,20 @@ typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR;
|
|||||||
typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
|
typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
|
||||||
typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;
|
typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;
|
||||||
|
|
||||||
|
// Ls vectorised
|
||||||
|
typedef DomainWallFermion<DomainWallVec5dImplR> DomainWallFermionVec5dR;
|
||||||
|
typedef DomainWallFermion<DomainWallVec5dImplF> DomainWallFermionVec5dF;
|
||||||
|
typedef DomainWallFermion<DomainWallVec5dImplD> DomainWallFermionVec5dD;
|
||||||
|
|
||||||
|
typedef MobiusFermion<DomainWallVec5dImplR> MobiusFermionVec5dR;
|
||||||
|
typedef MobiusFermion<DomainWallVec5dImplF> MobiusFermionVec5dF;
|
||||||
|
typedef MobiusFermion<DomainWallVec5dImplD> MobiusFermionVec5dD;
|
||||||
|
|
||||||
|
typedef ZMobiusFermion<ZDomainWallVec5dImplR> ZMobiusFermionVec5dR;
|
||||||
|
typedef ZMobiusFermion<ZDomainWallVec5dImplF> ZMobiusFermionVec5dF;
|
||||||
|
typedef ZMobiusFermion<ZDomainWallVec5dImplD> ZMobiusFermionVec5dD;
|
||||||
|
|
||||||
|
|
||||||
typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
|
typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
|
||||||
typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF;
|
typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF;
|
||||||
typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD;
|
typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD;
|
||||||
@ -269,6 +284,7 @@ typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
|
|||||||
typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
|
typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
|
||||||
typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;
|
typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;
|
||||||
|
|
||||||
|
|
||||||
}}
|
}}
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
// G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
|
// G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
|
||||||
|
@ -194,6 +194,11 @@ void WilsonFermion5D<Impl>::Report(void)
|
|||||||
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
||||||
|
|
||||||
|
RealD Fullmflops = 1344*volume*DhopCalls/(DhopComputeTime+DhopCommTime)/2; // 2 for red black counting
|
||||||
|
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
|
||||||
|
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( DerivCalls > 0 ) {
|
if ( DerivCalls > 0 ) {
|
||||||
@ -209,12 +214,15 @@ void WilsonFermion5D<Impl>::Report(void)
|
|||||||
RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
|
RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
||||||
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NP << std::endl;
|
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NP << std::endl;
|
||||||
}
|
|
||||||
|
RealD Fullmflops = 144*volume*DerivCalls/(DerivDhopComputeTime+DerivCommTime)/2; // 2 for red black counting
|
||||||
|
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
|
||||||
|
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NP << std::endl; }
|
||||||
|
|
||||||
if (DerivCalls > 0 || DhopCalls > 0){
|
if (DerivCalls > 0 || DhopCalls > 0){
|
||||||
std::cout << GridLogMessage << "WilsonFermion5D Stencil"<<std::endl; Stencil.Report();
|
std::cout << GridLogMessage << "WilsonFermion5D Stencil" <<std::endl; Stencil.Report();
|
||||||
std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl; StencilEven.Report();
|
std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl; StencilEven.Report();
|
||||||
std::cout << GridLogMessage << "WilsonFermion5D StencilOdd"<<std::endl; StencilOdd.Report();
|
std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl; StencilOdd.Report();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -167,7 +167,7 @@ namespace Optimization {
|
|||||||
}
|
}
|
||||||
//Integer
|
//Integer
|
||||||
inline __m256i operator()(__m256i a, __m256i b){
|
inline __m256i operator()(__m256i a, __m256i b){
|
||||||
#if defined (AVX1) || defined (AVXFMA4)
|
#if defined (AVX1) || defined (AVXFMA) || defined (AVXFMA4)
|
||||||
__m128i a0,a1;
|
__m128i a0,a1;
|
||||||
__m128i b0,b1;
|
__m128i b0,b1;
|
||||||
a0 = _mm256_extractf128_si256(a,0);
|
a0 = _mm256_extractf128_si256(a,0);
|
||||||
@ -195,7 +195,7 @@ namespace Optimization {
|
|||||||
}
|
}
|
||||||
//Integer
|
//Integer
|
||||||
inline __m256i operator()(__m256i a, __m256i b){
|
inline __m256i operator()(__m256i a, __m256i b){
|
||||||
#if defined (AVX1) || defined (AVXFMA4)
|
#if defined (AVX1) || defined (AVXFMA) || defined (AVXFMA4)
|
||||||
__m128i a0,a1;
|
__m128i a0,a1;
|
||||||
__m128i b0,b1;
|
__m128i b0,b1;
|
||||||
a0 = _mm256_extractf128_si256(a,0);
|
a0 = _mm256_extractf128_si256(a,0);
|
||||||
@ -216,7 +216,7 @@ namespace Optimization {
|
|||||||
struct MultComplex{
|
struct MultComplex{
|
||||||
// Complex float
|
// Complex float
|
||||||
inline __m256 operator()(__m256 a, __m256 b){
|
inline __m256 operator()(__m256 a, __m256 b){
|
||||||
#if defined (AVX1)
|
#if defined (AVX1)
|
||||||
__m256 ymm0,ymm1,ymm2;
|
__m256 ymm0,ymm1,ymm2;
|
||||||
ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
|
ymm0 = _mm256_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
|
||||||
ymm0 = _mm256_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
|
ymm0 = _mm256_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
|
||||||
@ -233,7 +233,7 @@ namespace Optimization {
|
|||||||
a_imag = _mm256_mul_ps( a_imag,tmp ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
|
a_imag = _mm256_mul_ps( a_imag,tmp ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
|
||||||
return _mm256_maddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
|
return _mm256_maddsub_ps( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
|
||||||
#endif
|
#endif
|
||||||
#if defined (AVX2)
|
#if defined (AVX2) || defined (AVXFMA)
|
||||||
__m256 a_real = _mm256_moveldup_ps( a ); // Ar Ar
|
__m256 a_real = _mm256_moveldup_ps( a ); // Ar Ar
|
||||||
__m256 a_imag = _mm256_movehdup_ps( a ); // Ai Ai
|
__m256 a_imag = _mm256_movehdup_ps( a ); // Ai Ai
|
||||||
a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) )); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
|
a_imag = _mm256_mul_ps( a_imag, _mm256_shuffle_ps( b,b, _MM_SELECT_FOUR_FOUR(2,3,0,1) )); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
|
||||||
@ -264,7 +264,7 @@ namespace Optimization {
|
|||||||
IF IMM0[3] = 0
|
IF IMM0[3] = 0
|
||||||
THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI; // Ox5 r<->i ; 0xC unchanged
|
THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI; // Ox5 r<->i ; 0xC unchanged
|
||||||
*/
|
*/
|
||||||
#if defined (AVX1)
|
#if defined (AVX1)
|
||||||
__m256d ymm0,ymm1,ymm2;
|
__m256d ymm0,ymm1,ymm2;
|
||||||
ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
|
ymm0 = _mm256_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
|
||||||
ymm0 = _mm256_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br
|
ymm0 = _mm256_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br
|
||||||
@ -279,7 +279,7 @@ namespace Optimization {
|
|||||||
a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
|
a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
|
||||||
return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
|
return _mm256_maddsub_pd( a_real, b, a_imag ); // Ar Br , Ar Bi +- Ai Bi = ArBr-AiBi , ArBi+AiBr
|
||||||
#endif
|
#endif
|
||||||
#if defined (AVX2)
|
#if defined (AVX2) || defined (AVXFMA)
|
||||||
__m256d a_real = _mm256_movedup_pd( a ); // Ar Ar
|
__m256d a_real = _mm256_movedup_pd( a ); // Ar Ar
|
||||||
__m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai
|
__m256d a_imag = _mm256_shuffle_pd(a,a,0xF);//aiai
|
||||||
a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
|
a_imag = _mm256_mul_pd( a_imag, _mm256_permute_pd( b, 0x5 ) ); // (Ai, Ai) * (Bi, Br) = Ai Bi, Ai Br
|
||||||
@ -320,7 +320,7 @@ namespace Optimization {
|
|||||||
#if defined (AVXFMA4)
|
#if defined (AVXFMA4)
|
||||||
a= _mm256_macc_ps(b,c,a);
|
a= _mm256_macc_ps(b,c,a);
|
||||||
#endif
|
#endif
|
||||||
#if defined (AVX2)
|
#if defined (AVX2) || defined (AVXFMA)
|
||||||
a= _mm256_fmadd_ps( b, c, a);
|
a= _mm256_fmadd_ps( b, c, a);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@ -332,7 +332,7 @@ namespace Optimization {
|
|||||||
#if defined (AVXFMA4)
|
#if defined (AVXFMA4)
|
||||||
a= _mm256_macc_pd(b,c,a);
|
a= _mm256_macc_pd(b,c,a);
|
||||||
#endif
|
#endif
|
||||||
#if defined (AVX2)
|
#if defined (AVX2) || defined (AVXFMA)
|
||||||
a= _mm256_fmadd_pd( b, c, a);
|
a= _mm256_fmadd_pd( b, c, a);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@ -347,7 +347,7 @@ namespace Optimization {
|
|||||||
}
|
}
|
||||||
// Integer
|
// Integer
|
||||||
inline __m256i operator()(__m256i a, __m256i b){
|
inline __m256i operator()(__m256i a, __m256i b){
|
||||||
#if defined (AVX1)
|
#if defined (AVX1) || defined (AVXFMA)
|
||||||
__m128i a0,a1;
|
__m128i a0,a1;
|
||||||
__m128i b0,b1;
|
__m128i b0,b1;
|
||||||
a0 = _mm256_extractf128_si256(a,0);
|
a0 = _mm256_extractf128_si256(a,0);
|
||||||
|
@ -244,7 +244,22 @@ namespace Optimization {
|
|||||||
return a*b;
|
return a*b;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct Div{
|
||||||
|
// Real double
|
||||||
|
inline vector4double operator()(vector4double a, vector4double b){
|
||||||
|
return vec_swdiv(a, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Real float
|
||||||
|
FLOAT_WRAP_2(operator(), inline)
|
||||||
|
|
||||||
|
// Integer
|
||||||
|
inline int operator()(int a, int b){
|
||||||
|
return a/b;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
struct Conj{
|
struct Conj{
|
||||||
// Complex double
|
// Complex double
|
||||||
inline vector4double operator()(vector4double v){
|
inline vector4double operator()(vector4double v){
|
||||||
@ -413,6 +428,7 @@ template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
|
|||||||
typedef Optimization::Sum SumSIMD;
|
typedef Optimization::Sum SumSIMD;
|
||||||
typedef Optimization::Sub SubSIMD;
|
typedef Optimization::Sub SubSIMD;
|
||||||
typedef Optimization::Mult MultSIMD;
|
typedef Optimization::Mult MultSIMD;
|
||||||
|
typedef Optimization::Div DivSIMD;
|
||||||
typedef Optimization::MultComplex MultComplexSIMD;
|
typedef Optimization::MultComplex MultComplexSIMD;
|
||||||
typedef Optimization::Conj ConjSIMD;
|
typedef Optimization::Conj ConjSIMD;
|
||||||
typedef Optimization::TimesMinusI TimesMinusISIMD;
|
typedef Optimization::TimesMinusI TimesMinusISIMD;
|
||||||
|
@ -44,7 +44,7 @@ directory
|
|||||||
#ifdef SSE4
|
#ifdef SSE4
|
||||||
#include "Grid_sse4.h"
|
#include "Grid_sse4.h"
|
||||||
#endif
|
#endif
|
||||||
#if defined(AVX1) || defined(AVX2) || defined(AVXFMA4)
|
#if defined(AVX1) || defined (AVXFMA) || defined(AVX2) || defined(AVXFMA4)
|
||||||
#include "Grid_avx.h"
|
#include "Grid_avx.h"
|
||||||
#endif
|
#endif
|
||||||
#if defined AVX512
|
#if defined AVX512
|
||||||
|
@ -50,6 +50,12 @@ public:
|
|||||||
template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = i1*i2;}
|
template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = i1*i2;}
|
||||||
std::string name(void) const { return std::string("Times"); }
|
std::string name(void) const { return std::string("Times"); }
|
||||||
};
|
};
|
||||||
|
class funcDivide {
|
||||||
|
public:
|
||||||
|
funcDivide() {};
|
||||||
|
template<class vec> void operator()(vec &rr,vec &i1,vec &i2) const { rr = i1/i2;}
|
||||||
|
std::string name(void) const { return std::string("Divide"); }
|
||||||
|
};
|
||||||
class funcConj {
|
class funcConj {
|
||||||
public:
|
public:
|
||||||
funcConj() {};
|
funcConj() {};
|
||||||
@ -341,6 +347,7 @@ int main (int argc, char ** argv)
|
|||||||
Tester<RealF,vRealF>(funcPlus());
|
Tester<RealF,vRealF>(funcPlus());
|
||||||
Tester<RealF,vRealF>(funcMinus());
|
Tester<RealF,vRealF>(funcMinus());
|
||||||
Tester<RealF,vRealF>(funcTimes());
|
Tester<RealF,vRealF>(funcTimes());
|
||||||
|
Tester<RealF,vRealF>(funcDivide());
|
||||||
Tester<RealF,vRealF>(funcAdj());
|
Tester<RealF,vRealF>(funcAdj());
|
||||||
Tester<RealF,vRealF>(funcConj());
|
Tester<RealF,vRealF>(funcConj());
|
||||||
Tester<RealF,vRealF>(funcInnerProduct());
|
Tester<RealF,vRealF>(funcInnerProduct());
|
||||||
@ -371,6 +378,7 @@ int main (int argc, char ** argv)
|
|||||||
Tester<RealD,vRealD>(funcPlus());
|
Tester<RealD,vRealD>(funcPlus());
|
||||||
Tester<RealD,vRealD>(funcMinus());
|
Tester<RealD,vRealD>(funcMinus());
|
||||||
Tester<RealD,vRealD>(funcTimes());
|
Tester<RealD,vRealD>(funcTimes());
|
||||||
|
Tester<RealD,vRealD>(funcDivide());
|
||||||
Tester<RealD,vRealD>(funcAdj());
|
Tester<RealD,vRealD>(funcAdj());
|
||||||
Tester<RealD,vRealD>(funcConj());
|
Tester<RealD,vRealD>(funcConj());
|
||||||
Tester<RealD,vRealD>(funcInnerProduct());
|
Tester<RealD,vRealD>(funcInnerProduct());
|
||||||
|
@ -68,7 +68,7 @@ int main (int argc, char ** argv)
|
|||||||
for(int mu=0;mu<4;mu++){
|
for(int mu=0;mu<4;mu++){
|
||||||
RealD TwoPiL = M_PI * 2.0/ latt_size[mu];
|
RealD TwoPiL = M_PI * 2.0/ latt_size[mu];
|
||||||
LatticeCoordinate(coor,mu);
|
LatticeCoordinate(coor,mu);
|
||||||
C = C - (TwoPiL * p[mu]) * coor;
|
C = C + (TwoPiL * p[mu]) * coor;
|
||||||
}
|
}
|
||||||
|
|
||||||
C = exp(C*ci);
|
C = exp(C*ci);
|
||||||
@ -78,10 +78,11 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
FFT theFFT(&Fine);
|
FFT theFFT(&Fine);
|
||||||
|
|
||||||
theFFT.FFT_dim(Ctilde,C,0,FFT::forward); C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
|
Ctilde = C;
|
||||||
theFFT.FFT_dim(Ctilde,C,1,FFT::forward); C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
|
theFFT.FFT_dim(Ctilde,Ctilde,0,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
|
||||||
theFFT.FFT_dim(Ctilde,C,2,FFT::forward); C=Ctilde; std::cout << theFFT.MFlops()<<std::endl;
|
theFFT.FFT_dim(Ctilde,Ctilde,1,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
|
||||||
theFFT.FFT_dim(Ctilde,C,3,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
|
theFFT.FFT_dim(Ctilde,Ctilde,2,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
|
||||||
|
theFFT.FFT_dim(Ctilde,Ctilde,3,FFT::forward); std::cout << theFFT.MFlops()<<std::endl;
|
||||||
|
|
||||||
// C=zero;
|
// C=zero;
|
||||||
// Ctilde = where(abs(Ctilde)<1.0e-10,C,Ctilde);
|
// Ctilde = where(abs(Ctilde)<1.0e-10,C,Ctilde);
|
||||||
@ -93,10 +94,11 @@ int main (int argc, char ** argv)
|
|||||||
C=C-Ctilde;
|
C=C-Ctilde;
|
||||||
std::cout << "diff scalar "<<norm2(C) << std::endl;
|
std::cout << "diff scalar "<<norm2(C) << std::endl;
|
||||||
|
|
||||||
theFFT.FFT_dim(Stilde,S,0,FFT::forward); S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
|
Stilde = S;
|
||||||
theFFT.FFT_dim(Stilde,S,1,FFT::forward); S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
|
theFFT.FFT_dim(Stilde,Stilde,0,FFT::forward); std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
|
||||||
theFFT.FFT_dim(Stilde,S,2,FFT::forward); S=Stilde;std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
|
theFFT.FFT_dim(Stilde,Stilde,1,FFT::forward); std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
|
||||||
theFFT.FFT_dim(Stilde,S,3,FFT::forward);std::cout << theFFT.MFlops()<<" "<<theFFT.USec() <<std::endl;
|
theFFT.FFT_dim(Stilde,Stilde,2,FFT::forward); std::cout << theFFT.MFlops()<< " "<<theFFT.USec() <<std::endl;
|
||||||
|
theFFT.FFT_dim(Stilde,Stilde,3,FFT::forward); std::cout << theFFT.MFlops()<<" "<<theFFT.USec() <<std::endl;
|
||||||
|
|
||||||
SpinMatrixF Sp;
|
SpinMatrixF Sp;
|
||||||
Sp = zero; Sp = Sp+cVol;
|
Sp = zero; Sp = Sp+cVol;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user