mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-14 22:07:05 +01:00
Compare commits
45 Commits
chulwoo-de
...
v0.5.1
Author | SHA1 | Date | |
---|---|---|---|
446c768cd3 | |||
bfe14000a9 | |||
680645f849 | |||
3fc6e03ad1 | |||
2d6614f3a1 | |||
4e041b5103 | |||
712b9a3489 | |||
bdaa5b1767 | |||
8fcefc021a | |||
1445189361 | |||
05c884a62a | |||
a25bec87d9 | |||
2d8bb4c594 | |||
51cb2d4328 | |||
6d58cb2a68 | |||
c8b35d960c | |||
532f41dd61 | |||
661b0ab45d | |||
4bc08ed995 | |||
b2933a0557 | |||
db057cc276 | |||
22e88eaf54 | |||
09fe3caebd | |||
5e02392f9c | |||
17a8f51a9b | |||
1b7f88dd00 | |||
d6737e4bd8 | |||
d539888e57 | |||
86187d7cca | |||
87418e7df1 | |||
55f65b81b5 | |||
d9408893b3 | |||
05acc22920 | |||
8ac021de73 | |||
e503ef5590 | |||
a7682b0060 | |||
d4c9d71fc8 | |||
786ca52c43 | |||
048ac04abc | |||
f78d89bcbe | |||
53d06046b0 | |||
5d3a1a025d | |||
139cc5f1ae | |||
1c0e922585 | |||
9d5f693cbe |
1
.gitignore
vendored
1
.gitignore
vendored
@ -62,6 +62,7 @@ stamp-h1
|
|||||||
config.sub
|
config.sub
|
||||||
config.guess
|
config.guess
|
||||||
INSTALL
|
INSTALL
|
||||||
|
.dirstamp
|
||||||
|
|
||||||
# Packages #
|
# Packages #
|
||||||
############
|
############
|
||||||
|
27
.travis.yml
27
.travis.yml
@ -1,5 +1,9 @@
|
|||||||
language: cpp
|
language: cpp
|
||||||
|
|
||||||
|
cache:
|
||||||
|
directories:
|
||||||
|
- clang
|
||||||
|
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
- os: osx
|
- os: osx
|
||||||
@ -38,29 +42,31 @@ matrix:
|
|||||||
apt:
|
apt:
|
||||||
sources:
|
sources:
|
||||||
- ubuntu-toolchain-r-test
|
- ubuntu-toolchain-r-test
|
||||||
- llvm-toolchain-precise-3.7
|
|
||||||
packages:
|
packages:
|
||||||
- clang-3.7
|
- g++-4.8
|
||||||
- libmpfr-dev
|
- libmpfr-dev
|
||||||
- libgmp-dev
|
- libgmp-dev
|
||||||
- libmpc-dev
|
- libmpc-dev
|
||||||
- binutils-dev
|
- binutils-dev
|
||||||
env: VERSION=-3.7
|
env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
|
||||||
- compiler: clang
|
- compiler: clang
|
||||||
addons:
|
addons:
|
||||||
apt:
|
apt:
|
||||||
sources:
|
sources:
|
||||||
- ubuntu-toolchain-r-test
|
- ubuntu-toolchain-r-test
|
||||||
- llvm-toolchain-precise-3.8
|
|
||||||
packages:
|
packages:
|
||||||
- clang-3.8
|
- g++-4.8
|
||||||
- libmpfr-dev
|
- libmpfr-dev
|
||||||
- libgmp-dev
|
- libgmp-dev
|
||||||
- libmpc-dev
|
- libmpc-dev
|
||||||
- binutils-dev
|
- binutils-dev
|
||||||
env: VERSION=-3.8
|
env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
|
||||||
|
|
||||||
before_install:
|
before_install:
|
||||||
|
- export GRIDDIR=`pwd`
|
||||||
|
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]] && [ ! -e clang/bin ]; then wget $CLANG_LINK; tar -xf `basename $CLANG_LINK`; mkdir clang; mv clang+*/* clang/; fi
|
||||||
|
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export PATH="${GRIDDIR}/clang/bin:${PATH}"; fi
|
||||||
|
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
|
||||||
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
|
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
|
||||||
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi
|
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi
|
||||||
- if [[ "$TRAVIS_OS_NAME" == "osx" ]] && [[ "$CC" == "gcc" ]]; then brew install gcc5; fi
|
- if [[ "$TRAVIS_OS_NAME" == "osx" ]] && [[ "$CC" == "gcc" ]]; then brew install gcc5; fi
|
||||||
@ -68,6 +74,11 @@ before_install:
|
|||||||
install:
|
install:
|
||||||
- export CC=$CC$VERSION
|
- export CC=$CC$VERSION
|
||||||
- export CXX=$CXX$VERSION
|
- export CXX=$CXX$VERSION
|
||||||
|
- echo $PATH
|
||||||
|
- which $CC
|
||||||
|
- $CC --version
|
||||||
|
- which $CXX
|
||||||
|
- $CXX --version
|
||||||
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
|
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
|
||||||
|
|
||||||
script:
|
script:
|
||||||
@ -77,3 +88,7 @@ script:
|
|||||||
- ../configure CXXFLAGS="-msse4.2 -O3 -std=c++11" LIBS="-lmpfr -lgmp" --enable-precision=single --enable-simd=SSE4 --enable-comms=none
|
- ../configure CXXFLAGS="-msse4.2 -O3 -std=c++11" LIBS="-lmpfr -lgmp" --enable-precision=single --enable-simd=SSE4 --enable-comms=none
|
||||||
- make -j4
|
- make -j4
|
||||||
- ./benchmarks/Benchmark_dwf --threads 1
|
- ./benchmarks/Benchmark_dwf --threads 1
|
||||||
|
- make clean
|
||||||
|
- ../configure CXXFLAGS="-msse4.2 -O3 -std=c++11" LIBS="-lmpfr -lgmp" --enable-precision=double --enable-simd=SSE4 --enable-comms=none
|
||||||
|
- make -j4
|
||||||
|
- ./benchmarks/Benchmark_dwf --threads 1
|
||||||
|
4
VERSION
Normal file
4
VERSION
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
Version : 0.5.0
|
||||||
|
|
||||||
|
- AVX512, AVX2, AVX, SSE good
|
||||||
|
- Clang 3.5 and above, ICPC v16 and above, GCC 4.9 and above
|
@ -27,6 +27,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
*************************************************************************************/
|
*************************************************************************************/
|
||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
#include <Grid.h>
|
#include <Grid.h>
|
||||||
|
#include <PerfCount.h>
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
@ -45,6 +46,10 @@ struct scal {
|
|||||||
};
|
};
|
||||||
|
|
||||||
bool overlapComms = false;
|
bool overlapComms = false;
|
||||||
|
typedef WilsonFermion5D<DomainWallRedBlack5dImplR> WilsonFermion5DR;
|
||||||
|
typedef WilsonFermion5D<DomainWallRedBlack5dImplF> WilsonFermion5DF;
|
||||||
|
typedef WilsonFermion5D<DomainWallRedBlack5dImplD> WilsonFermion5DD;
|
||||||
|
|
||||||
|
|
||||||
int main (int argc, char ** argv)
|
int main (int argc, char ** argv)
|
||||||
{
|
{
|
||||||
@ -58,12 +63,18 @@ int main (int argc, char ** argv)
|
|||||||
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
||||||
|
|
||||||
std::vector<int> latt4 = GridDefaultLatt();
|
std::vector<int> latt4 = GridDefaultLatt();
|
||||||
const int Ls=8;
|
const int Ls=16;
|
||||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
||||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
|
||||||
|
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
|
||||||
|
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
|
||||||
|
std::cout << GridLogMessage << "Making s innermost rb grids"<<std::endl;
|
||||||
|
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
|
||||||
|
|
||||||
std::vector<int> seeds4({1,2,3,4});
|
std::vector<int> seeds4({1,2,3,4});
|
||||||
std::vector<int> seeds5({5,6,7,8});
|
std::vector<int> seeds5({5,6,7,8});
|
||||||
|
|
||||||
@ -78,7 +89,9 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
ColourMatrix cm = Complex(1.0,0.0);
|
ColourMatrix cm = Complex(1.0,0.0);
|
||||||
|
|
||||||
LatticeGaugeField Umu(UGrid); random(RNG4,Umu);
|
LatticeGaugeField Umu(UGrid);
|
||||||
|
random(RNG4,Umu);
|
||||||
|
|
||||||
LatticeGaugeField Umu5d(FGrid);
|
LatticeGaugeField Umu5d(FGrid);
|
||||||
|
|
||||||
// replicate across fifth dimension
|
// replicate across fifth dimension
|
||||||
@ -119,14 +132,21 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
RealD NP = UGrid->_Nprocessors;
|
RealD NP = UGrid->_Nprocessors;
|
||||||
|
|
||||||
|
for(int doasm=1;doasm<2;doasm++){
|
||||||
|
|
||||||
|
QCD::WilsonKernelsStatic::AsmOpt=doasm;
|
||||||
|
|
||||||
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
|
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
|
std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
|
||||||
int ncall=100;
|
int ncall =10;
|
||||||
{
|
if (1) {
|
||||||
|
|
||||||
double t0=usecond();
|
double t0=usecond();
|
||||||
for(int i=0;i<ncall;i++){
|
for(int i=0;i<ncall;i++){
|
||||||
|
__SSC_START;
|
||||||
Dw.Dhop(src,result,0);
|
Dw.Dhop(src,result,0);
|
||||||
|
__SSC_STOP;
|
||||||
}
|
}
|
||||||
double t1=usecond();
|
double t1=usecond();
|
||||||
|
|
||||||
@ -140,10 +160,121 @@ int main (int argc, char ** argv)
|
|||||||
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NP<<std::endl;
|
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NP<<std::endl;
|
||||||
err = ref-result;
|
err = ref-result;
|
||||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
||||||
Dw.Report();
|
// Dw.Report();
|
||||||
}
|
}
|
||||||
|
|
||||||
exit(0);
|
if (1)
|
||||||
|
{
|
||||||
|
typedef WilsonFermion5D<DomainWallRedBlack5dImplR> WilsonFermion5DR;
|
||||||
|
LatticeFermion ssrc(sFGrid);
|
||||||
|
LatticeFermion sref(sFGrid);
|
||||||
|
LatticeFermion sresult(sFGrid);
|
||||||
|
WilsonFermion5DR sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5,params);
|
||||||
|
|
||||||
|
for(int x=0;x<latt4[0];x++){
|
||||||
|
for(int y=0;y<latt4[1];y++){
|
||||||
|
for(int z=0;z<latt4[2];z++){
|
||||||
|
for(int t=0;t<latt4[3];t++){
|
||||||
|
for(int s=0;s<Ls;s++){
|
||||||
|
std::vector<int> site({s,x,y,z,t});
|
||||||
|
SpinColourVector tmp;
|
||||||
|
peekSite(tmp,src,site);
|
||||||
|
pokeSite(tmp,ssrc,site);
|
||||||
|
}}}}}
|
||||||
|
|
||||||
|
double t0=usecond();
|
||||||
|
for(int i=0;i<ncall;i++){
|
||||||
|
__SSC_START;
|
||||||
|
sDw.Dhop(ssrc,sresult,0);
|
||||||
|
__SSC_STOP;
|
||||||
|
}
|
||||||
|
double t1=usecond();
|
||||||
|
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||||
|
double flops=1344*volume*ncall;
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "Called Dw sinner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NP<<std::endl;
|
||||||
|
// sDw.Report();
|
||||||
|
|
||||||
|
if(0){
|
||||||
|
for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
|
||||||
|
sDw.Dhop(ssrc,sresult,0);
|
||||||
|
PerformanceCounter Counter(i);
|
||||||
|
Counter.Start();
|
||||||
|
sDw.Dhop(ssrc,sresult,0);
|
||||||
|
Counter.Stop();
|
||||||
|
Counter.Report();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
RealF sum=0;
|
||||||
|
for(int x=0;x<latt4[0];x++){
|
||||||
|
for(int y=0;y<latt4[1];y++){
|
||||||
|
for(int z=0;z<latt4[2];z++){
|
||||||
|
for(int t=0;t<latt4[3];t++){
|
||||||
|
for(int s=0;s<Ls;s++){
|
||||||
|
std::vector<int> site({s,x,y,z,t});
|
||||||
|
SpinColourVector normal, simd;
|
||||||
|
peekSite(normal,result,site);
|
||||||
|
peekSite(simd,sresult,site);
|
||||||
|
sum=sum+norm2(normal-simd);
|
||||||
|
// std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<norm2(normal-simd)<<std::endl;
|
||||||
|
// std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<normal<<std::endl;
|
||||||
|
// std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<simd<<std::endl;
|
||||||
|
}}}}}
|
||||||
|
std::cout<<" difference between normal and simd is "<<sum<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
|
if (1) {
|
||||||
|
|
||||||
|
LatticeFermion sr_eo(sFGrid);
|
||||||
|
LatticeFermion serr(sFGrid);
|
||||||
|
|
||||||
|
LatticeFermion ssrc_e (sFrbGrid);
|
||||||
|
LatticeFermion ssrc_o (sFrbGrid);
|
||||||
|
LatticeFermion sr_e (sFrbGrid);
|
||||||
|
LatticeFermion sr_o (sFrbGrid);
|
||||||
|
|
||||||
|
pickCheckerboard(Even,ssrc_e,ssrc);
|
||||||
|
pickCheckerboard(Odd,ssrc_o,ssrc);
|
||||||
|
|
||||||
|
setCheckerboard(sr_eo,ssrc_o);
|
||||||
|
setCheckerboard(sr_eo,ssrc_e);
|
||||||
|
serr = sr_eo-ssrc;
|
||||||
|
std::cout<<GridLogMessage << "EO src norm diff "<< norm2(serr)<<std::endl;
|
||||||
|
|
||||||
|
sr_e = zero;
|
||||||
|
sr_o = zero;
|
||||||
|
|
||||||
|
double t0=usecond();
|
||||||
|
for(int i=0;i<ncall;i++){
|
||||||
|
sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
|
||||||
|
}
|
||||||
|
double t1=usecond();
|
||||||
|
|
||||||
|
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||||
|
double flops=(1344.0*volume*ncall)/2;
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "sDeo mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "sDeo mflop/s per node "<< flops/(t1-t0)/NP<<std::endl;
|
||||||
|
|
||||||
|
sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
|
||||||
|
sDw.DhopOE(ssrc_e,sr_o,DaggerNo);
|
||||||
|
sDw.Dhop (ssrc ,sresult,DaggerNo);
|
||||||
|
|
||||||
|
pickCheckerboard(Even,ssrc_e,sresult);
|
||||||
|
pickCheckerboard(Odd ,ssrc_o,sresult);
|
||||||
|
ssrc_e = ssrc_e - sr_e;
|
||||||
|
std::cout<<GridLogMessage << "sE norm diff "<< norm2(ssrc_e)<<std::endl;
|
||||||
|
ssrc_o = ssrc_o - sr_o;
|
||||||
|
std::cout<<GridLogMessage << "sO norm diff "<< norm2(ssrc_o)<<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
if (1)
|
if (1)
|
||||||
{ // Naive wilson dag implementation
|
{ // Naive wilson dag implementation
|
||||||
@ -217,5 +348,8 @@ int main (int argc, char ** argv)
|
|||||||
std::cout<<GridLogMessage << "norm diff even "<< norm2(src_e)<<std::endl;
|
std::cout<<GridLogMessage << "norm diff even "<< norm2(src_e)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "norm diff odd "<< norm2(src_o)<<std::endl;
|
std::cout<<GridLogMessage << "norm diff odd "<< norm2(src_o)<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
Grid_finalize();
|
Grid_finalize();
|
||||||
}
|
}
|
||||||
|
154
benchmarks/Benchmark_dwf_ntpf.cc
Normal file
154
benchmarks/Benchmark_dwf_ntpf.cc
Normal file
@ -0,0 +1,154 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./benchmarks/Benchmark_dwf.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid.h>
|
||||||
|
#include <PerfCount.h>
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
using namespace Grid;
|
||||||
|
using namespace Grid::QCD;
|
||||||
|
|
||||||
|
template<class d>
|
||||||
|
struct scal {
|
||||||
|
d internal;
|
||||||
|
};
|
||||||
|
|
||||||
|
Gamma::GammaMatrix Gmu [] = {
|
||||||
|
Gamma::GammaX,
|
||||||
|
Gamma::GammaY,
|
||||||
|
Gamma::GammaZ,
|
||||||
|
Gamma::GammaT
|
||||||
|
};
|
||||||
|
|
||||||
|
bool overlapComms = false;
|
||||||
|
|
||||||
|
|
||||||
|
int main (int argc, char ** argv)
|
||||||
|
{
|
||||||
|
Grid_init(&argc,&argv);
|
||||||
|
|
||||||
|
if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){
|
||||||
|
overlapComms = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
int threads = GridThread::GetThreads();
|
||||||
|
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
||||||
|
|
||||||
|
std::vector<int> latt4 = GridDefaultLatt();
|
||||||
|
const int Ls=16;
|
||||||
|
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
||||||
|
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||||
|
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||||
|
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||||
|
|
||||||
|
std::vector<int> seeds4({1,2,3,4});
|
||||||
|
std::vector<int> seeds5({5,6,7,8});
|
||||||
|
|
||||||
|
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
||||||
|
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
|
||||||
|
|
||||||
|
LatticeFermion src (FGrid); random(RNG5,src);
|
||||||
|
LatticeFermion result(FGrid); result=zero;
|
||||||
|
LatticeFermion ref(FGrid); ref=zero;
|
||||||
|
LatticeFermion tmp(FGrid);
|
||||||
|
LatticeFermion err(FGrid);
|
||||||
|
|
||||||
|
ColourMatrix cm = Complex(1.0,0.0);
|
||||||
|
|
||||||
|
LatticeGaugeField Umu(UGrid);
|
||||||
|
random(RNG4,Umu);
|
||||||
|
|
||||||
|
LatticeGaugeField Umu5d(FGrid);
|
||||||
|
|
||||||
|
// replicate across fifth dimension
|
||||||
|
for(int ss=0;ss<Umu._grid->oSites();ss++){
|
||||||
|
for(int s=0;s<Ls;s++){
|
||||||
|
Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////
|
||||||
|
// Naive wilson implementation
|
||||||
|
////////////////////////////////////
|
||||||
|
std::vector<LatticeColourMatrix> U(4,FGrid);
|
||||||
|
for(int mu=0;mu<Nd;mu++){
|
||||||
|
U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (1)
|
||||||
|
{
|
||||||
|
ref = zero;
|
||||||
|
for(int mu=0;mu<Nd;mu++){
|
||||||
|
|
||||||
|
tmp = U[mu]*Cshift(src,mu+1,1);
|
||||||
|
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
|
||||||
|
|
||||||
|
tmp =adj(U[mu])*src;
|
||||||
|
tmp =Cshift(tmp,mu+1,-1);
|
||||||
|
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
|
||||||
|
}
|
||||||
|
ref = -0.5*ref;
|
||||||
|
}
|
||||||
|
|
||||||
|
RealD mass=0.1;
|
||||||
|
RealD M5 =1.8;
|
||||||
|
|
||||||
|
typename DomainWallFermionR::ImplParams params;
|
||||||
|
params.overlapCommsCompute = overlapComms;
|
||||||
|
|
||||||
|
RealD NP = UGrid->_Nprocessors;
|
||||||
|
|
||||||
|
|
||||||
|
QCD::WilsonKernelsStatic::AsmOpt=1;
|
||||||
|
|
||||||
|
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
|
||||||
|
int ncall =50;
|
||||||
|
if (1) {
|
||||||
|
|
||||||
|
double t0=usecond();
|
||||||
|
for(int i=0;i<ncall;i++){
|
||||||
|
Dw.Dhop(src,result,0);
|
||||||
|
}
|
||||||
|
double t1=usecond();
|
||||||
|
|
||||||
|
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||||
|
double flops=1344*volume*ncall;
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NP<<std::endl;
|
||||||
|
err = ref-result;
|
||||||
|
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
||||||
|
// Dw.Report();
|
||||||
|
}
|
||||||
|
Grid_finalize();
|
||||||
|
}
|
369
benchmarks/Benchmark_dwf_sweep.cc
Normal file
369
benchmarks/Benchmark_dwf_sweep.cc
Normal file
@ -0,0 +1,369 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./benchmarks/Benchmark_dwf.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid.h>
|
||||||
|
#include <PerfCount.h>
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
using namespace Grid;
|
||||||
|
using namespace Grid::QCD;
|
||||||
|
|
||||||
|
template<class d>
|
||||||
|
struct scal {
|
||||||
|
d internal;
|
||||||
|
};
|
||||||
|
|
||||||
|
Gamma::GammaMatrix Gmu [] = {
|
||||||
|
Gamma::GammaX,
|
||||||
|
Gamma::GammaY,
|
||||||
|
Gamma::GammaZ,
|
||||||
|
Gamma::GammaT
|
||||||
|
};
|
||||||
|
|
||||||
|
void benchDw(std::vector<int> & L, int Ls, int threads, int report =0 );
|
||||||
|
void benchsDw(std::vector<int> & L, int Ls, int threads, int report=0 );
|
||||||
|
|
||||||
|
int main (int argc, char ** argv)
|
||||||
|
{
|
||||||
|
Grid_init(&argc,&argv);
|
||||||
|
|
||||||
|
const int Ls=16;
|
||||||
|
int threads = GridThread::GetThreads();
|
||||||
|
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
||||||
|
|
||||||
|
if ( getenv("ASMOPT") ) {
|
||||||
|
QCD::WilsonKernelsStatic::AsmOpt=1;
|
||||||
|
} else {
|
||||||
|
QCD::WilsonKernelsStatic::AsmOpt=0;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "Volume \t\t\tProcs \t Dw \t eoDw \t sDw \t eosDw (Mflop/s) "<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
|
||||||
|
|
||||||
|
int Lmax=32;
|
||||||
|
int dmin=0;
|
||||||
|
if ( getenv("LMAX") ) Lmax=atoi(getenv("LMAX"));
|
||||||
|
if ( getenv("DMIN") ) dmin=atoi(getenv("DMIN"));
|
||||||
|
for (int L=8;L<=Lmax;L*=2){
|
||||||
|
std::vector<int> latt4(4,L);
|
||||||
|
for(int d=4;d>dmin;d--){
|
||||||
|
if ( d<=3 ) latt4[d]*=2;
|
||||||
|
std::cout << GridLogMessage <<"\t";
|
||||||
|
for(int d=0;d<Nd;d++){
|
||||||
|
std::cout<<latt4[d]<<"x";
|
||||||
|
}
|
||||||
|
std::cout <<Ls<<"\t" ;
|
||||||
|
benchDw (latt4,Ls,threads,0);
|
||||||
|
benchsDw(latt4,Ls,threads,0);
|
||||||
|
std::cout<<std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
|
||||||
|
{
|
||||||
|
std::vector<int> latt4(4,16);
|
||||||
|
std::cout<<GridLogMessage << "16^4 Dw miss rate"<<std::endl;
|
||||||
|
benchDw (latt4,Ls,threads,1);
|
||||||
|
std::cout<<GridLogMessage << "16^4 sDw miss rate"<<std::endl;
|
||||||
|
benchsDw(latt4,Ls,threads,1);
|
||||||
|
}
|
||||||
|
|
||||||
|
Grid_finalize();
|
||||||
|
}
|
||||||
|
|
||||||
|
#undef CHECK
|
||||||
|
|
||||||
|
void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
|
||||||
|
{
|
||||||
|
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
||||||
|
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||||
|
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||||
|
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||||
|
|
||||||
|
std::vector<int> seeds4({1,2,3,4});
|
||||||
|
std::vector<int> seeds5({5,6,7,8});
|
||||||
|
|
||||||
|
#ifdef CHECK
|
||||||
|
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
||||||
|
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
|
||||||
|
LatticeFermion src (FGrid); random(RNG5,src);
|
||||||
|
LatticeGaugeField Umu(UGrid);
|
||||||
|
random(RNG4,Umu);
|
||||||
|
#else
|
||||||
|
LatticeFermion src (FGrid); src=zero;
|
||||||
|
LatticeGaugeField Umu(UGrid); Umu=zero;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
LatticeFermion result(FGrid); result=zero;
|
||||||
|
LatticeFermion ref(FGrid); ref=zero;
|
||||||
|
LatticeFermion tmp(FGrid);
|
||||||
|
LatticeFermion err(FGrid);
|
||||||
|
|
||||||
|
ColourMatrix cm = Complex(1.0,0.0);
|
||||||
|
|
||||||
|
|
||||||
|
LatticeGaugeField Umu5d(FGrid);
|
||||||
|
|
||||||
|
// replicate across fifth dimension
|
||||||
|
for(int ss=0;ss<Umu._grid->oSites();ss++){
|
||||||
|
for(int s=0;s<Ls;s++){
|
||||||
|
Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////
|
||||||
|
// Naive wilson implementation
|
||||||
|
////////////////////////////////////
|
||||||
|
std::vector<LatticeColourMatrix> U(4,FGrid);
|
||||||
|
for(int mu=0;mu<Nd;mu++){
|
||||||
|
U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef CHECK
|
||||||
|
if (1)
|
||||||
|
{
|
||||||
|
ref = zero;
|
||||||
|
for(int mu=0;mu<Nd;mu++){
|
||||||
|
|
||||||
|
tmp = U[mu]*Cshift(src,mu+1,1);
|
||||||
|
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
|
||||||
|
|
||||||
|
tmp =adj(U[mu])*src;
|
||||||
|
tmp =Cshift(tmp,mu+1,-1);
|
||||||
|
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
|
||||||
|
}
|
||||||
|
ref = -0.5*ref;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
RealD mass=0.1;
|
||||||
|
RealD M5 =1.8;
|
||||||
|
RealD NP = UGrid->_Nprocessors;
|
||||||
|
|
||||||
|
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
||||||
|
|
||||||
|
double t0=usecond();
|
||||||
|
Dw.Dhop(src,result,0);
|
||||||
|
double t1=usecond();
|
||||||
|
|
||||||
|
#ifdef TIMERS_OFF
|
||||||
|
int ncall =10;
|
||||||
|
#else
|
||||||
|
int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (ncall < 5 ) exit(0);
|
||||||
|
|
||||||
|
Dw.Dhop(src,result,0);
|
||||||
|
|
||||||
|
PerformanceCounter Counter(8);
|
||||||
|
Counter.Start();
|
||||||
|
t0=usecond();
|
||||||
|
for(int i=0;i<ncall;i++){
|
||||||
|
Dw.Dhop(src,result,0);
|
||||||
|
}
|
||||||
|
t1=usecond();
|
||||||
|
Counter.Stop();
|
||||||
|
if ( report ) {
|
||||||
|
Counter.Report();
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( ! report )
|
||||||
|
{
|
||||||
|
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||||
|
double flops=1344*volume*ncall;
|
||||||
|
std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0)<< "\t";
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef CHECK
|
||||||
|
err = ref-result;
|
||||||
|
RealD errd = norm2(err);
|
||||||
|
if ( errd> 1.0e-4 ) {
|
||||||
|
std::cout<<GridLogMessage << "oops !!! norm diff "<< norm2(err)<<std::endl;
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
LatticeFermion src_e (FrbGrid);
|
||||||
|
LatticeFermion src_o (FrbGrid);
|
||||||
|
LatticeFermion r_e (FrbGrid);
|
||||||
|
LatticeFermion r_o (FrbGrid);
|
||||||
|
LatticeFermion r_eo (FGrid);
|
||||||
|
|
||||||
|
pickCheckerboard(Even,src_e,src);
|
||||||
|
pickCheckerboard(Odd,src_o,src);
|
||||||
|
|
||||||
|
{
|
||||||
|
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||||
|
double t0=usecond();
|
||||||
|
for(int i=0;i<ncall;i++){
|
||||||
|
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||||
|
}
|
||||||
|
double t1=usecond();
|
||||||
|
|
||||||
|
if(!report){
|
||||||
|
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||||
|
double flops=(1344.0*volume*ncall)/2;
|
||||||
|
std::cout<< flops/(t1-t0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#undef CHECK_SDW
|
||||||
|
void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
|
||||||
|
{
|
||||||
|
|
||||||
|
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
||||||
|
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||||
|
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||||
|
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||||
|
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
|
||||||
|
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
|
||||||
|
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
|
||||||
|
|
||||||
|
std::vector<int> seeds4({1,2,3,4});
|
||||||
|
std::vector<int> seeds5({5,6,7,8});
|
||||||
|
|
||||||
|
#ifdef CHECK_SDW
|
||||||
|
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
||||||
|
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
|
||||||
|
LatticeFermion src (FGrid); random(RNG5,src);
|
||||||
|
LatticeGaugeField Umu(UGrid);
|
||||||
|
random(RNG4,Umu);
|
||||||
|
#else
|
||||||
|
LatticeFermion src (FGrid); src=zero;
|
||||||
|
LatticeGaugeField Umu(UGrid); Umu=zero;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
LatticeFermion result(FGrid); result=zero;
|
||||||
|
LatticeFermion ref(FGrid); ref=zero;
|
||||||
|
LatticeFermion tmp(FGrid);
|
||||||
|
LatticeFermion err(FGrid);
|
||||||
|
|
||||||
|
ColourMatrix cm = Complex(1.0,0.0);
|
||||||
|
|
||||||
|
LatticeGaugeField Umu5d(FGrid);
|
||||||
|
|
||||||
|
// replicate across fifth dimension
|
||||||
|
for(int ss=0;ss<Umu._grid->oSites();ss++){
|
||||||
|
for(int s=0;s<Ls;s++){
|
||||||
|
Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
RealD mass=0.1;
|
||||||
|
RealD M5 =1.8;
|
||||||
|
|
||||||
|
typedef WilsonFermion5D<DomainWallRedBlack5dImplR> WilsonFermion5DR;
|
||||||
|
LatticeFermion ssrc(sFGrid);
|
||||||
|
LatticeFermion sref(sFGrid);
|
||||||
|
LatticeFermion sresult(sFGrid);
|
||||||
|
WilsonFermion5DR sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5);
|
||||||
|
|
||||||
|
for(int x=0;x<latt4[0];x++){
|
||||||
|
for(int y=0;y<latt4[1];y++){
|
||||||
|
for(int z=0;z<latt4[2];z++){
|
||||||
|
for(int t=0;t<latt4[3];t++){
|
||||||
|
for(int s=0;s<Ls;s++){
|
||||||
|
std::vector<int> site({s,x,y,z,t});
|
||||||
|
SpinColourVector tmp;
|
||||||
|
peekSite(tmp,src,site);
|
||||||
|
pokeSite(tmp,ssrc,site);
|
||||||
|
}}}}}
|
||||||
|
|
||||||
|
double t0=usecond();
|
||||||
|
sDw.Dhop(ssrc,sresult,0);
|
||||||
|
double t1=usecond();
|
||||||
|
|
||||||
|
#ifdef TIMERS_OFF
|
||||||
|
int ncall =10;
|
||||||
|
#else
|
||||||
|
int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
|
||||||
|
#endif
|
||||||
|
|
||||||
|
PerformanceCounter Counter(8);
|
||||||
|
Counter.Start();
|
||||||
|
t0=usecond();
|
||||||
|
for(int i=0;i<ncall;i++){
|
||||||
|
sDw.Dhop(ssrc,sresult,0);
|
||||||
|
}
|
||||||
|
t1=usecond();
|
||||||
|
Counter.Stop();
|
||||||
|
|
||||||
|
if ( report ) {
|
||||||
|
Counter.Report();
|
||||||
|
} else {
|
||||||
|
|
||||||
|
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||||
|
double flops=1344*volume*ncall;
|
||||||
|
std::cout<<"\t"<< flops/(t1-t0);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
LatticeFermion sr_eo(sFGrid);
|
||||||
|
LatticeFermion serr(sFGrid);
|
||||||
|
|
||||||
|
LatticeFermion ssrc_e (sFrbGrid);
|
||||||
|
LatticeFermion ssrc_o (sFrbGrid);
|
||||||
|
LatticeFermion sr_e (sFrbGrid);
|
||||||
|
LatticeFermion sr_o (sFrbGrid);
|
||||||
|
|
||||||
|
pickCheckerboard(Even,ssrc_e,ssrc);
|
||||||
|
pickCheckerboard(Odd,ssrc_o,ssrc);
|
||||||
|
|
||||||
|
setCheckerboard(sr_eo,ssrc_o);
|
||||||
|
setCheckerboard(sr_eo,ssrc_e);
|
||||||
|
|
||||||
|
sr_e = zero;
|
||||||
|
sr_o = zero;
|
||||||
|
|
||||||
|
sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
|
||||||
|
PerformanceCounter CounterSdw(8);
|
||||||
|
CounterSdw.Start();
|
||||||
|
t0=usecond();
|
||||||
|
for(int i=0;i<ncall;i++){
|
||||||
|
__SSC_START;
|
||||||
|
sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
|
||||||
|
__SSC_STOP;
|
||||||
|
}
|
||||||
|
t1=usecond();
|
||||||
|
CounterSdw.Stop();
|
||||||
|
|
||||||
|
if ( report ) {
|
||||||
|
CounterSdw.Report();
|
||||||
|
} else {
|
||||||
|
|
||||||
|
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||||
|
double flops=(1344.0*volume*ncall)/2;
|
||||||
|
std::cout<<"\t"<< flops/(t1-t0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
@ -119,7 +119,7 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
|
|||||||
mfc = flops*ncall/(t1-t0);
|
mfc = flops*ncall/(t1-t0);
|
||||||
std::cout<<GridLogMessage << "Called C++ Dw"<< " mflop/s = "<< mfc<<std::endl;
|
std::cout<<GridLogMessage << "Called C++ Dw"<< " mflop/s = "<< mfc<<std::endl;
|
||||||
|
|
||||||
QCD::WilsonFermion5DStatic::AsmOptDslash=1;
|
QCD::WilsonKernelsStatic::AsmOpt=1;
|
||||||
t0=usecond();
|
t0=usecond();
|
||||||
for(int i=0;i<ncall;i++){
|
for(int i=0;i<ncall;i++){
|
||||||
Dw.DhopOE(srce,resulta,0);
|
Dw.DhopOE(srce,resulta,0);
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
|
|
||||||
bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson Benchmark_zmm
|
bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_dwf_ntpf Benchmark_dwf_sweep Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson Benchmark_zmm
|
||||||
|
|
||||||
|
|
||||||
Benchmark_comms_SOURCES=Benchmark_comms.cc
|
Benchmark_comms_SOURCES=Benchmark_comms.cc
|
||||||
@ -10,6 +10,14 @@ Benchmark_dwf_SOURCES=Benchmark_dwf.cc
|
|||||||
Benchmark_dwf_LDADD=-lGrid
|
Benchmark_dwf_LDADD=-lGrid
|
||||||
|
|
||||||
|
|
||||||
|
Benchmark_dwf_ntpf_SOURCES=Benchmark_dwf_ntpf.cc
|
||||||
|
Benchmark_dwf_ntpf_LDADD=-lGrid
|
||||||
|
|
||||||
|
|
||||||
|
Benchmark_dwf_sweep_SOURCES=Benchmark_dwf_sweep.cc
|
||||||
|
Benchmark_dwf_sweep_LDADD=-lGrid
|
||||||
|
|
||||||
|
|
||||||
Benchmark_memory_asynch_SOURCES=Benchmark_memory_asynch.cc
|
Benchmark_memory_asynch_SOURCES=Benchmark_memory_asynch.cc
|
||||||
Benchmark_memory_asynch_LDADD=-lGrid
|
Benchmark_memory_asynch_LDADD=-lGrid
|
||||||
|
|
||||||
|
28
configure.ac
28
configure.ac
@ -55,6 +55,15 @@ echo :::::::::::::::::::::::::::::::::::::::::::
|
|||||||
|
|
||||||
AC_CHECK_FUNCS([gettimeofday])
|
AC_CHECK_FUNCS([gettimeofday])
|
||||||
|
|
||||||
|
#AC_CHECK_LIB([gmp],[__gmpf_init],,
|
||||||
|
# [AC_MSG_ERROR(GNU Multiple Precision GMP library was not found in your system.
|
||||||
|
#Please install or provide the correct path to your installation
|
||||||
|
#Info at: http://www.gmplib.org)])
|
||||||
|
|
||||||
|
#AC_CHECK_LIB([mpfr],[mpfr_init],,
|
||||||
|
# [AC_MSG_ERROR(GNU Multiple Precision MPFR library was not found in your system.
|
||||||
|
#Please install or provide the correct path to your installation
|
||||||
|
#Info at: http://www.mpfr.org/)])
|
||||||
|
|
||||||
#
|
#
|
||||||
# SIMD instructions selection
|
# SIMD instructions selection
|
||||||
@ -199,6 +208,25 @@ case ${ac_RNG} in
|
|||||||
AC_MSG_ERROR([${ac_RNG} unsupported --enable-rng option]);
|
AC_MSG_ERROR([${ac_RNG} unsupported --enable-rng option]);
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
|
#
|
||||||
|
# SDE timing mode
|
||||||
|
#
|
||||||
|
AC_ARG_ENABLE([timers],[AC_HELP_STRING([--enable-timers=yes|no],\
|
||||||
|
[Enable system dependent high res timers])],\
|
||||||
|
[ac_TIMERS=${enable_timers}],[ac_TIMERS=yes])
|
||||||
|
case ${ac_TIMERS} in
|
||||||
|
yes)
|
||||||
|
AC_DEFINE([TIMERS_ON],[1],[TIMERS_ON] )
|
||||||
|
;;
|
||||||
|
no)
|
||||||
|
AC_DEFINE([TIMERS_OFF],[1],[TIMERS_OFF] )
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
AC_MSG_ERROR([${ac_TIMERS} unsupported --enable-timers option]);
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
#
|
#
|
||||||
# Chroma regression tests
|
# Chroma regression tests
|
||||||
#
|
#
|
||||||
|
0
lib/.dirstamp
Normal file
0
lib/.dirstamp
Normal file
@ -211,8 +211,7 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
Grid_quiesce_nodes();
|
Grid_quiesce_nodes();
|
||||||
}
|
}
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-opt") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-opt") ){
|
||||||
QCD::WilsonFermionStatic::HandOptDslash=1;
|
QCD::WilsonKernelsStatic::HandOpt=1;
|
||||||
QCD::WilsonFermion5DStatic::HandOptDslash=1;
|
|
||||||
}
|
}
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
|
||||||
LebesgueOrder::UseLebesgueOrder=1;
|
LebesgueOrder::UseLebesgueOrder=1;
|
||||||
@ -276,11 +275,6 @@ void Grid_finalize(void)
|
|||||||
Grid_unquiesce_nodes();
|
Grid_unquiesce_nodes();
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
double usecond(void) {
|
|
||||||
struct timeval tv;
|
|
||||||
gettimeofday(&tv,NULL);
|
|
||||||
return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
|
|
||||||
}
|
|
||||||
|
|
||||||
void * Grid_backtrace_buffer[_NBACKTRACE];
|
void * Grid_backtrace_buffer[_NBACKTRACE];
|
||||||
|
|
||||||
|
File diff suppressed because one or more lines are too long
@ -32,28 +32,44 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
#define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16))
|
#define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16))
|
||||||
|
#define RawConfig(A,B) (A<<8|B)
|
||||||
const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::PerformanceCounterConfigs [] = {
|
const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::PerformanceCounterConfigs [] = {
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES , "CPUCYCLES.........." },
|
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES , "CACHE_REFERENCES..." , INSTRUCTIONS},
|
||||||
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS , "INSTRUCTIONS......." },
|
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES , "CACHE_MISSES......." , CACHE_REFERENCES},
|
||||||
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES , "CACHE_REFERENCES..." },
|
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES , "CPUCYCLES.........." , INSTRUCTIONS},
|
||||||
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES , "CACHE_MISSES......." },
|
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS , "INSTRUCTIONS......." , CPUCYCLES },
|
||||||
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,MISS) , "L1D_READ_MISS......"},
|
// 4
|
||||||
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,ACCESS) , "L1D_READ_ACCESS...."},
|
#ifdef AVX512
|
||||||
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,MISS) , "L1D_WRITE_MISS....."},
|
{ PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES },
|
||||||
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,ACCESS) , "L1D_WRITE_ACCESS..."},
|
{ PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS },
|
||||||
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,MISS) , "L1D_PREFETCH_MISS.."},
|
{ PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS },
|
||||||
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) , "L1D_PREFETCH_ACCESS"},
|
{ PERF_TYPE_RAW, RawConfig(0x02,0x04), "L2_HIT_LOADS.......", L1D_READ_ACCESS },
|
||||||
{ PERF_TYPE_HW_CACHE, CacheControl(LL,READ,MISS) , "LL_READ_MISS......."},
|
{ PERF_TYPE_RAW, RawConfig(0x04,0x04), "L2_MISS_LOADS......", L1D_READ_ACCESS },
|
||||||
// { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,ACCESS) , "LL_READ_ACCESS....."},
|
{ PERF_TYPE_RAW, RawConfig(0x10,0x04), "UTLB_MISS_LOADS....", L1D_READ_ACCESS },
|
||||||
{ PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,MISS) , "LL_WRITE_MISS......"},
|
{ PERF_TYPE_RAW, RawConfig(0x08,0x04), "DTLB_MISS_LOADS....", L1D_READ_ACCESS },
|
||||||
{ PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,ACCESS) , "LL_WRITE_ACCESS...."},
|
// 11
|
||||||
{ PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,MISS) , "LL_PREFETCH_MISS..."},
|
#else
|
||||||
{ PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,ACCESS) , "LL_PREFETCH_ACCESS."},
|
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,ACCESS) , "L1D_READ_ACCESS....",INSTRUCTIONS},
|
||||||
{ PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,MISS) , "L1I_READ_MISS......"},
|
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,MISS) , "L1D_READ_MISS......",L1D_READ_ACCESS},
|
||||||
{ PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,ACCESS) , "L1I_READ_ACCESS...."}
|
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,MISS) , "L1D_WRITE_MISS.....",L1D_READ_ACCESS},
|
||||||
|
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,ACCESS) , "L1D_WRITE_ACCESS...",L1D_READ_ACCESS},
|
||||||
|
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,MISS) , "L1D_PREFETCH_MISS..",L1D_READ_ACCESS},
|
||||||
|
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) , "L1D_PREFETCH_ACCESS",L1D_READ_ACCESS},
|
||||||
|
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) , "L1D_PREFETCH_ACCESS",L1D_READ_ACCESS},
|
||||||
|
// 11
|
||||||
#endif
|
#endif
|
||||||
|
{ PERF_TYPE_HW_CACHE, CacheControl(LL,READ,MISS) , "LL_READ_MISS.......",L1D_READ_ACCESS},
|
||||||
|
{ PERF_TYPE_HW_CACHE, CacheControl(LL,READ,ACCESS) , "LL_READ_ACCESS.....",L1D_READ_ACCESS},
|
||||||
|
{ PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,MISS) , "LL_WRITE_MISS......",L1D_READ_ACCESS},
|
||||||
|
{ PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,ACCESS) , "LL_WRITE_ACCESS....",L1D_READ_ACCESS},
|
||||||
|
//15
|
||||||
|
{ PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,MISS) , "LL_PREFETCH_MISS...",L1D_READ_ACCESS},
|
||||||
|
{ PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,ACCESS) , "LL_PREFETCH_ACCESS.",L1D_READ_ACCESS},
|
||||||
|
{ PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,MISS) , "L1I_READ_MISS......",INSTRUCTIONS},
|
||||||
|
{ PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,ACCESS) , "L1I_READ_ACCESS....",INSTRUCTIONS}
|
||||||
|
//19
|
||||||
// { PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "STALL_CYCLES" },
|
// { PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "STALL_CYCLES" },
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -58,6 +58,27 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef TIMERS_OFF
|
||||||
|
|
||||||
|
|
||||||
|
inline uint64_t cyclecount(void){
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
#define __SSC_MARK(mark) __asm__ __volatile__ ("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(mark):"%ebx")
|
||||||
|
#define __SSC_STOP __SSC_MARK(0x110)
|
||||||
|
#define __SSC_START __SSC_MARK(0x111)
|
||||||
|
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#define __SSC_MARK(mark)
|
||||||
|
#define __SSC_STOP
|
||||||
|
#define __SSC_START
|
||||||
|
|
||||||
|
/*
|
||||||
|
* cycle counters arch dependent
|
||||||
|
*/
|
||||||
|
|
||||||
#ifdef __bgq__
|
#ifdef __bgq__
|
||||||
inline uint64_t cyclecount(void){
|
inline uint64_t cyclecount(void){
|
||||||
uint64_t tmp;
|
uint64_t tmp;
|
||||||
@ -65,18 +86,20 @@ inline uint64_t cyclecount(void){
|
|||||||
return tmp;
|
return tmp;
|
||||||
}
|
}
|
||||||
#elif defined __x86_64__
|
#elif defined __x86_64__
|
||||||
#include <immintrin.h>
|
|
||||||
#ifndef __INTEL_COMPILER
|
|
||||||
#include <x86intrin.h>
|
#include <x86intrin.h>
|
||||||
#endif
|
inline uint64_t cyclecount(void){
|
||||||
inline uint64_t cyclecount(void){
|
return __rdtsc();
|
||||||
return __rdtsc();
|
// unsigned int dummy;
|
||||||
|
// return __rdtscp(&dummy);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
#warning No cycle counter implemented for this architecture
|
|
||||||
inline uint64_t cyclecount(void){
|
inline uint64_t cyclecount(void){
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
class PerformanceCounter {
|
class PerformanceCounter {
|
||||||
@ -87,6 +110,7 @@ private:
|
|||||||
uint32_t type;
|
uint32_t type;
|
||||||
uint64_t config;
|
uint64_t config;
|
||||||
const char *name;
|
const char *name;
|
||||||
|
int normalisation;
|
||||||
} PerformanceCounterConfig;
|
} PerformanceCounterConfig;
|
||||||
|
|
||||||
static const PerformanceCounterConfig PerformanceCounterConfigs [];
|
static const PerformanceCounterConfig PerformanceCounterConfigs [];
|
||||||
@ -94,26 +118,12 @@ private:
|
|||||||
public:
|
public:
|
||||||
|
|
||||||
enum PerformanceCounterType {
|
enum PerformanceCounterType {
|
||||||
CPUCYCLES=0,
|
CACHE_REFERENCES=0,
|
||||||
INSTRUCTIONS,
|
CACHE_MISSES=1,
|
||||||
// STALL_CYCLES,
|
CPUCYCLES=2,
|
||||||
CACHE_REFERENCES,
|
INSTRUCTIONS=3,
|
||||||
CACHE_MISSES,
|
L1D_READ_ACCESS=4,
|
||||||
L1D_READ_MISS,
|
PERFORMANCE_COUNTER_NUM_TYPES=19
|
||||||
L1D_READ_ACCESS,
|
|
||||||
L1D_WRITE_MISS,
|
|
||||||
L1D_WRITE_ACCESS,
|
|
||||||
L1D_PREFETCH_MISS,
|
|
||||||
L1D_PREFETCH_ACCESS,
|
|
||||||
LL_READ_MISS,
|
|
||||||
// LL_READ_ACCESS,
|
|
||||||
LL_WRITE_MISS,
|
|
||||||
LL_WRITE_ACCESS,
|
|
||||||
LL_PREFETCH_MISS,
|
|
||||||
LL_PREFETCH_ACCESS,
|
|
||||||
L1I_READ_MISS,
|
|
||||||
L1I_READ_ACCESS,
|
|
||||||
PERFORMANCE_COUNTER_NUM_TYPES
|
|
||||||
};
|
};
|
||||||
|
|
||||||
public:
|
public:
|
||||||
@ -121,7 +131,9 @@ public:
|
|||||||
int PCT;
|
int PCT;
|
||||||
|
|
||||||
long long count;
|
long long count;
|
||||||
|
long long cycles;
|
||||||
int fd;
|
int fd;
|
||||||
|
int cyclefd;
|
||||||
unsigned long long elapsed;
|
unsigned long long elapsed;
|
||||||
uint64_t begin;
|
uint64_t begin;
|
||||||
|
|
||||||
@ -134,7 +146,9 @@ public:
|
|||||||
assert(_pct>=0);
|
assert(_pct>=0);
|
||||||
assert(_pct<PERFORMANCE_COUNTER_NUM_TYPES);
|
assert(_pct<PERFORMANCE_COUNTER_NUM_TYPES);
|
||||||
fd=-1;
|
fd=-1;
|
||||||
|
cyclefd=-1;
|
||||||
count=0;
|
count=0;
|
||||||
|
cycles=0;
|
||||||
PCT =_pct;
|
PCT =_pct;
|
||||||
Open();
|
Open();
|
||||||
#endif
|
#endif
|
||||||
@ -159,6 +173,15 @@ public:
|
|||||||
fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name);
|
fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name);
|
||||||
perror("Error is");
|
perror("Error is");
|
||||||
}
|
}
|
||||||
|
int norm = PerformanceCounterConfigs[PCT].normalisation;
|
||||||
|
pe.type = PerformanceCounterConfigs[norm].type;
|
||||||
|
pe.config= PerformanceCounterConfigs[norm].config;
|
||||||
|
name = PerformanceCounterConfigs[norm].name;
|
||||||
|
cyclefd = perf_event_open(&pe, 0, -1, -1, 0); // pid 0, cpu -1 current process any cpu. group -1
|
||||||
|
if (cyclefd == -1) {
|
||||||
|
fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name);
|
||||||
|
perror("Error is");
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -168,6 +191,8 @@ public:
|
|||||||
if ( fd!= -1) {
|
if ( fd!= -1) {
|
||||||
::ioctl(fd, PERF_EVENT_IOC_RESET, 0);
|
::ioctl(fd, PERF_EVENT_IOC_RESET, 0);
|
||||||
::ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
|
::ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
|
||||||
|
::ioctl(cyclefd, PERF_EVENT_IOC_RESET, 0);
|
||||||
|
::ioctl(cyclefd, PERF_EVENT_IOC_ENABLE, 0);
|
||||||
}
|
}
|
||||||
begin =cyclecount();
|
begin =cyclecount();
|
||||||
#else
|
#else
|
||||||
@ -177,10 +202,13 @@ public:
|
|||||||
|
|
||||||
void Stop(void) {
|
void Stop(void) {
|
||||||
count=0;
|
count=0;
|
||||||
|
cycles=0;
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
if ( fd!= -1) {
|
if ( fd!= -1) {
|
||||||
::ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
|
::ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
|
||||||
|
::ioctl(cyclefd, PERF_EVENT_IOC_DISABLE, 0);
|
||||||
::read(fd, &count, sizeof(long long));
|
::read(fd, &count, sizeof(long long));
|
||||||
|
::read(cyclefd, &cycles, sizeof(long long));
|
||||||
}
|
}
|
||||||
elapsed = cyclecount() - begin;
|
elapsed = cyclecount() - begin;
|
||||||
#else
|
#else
|
||||||
@ -190,7 +218,11 @@ public:
|
|||||||
}
|
}
|
||||||
void Report(void) {
|
void Report(void) {
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
std::printf("%llu cycles %s = %20llu\n", elapsed , PerformanceCounterConfigs[PCT].name, count);
|
int N = PerformanceCounterConfigs[PCT].normalisation;
|
||||||
|
const char * sn = PerformanceCounterConfigs[N].name ;
|
||||||
|
const char * sc = PerformanceCounterConfigs[PCT].name;
|
||||||
|
std::printf("tsc = %llu %s = %llu %s = %20llu\n (%s/%s) rate = %lf\n", elapsed,sn ,cycles,
|
||||||
|
sc, count, sc,sn, (double)count/(double)cycles);
|
||||||
#else
|
#else
|
||||||
std::printf("%llu cycles \n", elapsed );
|
std::printf("%llu cycles \n", elapsed );
|
||||||
#endif
|
#endif
|
||||||
@ -199,7 +231,7 @@ public:
|
|||||||
~PerformanceCounter()
|
~PerformanceCounter()
|
||||||
{
|
{
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
::close(fd);
|
::close(fd); ::close(cyclefd);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
1704
lib/Stencil.h
1704
lib/Stencil.h
File diff suppressed because it is too large
Load Diff
14
lib/Timer.h
14
lib/Timer.h
@ -39,7 +39,13 @@ namespace Grid {
|
|||||||
// Dress the output; use std::chrono
|
// Dress the output; use std::chrono
|
||||||
|
|
||||||
// C++11 time facilities better?
|
// C++11 time facilities better?
|
||||||
double usecond(void);
|
inline double usecond(void) {
|
||||||
|
struct timeval tv;
|
||||||
|
#ifdef TIMERS_ON
|
||||||
|
gettimeofday(&tv,NULL);
|
||||||
|
#endif
|
||||||
|
return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
|
||||||
|
}
|
||||||
|
|
||||||
typedef std::chrono::system_clock GridClock;
|
typedef std::chrono::system_clock GridClock;
|
||||||
typedef std::chrono::time_point<GridClock> GridTimePoint;
|
typedef std::chrono::time_point<GridClock> GridTimePoint;
|
||||||
@ -63,17 +69,23 @@ public:
|
|||||||
}
|
}
|
||||||
void Start(void) {
|
void Start(void) {
|
||||||
assert(running == false);
|
assert(running == false);
|
||||||
|
#ifdef TIMERS_ON
|
||||||
start = GridClock::now();
|
start = GridClock::now();
|
||||||
|
#endif
|
||||||
running = true;
|
running = true;
|
||||||
}
|
}
|
||||||
void Stop(void) {
|
void Stop(void) {
|
||||||
assert(running == true);
|
assert(running == true);
|
||||||
|
#ifdef TIMERS_ON
|
||||||
accumulator+= std::chrono::duration_cast<GridUsecs>(GridClock::now()-start);
|
accumulator+= std::chrono::duration_cast<GridUsecs>(GridClock::now()-start);
|
||||||
|
#endif
|
||||||
running = false;
|
running = false;
|
||||||
};
|
};
|
||||||
void Reset(void){
|
void Reset(void){
|
||||||
running = false;
|
running = false;
|
||||||
|
#ifdef TIMERS_ON
|
||||||
start = GridClock::now();
|
start = GridClock::now();
|
||||||
|
#endif
|
||||||
accumulator = std::chrono::duration_cast<GridUsecs>(start-start);
|
accumulator = std::chrono::duration_cast<GridUsecs>(start-start);
|
||||||
}
|
}
|
||||||
GridTime Elapsed(void) {
|
GridTime Elapsed(void) {
|
||||||
|
0
lib/algorithms/approx/.dirstamp
Normal file
0
lib/algorithms/approx/.dirstamp
Normal file
@ -170,9 +170,15 @@ public:
|
|||||||
// Use a reduced simd grid
|
// Use a reduced simd grid
|
||||||
_simd_layout[d] = simd_layout[d];
|
_simd_layout[d] = simd_layout[d];
|
||||||
_rdimensions[d]= _ldimensions[d]/_simd_layout[d];
|
_rdimensions[d]= _ldimensions[d]/_simd_layout[d];
|
||||||
|
assert(_rdimensions[d]>0);
|
||||||
|
|
||||||
// all elements of a simd vector must have same checkerboard.
|
// all elements of a simd vector must have same checkerboard.
|
||||||
if ( simd_layout[d]>1 ) assert((_rdimensions[d]&0x1)==0);
|
// If Ls vectorised, this must still be the case; e.g. dwf rb5d
|
||||||
|
if ( _simd_layout[d]>1 ) {
|
||||||
|
if ( d != _checker_dim ) {
|
||||||
|
assert( (_rdimensions[d]&0x1) == 0 );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
_osites *= _rdimensions[d];
|
_osites *= _rdimensions[d];
|
||||||
_isites *= _simd_layout[d];
|
_isites *= _simd_layout[d];
|
||||||
|
0
lib/communicator/.dirstamp
Normal file
0
lib/communicator/.dirstamp
Normal file
@ -53,7 +53,6 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
|||||||
_Nprocessors=1;
|
_Nprocessors=1;
|
||||||
_processors = processors;
|
_processors = processors;
|
||||||
_processor_coor.resize(_ndimension);
|
_processor_coor.resize(_ndimension);
|
||||||
std::cout << processors << std::endl;
|
|
||||||
|
|
||||||
MPI_Cart_create(MPI_COMM_WORLD, _ndimension,&_processors[0],&periodic[0],1,&communicator);
|
MPI_Cart_create(MPI_COMM_WORLD, _ndimension,&_processors[0],&periodic[0],1,&communicator);
|
||||||
MPI_Comm_rank(communicator,&_processor);
|
MPI_Comm_rank(communicator,&_processor);
|
||||||
|
0
lib/qcd/action/fermion/.dirstamp
Normal file
0
lib/qcd/action/fermion/.dirstamp
Normal file
@ -63,7 +63,7 @@ namespace Grid {
|
|||||||
Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
|
Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
|
||||||
assert(zdata->n==this->Ls);
|
assert(zdata->n==this->Ls);
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "DomainWallFermion with Ls="<<this->Ls<<std::endl;
|
// std::cout<<GridLogMessage << "DomainWallFermion with Ls="<<this->Ls<<std::endl;
|
||||||
// Call base setter
|
// Call base setter
|
||||||
this->SetCoefficientsTanh(zdata,1.0,0.0);
|
this->SetCoefficientsTanh(zdata,1.0,0.0);
|
||||||
|
|
||||||
|
@ -53,6 +53,8 @@ namespace QCD {
|
|||||||
StencilEven(&Hgrid,npoint,Even,directions,displacements), // source is Even
|
StencilEven(&Hgrid,npoint,Even,directions,displacements), // source is Even
|
||||||
StencilOdd (&Hgrid,npoint,Odd ,directions,displacements), // source is Odd
|
StencilOdd (&Hgrid,npoint,Odd ,directions,displacements), // source is Odd
|
||||||
mass(_mass),
|
mass(_mass),
|
||||||
|
Lebesgue(_grid),
|
||||||
|
LebesgueEvenOdd(_cbgrid),
|
||||||
Umu(&Fgrid),
|
Umu(&Fgrid),
|
||||||
UmuEven(&Hgrid),
|
UmuEven(&Hgrid),
|
||||||
UmuOdd (&Hgrid)
|
UmuOdd (&Hgrid)
|
||||||
@ -228,7 +230,7 @@ PARALLEL_FOR_LOOP
|
|||||||
|
|
||||||
out.checkerboard = in.checkerboard;
|
out.checkerboard = in.checkerboard;
|
||||||
|
|
||||||
DhopInternal(Stencil,Umu,in,out,dag);
|
DhopInternal(Stencil,Lebesgue,Umu,in,out,dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -239,7 +241,7 @@ PARALLEL_FOR_LOOP
|
|||||||
assert(in.checkerboard==Even);
|
assert(in.checkerboard==Even);
|
||||||
out.checkerboard = Odd;
|
out.checkerboard = Odd;
|
||||||
|
|
||||||
DhopInternal(StencilEven,UmuOdd,in,out,dag);
|
DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,in,out,dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -250,7 +252,7 @@ PARALLEL_FOR_LOOP
|
|||||||
assert(in.checkerboard==Odd);
|
assert(in.checkerboard==Odd);
|
||||||
out.checkerboard = Even;
|
out.checkerboard = Even;
|
||||||
|
|
||||||
DhopInternal(StencilOdd,UmuEven,in,out,dag);
|
DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,in,out,dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -285,43 +287,23 @@ PARALLEL_FOR_LOOP
|
|||||||
};
|
};
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
|
void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,LebesgueOrder& lo,DoubledGaugeField & U,
|
||||||
const FermionField &in, FermionField &out,int dag)
|
const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
{
|
||||||
DhopInternalCommsThenCompute(st,U,in,out,dag);
|
|
||||||
}
|
|
||||||
template<class Impl>
|
|
||||||
void WilsonFermion<Impl>::DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
|
|
||||||
const FermionField &in, FermionField &out,int dag) {
|
|
||||||
|
|
||||||
assert((dag==DaggerNo) ||(dag==DaggerYes));
|
assert((dag==DaggerNo) ||(dag==DaggerYes));
|
||||||
|
|
||||||
Compressor compressor(dag);
|
Compressor compressor(dag);
|
||||||
st.HaloExchange(in,compressor);
|
st.HaloExchange(in,compressor);
|
||||||
|
|
||||||
if ( dag == DaggerYes ) {
|
if ( dag == DaggerYes ) {
|
||||||
if( HandOptDslash ) {
|
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||||
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
|
Kernels::DiracOptDhopSiteDag(st,lo,U,st.comm_buf,sss,sss,1,1,in,out);
|
||||||
}
|
|
||||||
} else {
|
|
||||||
PARALLEL_FOR_LOOP
|
|
||||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
|
||||||
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if( HandOptDslash ) {
|
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||||
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out);
|
Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sss,sss,1,1,in,out);
|
||||||
}
|
|
||||||
} else {
|
|
||||||
PARALLEL_FOR_LOOP
|
|
||||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
|
||||||
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -111,12 +111,9 @@ namespace Grid {
|
|||||||
const FermionField &B,
|
const FermionField &B,
|
||||||
int dag);
|
int dag);
|
||||||
|
|
||||||
void DhopInternal(StencilImpl & st,DoubledGaugeField & U,
|
void DhopInternal(StencilImpl & st,LebesgueOrder & lo,DoubledGaugeField & U,
|
||||||
const FermionField &in, FermionField &out,int dag) ;
|
const FermionField &in, FermionField &out,int dag) ;
|
||||||
|
|
||||||
void DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
|
|
||||||
const FermionField &in, FermionField &out,int dag) ;
|
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
WilsonFermion(GaugeField &_Umu,
|
WilsonFermion(GaugeField &_Umu,
|
||||||
GridCartesian &Fgrid,
|
GridCartesian &Fgrid,
|
||||||
@ -149,6 +146,10 @@ namespace Grid {
|
|||||||
DoubledGaugeField Umu;
|
DoubledGaugeField Umu;
|
||||||
DoubledGaugeField UmuEven;
|
DoubledGaugeField UmuEven;
|
||||||
DoubledGaugeField UmuOdd;
|
DoubledGaugeField UmuOdd;
|
||||||
|
|
||||||
|
LebesgueOrder Lebesgue;
|
||||||
|
LebesgueOrder LebesgueEvenOdd;
|
||||||
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
|
/*************************************************************************************
|
||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
@ -39,8 +38,6 @@ namespace QCD {
|
|||||||
// S-direction is INNERMOST and takes no part in the parity.
|
// S-direction is INNERMOST and takes no part in the parity.
|
||||||
const std::vector<int> WilsonFermion5DStatic::directions ({1,2,3,4, 1, 2, 3, 4});
|
const std::vector<int> WilsonFermion5DStatic::directions ({1,2,3,4, 1, 2, 3, 4});
|
||||||
const std::vector<int> WilsonFermion5DStatic::displacements({1,1,1,1,-1,-1,-1,-1});
|
const std::vector<int> WilsonFermion5DStatic::displacements({1,1,1,1,-1,-1,-1,-1});
|
||||||
int WilsonFermion5DStatic::HandOptDslash;
|
|
||||||
int WilsonFermion5DStatic::AsmOptDslash;
|
|
||||||
|
|
||||||
// 5d lattice for DWF.
|
// 5d lattice for DWF.
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -98,34 +95,27 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
|
|||||||
|
|
||||||
// Allocate the required comms buffer
|
// Allocate the required comms buffer
|
||||||
ImportGauge(_Umu);
|
ImportGauge(_Umu);
|
||||||
alltime=0;
|
|
||||||
commtime=0;
|
|
||||||
jointime=0;
|
|
||||||
dslashtime=0;
|
|
||||||
dslash1time=0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
WilsonFermion5D<Impl>::WilsonFermion5D(int simd, GaugeField &_Umu,
|
WilsonFermion5D<Impl>::WilsonFermion5D(int simd,GaugeField &_Umu,
|
||||||
GridCartesian &FiveDimGrid,
|
GridCartesian &FiveDimGrid,
|
||||||
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||||
GridCartesian &FourDimGrid,
|
GridCartesian &FourDimGrid,
|
||||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
|
||||||
RealD _M5,const ImplParams &p) :
|
RealD _M5,const ImplParams &p) :
|
||||||
Kernels(p),
|
Kernels(p),
|
||||||
_FiveDimGrid (&FiveDimGrid),
|
_FiveDimGrid (&FiveDimGrid),
|
||||||
_FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
|
_FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
|
||||||
_FourDimGrid (&FourDimGrid),
|
_FourDimGrid (&FourDimGrid),
|
||||||
_FourDimRedBlackGrid(&FourDimRedBlackGrid),
|
|
||||||
Stencil (_FiveDimGrid,npoint,Even,directions,displacements),
|
Stencil (_FiveDimGrid,npoint,Even,directions,displacements),
|
||||||
StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even
|
StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even
|
||||||
StencilOdd (_FiveDimRedBlackGrid,npoint,Odd ,directions,displacements), // source is Odd
|
StencilOdd (_FiveDimRedBlackGrid,npoint,Odd ,directions,displacements), // source is Odd
|
||||||
M5(_M5),
|
M5(_M5),
|
||||||
Umu(_FourDimGrid),
|
Umu(_FourDimGrid),
|
||||||
UmuEven(_FourDimRedBlackGrid),
|
UmuEven(_FourDimGrid),
|
||||||
UmuOdd (_FourDimRedBlackGrid),
|
UmuOdd (_FourDimGrid),
|
||||||
Lebesgue(_FourDimGrid),
|
Lebesgue(_FourDimGrid),
|
||||||
LebesgueEvenOdd(_FourDimRedBlackGrid)
|
LebesgueEvenOdd(_FourDimGrid)
|
||||||
{
|
{
|
||||||
int nsimd = Simd::Nsimd();
|
int nsimd = Simd::Nsimd();
|
||||||
|
|
||||||
@ -134,7 +124,6 @@ WilsonFermion5D<Impl>::WilsonFermion5D(int simd, GaugeField &_Umu,
|
|||||||
assert(FiveDimRedBlackGrid._ndimension==5);
|
assert(FiveDimRedBlackGrid._ndimension==5);
|
||||||
assert(FiveDimRedBlackGrid._checker_dim==0); // Checkerboard the s-direction
|
assert(FiveDimRedBlackGrid._checker_dim==0); // Checkerboard the s-direction
|
||||||
assert(FourDimGrid._ndimension==4);
|
assert(FourDimGrid._ndimension==4);
|
||||||
assert(FourDimRedBlackGrid._ndimension==4);
|
|
||||||
|
|
||||||
// Dimension zero of the five-d is the Ls direction
|
// Dimension zero of the five-d is the Ls direction
|
||||||
Ls=FiveDimGrid._fdimensions[0];
|
Ls=FiveDimGrid._fdimensions[0];
|
||||||
@ -147,15 +136,10 @@ WilsonFermion5D<Impl>::WilsonFermion5D(int simd, GaugeField &_Umu,
|
|||||||
|
|
||||||
// Other dimensions must match the decomposition of the four-D fields
|
// Other dimensions must match the decomposition of the four-D fields
|
||||||
for(int d=0;d<4;d++){
|
for(int d=0;d<4;d++){
|
||||||
assert(FourDimRedBlackGrid._fdimensions[d] ==FourDimGrid._fdimensions[d]);
|
|
||||||
assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
|
assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
|
||||||
|
|
||||||
assert(FourDimRedBlackGrid._processors[d] ==FourDimGrid._processors[d]);
|
|
||||||
assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
|
assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
|
||||||
|
|
||||||
assert(FourDimGrid._simd_layout[d]=1);
|
assert(FourDimGrid._simd_layout[d]=1);
|
||||||
assert(FourDimRedBlackGrid._simd_layout[d] ==1);
|
|
||||||
assert(FourDimRedBlackGrid._simd_layout[d] ==1);
|
|
||||||
assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
|
assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
|
||||||
|
|
||||||
assert(FiveDimGrid._fdimensions[d+1] ==FourDimGrid._fdimensions[d]);
|
assert(FiveDimGrid._fdimensions[d+1] ==FourDimGrid._fdimensions[d]);
|
||||||
@ -163,8 +147,13 @@ WilsonFermion5D<Impl>::WilsonFermion5D(int simd, GaugeField &_Umu,
|
|||||||
assert(FiveDimGrid._simd_layout[d+1] ==FourDimGrid._simd_layout[d]);
|
assert(FiveDimGrid._simd_layout[d+1] ==FourDimGrid._simd_layout[d]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Allocate the required comms buffer
|
{
|
||||||
ImportGauge(_Umu);
|
GaugeField HUmu(_Umu._grid);
|
||||||
|
HUmu = _Umu*(-0.5);
|
||||||
|
Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
|
||||||
|
UmuEven=Umu;// Really want a reference.
|
||||||
|
UmuOdd =Umu;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -297,30 +286,6 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
|
||||||
void WilsonFermion5D<Impl>::Report(void)
|
|
||||||
{
|
|
||||||
std::cout<<GridLogMessage << "******************** WilsonFermion"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Wilson5d time "<<alltime <<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "HaloBegin time "<<commtime <<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Dslash time "<<dslashtime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Dslash1 time "<<dslash1time<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "HaloComplete time "<<jointime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "******************** Stencil"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Stencil all gather time "<<Stencil.halogtime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Stencil nosplice gather time "<<Stencil.nosplicetime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Stencil splice gather time "<<Stencil.splicetime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "********************"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Stencil gather "<<Stencil.gathertime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Stencil gather simd "<<Stencil.gathermtime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Stencil merge simd "<<Stencil.mergetime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Stencil spin simd "<<Stencil.spintime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "********************"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Stencil MB/s "<<(double)Stencil.comms_bytes/Stencil.commtime<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Stencil comm time "<<Stencil.commtime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Stencil join time "<<Stencil.jointime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "********************"<<std::endl;
|
|
||||||
}
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
|
void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
|
||||||
const FermionField &A,
|
const FermionField &A,
|
||||||
@ -342,90 +307,30 @@ template<class Impl>
|
|||||||
void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField & U,
|
DoubledGaugeField & U,
|
||||||
const FermionField &in, FermionField &out,int dag)
|
const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
|
||||||
DhopInternalCommsThenCompute(st,lo,U,in,out,dag);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class Impl>
|
|
||||||
void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField & U,
|
|
||||||
const FermionField &in, FermionField &out,int dag)
|
|
||||||
{
|
{
|
||||||
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
||||||
alltime-=usecond();
|
|
||||||
Compressor compressor(dag);
|
Compressor compressor(dag);
|
||||||
|
|
||||||
// Assume balanced KMP_AFFINITY; this is forced in GridThread.h
|
|
||||||
int LLs = in._grid->_rdimensions[0];
|
int LLs = in._grid->_rdimensions[0];
|
||||||
|
|
||||||
commtime -=usecond();
|
|
||||||
// auto handle = st.HaloExchangeBegin(in,compressor);
|
|
||||||
// st.HaloExchangeComplete(handle);
|
|
||||||
st.HaloExchange(in,compressor);
|
st.HaloExchange(in,compressor);
|
||||||
commtime +=usecond();
|
|
||||||
|
|
||||||
jointime -=usecond();
|
|
||||||
jointime +=usecond();
|
|
||||||
|
|
||||||
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
||||||
// Not loop ordering and data layout.
|
|
||||||
// Designed to create
|
|
||||||
// - per thread reuse in L1 cache for U
|
|
||||||
// - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
|
|
||||||
dslashtime -=usecond();
|
|
||||||
if ( dag == DaggerYes ) {
|
if ( dag == DaggerYes ) {
|
||||||
if( this->HandOptDslash ) {
|
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||||
for(int s=0;s<LLs;s++){
|
int sU=ss;
|
||||||
int sU=ss;
|
int sF=LLs*sU;
|
||||||
int sF = s+LLs*sU;
|
Kernels::DiracOptDhopSiteDag(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
|
||||||
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
PARALLEL_FOR_LOOP
|
|
||||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
|
||||||
for(int s=0;s<LLs;s++){
|
|
||||||
int sU=ss;
|
|
||||||
int sF = s+LLs*sU;
|
|
||||||
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if( this->AsmOptDslash ) {
|
|
||||||
|
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||||
for(int s=0;s<LLs;s++){
|
int sU=ss;
|
||||||
int sU=ss;
|
int sF=LLs*sU;
|
||||||
int sF = s+LLs*sU;
|
Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
|
||||||
Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if( this->HandOptDslash ) {
|
|
||||||
PARALLEL_FOR_LOOP
|
|
||||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
|
||||||
for(int s=0;s<LLs;s++){
|
|
||||||
int sU=ss;
|
|
||||||
int sF = s+LLs*sU;
|
|
||||||
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
PARALLEL_FOR_LOOP
|
|
||||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
|
||||||
for(int s=0;s<LLs;s++){
|
|
||||||
int sU=ss;
|
|
||||||
int sF = s+LLs*sU;
|
|
||||||
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
dslashtime +=usecond();
|
|
||||||
alltime+=usecond();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -473,7 +378,7 @@ FermOpTemplateInstantiate(WilsonFermion5D);
|
|||||||
GparityFermOpTemplateInstantiate(WilsonFermion5D);
|
GparityFermOpTemplateInstantiate(WilsonFermion5D);
|
||||||
template class WilsonFermion5D<DomainWallRedBlack5dImplF>;
|
template class WilsonFermion5D<DomainWallRedBlack5dImplF>;
|
||||||
template class WilsonFermion5D<DomainWallRedBlack5dImplD>;
|
template class WilsonFermion5D<DomainWallRedBlack5dImplD>;
|
||||||
|
|
||||||
}}
|
}}
|
||||||
|
|
||||||
|
|
||||||
|
@ -49,8 +49,6 @@ namespace Grid {
|
|||||||
class WilsonFermion5DStatic {
|
class WilsonFermion5DStatic {
|
||||||
public:
|
public:
|
||||||
// S-direction is INNERMOST and takes no part in the parity.
|
// S-direction is INNERMOST and takes no part in the parity.
|
||||||
static int AsmOptDslash; // these are a temporary hack
|
|
||||||
static int HandOptDslash; // these are a temporary hack
|
|
||||||
static const std::vector<int> directions;
|
static const std::vector<int> directions;
|
||||||
static const std::vector<int> displacements;
|
static const std::vector<int> displacements;
|
||||||
const int npoint = 8;
|
const int npoint = 8;
|
||||||
@ -62,11 +60,7 @@ namespace Grid {
|
|||||||
public:
|
public:
|
||||||
INHERIT_IMPL_TYPES(Impl);
|
INHERIT_IMPL_TYPES(Impl);
|
||||||
typedef WilsonKernels<Impl> Kernels;
|
typedef WilsonKernels<Impl> Kernels;
|
||||||
double alltime;
|
|
||||||
double jointime;
|
|
||||||
double commtime;
|
|
||||||
double dslashtime;
|
|
||||||
double dslash1time;
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Implement the abstract base
|
// Implement the abstract base
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
@ -122,13 +116,6 @@ namespace Grid {
|
|||||||
FermionField &out,
|
FermionField &out,
|
||||||
int dag);
|
int dag);
|
||||||
|
|
||||||
void DhopInternalCommsThenCompute(StencilImpl & st,
|
|
||||||
LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField &U,
|
|
||||||
const FermionField &in,
|
|
||||||
FermionField &out,
|
|
||||||
int dag);
|
|
||||||
|
|
||||||
// Constructors
|
// Constructors
|
||||||
WilsonFermion5D(GaugeField &_Umu,
|
WilsonFermion5D(GaugeField &_Umu,
|
||||||
GridCartesian &FiveDimGrid,
|
GridCartesian &FiveDimGrid,
|
||||||
@ -143,13 +130,11 @@ namespace Grid {
|
|||||||
GridCartesian &FiveDimGrid,
|
GridCartesian &FiveDimGrid,
|
||||||
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||||
GridCartesian &FourDimGrid,
|
GridCartesian &FourDimGrid,
|
||||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
|
||||||
double _M5,const ImplParams &p= ImplParams());
|
double _M5,const ImplParams &p= ImplParams());
|
||||||
|
|
||||||
// DoubleStore
|
// DoubleStore
|
||||||
void ImportGauge(const GaugeField &_Umu);
|
void ImportGauge(const GaugeField &_Umu);
|
||||||
|
|
||||||
void Report(void);
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Data members require to support the functionality
|
// Data members require to support the functionality
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
|
@ -31,12 +31,63 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
namespace Grid {
|
namespace Grid {
|
||||||
namespace QCD {
|
namespace QCD {
|
||||||
|
|
||||||
|
int WilsonKernelsStatic::HandOpt;
|
||||||
|
int WilsonKernelsStatic::AsmOpt;
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
WilsonKernels<Impl>::WilsonKernels(const ImplParams &p): Base(p) {};
|
WilsonKernels<Impl>::WilsonKernels(const ImplParams &p): Base(p) {};
|
||||||
|
|
||||||
// Need controls to do interior, exterior, or both
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
|
#ifdef AVX512
|
||||||
|
if ( AsmOpt ) {
|
||||||
|
|
||||||
|
WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
#else
|
||||||
|
{
|
||||||
|
#endif
|
||||||
|
for(int site=0;site<Ns;site++) {
|
||||||
|
for(int s=0;s<Ls;s++) {
|
||||||
|
if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out);
|
||||||
|
else WilsonKernels<Impl>::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out);
|
||||||
|
sF++;
|
||||||
|
}
|
||||||
|
sU++;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl>
|
||||||
|
void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
|
// No asm implementation yet.
|
||||||
|
// if ( AsmOpt ) WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
|
||||||
|
// else
|
||||||
|
for(int site=0;site<Ns;site++) {
|
||||||
|
for(int s=0;s<Ls;s++) {
|
||||||
|
if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
|
||||||
|
else WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
|
||||||
|
sF++;
|
||||||
|
}
|
||||||
|
sU++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
////////////////////////////////////////////
|
||||||
|
// Generic implementation; move to different file?
|
||||||
|
////////////////////////////////////////////
|
||||||
|
|
||||||
|
template<class Impl>
|
||||||
|
void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
@ -214,9 +265,9 @@ void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField
|
|||||||
|
|
||||||
// Need controls to do interior, exterior, or both
|
// Need controls to do interior, exterior, or both
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
SiteHalfSpinor tmp;
|
SiteHalfSpinor tmp;
|
||||||
SiteHalfSpinor chi;
|
SiteHalfSpinor chi;
|
||||||
@ -518,17 +569,9 @@ void WilsonKernels<Impl>::DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
|
|||||||
vstream(out._odata[sF],result);
|
vstream(out._odata[sF],result);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if ( ! defined(AVX512) )
|
|
||||||
template<class Impl>
|
|
||||||
void WilsonKernels<Impl>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
|
||||||
{
|
|
||||||
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
FermOpTemplateInstantiate(WilsonKernels);
|
FermOpTemplateInstantiate(WilsonKernels);
|
||||||
|
|
||||||
template class WilsonKernels<DomainWallRedBlack5dImplF>;
|
template class WilsonKernels<DomainWallRedBlack5dImplF>;
|
||||||
template class WilsonKernels<DomainWallRedBlack5dImplD>;
|
template class WilsonKernels<DomainWallRedBlack5dImplD>;
|
||||||
|
|
||||||
|
@ -38,37 +38,56 @@ namespace Grid {
|
|||||||
// Helper routines that implement Wilson stencil for a single site.
|
// Helper routines that implement Wilson stencil for a single site.
|
||||||
// Common to both the WilsonFermion and WilsonFermion5D
|
// Common to both the WilsonFermion and WilsonFermion5D
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
class WilsonKernelsStatic {
|
||||||
|
public:
|
||||||
|
// S-direction is INNERMOST and takes no part in the parity.
|
||||||
|
static int AsmOpt; // these are a temporary hack
|
||||||
|
static int HandOpt; // these are a temporary hack
|
||||||
|
};
|
||||||
|
|
||||||
template<class Impl> class WilsonKernels : public FermionOperator<Impl> {
|
template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic {
|
||||||
public:
|
public:
|
||||||
|
|
||||||
INHERIT_IMPL_TYPES(Impl);
|
INHERIT_IMPL_TYPES(Impl);
|
||||||
typedef FermionOperator<Impl> Base;
|
typedef FermionOperator<Impl> Base;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
|
||||||
|
void DiracOptDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out);
|
int sF, int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
void DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
void DiracOptDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in,FermionField &out);
|
int sF,int sU,int Ls, int Ns, const FermionField &in,FermionField &out);
|
||||||
|
|
||||||
void DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
|
void DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out,int dirdisp,int gamma);
|
int sF,int sU,const FermionField &in, FermionField &out,int dirdisp,int gamma);
|
||||||
|
|
||||||
void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
private:
|
||||||
|
// Specialised variants
|
||||||
|
void DiracOptGenericDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int sF,int sU, const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
|
void DiracOptGenericDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out);
|
int sF,int sU,const FermionField &in,FermionField &out);
|
||||||
|
|
||||||
int DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
|
|
||||||
|
void DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out);
|
int sF,int sU,const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
int DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
void DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out);
|
int sF,int sU,const FermionField &in, FermionField &out);
|
||||||
|
public:
|
||||||
|
|
||||||
WilsonKernels(const ImplParams &p= ImplParams());
|
WilsonKernels(const ImplParams &p= ImplParams());
|
||||||
|
|
||||||
|
@ -2,6 +2,8 @@
|
|||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Source file: ./lib/qcd/action/fermion/WilsonKernelsAsm.cc
|
Source file: ./lib/qcd/action/fermion/WilsonKernelsAsm.cc
|
||||||
|
|
||||||
Copyright (C) 2015
|
Copyright (C) 2015
|
||||||
@ -26,237 +28,93 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
*************************************************************************************/
|
*************************************************************************************/
|
||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
|
|
||||||
#include <Grid.h>
|
#include <Grid.h>
|
||||||
#if defined(AVX512)
|
|
||||||
//#if defined (IMCI)
|
|
||||||
|
|
||||||
#include <simd/Intel512wilson.h>
|
|
||||||
|
|
||||||
#include <simd/Intel512single.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
namespace QCD {
|
namespace QCD {
|
||||||
|
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////
|
||||||
|
// Default to no assembler implementation
|
||||||
|
///////////////////////////////////////////////////////////
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out)
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
uint64_t now;
|
assert(0);
|
||||||
uint64_t first ;
|
|
||||||
int offset,local,perm, ptype;
|
|
||||||
const SiteHalfSpinor *pbuf = & buf[0];
|
|
||||||
const SiteSpinor *plocal = & in._odata[0];
|
|
||||||
void *pf;
|
|
||||||
int osites = in._grid->oSites();
|
|
||||||
|
|
||||||
|
|
||||||
StencilEntry *SE;
|
|
||||||
|
|
||||||
//#define STAMP(i) timers[i] = cyclecount() ;
|
|
||||||
#define STAMP(i) //timers[i] = cyclecount() ;
|
|
||||||
|
|
||||||
MASK_REGS;
|
|
||||||
|
|
||||||
first = cyclecount();
|
|
||||||
|
|
||||||
SE=st.GetEntry(ptype,Xm,ss);
|
|
||||||
|
|
||||||
// Xm
|
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
// Prefetch
|
|
||||||
SE=st.GetEntry(ptype,Ym,ss);
|
|
||||||
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
|
|
||||||
else pf=(void *)&pbuf[SE->_offset];
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
XP_PROJMEM(&plocal[offset]);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR3; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(&pbuf[offset]);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT_2SPIN_DIR_PFXM(Xm,pf);
|
|
||||||
}
|
|
||||||
XP_RECON;
|
|
||||||
|
|
||||||
// Ym
|
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
// Prefetch
|
|
||||||
SE=st.GetEntry(ptype,Zm,ss);
|
|
||||||
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
|
|
||||||
else pf=(void *)&pbuf[SE->_offset];
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
YP_PROJMEM(&plocal[offset]);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR2; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(&pbuf[offset]);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT_2SPIN_DIR_PFYM(Ym,pf);
|
|
||||||
}
|
|
||||||
YP_RECON_ACCUM;
|
|
||||||
|
|
||||||
// Zm
|
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
// Prefetch
|
|
||||||
SE=st.GetEntry(ptype,Tm,ss);
|
|
||||||
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
|
|
||||||
else pf=(void *)&pbuf[SE->_offset];
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
ZP_PROJMEM(&plocal[offset]);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR1; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(&pbuf[offset]);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT_2SPIN_DIR_PFZM(Zm,pf);
|
|
||||||
}
|
|
||||||
ZP_RECON_ACCUM;
|
|
||||||
|
|
||||||
// Tm
|
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
SE=st.GetEntry(ptype,Tp,ss);
|
|
||||||
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
|
|
||||||
else pf=(void *)&pbuf[SE->_offset];
|
|
||||||
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
TP_PROJMEM(&plocal[offset]);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR0; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(&pbuf[offset]);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT_2SPIN_DIR_PFTM(Tm,pf);
|
|
||||||
}
|
|
||||||
TP_RECON_ACCUM;
|
|
||||||
|
|
||||||
// Tp
|
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
// Prefetch
|
|
||||||
SE=st.GetEntry(ptype,Zp,ss);
|
|
||||||
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
|
|
||||||
else pf=(void *)&pbuf[SE->_offset];
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
TM_PROJMEM(&plocal[offset]);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR0; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(&pbuf[offset]);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT_2SPIN_DIR_PFTP(Tp,pf);
|
|
||||||
}
|
|
||||||
TM_RECON_ACCUM;
|
|
||||||
|
|
||||||
// Zp
|
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
// Prefetch
|
|
||||||
SE=st.GetEntry(ptype,Yp,ss);
|
|
||||||
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
|
|
||||||
else pf=(void *)&pbuf[SE->_offset];
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
ZM_PROJMEM(&plocal[offset]);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR1; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(&pbuf[offset]);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT_2SPIN_DIR_PFZP(Zp,pf);
|
|
||||||
}
|
|
||||||
ZM_RECON_ACCUM;
|
|
||||||
|
|
||||||
|
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
// Prefetch
|
|
||||||
SE=st.GetEntry(ptype,Xp,ss);
|
|
||||||
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
|
|
||||||
else pf=(void *)&pbuf[SE->_offset];
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
YM_PROJMEM(&plocal[offset]);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR2; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(&pbuf[offset]);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT_2SPIN_DIR_PFYP(Yp,pf);
|
|
||||||
}
|
|
||||||
YM_RECON_ACCUM;
|
|
||||||
|
|
||||||
// Xp
|
|
||||||
perm = SE->_permute;
|
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
|
|
||||||
// Prefetch
|
|
||||||
SE=st.GetEntry(ptype,Xm,(ss+1)%osites);
|
|
||||||
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
|
|
||||||
else pf=(void *)&pbuf[SE->_offset];
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
XM_PROJMEM(&plocal[offset]);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR3; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(&pbuf[offset]);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT_2SPIN_DIR_PFXP(Xp,pf);
|
|
||||||
}
|
|
||||||
XM_RECON_ACCUM;
|
|
||||||
|
|
||||||
debug:
|
|
||||||
SAVE_RESULT(&out._odata[ss]);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template class WilsonKernels<WilsonImplF>;
|
#if defined(AVX512)
|
||||||
template class WilsonKernels<WilsonImplD>;
|
|
||||||
template class WilsonKernels<GparityWilsonImplF>;
|
|
||||||
template class WilsonKernels<GparityWilsonImplD>;
|
///////////////////////////////////////////////////////////
|
||||||
template class WilsonKernels<DomainWallRedBlack5dImplF>;
|
// If we are AVX512 specialise the single precision routine
|
||||||
template class WilsonKernels<DomainWallRedBlack5dImplD>;
|
///////////////////////////////////////////////////////////
|
||||||
}}
|
|
||||||
|
#include <simd/Intel512wilson.h>
|
||||||
|
#include <simd/Intel512single.h>
|
||||||
|
|
||||||
|
static Vector<vComplexF> signs;
|
||||||
|
|
||||||
|
int setupSigns(void ){
|
||||||
|
Vector<vComplexF> bother(2);
|
||||||
|
signs = bother;
|
||||||
|
vrsign(signs[0]);
|
||||||
|
visign(signs[1]);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
static int signInit = setupSigns();
|
||||||
|
|
||||||
|
#define label(A) ilabel(A)
|
||||||
|
#define ilabel(A) ".globl\n" #A ":\n"
|
||||||
|
|
||||||
|
#define MAYBEPERM(A,perm) if (perm) { A ; }
|
||||||
|
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
|
||||||
|
#define FX(A) WILSONASM_ ##A
|
||||||
|
template<>
|
||||||
|
void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||||
|
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||||
|
|
||||||
|
#undef VMOVIDUP
|
||||||
|
#undef VMOVRDUP
|
||||||
|
#undef MAYBEPERM
|
||||||
|
#undef MULT_2SPIN
|
||||||
|
#undef FX
|
||||||
|
#define FX(A) DWFASM_ ## A
|
||||||
|
#define MAYBEPERM(A,B)
|
||||||
|
#define VMOVIDUP(A,B,C) VBCASTIDUPf(A,B,C)
|
||||||
|
#define VMOVRDUP(A,B,C) VBCASTRDUPf(A,B,C)
|
||||||
|
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
|
||||||
|
template<>
|
||||||
|
void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||||
|
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
|
template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||||
|
template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||||
|
template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||||
|
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||||
|
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||||
|
}}
|
||||||
|
|
||||||
|
186
lib/qcd/action/fermion/WilsonKernelsAsmBody.h
Normal file
186
lib/qcd/action/fermion/WilsonKernelsAsmBody.h
Normal file
@ -0,0 +1,186 @@
|
|||||||
|
{
|
||||||
|
int local,perm, ptype;
|
||||||
|
uint64_t base;
|
||||||
|
uint64_t basep;
|
||||||
|
const uint64_t plocal =(uint64_t) & in._odata[0];
|
||||||
|
|
||||||
|
// vComplexF isigns[2] = { signs[0], signs[1] };
|
||||||
|
vComplexF *isigns = &signs[0];
|
||||||
|
|
||||||
|
MASK_REGS;
|
||||||
|
int nmax=U._grid->oSites();
|
||||||
|
for(int site=0;site<Ns;site++) {
|
||||||
|
int sU =lo.Reorder(ssU);
|
||||||
|
int ssn=ssU+1;
|
||||||
|
if(ssn>=nmax) ssn=0;
|
||||||
|
int sUn=lo.Reorder(ssn);
|
||||||
|
for(int s=0;s<Ls;s++) {
|
||||||
|
ss =sU*Ls+s;
|
||||||
|
ssn=sUn*Ls+s;
|
||||||
|
////////////////////////////////
|
||||||
|
// Xp
|
||||||
|
////////////////////////////////
|
||||||
|
int ent=ss*8;// 2*Ndim
|
||||||
|
int nent=ssn*8;
|
||||||
|
|
||||||
|
PF_GAUGE(Xp);
|
||||||
|
base = st.GetInfo(ptype,local,perm,Xp,ent,plocal); ent++;
|
||||||
|
PREFETCH1_CHIMU(base);
|
||||||
|
|
||||||
|
basep = st.GetPFInfo(nent,plocal); nent++;
|
||||||
|
if ( local ) {
|
||||||
|
LOAD64(%r10,isigns);
|
||||||
|
XM_PROJMEM(base);
|
||||||
|
MAYBEPERM(PERMUTE_DIR3,perm);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(base);
|
||||||
|
}
|
||||||
|
base = st.GetInfo(ptype,local,perm,Yp,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(base);
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFXP(Xp,basep);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns);
|
||||||
|
XM_RECON;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Yp
|
||||||
|
////////////////////////////////
|
||||||
|
basep = st.GetPFInfo(nent,plocal); nent++;
|
||||||
|
if ( local ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
YM_PROJMEM(base);
|
||||||
|
MAYBEPERM(PERMUTE_DIR2,perm);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(base);
|
||||||
|
}
|
||||||
|
base = st.GetInfo(ptype,local,perm,Zp,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(base);
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFYP(Yp,basep);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
YM_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Zp
|
||||||
|
////////////////////////////////
|
||||||
|
basep = st.GetPFInfo(nent,plocal); nent++;
|
||||||
|
if ( local ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
ZM_PROJMEM(base);
|
||||||
|
MAYBEPERM(PERMUTE_DIR1,perm);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(base);
|
||||||
|
}
|
||||||
|
base = st.GetInfo(ptype,local,perm,Tp,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(base);
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFZP(Zp,basep);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
ZM_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Tp
|
||||||
|
////////////////////////////////
|
||||||
|
basep = st.GetPFInfo(nent,plocal); nent++;
|
||||||
|
if ( local ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
TM_PROJMEM(base);
|
||||||
|
MAYBEPERM(PERMUTE_DIR0,perm);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(base);
|
||||||
|
}
|
||||||
|
base = st.GetInfo(ptype,local,perm,Xm,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(base);
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFTP(Tp,basep);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
TM_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Xm
|
||||||
|
////////////////////////////////
|
||||||
|
basep= (uint64_t) &out._odata[ss];
|
||||||
|
// basep= st.GetPFInfo(nent,plocal); nent++;
|
||||||
|
if ( local ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
XP_PROJMEM(base);
|
||||||
|
MAYBEPERM(PERMUTE_DIR3,perm);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(base);
|
||||||
|
}
|
||||||
|
base = st.GetInfo(ptype,local,perm,Ym,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(base);
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFXM(Xm,basep);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
XP_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Ym
|
||||||
|
////////////////////////////////
|
||||||
|
basep= st.GetPFInfo(nent,plocal); nent++;
|
||||||
|
if ( local ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
YP_PROJMEM(base);
|
||||||
|
MAYBEPERM(PERMUTE_DIR2,perm);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(base);
|
||||||
|
}
|
||||||
|
base = st.GetInfo(ptype,local,perm,Zm,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(base);
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFYM(Ym,basep);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
YP_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Zm
|
||||||
|
////////////////////////////////
|
||||||
|
basep= st.GetPFInfo(nent,plocal); nent++;
|
||||||
|
if ( local ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
ZP_PROJMEM(base);
|
||||||
|
MAYBEPERM(PERMUTE_DIR1,perm);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(base);
|
||||||
|
}
|
||||||
|
base = st.GetInfo(ptype,local,perm,Tm,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(base);
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFZM(Zm,basep);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
ZP_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Tm
|
||||||
|
////////////////////////////////
|
||||||
|
basep= st.GetPFInfo(nent,plocal); nent++;
|
||||||
|
if ( local ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
TP_PROJMEM(base);
|
||||||
|
MAYBEPERM(PERMUTE_DIR0,perm);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(base);
|
||||||
|
}
|
||||||
|
base= (uint64_t) &out._odata[ss];
|
||||||
|
PREFETCH_CHIMU(base);
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFTM(Tm,basep);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
TP_RECON_ACCUM;
|
||||||
|
|
||||||
|
basep= st.GetPFInfo(nent,plocal); nent++;
|
||||||
|
SAVE_RESULT(base,basep);
|
||||||
|
|
||||||
|
}
|
||||||
|
ssU++;
|
||||||
|
}
|
||||||
|
}
|
161
lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab
Normal file
161
lib/qcd/action/fermion/WilsonKernelsAsmBody.h.ab
Normal file
@ -0,0 +1,161 @@
|
|||||||
|
{
|
||||||
|
int locala,perma, ptypea;
|
||||||
|
int localb,permb, ptypeb;
|
||||||
|
uint64_t basea, baseb;
|
||||||
|
const uint64_t plocal =(uint64_t) & in._odata[0];
|
||||||
|
|
||||||
|
// vComplexF isigns[2] = { signs[0], signs[1] };
|
||||||
|
vComplexF *isigns = &signs[0];
|
||||||
|
|
||||||
|
MASK_REGS;
|
||||||
|
|
||||||
|
for(int site=0;site<Ns;site++) {
|
||||||
|
int sU=lo.Reorder(ssU);
|
||||||
|
for(int s=0;s<Ls;s++) {
|
||||||
|
ss=sU*Ls+s;
|
||||||
|
////////////////////////////////
|
||||||
|
// Xp
|
||||||
|
////////////////////////////////
|
||||||
|
int ent=ss*8;// 2*Ndim
|
||||||
|
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
||||||
|
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
||||||
|
|
||||||
|
if ( locala ) {
|
||||||
|
LOAD64(%r10,isigns);
|
||||||
|
XM_PROJMEM(basea);
|
||||||
|
MAYBEPERM(PERMUTE_DIR3,perma);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(basea);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFXP(Xp,baseb);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns);
|
||||||
|
XM_RECON;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Yp
|
||||||
|
////////////////////////////////
|
||||||
|
basea = st.GetInfo(ptypea,locala,perma,Zp,ent,plocal); ent++;
|
||||||
|
if ( localb ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
YM_PROJMEM(baseb);
|
||||||
|
MAYBEPERM(PERMUTE_DIR2,permb);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(baseb);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFYP(Yp,basea);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
YM_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Zp
|
||||||
|
////////////////////////////////
|
||||||
|
baseb = st.GetInfo(ptypeb,localb,permb,Tp,ent,plocal); ent++;
|
||||||
|
if ( locala ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
ZM_PROJMEM(basea);
|
||||||
|
MAYBEPERM(PERMUTE_DIR1,perma);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(basea);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFZP(Zp,baseb);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
ZM_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Tp
|
||||||
|
////////////////////////////////
|
||||||
|
basea = st.GetInfo(ptypea,locala,perma,Xm,ent,plocal); ent++;
|
||||||
|
if ( localb ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
TM_PROJMEM(baseb);
|
||||||
|
MAYBEPERM(PERMUTE_DIR0,permb);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(baseb);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFTP(Tp,basea);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
TM_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Xm
|
||||||
|
////////////////////////////////
|
||||||
|
baseb = st.GetInfo(ptypeb,localb,permb,Ym,ent,plocal); ent++;
|
||||||
|
if ( locala ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
XP_PROJMEM(basea);
|
||||||
|
MAYBEPERM(PERMUTE_DIR3,perma);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(basea);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFXM(Xm,baseb);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
XP_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Ym
|
||||||
|
////////////////////////////////
|
||||||
|
basea = st.GetInfo(ptypea,locala,perma,Zm,ent,plocal); ent++;
|
||||||
|
if ( localb ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
YP_PROJMEM(baseb);
|
||||||
|
MAYBEPERM(PERMUTE_DIR2,permb);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(baseb);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFYM(Ym,basea);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
YP_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Zm
|
||||||
|
////////////////////////////////
|
||||||
|
baseb = st.GetInfo(ptypeb,localb,permb,Tm,ent,plocal); ent++;
|
||||||
|
if ( locala ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
ZP_PROJMEM(basea);
|
||||||
|
MAYBEPERM(PERMUTE_DIR1,perma);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(basea);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFZM(Zm,baseb);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
ZP_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Tm
|
||||||
|
////////////////////////////////
|
||||||
|
basea = (uint64_t)&out._odata[ss];
|
||||||
|
if ( localb ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
TP_PROJMEM(baseb);
|
||||||
|
MAYBEPERM(PERMUTE_DIR0,permb);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(baseb);
|
||||||
|
}
|
||||||
|
baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal);
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFTM(Tm,basea);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
TP_RECON_ACCUM;
|
||||||
|
|
||||||
|
SAVE_RESULT(&out._odata[ss],baseb);
|
||||||
|
|
||||||
|
}
|
||||||
|
ssU++;
|
||||||
|
}
|
||||||
|
}
|
187
lib/qcd/action/fermion/WilsonKernelsAsmBody.h.abc
Normal file
187
lib/qcd/action/fermion/WilsonKernelsAsmBody.h.abc
Normal file
@ -0,0 +1,187 @@
|
|||||||
|
{
|
||||||
|
int locala,perma, ptypea;
|
||||||
|
int localb,permb, ptypeb;
|
||||||
|
int localc,permc, ptypec;
|
||||||
|
uint64_t basea, baseb, basec;
|
||||||
|
uint64_t basex;
|
||||||
|
const uint64_t plocal =(uint64_t) & in._odata[0];
|
||||||
|
|
||||||
|
// vComplexF isigns[2] = { signs[0], signs[1] };
|
||||||
|
vComplexF *isigns = &signs[0];
|
||||||
|
|
||||||
|
MASK_REGS;
|
||||||
|
|
||||||
|
for(int site=0;site<Ns;site++) {
|
||||||
|
int sU=lo.Reorder(ssU);
|
||||||
|
|
||||||
|
for(int s=0;s<Ls;s++) {
|
||||||
|
ss =sU*Ls+s;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Xp
|
||||||
|
////////////////////////////////
|
||||||
|
int ent=ss*8;// 2*Ndim
|
||||||
|
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(basea);
|
||||||
|
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(baseb);
|
||||||
|
basec = st.GetInfo(ptypec,localc,permc,Zp,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(basec);
|
||||||
|
|
||||||
|
basex = basea;
|
||||||
|
|
||||||
|
label(FX(XP) );
|
||||||
|
if ( locala ) {
|
||||||
|
LOAD64(%r10,isigns);
|
||||||
|
XM_PROJMEM(basea);
|
||||||
|
MAYBEPERM(PERMUTE_DIR3,perma);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(basea);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFXP(Xp,baseb);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns);
|
||||||
|
XM_RECON;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Yp
|
||||||
|
////////////////////////////////
|
||||||
|
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(basea);
|
||||||
|
label(FX(YP) );
|
||||||
|
if ( localb ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
YM_PROJMEM(baseb);
|
||||||
|
MAYBEPERM(PERMUTE_DIR2,permb);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(baseb);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFYP(Yp,basec);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
YM_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Zp
|
||||||
|
////////////////////////////////
|
||||||
|
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(baseb);
|
||||||
|
label(FX(ZP) );
|
||||||
|
if ( localc ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
ZM_PROJMEM(basec);
|
||||||
|
MAYBEPERM(PERMUTE_DIR1,permc);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(basec);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFZP(Zp,basea);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
ZM_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Tp
|
||||||
|
////////////////////////////////
|
||||||
|
basec = st.GetInfo(ptypec,localc,permc,Xp,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(basec);
|
||||||
|
label(FX(TP) );
|
||||||
|
if ( locala ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
TM_PROJMEM(basea);
|
||||||
|
MAYBEPERM(PERMUTE_DIR0,perma);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(basea);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFTP(Tp,baseb);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
TM_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Xm
|
||||||
|
////////////////////////////////
|
||||||
|
basea = st.GetInfo(ptypea,locala,perma,Yp,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(basea);
|
||||||
|
label(FX(XM) );
|
||||||
|
if ( localb ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
XP_PROJMEM(baseb);
|
||||||
|
MAYBEPERM(PERMUTE_DIR3,permb);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(baseb);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFXM(Xm,basec);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
XP_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Ym
|
||||||
|
////////////////////////////////
|
||||||
|
baseb = st.GetInfo(ptypeb,localb,permb,Xp,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(baseb);
|
||||||
|
label(FX(YM) );
|
||||||
|
if ( localc ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
YP_PROJMEM(basec);
|
||||||
|
MAYBEPERM(PERMUTE_DIR2,permc);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(basec);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFYM(Ym,basea);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
YP_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Zm
|
||||||
|
////////////////////////////////
|
||||||
|
basec = st.GetInfo(ptypec,localc,permc,Yp,ent,plocal); ent++;
|
||||||
|
PREFETCH_CHIMU(basec);
|
||||||
|
label(FX(ZM) );
|
||||||
|
if ( locala ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
ZP_PROJMEM(basea);
|
||||||
|
MAYBEPERM(PERMUTE_DIR1,perma);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(basea);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFZM(Zm,baseb);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
ZP_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Tm
|
||||||
|
////////////////////////////////
|
||||||
|
basea = (uint64_t)&out._odata[ss];
|
||||||
|
PREFETCH_CHIMU(basea);
|
||||||
|
label(FX(TM) );
|
||||||
|
if ( localb ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
TP_PROJMEM(baseb);
|
||||||
|
MAYBEPERM(PERMUTE_DIR0,permb);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(baseb);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFTM(Tm,basec);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
TP_RECON_ACCUM;
|
||||||
|
|
||||||
|
// PREFETCH_CHIMU(basex);
|
||||||
|
label(FX(SAV) );
|
||||||
|
SAVE_RESULT(&out._odata[ss]);
|
||||||
|
|
||||||
|
}
|
||||||
|
ssU++;
|
||||||
|
}
|
||||||
|
}
|
@ -312,7 +312,7 @@ namespace QCD {
|
|||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
int WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out)
|
int ss,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
@ -552,12 +552,10 @@ int WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField
|
|||||||
vstream(ref()(3)(1),result_31);
|
vstream(ref()(3)(1),result_31);
|
||||||
vstream(ref()(3)(2),result_32);
|
vstream(ref()(3)(2),result_32);
|
||||||
}
|
}
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out)
|
int ss,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
@ -798,7 +796,6 @@ int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeFi
|
|||||||
vstream(ref()(3)(1),result_31);
|
vstream(ref()(3)(1),result_31);
|
||||||
vstream(ref()(3)(2),result_32);
|
vstream(ref()(3)(2),result_32);
|
||||||
}
|
}
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -806,125 +803,80 @@ int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeFi
|
|||||||
// Specialise Gparity to simple implementation
|
// Specialise Gparity to simple implementation
|
||||||
////////////////////////////////////////////////
|
////////////////////////////////////////////////
|
||||||
template<>
|
template<>
|
||||||
int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3
|
assert(0);
|
||||||
return 0;
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
|
assert(0);
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
|
assert(0);
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
|
assert(0);
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
//////////////
|
////////////// Wilson ; uses this implementation /////////////////////
|
||||||
/*
|
// Need Nc=3 though //
|
||||||
template<>
|
|
||||||
int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
|
||||||
{
|
|
||||||
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
template<>
|
template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
|
||||||
{
|
|
||||||
DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<>
|
|
||||||
int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
|
||||||
{
|
|
||||||
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<>
|
|
||||||
int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int sF,int sU,const FermionField &in, FermionField &out)
|
|
||||||
{
|
|
||||||
DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
*/
|
|
||||||
|
|
||||||
template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
template int WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
template int WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
|
|
||||||
template int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
template int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
|
|
||||||
|
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
|
|
||||||
template int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
template int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
template int WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
template int WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
|
0
lib/qcd/hmc/.dirstamp
Normal file
0
lib/qcd/hmc/.dirstamp
Normal file
0
lib/qcd/spin/.dirstamp
Normal file
0
lib/qcd/spin/.dirstamp
Normal file
0
lib/qcd/utils/.dirstamp
Normal file
0
lib/qcd/utils/.dirstamp
Normal file
@ -410,6 +410,7 @@ namespace Optimization {
|
|||||||
break;
|
break;
|
||||||
default: assert(0);
|
default: assert(0);
|
||||||
}
|
}
|
||||||
|
return out;
|
||||||
}
|
}
|
||||||
static inline u128d rotate(u128d in,int n){
|
static inline u128d rotate(u128d in,int n){
|
||||||
u128d out;
|
u128d out;
|
||||||
@ -424,6 +425,7 @@ namespace Optimization {
|
|||||||
break;
|
break;
|
||||||
default: assert(0);
|
default: assert(0);
|
||||||
}
|
}
|
||||||
|
return out;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -367,6 +367,9 @@ namespace Grid {
|
|||||||
template <class S,class V, IfComplex<S> = 0 > inline void vzero(Grid_simd<S,V> &ret) { vsplat(ret,S(0.0,0.0)); }// use xor?
|
template <class S,class V, IfComplex<S> = 0 > inline void vzero(Grid_simd<S,V> &ret) { vsplat(ret,S(0.0,0.0)); }// use xor?
|
||||||
template <class S,class V, IfComplex<S> = 0 > inline void vcomplex_i(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0,1.0));}
|
template <class S,class V, IfComplex<S> = 0 > inline void vcomplex_i(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0,1.0));}
|
||||||
|
|
||||||
|
template <class S,class V, IfComplex<S> = 0 > inline void visign(Grid_simd<S,V> &ret){ vsplat(ret,S(1.0,-1.0));}
|
||||||
|
template <class S,class V, IfComplex<S> = 0 > inline void vrsign(Grid_simd<S,V> &ret){ vsplat(ret,S(-1.0,1.0));}
|
||||||
|
|
||||||
// if not complex overload here
|
// if not complex overload here
|
||||||
template <class S,class V, IfReal<S> = 0 > inline void vone (Grid_simd<S,V> &ret){ vsplat(ret,S(1.0)); }
|
template <class S,class V, IfReal<S> = 0 > inline void vone (Grid_simd<S,V> &ret){ vsplat(ret,S(1.0)); }
|
||||||
template <class S,class V, IfReal<S> = 0 > inline void vzero(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0)); }
|
template <class S,class V, IfReal<S> = 0 > inline void vzero(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0)); }
|
||||||
|
@ -87,14 +87,39 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
#define VMOVRDUPd(OFF,A,DEST) "vpshufd $0x44," #OFF "*64(" #A ")," #DEST ";\n" // 32 bit level: 1,0,3,2
|
#define VMOVRDUPd(OFF,A,DEST) "vpshufd $0x44," #OFF "*64(" #A ")," #DEST ";\n" // 32 bit level: 1,0,3,2
|
||||||
#define VMOVIDUPd(OFF,A,DEST) "vpshufd $0xee," #OFF "*64(" #A ")," #DEST ";\n" // 32 bit level: 3,2,3,2
|
#define VMOVIDUPd(OFF,A,DEST) "vpshufd $0xee," #OFF "*64(" #A ")," #DEST ";\n" // 32 bit level: 3,2,3,2
|
||||||
|
|
||||||
#define VMOVRDUPf(OFF,PTR,DEST) "vmovsldup " #OFF "*64(" #PTR "), " #DEST ";\n"
|
#define VMOVRDUPf(OFF,PTR,DEST) "vmovsldup " #OFF "*64(" #PTR "), " #DEST ";\n"
|
||||||
#define VMOVIDUPf(OFF,PTR,DEST) "vmovshdup " #OFF "*64(" #PTR "), " #DEST ";\n"
|
#define VMOVIDUPf(OFF,PTR,DEST) "vmovshdup " #OFF "*64(" #PTR "), " #DEST ";\n"
|
||||||
|
|
||||||
|
#define VRDUPd(SRC,DEST) "vpshufd $0x44," #SRC"," #DEST ";\n" // 32 bit level: 1,0,3,2
|
||||||
|
#define VRDUPf(SRC,DEST) "vmovsldup " #SRC ", " #DEST ";\n"
|
||||||
|
#define VIDUPd(SRC,DEST) "vpshufd $0xee," #SRC"," #DEST ";\n" // 32 bit level: 3,2,3,2
|
||||||
|
#define VIDUPf(SRC,DEST) "vmovshdup " #SRC ", " #DEST ";\n"
|
||||||
|
|
||||||
|
#define VBCASTRDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+0)(" #A ")," #DEST ";\n"
|
||||||
|
#define VBCASTIDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+8)(" #A ")," #DEST ";\n"
|
||||||
|
#define VBCASTRDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +0)(" #PTR "), " #DEST ";\n"
|
||||||
|
#define VBCASTIDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +4)(" #PTR "), " #DEST ";\n"
|
||||||
|
|
||||||
#define VMADDSUBf(A,B,accum) "vfmaddsub231ps " #A "," #B "," #accum ";\n"
|
#define VMADDSUBf(A,B,accum) "vfmaddsub231ps " #A "," #B "," #accum ";\n"
|
||||||
#define VMADDSUBd(A,B,accum) "vfmaddsub231pd " #A "," #B "," #accum ";\n"
|
#define VMADDSUBd(A,B,accum) "vfmaddsub231pd " #A "," #B "," #accum ";\n"
|
||||||
|
#define VMADDSUBMEMf(O,P,B,accum) "vfmaddsub231ps " #O"*64("#P "),"#B "," #accum ";\n"
|
||||||
|
#define VMADDSUBMEMd(O,P,B,accum) "vfmaddsub231pd " #O"*64("#P "),"#B "," #accum ";\n"
|
||||||
|
|
||||||
|
|
||||||
|
#define VMADDSUBRDUPf(O,P,B,accum) "vfmaddsub231ps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n"
|
||||||
|
#define VMADDSUBIDUPf(O,P,B,accum) "vfmaddsub231ps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n"
|
||||||
|
#define VMULRDUPf(O,P,B,accum) "vmulps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n"
|
||||||
|
#define VMULIDUPf(O,P,B,accum) "vmulps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n"
|
||||||
|
|
||||||
|
#define VMADDSUBRDUPd(O,P,B,accum) "vfmaddsub231pd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n"
|
||||||
|
#define VMADDSUBIDUPd(O,P,B,accum) "vfmaddsub231pd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n"
|
||||||
|
#define VMULRDUPd(O,P,B,accum) "vmulpd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n"
|
||||||
|
#define VMULIDUPd(O,P,B,accum) "vmulpd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n"
|
||||||
|
/*
|
||||||
|
* TimesI is used only in the XP recon
|
||||||
|
* Could zero the regs and use RECON_ACCUM
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
#define VTIMESI0f(A,DEST, Z) VSHUFf(A,DEST)
|
#define VTIMESI0f(A,DEST, Z) VSHUFf(A,DEST)
|
||||||
#define VTIMESI1f(A,DEST, Z) "vaddps " #DEST "," #Z "," #DEST"{%k6}" ";\n"
|
#define VTIMESI1f(A,DEST, Z) "vaddps " #DEST "," #Z "," #DEST"{%k6}" ";\n"
|
||||||
#define VTIMESI2f(A,DEST, Z) "vsubps " #DEST "," #Z "," #DEST"{%k7}" ";\n"
|
#define VTIMESI2f(A,DEST, Z) "vsubps " #DEST "," #Z "," #DEST"{%k7}" ";\n"
|
||||||
@ -111,6 +136,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#define VTIMESMINUSI1d(A,DEST,Z) "vsubpd " #DEST "," #Z "," #DEST"{%k6}" ";\n"
|
#define VTIMESMINUSI1d(A,DEST,Z) "vsubpd " #DEST "," #Z "," #DEST"{%k6}" ";\n"
|
||||||
#define VTIMESMINUSI2d(A,DEST,Z) "vaddpd " #DEST "," #Z "," #DEST"{%k7}" ";\n"
|
#define VTIMESMINUSI2d(A,DEST,Z) "vaddpd " #DEST "," #Z "," #DEST"{%k7}" ";\n"
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
|
||||||
#define VACCTIMESMINUSI0f(A,ACC,tmp) VSHUFf(A,tmp)
|
#define VACCTIMESMINUSI0f(A,ACC,tmp) VSHUFf(A,tmp)
|
||||||
#define VACCTIMESMINUSI1f(A,ACC,tmp) "vsubps " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
|
#define VACCTIMESMINUSI1f(A,ACC,tmp) "vsubps " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
|
||||||
#define VACCTIMESMINUSI2f(A,ACC,tmp) "vaddps " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
|
#define VACCTIMESMINUSI2f(A,ACC,tmp) "vaddps " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
|
||||||
@ -127,6 +154,35 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#define VACCTIMESI1d(A,ACC,tmp) "vaddpd " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
|
#define VACCTIMESI1d(A,ACC,tmp) "vaddpd " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
|
||||||
#define VACCTIMESI2d(A,ACC,tmp) "vsubpd " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
|
#define VACCTIMESI2d(A,ACC,tmp) "vsubpd " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
// o_p must point to floating 1.0f/d
|
||||||
|
//
|
||||||
|
// Ai, Ar -> tmp (r i)
|
||||||
|
// tmp *1.0
|
||||||
|
// ACC i - Ar ; ACC r + Ai
|
||||||
|
#define VACCTIMESMINUSI0f(A,ACC,tmp) VSHUFf(A,tmp)
|
||||||
|
#define VACCTIMESMINUSI1f(A,ACC,tmp) VMADDMEMf(1,%r10,tmp,ACC)
|
||||||
|
#define VACCTIMESMINUSI2f(A,ACC,tmp)
|
||||||
|
|
||||||
|
|
||||||
|
#define VACCTIMESMINUSI0d(A,ACC,tmp) VSHUFd(A,tmp)
|
||||||
|
#define VACCTIMESMINUSI1d(A,ACC,tmp) VMADDMEMd(1,%r10,tmp,ACC)
|
||||||
|
#define VACCTIMESMINUSI2d(A,ACC,tmp)
|
||||||
|
|
||||||
|
// Ai, Ar -> tmp (r i)
|
||||||
|
// tmp *1.0
|
||||||
|
// ACC i + Ar ; ACC r - Ai
|
||||||
|
#define VACCTIMESI0f(A,ACC,tmp) VSHUFf(A,tmp)
|
||||||
|
#define VACCTIMESI1f(A,ACC,tmp) VMADDMEMf(0,%r10,tmp,ACC)
|
||||||
|
#define VACCTIMESI2f(A,ACC,tmp)
|
||||||
|
|
||||||
|
#define VACCTIMESI0d(A,ACC,tmp) VSHUFd(A,tmp)
|
||||||
|
#define VACCTIMESI1d(A,ACC,tmp) VMADDMEMd(0,%r10,tmp,ACC)
|
||||||
|
#define VACCTIMESI2d(A,ACC,tmp)
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
#define VPERM0f(A,B) "vshuff32x4 $0x4e," #A "," #B "," #B ";\n"
|
#define VPERM0f(A,B) "vshuff32x4 $0x4e," #A "," #B "," #B ";\n"
|
||||||
#define VPERM1f(A,B) "vshuff32x4 $0xb1," #A "," #B "," #B ";\n"
|
#define VPERM1f(A,B) "vshuff32x4 $0xb1," #A "," #B "," #B ";\n"
|
||||||
#define VPERM2f(A,B) "vshufps $0x4e," #A "," #B "," #B ";\n"
|
#define VPERM2f(A,B) "vshufps $0x4e," #A "," #B "," #B ";\n"
|
||||||
|
@ -1,92 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: ./lib/simd/Avx512Asm.h
|
|
||||||
|
|
||||||
Copyright (C) 2015
|
|
||||||
|
|
||||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#ifndef GRID_ASM_AV512_ADDSUB_H
|
|
||||||
#define GRID_ASM_AV512_ADDSUB_H
|
|
||||||
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////
|
|
||||||
// Building blocks for SU3 x 2spinor
|
|
||||||
// Load columns of U
|
|
||||||
// 18 U DUP's rr/ii
|
|
||||||
// 6 Chi shuffles ir,ri
|
|
||||||
// 6muls, 30 fmaddsubs
|
|
||||||
////////////////////////////////////////////////////////////////
|
|
||||||
#define MULT_ADDSUB_2SPIN(ptr) \
|
|
||||||
LOAD64(%r8,ptr) \
|
|
||||||
__asm__ ( \
|
|
||||||
VMOVIDUPf(0,%r8,Z0 ) \
|
|
||||||
VMOVIDUPf(3,%r8,Z1 )\
|
|
||||||
VMOVIDUPf(6,%r8,Z2 )\
|
|
||||||
VSHUFf(Chi_00,T1) \
|
|
||||||
VSHUFf(Chi_10,T2) \
|
|
||||||
\
|
|
||||||
VMULf(Z0,T1,UChi_00) VMOVRDUPf(0,%r8,Z3 ) \
|
|
||||||
VMULf(Z0,T2,UChi_10) VMOVRDUPf(3,%r8,Z4 ) \
|
|
||||||
VMULf(Z1,T1,UChi_01) VMOVRDUPf(6,%r8,Z5 ) \
|
|
||||||
VMULf(Z1,T2,UChi_11) VMOVIDUPf(1,%r8,Z0 ) \
|
|
||||||
VMULf(Z2,T1,UChi_02) VMOVIDUPf(4,%r8,Z1 ) \
|
|
||||||
VMULf(Z2,T2,UChi_12) VMOVIDUPf(7,%r8,Z2 ) \
|
|
||||||
\
|
|
||||||
VMADDSUBf(Z3,Chi_00,UChi_00) VSHUFf(Chi_01,T1) \
|
|
||||||
VMADDSUBf(Z3,Chi_10,UChi_10) VSHUFf(Chi_11,T2) \
|
|
||||||
VMADDSUBf(Z4,Chi_00,UChi_01) VMOVRDUPf(1,%r8,Z3 ) \
|
|
||||||
VMADDSUBf(Z4,Chi_10,UChi_11)\
|
|
||||||
VMADDSUBf(Z5,Chi_00,UChi_02) VMOVRDUPf(4,%r8,Z4 ) \
|
|
||||||
VMADDSUBf(Z5,Chi_10,UChi_12)\
|
|
||||||
\
|
|
||||||
VMADDSUBf(Z0,T1,UChi_00) VMOVRDUPf(7,%r8,Z5 ) \
|
|
||||||
VMADDSUBf(Z0,T2,UChi_10)\
|
|
||||||
VMADDSUBf(Z1,T1,UChi_01) VMOVIDUPf(2,%r8,Z0 ) \
|
|
||||||
VMADDSUBf(Z1,T2,UChi_11)\
|
|
||||||
VMADDSUBf(Z2,T1,UChi_02) VMOVIDUPf(5,%r8,Z1 ) \
|
|
||||||
VMADDSUBf(Z2,T2,UChi_12) VMOVIDUPf(8,%r8,Z2 ) \
|
|
||||||
\
|
|
||||||
VMADDSUBf(Z3,Chi_01,UChi_00) VSHUFf(Chi_02,T1) \
|
|
||||||
VMADDSUBf(Z3,Chi_11,UChi_10) VSHUFf(Chi_12,T2) \
|
|
||||||
VMADDSUBf(Z4,Chi_01,UChi_01) VMOVRDUPf(2,%r8,Z3 ) \
|
|
||||||
VMADDSUBf(Z4,Chi_11,UChi_11)\
|
|
||||||
VMADDSUBf(Z5,Chi_01,UChi_02) VMOVRDUPf(5,%r8,Z4 ) \
|
|
||||||
VMADDSUBf(Z5,Chi_11,UChi_12)\
|
|
||||||
\
|
|
||||||
VMADDSUBf(Z0,T1,UChi_00) VMOVRDUPf(8,%r8,Z5 ) \
|
|
||||||
VMADDSUBf(Z0,T2,UChi_10)\
|
|
||||||
VMADDSUBf(Z1,T1,UChi_01)\
|
|
||||||
VMADDSUBf(Z1,T2,UChi_11)\
|
|
||||||
VMADDSUBf(Z2,T1,UChi_02)\
|
|
||||||
VMADDSUBf(Z2,T2,UChi_12)\
|
|
||||||
\
|
|
||||||
VMADDSUBf(Z3,Chi_02,UChi_00)\
|
|
||||||
VMADDSUBf(Z3,Chi_12,UChi_10)\
|
|
||||||
VMADDSUBf(Z4,Chi_02,UChi_01)\
|
|
||||||
VMADDSUBf(Z4,Chi_12,UChi_11)\
|
|
||||||
VMADDSUBf(Z5,Chi_02,UChi_02)\
|
|
||||||
VMADDSUBf(Z5,Chi_12,UChi_12)\
|
|
||||||
);
|
|
||||||
|
|
||||||
|
|
||||||
#endif
|
|
@ -1,4 +1,4 @@
|
|||||||
/*************************************************************************************
|
/*************************************************************************************
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
@ -28,6 +28,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#ifndef GRID_ASM_INTEL_COMMON_512_H
|
#ifndef GRID_ASM_INTEL_COMMON_512_H
|
||||||
#define GRID_ASM_INTEL_COMMON_512_H
|
#define GRID_ASM_INTEL_COMMON_512_H
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Peformance options
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
#undef AVX512_PF_L2_WRITE
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Opcodes common
|
// Opcodes common
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
@ -37,6 +42,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
"mov $0x5555, %%eax \n"\
|
"mov $0x5555, %%eax \n"\
|
||||||
"kmovw %%eax, %%k7 \n" : : : "%eax");
|
"kmovw %%eax, %%k7 \n" : : : "%eax");
|
||||||
|
|
||||||
|
//#define label(B) __asm__ ( __func__ _LINE__ #B ":\n" );
|
||||||
|
|
||||||
#define VZEROf(A) "vpxorq " #A "," #A "," #A ";\n"
|
#define VZEROf(A) "vpxorq " #A "," #A "," #A ";\n"
|
||||||
#define VZEROd(A) "vpxorq " #A "," #A "," #A ";\n"
|
#define VZEROd(A) "vpxorq " #A "," #A "," #A ";\n"
|
||||||
|
|
||||||
@ -86,8 +93,16 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#define VMOVf(A,DEST) "vmovaps " #A ", " #DEST ";\n"
|
#define VMOVf(A,DEST) "vmovaps " #A ", " #DEST ";\n"
|
||||||
#define VMOVd(A,DEST) "vmovapd " #A ", " #DEST ";\n"
|
#define VMOVd(A,DEST) "vmovapd " #A ", " #DEST ";\n"
|
||||||
|
|
||||||
#define VPREFETCHG(O,A)
|
#define VPREFETCH1(O,A) "prefetcht0 "#O"*64("#A");\n"
|
||||||
|
#define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n"
|
||||||
|
#ifdef AVX512_PF_L2_WRITE
|
||||||
|
#define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n"
|
||||||
|
#else
|
||||||
#define VPREFETCHW(O,A)
|
#define VPREFETCHW(O,A)
|
||||||
|
#endif
|
||||||
|
#define VPREFETCHNTA(O,A)
|
||||||
|
#define VPREFETCH(O,A)
|
||||||
|
|
||||||
#define VEVICT(O,A)
|
#define VEVICT(O,A)
|
||||||
|
|
||||||
//"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
|
//"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
|
||||||
@ -123,8 +138,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri)
|
#define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri)
|
||||||
#define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri)
|
#define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri)
|
||||||
|
|
||||||
#define VPREFETCHNTA(O,A)
|
|
||||||
#define VPREFETCH(O,A)
|
|
||||||
|
|
||||||
#define VSTOREf(OFF,PTR,SRC) "vmovaps " #SRC "," #OFF "*64(" #PTR ")" ";\n"
|
#define VSTOREf(OFF,PTR,SRC) "vmovaps " #SRC "," #OFF "*64(" #PTR ")" ";\n"
|
||||||
#define VSTOREd(OFF,PTR,SRC) "vmovapd " #SRC "," #OFF "*64(" #PTR ")" ";\n"
|
#define VSTOREd(OFF,PTR,SRC) "vmovapd " #SRC "," #OFF "*64(" #PTR ")" ";\n"
|
||||||
|
@ -133,3 +133,22 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
||||||
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
||||||
|
|
||||||
|
|
||||||
|
#undef VRDUP
|
||||||
|
#undef VIDUP
|
||||||
|
#undef VMADDSUBMEM
|
||||||
|
#undef VMADDMEM
|
||||||
|
#undef VMULMEM
|
||||||
|
#define VRDUP(SRC,DEST) VRDUPd(SRC,DEST)
|
||||||
|
#define VIDUP(SRC,DEST) VIDUPd(SRC,DEST)
|
||||||
|
#define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMd(O,P,B,accum)
|
||||||
|
#define VMADDMEM(O,P,B,accum) VMADDMEMd(O,P,B,accum)
|
||||||
|
#define VMULMEM(O,P,B,accum) VMULMEMd(O,P,B,accum)
|
||||||
|
#undef VMADDSUBRDUP
|
||||||
|
#undef VMADDSUBIDUP
|
||||||
|
#undef VMULRDUP
|
||||||
|
#undef VMULIDUP
|
||||||
|
#define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPd(O,P,B,accum)
|
||||||
|
#define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPd(O,P,B,accum)
|
||||||
|
#define VMULRDUP(O,P,B,accum) VMULRDUPd(O,P,B,accum)
|
||||||
|
#define VMULIDUP(O,P,B,accum) VMULIDUPd(O,P,B,accum)
|
||||||
|
@ -116,7 +116,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#define VMADDSUB(A,B,accum) VMADDSUBf(A,B,accum)
|
#define VMADDSUB(A,B,accum) VMADDSUBf(A,B,accum)
|
||||||
#define VSHUF(A,B) VSHUFf(A,B)
|
#define VSHUF(A,B) VSHUFf(A,B)
|
||||||
|
|
||||||
|
|
||||||
#undef ZEND1
|
#undef ZEND1
|
||||||
#undef ZEND2
|
#undef ZEND2
|
||||||
#undef ZLOAD
|
#undef ZLOAD
|
||||||
@ -133,3 +132,24 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
||||||
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
||||||
|
|
||||||
|
#undef VRDUP
|
||||||
|
#undef VIDUP
|
||||||
|
#undef VMADDSUBMEM
|
||||||
|
#undef VMADDMEM
|
||||||
|
#undef VMULMEM
|
||||||
|
|
||||||
|
#define VRDUP(SRC,DEST) VRDUPf(SRC,DEST)
|
||||||
|
#define VIDUP(SRC,DEST) VIDUPf(SRC,DEST)
|
||||||
|
#define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMf(O,P,B,accum)
|
||||||
|
#define VMADDMEM(O,P,B,accum) VMADDMEMf(O,P,B,accum)
|
||||||
|
#define VMULMEM(O,P,B,accum) VMULMEMf(O,P,B,accum)
|
||||||
|
|
||||||
|
#undef VMADDSUBRDUP
|
||||||
|
#undef VMADDSUBIDUP
|
||||||
|
#undef VMULRDUP
|
||||||
|
#undef VMULIDUP
|
||||||
|
#define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPf(O,P,B,accum)
|
||||||
|
#define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPf(O,P,B,accum)
|
||||||
|
#define VMULRDUP(O,P,B,accum) VMULRDUPf(O,P,B,accum)
|
||||||
|
#define VMULIDUP(O,P,B,accum) VMULIDUPf(O,P,B,accum)
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
/*************************************************************************************
|
/*************************************************************************************
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
@ -27,9 +27,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
#ifndef GRID_ASM_INTEL_512_QCD_H
|
#ifndef GRID_ASM_INTEL_512_QCD_H
|
||||||
#define GRID_ASM_INTEL_512_QCD_H
|
#define GRID_ASM_INTEL_512_QCD_H
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Register allocations for Wilson Kernel are precision and IMCI/AVX512 indept
|
// Register allocations for Wilson Kernel are precision indept
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////
|
||||||
#define result_00 %zmm0
|
#define result_00 %zmm0
|
||||||
#define result_01 %zmm1
|
#define result_01 %zmm1
|
||||||
@ -64,7 +64,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#define UChi_12 %zmm23
|
#define UChi_12 %zmm23
|
||||||
|
|
||||||
#define Uir %zmm24
|
#define Uir %zmm24
|
||||||
//#define ONE %zmm24
|
|
||||||
#define Uri %zmm25
|
#define Uri %zmm25
|
||||||
#define T1 %zmm24
|
#define T1 %zmm24
|
||||||
#define T2 %zmm25
|
#define T2 %zmm25
|
||||||
@ -92,13 +91,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#define Chimu_32 UChi_12
|
#define Chimu_32 UChi_12
|
||||||
|
|
||||||
#include <simd/Intel512common.h>
|
#include <simd/Intel512common.h>
|
||||||
#ifdef AVX512
|
|
||||||
#include <simd/Intel512avx.h>
|
#include <simd/Intel512avx.h>
|
||||||
//#include <simd/Intel512avxAddsub.h> // Alternate implementation
|
|
||||||
#endif
|
|
||||||
#ifdef IMCI
|
|
||||||
#include <simd/Intel512imci.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////
|
||||||
// Macros used to build wilson kernel -- can rationalise and simplify
|
// Macros used to build wilson kernel -- can rationalise and simplify
|
||||||
@ -111,7 +104,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#define LOAD_CHI(PTR) LOAD64(%r8,PTR) __asm__ ( LOAD_CHIi );
|
#define LOAD_CHI(PTR) LOAD64(%r8,PTR) __asm__ ( LOAD_CHIi );
|
||||||
#define SAVE_UCHI(PTR) SAVE_UCHIi(PTR)
|
#define SAVE_UCHI(PTR) SAVE_UCHIi(PTR)
|
||||||
#define SAVE_CHI(PTR) SAVE_CHIi(PTR)
|
#define SAVE_CHI(PTR) SAVE_CHIi(PTR)
|
||||||
#define SAVE_RESULT(PTR) SAVE_RESULTi(PTR)
|
#define SAVE_RESULT(PT,R) SAVE_RESULTi(PT,R)
|
||||||
|
|
||||||
#define LOAD_CHIMUi \
|
#define LOAD_CHIMUi \
|
||||||
LOAD_CHIMU01i \
|
LOAD_CHIMU01i \
|
||||||
@ -176,63 +169,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
VSTORE(5,%r8,Chi_12) \
|
VSTORE(5,%r8,Chi_12) \
|
||||||
);
|
);
|
||||||
|
|
||||||
#define SAVE_RESULTi(PTR)\
|
|
||||||
LOAD64(%r8,PTR) \
|
|
||||||
__asm__ ( \
|
|
||||||
VSTORE(0,%r8,result_00) \
|
|
||||||
VSTORE(1,%r8,result_01) \
|
|
||||||
VSTORE(2,%r8,result_02) \
|
|
||||||
VSTORE(3,%r8,result_10) \
|
|
||||||
VSTORE(4,%r8,result_11) \
|
|
||||||
VSTORE(5,%r8,result_12) \
|
|
||||||
VSTORE(6,%r8,result_20) \
|
|
||||||
VSTORE(7,%r8,result_21) \
|
|
||||||
VSTORE(8,%r8,result_22) \
|
|
||||||
VSTORE(9,%r8,result_30) \
|
|
||||||
VSTORE(10,%r8,result_31) \
|
|
||||||
VSTORE(11,%r8,result_32) \
|
|
||||||
);
|
|
||||||
|
|
||||||
// auto ptr = &U._odata[sU](A);
|
|
||||||
// A plan for lifting loads
|
|
||||||
// can use Z2/3/4/5/U/U for U field in first step.
|
|
||||||
// can use Chi_00, Chi_10, U U for U field in second step
|
|
||||||
// can use Chi_00, Chi_10, Chi_01,11, U U for U field in third step
|
|
||||||
// Enables to lift ALL loads earlier by a few cycles and alleviate OoO pressure if needed.
|
|
||||||
// KNL is DUAL issue for FP, and lifting these loads is potentially important.
|
|
||||||
// Need detailed profile data to be sure.
|
|
||||||
#if 0
|
|
||||||
#define PREFETCH_U(A) \
|
|
||||||
LOAD64(%r8,&U._odata[sU](A)) \
|
|
||||||
__asm__ ( \
|
|
||||||
VPREFETCHG(0,%r8) \
|
|
||||||
VPREFETCHG(1,%r8) \
|
|
||||||
VPREFETCHG(2,%r8) \
|
|
||||||
VPREFETCHG(3,%r8) \
|
|
||||||
VPREFETCHG(4,%r8) \
|
|
||||||
VPREFETCHG(5,%r8) \
|
|
||||||
VPREFETCHG(6,%r8) \
|
|
||||||
VPREFETCHG(7,%r8) \
|
|
||||||
VPREFETCHG(8,%r8) );
|
|
||||||
|
|
||||||
#define PREFETCH_R(A) \
|
|
||||||
LOAD64(%r8,&out._odata[ss]) \
|
|
||||||
__asm__ ( \
|
|
||||||
VPREFETCHW(0,%r8) \
|
|
||||||
VPREFETCHW(1,%r8) \
|
|
||||||
VPREFETCHW(2,%r8) \
|
|
||||||
VPREFETCHW(3,%r8) \
|
|
||||||
VPREFETCHW(4,%r8) \
|
|
||||||
VPREFETCHW(5,%r8) \
|
|
||||||
VPREFETCHW(6,%r8) \
|
|
||||||
VPREFETCHW(7,%r8) \
|
|
||||||
VPREFETCHW(8,%r8) \
|
|
||||||
VPREFETCHW(9,%r8) \
|
|
||||||
VPREFETCHW(10,%r8) \
|
|
||||||
VPREFETCHW(11,%r8) );
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define MULT_2SPIN_DIR(A) MULT_2SPIN(&U._odata[sU](A))
|
|
||||||
|
|
||||||
#define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN_PFXP(&U._odata[sU](A),p)
|
#define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN_PFXP(&U._odata[sU](A),p)
|
||||||
#define MULT_2SPIN_DIR_PFYP(A,p) MULT_2SPIN_PFYP(&U._odata[sU](A),p)
|
#define MULT_2SPIN_DIR_PFYP(A,p) MULT_2SPIN_PFYP(&U._odata[sU](A),p)
|
||||||
@ -244,131 +180,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#define MULT_2SPIN_DIR_PFZM(A,p) MULT_2SPIN_PFZM(&U._odata[sU](A),p)
|
#define MULT_2SPIN_DIR_PFZM(A,p) MULT_2SPIN_PFZM(&U._odata[sU](A),p)
|
||||||
#define MULT_2SPIN_DIR_PFTM(A,p) MULT_2SPIN_PFTM(&U._odata[sU](A),p)
|
#define MULT_2SPIN_DIR_PFTM(A,p) MULT_2SPIN_PFTM(&U._odata[sU](A),p)
|
||||||
|
|
||||||
#if 0
|
#define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||||
#define MULT_2SPIN_UNOPT(ptr) \
|
#define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||||
LOAD64(%r8,ptr) \
|
#define MULT_2SPIN_PFZM(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||||
__asm__ ( \
|
#define MULT_2SPIN_PFTM(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||||
ZLOAD (0,%r8,UChi_01,UChi_11) \
|
#define MULT_2SPIN_PFTP(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||||
ZLOAD (3,%r8,UChi_02,UChi_12) \
|
#define MULT_2SPIN_PFZP(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||||
ZLOAD (6,%r8,Uri,Uir) \
|
#define MULT_2SPIN_PFYP(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||||
ZMUL (UChi_01,UChi_11,Chi_00,UChi_00,Z0) \
|
#define MULT_2SPIN_PFXP(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||||
ZMUL (UChi_01,UChi_11,Chi_10,UChi_10,Z1) \
|
|
||||||
ZMUL (UChi_02,UChi_12,Chi_00,UChi_01,Z2) \
|
|
||||||
ZMUL (UChi_02,UChi_12,Chi_10,UChi_11,Z3) \
|
|
||||||
ZMUL (Uri,Uir, Chi_00,UChi_02,Z4) \
|
|
||||||
ZMUL (Uri,Uir, Chi_10,UChi_12,Z5) \
|
|
||||||
\
|
|
||||||
ZLOAD (1,%r8,Uri,Uir) \
|
|
||||||
ZLOAD (4,%r8,Chi_00, Chi_10) \
|
|
||||||
ZMADD (Uri,Uir, Chi_01,UChi_00,Z0) \
|
|
||||||
ZMADD (Uri,Uir, Chi_11,UChi_10,Z1) \
|
|
||||||
ZLOAD (7,%r8,Uri,Uir) \
|
|
||||||
ZMADD (Chi_00, Chi_10,Chi_01,UChi_01,Z2) \
|
|
||||||
ZMADD (Chi_00, Chi_10,Chi_11,UChi_11,Z3) \
|
|
||||||
ZLOAD (2,%r8,Chi_00,Chi_10) \
|
|
||||||
ZMADD(Uri,Uir, Chi_01,UChi_02,Z4) \
|
|
||||||
ZMADD(Uri,Uir, Chi_11,UChi_12,Z5) \
|
|
||||||
\
|
|
||||||
ZLOAD (5,%r8,Uri,Uir) \
|
|
||||||
ZMADD (Chi_00,Chi_10, Chi_02,UChi_00,Z0) \
|
|
||||||
ZMADD (Chi_00,Chi_10, Chi_12,UChi_10,Z1) \
|
|
||||||
ZLOAD (8,%r8,Chi_00,Chi_10) \
|
|
||||||
ZMADD (Uri,Uir, Chi_02,UChi_01,Z2) \
|
|
||||||
ZMADD (Uri,Uir, Chi_12,UChi_11,Z3) \
|
|
||||||
ZMADD(Chi_00,Chi_10, Chi_02,UChi_02,Z4) \
|
|
||||||
ZMADD(Chi_00,Chi_10, Chi_12,UChi_12,Z5) \
|
|
||||||
\
|
|
||||||
ZEND1(UChi_00,Z0,Chi_01) \
|
|
||||||
ZEND1(UChi_10,Z1,Chi_11) \
|
|
||||||
ZEND1(UChi_01,Z2,Chi_00) \
|
|
||||||
ZEND1(UChi_11,Z3,Chi_10) \
|
|
||||||
ZEND1(UChi_02,Z4,Chi_02) \
|
|
||||||
ZEND1(UChi_12,Z5,Chi_12) \
|
|
||||||
ZEND2(UChi_00,Z0,Chi_01) \
|
|
||||||
ZEND2(UChi_10,Z1,Chi_11) \
|
|
||||||
ZEND2(UChi_01,Z2,Chi_00) \
|
|
||||||
ZEND2(UChi_11,Z3,Chi_10) \
|
|
||||||
ZEND2(UChi_02,Z4,Chi_02) \
|
|
||||||
ZEND2(UChi_12,Z5,Chi_12) );
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN(ptr)
|
|
||||||
#define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN(ptr)
|
|
||||||
#define MULT_2SPIN_PFZM(ptr,pf) MULT_2SPIN(ptr)
|
|
||||||
#define MULT_2SPIN_PFTM(ptr,pf) MULT_2SPIN(ptr)
|
|
||||||
#define MULT_2SPIN_PFTP(ptr,pf) MULT_2SPIN(ptr)
|
|
||||||
#define MULT_2SPIN_PFZP(ptr,pf) MULT_2SPIN(ptr)
|
|
||||||
#define MULT_2SPIN_PFYP(ptr,pf) MULT_2SPIN(ptr)
|
|
||||||
#define MULT_2SPIN_PFXP(ptr,pf) MULT_2SPIN(ptr)
|
|
||||||
|
|
||||||
// MULT_2SPINa(ptr) MULT_2SPIN_PF(ptr,ptr,VPREFETCHG);
|
|
||||||
|
|
||||||
#if 0
|
|
||||||
#define MULT_2SPIN_PF(ptr,pf,VPF) \
|
|
||||||
LOAD64(%r8,ptr) \
|
|
||||||
LOAD64(%r9,pf) \
|
|
||||||
__asm__ ( \
|
|
||||||
ZMULMEM2SP(0,%r8,Uri,Chi_00,Chi_10,UChi_00,Z0,UChi_10,Z1) \
|
|
||||||
VPF(0,%r9) \
|
|
||||||
ZMULMEM2SP(3,%r8,Uri,Chi_00,Chi_10,UChi_01,Z2,UChi_11,Z3) \
|
|
||||||
VPF(1,%r9) \
|
|
||||||
ZMULMEM2SP(6,%r8,Uri,Chi_00,Chi_10,UChi_02,Z4,UChi_12,Z5) \
|
|
||||||
VPF(2,%r9) \
|
|
||||||
\
|
|
||||||
ZMADDMEM2SP(1,%r8,Uri,Chi_01,Chi_11,UChi_00,Z0,UChi_10,Z1) \
|
|
||||||
VPF(3,%r9) \
|
|
||||||
ZMADDMEM2SP(4,%r8,Uri,Chi_01,Chi_11,UChi_01,Z2,UChi_11,Z3) \
|
|
||||||
VPF(4,%r9) \
|
|
||||||
ZMADDMEM2SP(7,%r8,Uri,Chi_01,Chi_11,UChi_02,Z4,UChi_12,Z5) \
|
|
||||||
VPF(5,%r9) \
|
|
||||||
\
|
|
||||||
ZMADDMEM2SP(2,%r8,Uri,Chi_02,Chi_12,UChi_00,Z0,UChi_10,Z1) \
|
|
||||||
VPF(6,%r9) \
|
|
||||||
ZMADDMEM2SP(5,%r8,Uri,Chi_02,Chi_12,UChi_01,Z2,UChi_11,Z3) \
|
|
||||||
VPF(7,%r9) \
|
|
||||||
ZMADDMEM2SP(8,%r8,Uri,Chi_02,Chi_12,UChi_02,Z4,UChi_12,Z5) \
|
|
||||||
VPF(8,%r9) \
|
|
||||||
\
|
|
||||||
ZEND1(UChi_00,Z0,Chi_01) \
|
|
||||||
ZEND1(UChi_10,Z1,Chi_11) \
|
|
||||||
ZEND1(UChi_01,Z2,Chi_00) \
|
|
||||||
ZEND1(UChi_11,Z3,Chi_10) \
|
|
||||||
VPF(9,%r9) \
|
|
||||||
ZEND1(UChi_02,Z4,Chi_02) \
|
|
||||||
ZEND1(UChi_12,Z5,Chi_12) \
|
|
||||||
ZEND2(UChi_00,Z0,Chi_01) \
|
|
||||||
ZEND2(UChi_10,Z1,Chi_11) \
|
|
||||||
VPF(10,%r9) \
|
|
||||||
ZEND2(UChi_01,Z2,Chi_00) \
|
|
||||||
ZEND2(UChi_11,Z3,Chi_10) \
|
|
||||||
ZEND2(UChi_02,Z4,Chi_02) \
|
|
||||||
VPF(11,%r9) \
|
|
||||||
ZEND2(UChi_12,Z5,Chi_12) );
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if 0
|
|
||||||
#define MULT_2SPIN_PFNONE(ptr,pf,VPF) \
|
|
||||||
LOAD64(%r8,ptr) \
|
|
||||||
LOAD64(%r9,pf) \
|
|
||||||
__asm__ ( \
|
|
||||||
VPF(0,%r9) \
|
|
||||||
VPF(1,%r9) \
|
|
||||||
VPF(2,%r9) \
|
|
||||||
\
|
|
||||||
VPF(3,%r9) \
|
|
||||||
VPF(4,%r9) \
|
|
||||||
VPF(5,%r9) \
|
|
||||||
\
|
|
||||||
VPF(6,%r9) \
|
|
||||||
VPF(7,%r9) \
|
|
||||||
VPF(8,%r9) \
|
|
||||||
\
|
|
||||||
VPF(9,%r9) \
|
|
||||||
VPF(10,%r9) \
|
|
||||||
VPF(11,%r9) );
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Pretty much Perfectly Pipelined
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////
|
||||||
// Dirac algebra
|
// Dirac algebra
|
||||||
@ -442,8 +261,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#define XM_PROJMEM(PTR) \
|
#define XM_PROJMEM(PTR) \
|
||||||
LOAD64(%r8,PTR)\
|
LOAD64(%r8,PTR)\
|
||||||
__asm__ ( \
|
__asm__ ( \
|
||||||
SHUF_CHIMU23i \
|
|
||||||
LOAD_CHIi \
|
LOAD_CHIi \
|
||||||
|
SHUF_CHIMU23i \
|
||||||
VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_30)\
|
VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_30)\
|
||||||
VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_31)\
|
VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_31)\
|
||||||
VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_32)\
|
VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_32)\
|
||||||
@ -471,8 +290,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#define ZM_PROJMEM(PTR) \
|
#define ZM_PROJMEM(PTR) \
|
||||||
LOAD64(%r8,PTR) \
|
LOAD64(%r8,PTR) \
|
||||||
__asm__ ( \
|
__asm__ ( \
|
||||||
SHUF_CHIMU23i \
|
|
||||||
LOAD_CHIi \
|
LOAD_CHIi \
|
||||||
|
SHUF_CHIMU23i \
|
||||||
VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_20)\
|
VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_20)\
|
||||||
VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_21)\
|
VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_21)\
|
||||||
VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_22)\
|
VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_22)\
|
||||||
@ -490,7 +309,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
LOAD64(%r8,ptr) \
|
LOAD64(%r8,ptr) \
|
||||||
__asm__ ( \
|
__asm__ ( \
|
||||||
LOAD_CHIMU01i \
|
LOAD_CHIMU01i \
|
||||||
VSUBMEM(6,%r8 ,Chimu_00,Chi_00) \
|
VSUBMEM(6,%r8,Chimu_00,Chi_00) \
|
||||||
VSUBMEM(7,%r8,Chimu_01,Chi_01) \
|
VSUBMEM(7,%r8,Chimu_01,Chi_01) \
|
||||||
VSUBMEM(8,%r8,Chimu_02,Chi_02) \
|
VSUBMEM(8,%r8,Chimu_02,Chi_02) \
|
||||||
VSUBMEM(9,%r8,Chimu_10,Chi_10) \
|
VSUBMEM(9,%r8,Chimu_10,Chi_10) \
|
||||||
@ -503,18 +322,18 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
// fspin(3)=timesMinusI(hspin(0))
|
// fspin(3)=timesMinusI(hspin(0))
|
||||||
#define XP_RECON __asm__ ( \
|
#define XP_RECON __asm__ ( \
|
||||||
VZERO(TMP) \
|
VZERO(TMP) \
|
||||||
VMOV(UChi_00,result_00) \
|
|
||||||
VMOV(UChi_01,result_01) \
|
|
||||||
VMOV(UChi_02,result_02) \
|
|
||||||
VMOV(UChi_10,result_10) \
|
|
||||||
VMOV(UChi_11,result_11) \
|
|
||||||
VMOV(UChi_12,result_12) \
|
|
||||||
VTIMESMINUSI0(UChi_10,result_20,TMP) \
|
|
||||||
VTIMESMINUSI0(UChi_11,result_21,TMP) \
|
|
||||||
VTIMESMINUSI0(UChi_12,result_22,TMP) \
|
|
||||||
VTIMESMINUSI0(UChi_00,result_30,TMP) \
|
VTIMESMINUSI0(UChi_00,result_30,TMP) \
|
||||||
|
VTIMESMINUSI0(UChi_10,result_20,TMP) \
|
||||||
VTIMESMINUSI0(UChi_01,result_31,TMP) \
|
VTIMESMINUSI0(UChi_01,result_31,TMP) \
|
||||||
|
VTIMESMINUSI0(UChi_11,result_21,TMP) \
|
||||||
VTIMESMINUSI0(UChi_02,result_32,TMP) \
|
VTIMESMINUSI0(UChi_02,result_32,TMP) \
|
||||||
|
VTIMESMINUSI0(UChi_12,result_22,TMP) \
|
||||||
|
VMOV(UChi_00,result_00) \
|
||||||
|
VMOV(UChi_10,result_10) \
|
||||||
|
VMOV(UChi_01,result_01) \
|
||||||
|
VMOV(UChi_11,result_11) \
|
||||||
|
VMOV(UChi_02,result_02) \
|
||||||
|
VMOV(UChi_12,result_12) \
|
||||||
VTIMESMINUSI1(UChi_10,result_20,TMP) \
|
VTIMESMINUSI1(UChi_10,result_20,TMP) \
|
||||||
VTIMESMINUSI1(UChi_11,result_21,TMP) \
|
VTIMESMINUSI1(UChi_11,result_21,TMP) \
|
||||||
VTIMESMINUSI1(UChi_12,result_22,TMP) \
|
VTIMESMINUSI1(UChi_12,result_22,TMP) \
|
||||||
@ -531,24 +350,24 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
// NB could save 6 ops using addsub => 12 cycles
|
// NB could save 6 ops using addsub => 12 cycles
|
||||||
#define XP_RECON_ACCUM __asm__ ( \
|
#define XP_RECON_ACCUM __asm__ ( \
|
||||||
VZERO(TMP)\
|
VZERO(TMP)\
|
||||||
VADD(UChi_00,result_00,result_00)\
|
|
||||||
VADD(UChi_01,result_01,result_01)\
|
|
||||||
VADD(UChi_02,result_02,result_02)\
|
|
||||||
VADD(UChi_10,result_10,result_10)\
|
|
||||||
VADD(UChi_11,result_11,result_11)\
|
|
||||||
VADD(UChi_12,result_12,result_12)\
|
|
||||||
VACCTIMESMINUSI0(UChi_10,result_20,Z0)\
|
|
||||||
VACCTIMESMINUSI0(UChi_11,result_21,Z1)\
|
|
||||||
VACCTIMESMINUSI0(UChi_12,result_22,Z2)\
|
|
||||||
VACCTIMESMINUSI0(UChi_00,result_30,Z3)\
|
VACCTIMESMINUSI0(UChi_00,result_30,Z3)\
|
||||||
|
VACCTIMESMINUSI0(UChi_10,result_20,Z0)\
|
||||||
VACCTIMESMINUSI0(UChi_01,result_31,Z4)\
|
VACCTIMESMINUSI0(UChi_01,result_31,Z4)\
|
||||||
|
VACCTIMESMINUSI0(UChi_11,result_21,Z1)\
|
||||||
VACCTIMESMINUSI0(UChi_02,result_32,Z5)\
|
VACCTIMESMINUSI0(UChi_02,result_32,Z5)\
|
||||||
VACCTIMESMINUSI1(UChi_10,result_20,Z0)\
|
VACCTIMESMINUSI0(UChi_12,result_22,Z2)\
|
||||||
VACCTIMESMINUSI1(UChi_11,result_21,Z1)\
|
VADD(UChi_00,result_00,result_00)\
|
||||||
VACCTIMESMINUSI1(UChi_12,result_22,Z2)\
|
VADD(UChi_10,result_10,result_10)\
|
||||||
|
VADD(UChi_01,result_01,result_01)\
|
||||||
|
VADD(UChi_11,result_11,result_11)\
|
||||||
|
VADD(UChi_02,result_02,result_02)\
|
||||||
|
VADD(UChi_12,result_12,result_12)\
|
||||||
VACCTIMESMINUSI1(UChi_00,result_30,Z3)\
|
VACCTIMESMINUSI1(UChi_00,result_30,Z3)\
|
||||||
|
VACCTIMESMINUSI1(UChi_10,result_20,Z0)\
|
||||||
VACCTIMESMINUSI1(UChi_01,result_31,Z4)\
|
VACCTIMESMINUSI1(UChi_01,result_31,Z4)\
|
||||||
|
VACCTIMESMINUSI1(UChi_11,result_21,Z1)\
|
||||||
VACCTIMESMINUSI1(UChi_02,result_32,Z5)\
|
VACCTIMESMINUSI1(UChi_02,result_32,Z5)\
|
||||||
|
VACCTIMESMINUSI1(UChi_12,result_22,Z2)\
|
||||||
VACCTIMESMINUSI2(UChi_10,result_20,Z0)\
|
VACCTIMESMINUSI2(UChi_10,result_20,Z0)\
|
||||||
VACCTIMESMINUSI2(UChi_11,result_21,Z1)\
|
VACCTIMESMINUSI2(UChi_11,result_21,Z1)\
|
||||||
VACCTIMESMINUSI2(UChi_12,result_22,Z2)\
|
VACCTIMESMINUSI2(UChi_12,result_22,Z2)\
|
||||||
@ -559,24 +378,24 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
#define XM_RECON __asm__ ( \
|
#define XM_RECON __asm__ ( \
|
||||||
VZERO(TMP)\
|
VZERO(TMP)\
|
||||||
VMOV(UChi_00,result_00)\
|
|
||||||
VMOV(UChi_01,result_01)\
|
|
||||||
VMOV(UChi_02,result_02)\
|
|
||||||
VMOV(UChi_10,result_10)\
|
|
||||||
VMOV(UChi_11,result_11)\
|
|
||||||
VMOV(UChi_12,result_12)\
|
|
||||||
VTIMESI0(UChi_10,result_20,TMP)\
|
|
||||||
VTIMESI0(UChi_11,result_21,TMP)\
|
|
||||||
VTIMESI0(UChi_12,result_22,TMP)\
|
|
||||||
VTIMESI0(UChi_00,result_30,TMP)\
|
VTIMESI0(UChi_00,result_30,TMP)\
|
||||||
|
VTIMESI0(UChi_10,result_20,TMP)\
|
||||||
VTIMESI0(UChi_01,result_31,TMP)\
|
VTIMESI0(UChi_01,result_31,TMP)\
|
||||||
|
VTIMESI0(UChi_11,result_21,TMP)\
|
||||||
VTIMESI0(UChi_02,result_32,TMP)\
|
VTIMESI0(UChi_02,result_32,TMP)\
|
||||||
VTIMESI1(UChi_10,result_20,TMP)\
|
VTIMESI0(UChi_12,result_22,TMP)\
|
||||||
VTIMESI1(UChi_11,result_21,TMP)\
|
VMOV(UChi_00,result_00)\
|
||||||
VTIMESI1(UChi_12,result_22,TMP)\
|
VMOV(UChi_10,result_10)\
|
||||||
|
VMOV(UChi_01,result_01)\
|
||||||
|
VMOV(UChi_11,result_11)\
|
||||||
|
VMOV(UChi_02,result_02)\
|
||||||
|
VMOV(UChi_12,result_12)\
|
||||||
VTIMESI1(UChi_00,result_30,TMP)\
|
VTIMESI1(UChi_00,result_30,TMP)\
|
||||||
|
VTIMESI1(UChi_10,result_20,TMP)\
|
||||||
VTIMESI1(UChi_01,result_31,TMP)\
|
VTIMESI1(UChi_01,result_31,TMP)\
|
||||||
|
VTIMESI1(UChi_11,result_21,TMP)\
|
||||||
VTIMESI1(UChi_02,result_32,TMP)\
|
VTIMESI1(UChi_02,result_32,TMP)\
|
||||||
|
VTIMESI1(UChi_12,result_22,TMP)\
|
||||||
VTIMESI2(UChi_10,result_20,TMP)\
|
VTIMESI2(UChi_10,result_20,TMP)\
|
||||||
VTIMESI2(UChi_11,result_21,TMP)\
|
VTIMESI2(UChi_11,result_21,TMP)\
|
||||||
VTIMESI2(UChi_12,result_22,TMP)\
|
VTIMESI2(UChi_12,result_22,TMP)\
|
||||||
@ -586,23 +405,25 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
);
|
);
|
||||||
|
|
||||||
#define XM_RECON_ACCUM __asm__ ( \
|
#define XM_RECON_ACCUM __asm__ ( \
|
||||||
VADD(UChi_00,result_00,result_00)\
|
|
||||||
VADD(UChi_01,result_01,result_01)\
|
|
||||||
VADD(UChi_02,result_02,result_02)\
|
|
||||||
VADD(UChi_10,result_10,result_10)\
|
|
||||||
VADD(UChi_11,result_11,result_11)\
|
|
||||||
VADD(UChi_12,result_12,result_12)\
|
|
||||||
VACCTIMESI0(UChi_10,result_20,Z0)\
|
VACCTIMESI0(UChi_10,result_20,Z0)\
|
||||||
VACCTIMESI0(UChi_11,result_21,Z1)\
|
|
||||||
VACCTIMESI0(UChi_12,result_22,Z2)\
|
|
||||||
VACCTIMESI0(UChi_00,result_30,Z3)\
|
VACCTIMESI0(UChi_00,result_30,Z3)\
|
||||||
|
VACCTIMESI0(UChi_11,result_21,Z1)\
|
||||||
VACCTIMESI0(UChi_01,result_31,Z4)\
|
VACCTIMESI0(UChi_01,result_31,Z4)\
|
||||||
|
VACCTIMESI0(UChi_12,result_22,Z2)\
|
||||||
VACCTIMESI0(UChi_02,result_32,Z5)\
|
VACCTIMESI0(UChi_02,result_32,Z5)\
|
||||||
|
\
|
||||||
|
VADD(UChi_10,result_10,result_10)\
|
||||||
|
VADD(UChi_00,result_00,result_00)\
|
||||||
|
VADD(UChi_11,result_11,result_11)\
|
||||||
|
VADD(UChi_01,result_01,result_01)\
|
||||||
|
VADD(UChi_12,result_12,result_12)\
|
||||||
|
VADD(UChi_02,result_02,result_02)\
|
||||||
|
\
|
||||||
VACCTIMESI1(UChi_10,result_20,Z0)\
|
VACCTIMESI1(UChi_10,result_20,Z0)\
|
||||||
VACCTIMESI1(UChi_11,result_21,Z1)\
|
|
||||||
VACCTIMESI1(UChi_12,result_22,Z2)\
|
|
||||||
VACCTIMESI1(UChi_00,result_30,Z3)\
|
VACCTIMESI1(UChi_00,result_30,Z3)\
|
||||||
|
VACCTIMESI1(UChi_11,result_21,Z1)\
|
||||||
VACCTIMESI1(UChi_01,result_31,Z4)\
|
VACCTIMESI1(UChi_01,result_31,Z4)\
|
||||||
|
VACCTIMESI1(UChi_12,result_22,Z2)\
|
||||||
VACCTIMESI1(UChi_02,result_32,Z5)\
|
VACCTIMESI1(UChi_02,result_32,Z5)\
|
||||||
VACCTIMESI2(UChi_10,result_20,Z0)\
|
VACCTIMESI2(UChi_10,result_20,Z0)\
|
||||||
VACCTIMESI2(UChi_11,result_21,Z1)\
|
VACCTIMESI2(UChi_11,result_21,Z1)\
|
||||||
@ -614,10 +435,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
#define YP_RECON_ACCUM __asm__ ( \
|
#define YP_RECON_ACCUM __asm__ ( \
|
||||||
VADD(UChi_00,result_00,result_00)\
|
VADD(UChi_00,result_00,result_00)\
|
||||||
VADD(UChi_01,result_01,result_01)\
|
|
||||||
VADD(UChi_02,result_02,result_02)\
|
|
||||||
VADD(UChi_10,result_10,result_10)\
|
VADD(UChi_10,result_10,result_10)\
|
||||||
|
VADD(UChi_01,result_01,result_01)\
|
||||||
VADD(UChi_11,result_11,result_11)\
|
VADD(UChi_11,result_11,result_11)\
|
||||||
|
VADD(UChi_02,result_02,result_02)\
|
||||||
VADD(UChi_12,result_12,result_12)\
|
VADD(UChi_12,result_12,result_12)\
|
||||||
VADD(UChi_10,result_20,result_20)\
|
VADD(UChi_10,result_20,result_20)\
|
||||||
VADD(UChi_11,result_21,result_21)\
|
VADD(UChi_11,result_21,result_21)\
|
||||||
@ -628,10 +449,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
#define YM_RECON_ACCUM __asm__ ( \
|
#define YM_RECON_ACCUM __asm__ ( \
|
||||||
VADD(UChi_00,result_00,result_00)\
|
VADD(UChi_00,result_00,result_00)\
|
||||||
VADD(UChi_01,result_01,result_01)\
|
|
||||||
VADD(UChi_02,result_02,result_02)\
|
|
||||||
VADD(UChi_10,result_10,result_10)\
|
VADD(UChi_10,result_10,result_10)\
|
||||||
|
VADD(UChi_01,result_01,result_01)\
|
||||||
VADD(UChi_11,result_11,result_11)\
|
VADD(UChi_11,result_11,result_11)\
|
||||||
|
VADD(UChi_02,result_02,result_02)\
|
||||||
VADD(UChi_12,result_12,result_12)\
|
VADD(UChi_12,result_12,result_12)\
|
||||||
VSUB(UChi_10,result_20,result_20)\
|
VSUB(UChi_10,result_20,result_20)\
|
||||||
VSUB(UChi_11,result_21,result_21)\
|
VSUB(UChi_11,result_21,result_21)\
|
||||||
@ -641,23 +462,23 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
VADD(UChi_02,result_32,result_32) );
|
VADD(UChi_02,result_32,result_32) );
|
||||||
|
|
||||||
#define ZP_RECON_ACCUM __asm__ ( \
|
#define ZP_RECON_ACCUM __asm__ ( \
|
||||||
VADD(UChi_00,result_00,result_00)\
|
|
||||||
VADD(UChi_01,result_01,result_01)\
|
|
||||||
VADD(UChi_02,result_02,result_02)\
|
|
||||||
VADD(UChi_10,result_10,result_10)\
|
|
||||||
VADD(UChi_11,result_11,result_11)\
|
|
||||||
VADD(UChi_12,result_12,result_12)\
|
|
||||||
VACCTIMESMINUSI0(UChi_00,result_20,Z0)\
|
VACCTIMESMINUSI0(UChi_00,result_20,Z0)\
|
||||||
VACCTIMESMINUSI0(UChi_01,result_21,Z1)\
|
|
||||||
VACCTIMESMINUSI0(UChi_02,result_22,Z2)\
|
|
||||||
VACCTIMESI0(UChi_10,result_30,Z3)\
|
VACCTIMESI0(UChi_10,result_30,Z3)\
|
||||||
|
VACCTIMESMINUSI0(UChi_01,result_21,Z1)\
|
||||||
VACCTIMESI0(UChi_11,result_31,Z4)\
|
VACCTIMESI0(UChi_11,result_31,Z4)\
|
||||||
|
VACCTIMESMINUSI0(UChi_02,result_22,Z2)\
|
||||||
VACCTIMESI0(UChi_12,result_32,Z5)\
|
VACCTIMESI0(UChi_12,result_32,Z5)\
|
||||||
|
VADD(UChi_00,result_00,result_00)\
|
||||||
|
VADD(UChi_10,result_10,result_10)\
|
||||||
|
VADD(UChi_01,result_01,result_01)\
|
||||||
|
VADD(UChi_11,result_11,result_11)\
|
||||||
|
VADD(UChi_02,result_02,result_02)\
|
||||||
|
VADD(UChi_12,result_12,result_12)\
|
||||||
VACCTIMESMINUSI1(UChi_00,result_20,Z0)\
|
VACCTIMESMINUSI1(UChi_00,result_20,Z0)\
|
||||||
VACCTIMESMINUSI1(UChi_01,result_21,Z1)\
|
|
||||||
VACCTIMESMINUSI1(UChi_02,result_22,Z2)\
|
|
||||||
VACCTIMESI1(UChi_10,result_30,Z3)\
|
VACCTIMESI1(UChi_10,result_30,Z3)\
|
||||||
|
VACCTIMESMINUSI1(UChi_01,result_21,Z1)\
|
||||||
VACCTIMESI1(UChi_11,result_31,Z4)\
|
VACCTIMESI1(UChi_11,result_31,Z4)\
|
||||||
|
VACCTIMESMINUSI1(UChi_02,result_22,Z2)\
|
||||||
VACCTIMESI1(UChi_12,result_32,Z5)\
|
VACCTIMESI1(UChi_12,result_32,Z5)\
|
||||||
VACCTIMESMINUSI2(UChi_00,result_20,Z0)\
|
VACCTIMESMINUSI2(UChi_00,result_20,Z0)\
|
||||||
VACCTIMESMINUSI2(UChi_01,result_21,Z1)\
|
VACCTIMESMINUSI2(UChi_01,result_21,Z1)\
|
||||||
@ -668,23 +489,23 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
);
|
);
|
||||||
|
|
||||||
#define ZM_RECON_ACCUM __asm__ ( \
|
#define ZM_RECON_ACCUM __asm__ ( \
|
||||||
VADD(UChi_00,result_00,result_00)\
|
|
||||||
VADD(UChi_01,result_01,result_01)\
|
|
||||||
VADD(UChi_02,result_02,result_02)\
|
|
||||||
VADD(UChi_10,result_10,result_10)\
|
|
||||||
VADD(UChi_11,result_11,result_11)\
|
|
||||||
VADD(UChi_12,result_12,result_12)\
|
|
||||||
VACCTIMESI0(UChi_00,result_20,Z0)\
|
VACCTIMESI0(UChi_00,result_20,Z0)\
|
||||||
VACCTIMESI0(UChi_01,result_21,Z1)\
|
|
||||||
VACCTIMESI0(UChi_02,result_22,Z2)\
|
|
||||||
VACCTIMESMINUSI0(UChi_10,result_30,Z3)\
|
VACCTIMESMINUSI0(UChi_10,result_30,Z3)\
|
||||||
|
VACCTIMESI0(UChi_01,result_21,Z1)\
|
||||||
VACCTIMESMINUSI0(UChi_11,result_31,Z4)\
|
VACCTIMESMINUSI0(UChi_11,result_31,Z4)\
|
||||||
|
VACCTIMESI0(UChi_02,result_22,Z2)\
|
||||||
VACCTIMESMINUSI0(UChi_12,result_32,Z5)\
|
VACCTIMESMINUSI0(UChi_12,result_32,Z5)\
|
||||||
|
VADD(UChi_00,result_00,result_00)\
|
||||||
|
VADD(UChi_10,result_10,result_10)\
|
||||||
|
VADD(UChi_01,result_01,result_01)\
|
||||||
|
VADD(UChi_11,result_11,result_11)\
|
||||||
|
VADD(UChi_02,result_02,result_02)\
|
||||||
|
VADD(UChi_12,result_12,result_12)\
|
||||||
VACCTIMESI1(UChi_00,result_20,Z0)\
|
VACCTIMESI1(UChi_00,result_20,Z0)\
|
||||||
VACCTIMESI1(UChi_01,result_21,Z1)\
|
|
||||||
VACCTIMESI1(UChi_02,result_22,Z2)\
|
|
||||||
VACCTIMESMINUSI1(UChi_10,result_30,Z3)\
|
VACCTIMESMINUSI1(UChi_10,result_30,Z3)\
|
||||||
|
VACCTIMESI1(UChi_01,result_21,Z1)\
|
||||||
VACCTIMESMINUSI1(UChi_11,result_31,Z4)\
|
VACCTIMESMINUSI1(UChi_11,result_31,Z4)\
|
||||||
|
VACCTIMESI1(UChi_02,result_22,Z2)\
|
||||||
VACCTIMESMINUSI1(UChi_12,result_32,Z5)\
|
VACCTIMESMINUSI1(UChi_12,result_32,Z5)\
|
||||||
VACCTIMESI2(UChi_00,result_20,Z0)\
|
VACCTIMESI2(UChi_00,result_20,Z0)\
|
||||||
VACCTIMESI2(UChi_01,result_21,Z1)\
|
VACCTIMESI2(UChi_01,result_21,Z1)\
|
||||||
@ -696,35 +517,121 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
#define TP_RECON_ACCUM __asm__ ( \
|
#define TP_RECON_ACCUM __asm__ ( \
|
||||||
VADD(UChi_00,result_00,result_00)\
|
VADD(UChi_00,result_00,result_00)\
|
||||||
VADD(UChi_01,result_01,result_01)\
|
|
||||||
VADD(UChi_02,result_02,result_02)\
|
|
||||||
VADD(UChi_10,result_10,result_10)\
|
VADD(UChi_10,result_10,result_10)\
|
||||||
|
VADD(UChi_01,result_01,result_01)\
|
||||||
VADD(UChi_11,result_11,result_11)\
|
VADD(UChi_11,result_11,result_11)\
|
||||||
|
VADD(UChi_02,result_02,result_02)\
|
||||||
VADD(UChi_12,result_12,result_12)\
|
VADD(UChi_12,result_12,result_12)\
|
||||||
VADD(UChi_00,result_20,result_20)\
|
VADD(UChi_00,result_20,result_20)\
|
||||||
VADD(UChi_01,result_21,result_21)\
|
|
||||||
VADD(UChi_02,result_22,result_22)\
|
|
||||||
VADD(UChi_10,result_30,result_30)\
|
VADD(UChi_10,result_30,result_30)\
|
||||||
|
VADD(UChi_01,result_21,result_21)\
|
||||||
VADD(UChi_11,result_31,result_31)\
|
VADD(UChi_11,result_31,result_31)\
|
||||||
|
VADD(UChi_02,result_22,result_22)\
|
||||||
VADD(UChi_12,result_32,result_32) );
|
VADD(UChi_12,result_32,result_32) );
|
||||||
|
|
||||||
#define TM_RECON_ACCUM __asm__ ( \
|
#define TM_RECON_ACCUM __asm__ ( \
|
||||||
VADD(UChi_00,result_00,result_00)\
|
VADD(UChi_00,result_00,result_00)\
|
||||||
VADD(UChi_01,result_01,result_01)\
|
|
||||||
VADD(UChi_02,result_02,result_02)\
|
|
||||||
VADD(UChi_10,result_10,result_10)\
|
VADD(UChi_10,result_10,result_10)\
|
||||||
|
VADD(UChi_01,result_01,result_01)\
|
||||||
VADD(UChi_11,result_11,result_11)\
|
VADD(UChi_11,result_11,result_11)\
|
||||||
|
VADD(UChi_02,result_02,result_02)\
|
||||||
VADD(UChi_12,result_12,result_12)\
|
VADD(UChi_12,result_12,result_12)\
|
||||||
VSUB(UChi_00,result_20,result_20)\
|
VSUB(UChi_00,result_20,result_20)\
|
||||||
VSUB(UChi_01,result_21,result_21)\
|
|
||||||
VSUB(UChi_02,result_22,result_22)\
|
|
||||||
VSUB(UChi_10,result_30,result_30)\
|
VSUB(UChi_10,result_30,result_30)\
|
||||||
|
VSUB(UChi_01,result_21,result_21)\
|
||||||
VSUB(UChi_11,result_31,result_31)\
|
VSUB(UChi_11,result_31,result_31)\
|
||||||
|
VSUB(UChi_02,result_22,result_22)\
|
||||||
VSUB(UChi_12,result_32,result_32) );
|
VSUB(UChi_12,result_32,result_32) );
|
||||||
|
|
||||||
//define PREFETCH_CHIMU(A)
|
#define AVX512_PF_L1
|
||||||
|
#define AVX512_PF_L2_GAUGE
|
||||||
|
#define AVX512_PF_L2_TABLE
|
||||||
|
#undef AVX512_PF_L2_LINEAR
|
||||||
|
|
||||||
#define PERMUTE_DIR0 __asm__ ( \
|
#ifdef AVX512_PF_L2_TABLE
|
||||||
|
// P1 Fetches the base pointer for next link into L1 with P1
|
||||||
|
// M1 Fetches the next site pointer into L2
|
||||||
|
#define VPREFETCH_P1(A,B) VPREFETCH1(A,B)
|
||||||
|
#define VPREFETCH_P2(A,B)
|
||||||
|
#define VPREFETCH_M1(A,B) VPREFETCH2(A,B)
|
||||||
|
#define VPREFETCH_M2(A,B)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef AVX512_PF_L2_LINEAR
|
||||||
|
#define VPREFETCH_M1(A,B) VPREFETCH1(A,B)
|
||||||
|
#define VPREFETCH_M2(A,B) VPREFETCH2(A,B)
|
||||||
|
#define VPREFETCH_P1(A,B)
|
||||||
|
#define VPREFETCH_P2(A,B)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef AVX512_PF_L2_GAUGE
|
||||||
|
#define VPREFETCH_G1(A,B) VPREFETCH1(A,B)
|
||||||
|
#define VPREFETCH_G2(A,B) VPREFETCH2(A,B)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define PF_GAUGE(A) \
|
||||||
|
LOAD64(%r8,&U._odata[sU](A)) \
|
||||||
|
__asm__ ( \
|
||||||
|
VPREFETCH_G1(0,%r8) VPREFETCH_G1(1,%r8) \
|
||||||
|
VPREFETCH_G1(2,%r8) VPREFETCH_G1(3,%r8) \
|
||||||
|
);
|
||||||
|
|
||||||
|
#define SAVE_RESULTi(PTR,pf) \
|
||||||
|
LOAD64(%r8,PTR) \
|
||||||
|
LOAD64(%r9,pf) \
|
||||||
|
__asm__ ( \
|
||||||
|
VSTORE(0,%r8,result_00) VPREFETCH_M1(0,%r9) \
|
||||||
|
VSTORE(1,%r8,result_01) VPREFETCH_M1(1,%r9) \
|
||||||
|
VSTORE(2,%r8,result_02) VPREFETCH_M1(2,%r9) \
|
||||||
|
VSTORE(3,%r8,result_10) VPREFETCH_M1(3,%r9) \
|
||||||
|
VSTORE(4,%r8,result_11) VPREFETCH_M1(4,%r9) \
|
||||||
|
VSTORE(5,%r8,result_12) VPREFETCH_M1(5,%r9) \
|
||||||
|
VSTORE(6,%r8,result_20) VPREFETCH_M1(6,%r9) \
|
||||||
|
VSTORE(7,%r8,result_21) VPREFETCH_M1(7,%r9) \
|
||||||
|
VSTORE(8,%r8,result_22) VPREFETCH_M1(8,%r9) \
|
||||||
|
VSTORE(9,%r8,result_30) VPREFETCH_M1(9,%r9) \
|
||||||
|
VSTORE(10,%r8,result_31) VPREFETCH_M1(10,%r9) \
|
||||||
|
VSTORE(11,%r8,result_32) VPREFETCH_M1(11,%r9) \
|
||||||
|
);
|
||||||
|
|
||||||
|
#ifdef AVX512_PF_L2_TABLE
|
||||||
|
#define PREFETCH_CHIMU(A) \
|
||||||
|
LOAD64(%r9,A) \
|
||||||
|
__asm__ ( \
|
||||||
|
VPREFETCH_P1(0,%r9) \
|
||||||
|
VPREFETCH_P1(1,%r9) \
|
||||||
|
VPREFETCH_P1(2,%r9) \
|
||||||
|
VPREFETCH_P1(3,%r9) \
|
||||||
|
VPREFETCH_P1(4,%r9) \
|
||||||
|
VPREFETCH_P1(5,%r9) \
|
||||||
|
VPREFETCH_P1(6,%r9) \
|
||||||
|
VPREFETCH_P1(7,%r9) \
|
||||||
|
VPREFETCH_P1(8,%r9) \
|
||||||
|
VPREFETCH_P1(9,%r9) \
|
||||||
|
VPREFETCH_P1(10,%r9) \
|
||||||
|
VPREFETCH_P1(11,%r9));
|
||||||
|
|
||||||
|
#else
|
||||||
|
#define PREFETCH_CHIMU(A)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define PREFETCH1_CHIMU(A) \
|
||||||
|
LOAD64(%r9,A) \
|
||||||
|
__asm__ ( \
|
||||||
|
VPREFETCH_P1(0,%r9) \
|
||||||
|
VPREFETCH_P1(1,%r9) \
|
||||||
|
VPREFETCH_P1(2,%r9) \
|
||||||
|
VPREFETCH_P1(3,%r9) \
|
||||||
|
VPREFETCH_P1(4,%r9) \
|
||||||
|
VPREFETCH_P1(5,%r9) \
|
||||||
|
VPREFETCH_P1(6,%r9) \
|
||||||
|
VPREFETCH_P1(7,%r9) \
|
||||||
|
VPREFETCH_P1(8,%r9) \
|
||||||
|
VPREFETCH_P1(9,%r9) \
|
||||||
|
VPREFETCH_P1(10,%r9) \
|
||||||
|
VPREFETCH_P1(11,%r9));
|
||||||
|
|
||||||
|
#define PERMUTE_DIR0 __asm__ ( \
|
||||||
VPERM0(Chi_00,Chi_00) \
|
VPERM0(Chi_00,Chi_00) \
|
||||||
VPERM0(Chi_01,Chi_01) \
|
VPERM0(Chi_01,Chi_01) \
|
||||||
VPERM0(Chi_02,Chi_02) \
|
VPERM0(Chi_02,Chi_02) \
|
||||||
@ -756,65 +663,245 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
VPERM3(Chi_11,Chi_11) \
|
VPERM3(Chi_11,Chi_11) \
|
||||||
VPERM3(Chi_12,Chi_12) );
|
VPERM3(Chi_12,Chi_12) );
|
||||||
|
|
||||||
#define MULT_ADDSUB_2SPIN1(ptr) \
|
|
||||||
LOAD64(%r8,ptr)
|
|
||||||
/*
|
|
||||||
* __asm__ ( \
|
|
||||||
);
|
|
||||||
VMUL(Z0,%zmm2,%zmm3) \
|
|
||||||
*/
|
|
||||||
#define MULT_ADDSUB_2SPIN(ptr) \
|
|
||||||
LOAD64(%r8,ptr) \
|
|
||||||
__asm__ ( \
|
|
||||||
VMOVIDUP(0,%r8,Z0 ) \
|
|
||||||
VMOVIDUP(3,%r8,Z1 )\
|
|
||||||
VMOVIDUP(6,%r8,Z2 )\
|
|
||||||
VSHUF(Chi_00,T1) \
|
|
||||||
VSHUF(Chi_10,T2) \
|
|
||||||
\
|
|
||||||
VMUL(Z0,T1,UChi_00) VMOVRDUP(0,%r8,Z3 ) \
|
|
||||||
VMUL(Z0,T2,UChi_10) VMOVRDUP(3,%r8,Z4 ) \
|
|
||||||
VMUL(Z1,T1,UChi_01) VMOVRDUP(6,%r8,Z5 ) \
|
|
||||||
VMUL(Z1,T2,UChi_11) VMOVIDUP(1,%r8,Z0 ) \
|
|
||||||
VMUL(Z2,T1,UChi_02) VMOVIDUP(4,%r8,Z1 ) \
|
|
||||||
VMUL(Z2,T2,UChi_12) VMOVIDUP(7,%r8,Z2 ) \
|
|
||||||
\
|
|
||||||
VMADDSUB(Z3,Chi_00,UChi_00) VSHUF(Chi_01,T1) \
|
|
||||||
VMADDSUB(Z3,Chi_10,UChi_10) VSHUF(Chi_11,T2) \
|
|
||||||
VMADDSUB(Z4,Chi_00,UChi_01) VMOVRDUP(1,%r8,Z3 ) \
|
|
||||||
VMADDSUB(Z4,Chi_10,UChi_11)\
|
|
||||||
VMADDSUB(Z5,Chi_00,UChi_02) VMOVRDUP(4,%r8,Z4 ) \
|
|
||||||
VMADDSUB(Z5,Chi_10,UChi_12)\
|
|
||||||
\
|
|
||||||
VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(7,%r8,Z5 ) \
|
|
||||||
VMADDSUB(Z0,T2,UChi_10)\
|
|
||||||
VMADDSUB(Z1,T1,UChi_01) VMOVIDUP(2,%r8,Z0 ) \
|
|
||||||
VMADDSUB(Z1,T2,UChi_11)\
|
|
||||||
VMADDSUB(Z2,T1,UChi_02) VMOVIDUP(5,%r8,Z1 ) \
|
|
||||||
VMADDSUB(Z2,T2,UChi_12) VMOVIDUP(8,%r8,Z2 ) \
|
|
||||||
\
|
|
||||||
VMADDSUB(Z3,Chi_01,UChi_00) VSHUF(Chi_02,T1) \
|
|
||||||
VMADDSUB(Z3,Chi_11,UChi_10) VSHUF(Chi_12,T2) \
|
|
||||||
VMADDSUB(Z4,Chi_01,UChi_01) VMOVRDUP(2,%r8,Z3 ) \
|
|
||||||
VMADDSUB(Z4,Chi_11,UChi_11)\
|
|
||||||
VMADDSUB(Z5,Chi_01,UChi_02) VMOVRDUP(5,%r8,Z4 ) \
|
|
||||||
VMADDSUB(Z5,Chi_11,UChi_12)\
|
|
||||||
\
|
|
||||||
VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(8,%r8,Z5 ) \
|
|
||||||
VMADDSUB(Z0,T2,UChi_10)\
|
|
||||||
VMADDSUB(Z1,T1,UChi_01)\
|
|
||||||
VMADDSUB(Z1,T2,UChi_11)\
|
|
||||||
VMADDSUB(Z2,T1,UChi_02)\
|
|
||||||
VMADDSUB(Z2,T2,UChi_12)\
|
|
||||||
\
|
|
||||||
VMADDSUB(Z3,Chi_02,UChi_00)\
|
|
||||||
VMADDSUB(Z3,Chi_12,UChi_10)\
|
|
||||||
VMADDSUB(Z4,Chi_02,UChi_01)\
|
|
||||||
VMADDSUB(Z4,Chi_12,UChi_11)\
|
|
||||||
VMADDSUB(Z5,Chi_02,UChi_02)\
|
|
||||||
VMADDSUB(Z5,Chi_12,UChi_12)\
|
|
||||||
);
|
|
||||||
|
|
||||||
#define MULT_2SPIN(ptr) MULT_ADDSUB_2SPIN(ptr)
|
#define MULT_ADDSUB_2SPIN(ptr,pf) \
|
||||||
|
LOAD64(%r8,ptr) \
|
||||||
|
LOAD64(%r9,pf) \
|
||||||
|
__asm__ ( \
|
||||||
|
VPREFETCH_G2(9,%r8) \
|
||||||
|
VPREFETCH_G2(10,%r8) \
|
||||||
|
VPREFETCH_G2(11,%r8) \
|
||||||
|
VPREFETCH_G2(12,%r8) \
|
||||||
|
VPREFETCH_G2(13,%r8) \
|
||||||
|
VPREFETCH_G2(14,%r8) \
|
||||||
|
VPREFETCH_G2(15,%r8) \
|
||||||
|
VPREFETCH_G2(16,%r8) \
|
||||||
|
VPREFETCH_G2(17,%r8) \
|
||||||
|
VSHUF(Chi_00,T1) \
|
||||||
|
VMOVIDUP(0,%r8,Z0 ) \
|
||||||
|
VMOVIDUP(3,%r8,Z1 ) \
|
||||||
|
VMOVIDUP(6,%r8,Z2 ) VSHUF(Chi_10,T2) \
|
||||||
|
/*6*/ \
|
||||||
|
VMUL(Z0,T1,UChi_00) VMOVRDUP(0,%r8,Z3 ) \
|
||||||
|
VMUL(Z0,T2,UChi_10) VMOVRDUP(3,%r8,Z4 ) \
|
||||||
|
VMUL(Z1,T1,UChi_01) VMOVRDUP(6,%r8,Z5 ) \
|
||||||
|
VMUL(Z1,T2,UChi_11) VMOVIDUP(1,%r8,Z0 ) \
|
||||||
|
VMUL(Z2,T1,UChi_02) VMOVIDUP(4,%r8,Z1 ) \
|
||||||
|
VMUL(Z2,T2,UChi_12) VMOVIDUP(7,%r8,Z2 ) \
|
||||||
|
VPREFETCH_M1(0,%r9) \
|
||||||
|
VPREFETCH_M1(1,%r9) \
|
||||||
|
VPREFETCH_M1(2,%r9) \
|
||||||
|
VPREFETCH_M1(3,%r9) \
|
||||||
|
/*18*/ \
|
||||||
|
VMADDSUB(Z3,Chi_00,UChi_00) VSHUF(Chi_01,T1) \
|
||||||
|
VMADDSUB(Z3,Chi_10,UChi_10) \
|
||||||
|
VMADDSUB(Z4,Chi_00,UChi_01) VMOVRDUP(1,%r8,Z3 ) \
|
||||||
|
VMADDSUB(Z4,Chi_10,UChi_11) VSHUF(Chi_11,T2) \
|
||||||
|
VMADDSUB(Z5,Chi_00,UChi_02) VMOVRDUP(4,%r8,Z4 ) \
|
||||||
|
VMADDSUB(Z5,Chi_10,UChi_12) \
|
||||||
|
VPREFETCH_M1(4,%r9) \
|
||||||
|
VPREFETCH_M1(5,%r9) \
|
||||||
|
VPREFETCH_M1(6,%r9) \
|
||||||
|
VPREFETCH_M1(7,%r9) \
|
||||||
|
/*28*/ \
|
||||||
|
VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(7,%r8,Z5 ) \
|
||||||
|
VMADDSUB(Z0,T2,UChi_10) \
|
||||||
|
VMADDSUB(Z1,T1,UChi_01) VMOVIDUP(2,%r8,Z0 ) \
|
||||||
|
VMADDSUB(Z1,T2,UChi_11) \
|
||||||
|
VMADDSUB(Z2,T1,UChi_02) VMOVIDUP(5,%r8,Z1 ) \
|
||||||
|
VMADDSUB(Z2,T2,UChi_12) VMOVIDUP(8,%r8,Z2 ) \
|
||||||
|
VPREFETCH2(12,%r9) \
|
||||||
|
VPREFETCH2(13,%r9) \
|
||||||
|
VPREFETCH2(14,%r9) \
|
||||||
|
VPREFETCH2(15,%r9) \
|
||||||
|
VPREFETCH2(16,%r9) \
|
||||||
|
VPREFETCH2(17,%r9) \
|
||||||
|
VPREFETCH2(18,%r9) \
|
||||||
|
VPREFETCH2(19,%r9) \
|
||||||
|
VPREFETCH2(20,%r9) \
|
||||||
|
VPREFETCH2(21,%r9) \
|
||||||
|
VPREFETCH2(22,%r9) \
|
||||||
|
VPREFETCH2(23,%r9) \
|
||||||
|
/*38*/ \
|
||||||
|
VMADDSUB(Z3,Chi_01,UChi_00) VSHUF(Chi_02,T1) \
|
||||||
|
VMADDSUB(Z3,Chi_11,UChi_10) \
|
||||||
|
VMADDSUB(Z4,Chi_01,UChi_01) VMOVRDUP(2,%r8,Z3 ) \
|
||||||
|
VMADDSUB(Z4,Chi_11,UChi_11) VSHUF(Chi_12,T2) \
|
||||||
|
VMADDSUB(Z5,Chi_01,UChi_02) VMOVRDUP(5,%r8,Z4 ) \
|
||||||
|
VMADDSUB(Z5,Chi_11,UChi_12) \
|
||||||
|
VPREFETCH_M1(9,%r8) \
|
||||||
|
VPREFETCH_M1(10,%r8) \
|
||||||
|
VPREFETCH_M1(11,%r8) \
|
||||||
|
VPREFETCH_M1(12,%r8) \
|
||||||
|
VPREFETCH_M1(13,%r8) \
|
||||||
|
VPREFETCH_M1(14,%r8) \
|
||||||
|
VPREFETCH_M1(15,%r8) \
|
||||||
|
VPREFETCH_M1(16,%r8) \
|
||||||
|
VPREFETCH_M1(17,%r8) \
|
||||||
|
/*48*/ \
|
||||||
|
VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(8,%r8,Z5 ) \
|
||||||
|
VMADDSUB(Z0,T2,UChi_10) \
|
||||||
|
VMADDSUB(Z1,T1,UChi_01) \
|
||||||
|
VMADDSUB(Z1,T2,UChi_11) \
|
||||||
|
VMADDSUB(Z2,T1,UChi_02) \
|
||||||
|
VMADDSUB(Z2,T2,UChi_12) \
|
||||||
|
VPREFETCH_M1(8,%r9) \
|
||||||
|
VPREFETCH_M1(9,%r9) \
|
||||||
|
VPREFETCH_M1(10,%r9) \
|
||||||
|
VPREFETCH_M1(11,%r9) \
|
||||||
|
/*55*/ \
|
||||||
|
VMADDSUB(Z3,Chi_02,UChi_00) \
|
||||||
|
VMADDSUB(Z3,Chi_12,UChi_10) \
|
||||||
|
VMADDSUB(Z4,Chi_02,UChi_01) \
|
||||||
|
VMADDSUB(Z4,Chi_12,UChi_11) \
|
||||||
|
VMADDSUB(Z5,Chi_02,UChi_02) \
|
||||||
|
VMADDSUB(Z5,Chi_12,UChi_12) \
|
||||||
|
/*61 insns*/ );
|
||||||
|
|
||||||
|
|
||||||
|
#define MULT_ADDSUB_2SPIN_LS(ptr,pf) \
|
||||||
|
LOAD64(%r8,ptr) \
|
||||||
|
LOAD64(%r9,pf) \
|
||||||
|
__asm__ ( \
|
||||||
|
VSHUF(Chi_00,T1) VSHUF(Chi_10,T2) \
|
||||||
|
VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10) \
|
||||||
|
VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11) \
|
||||||
|
VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12) \
|
||||||
|
VPREFETCH_M1(0,%r9) \
|
||||||
|
VPREFETCH_M1(1,%r9) \
|
||||||
|
VPREFETCH_M1(2,%r9) \
|
||||||
|
VPREFETCH_M1(3,%r9) \
|
||||||
|
/*8*/ \
|
||||||
|
VSHUF(Chi_01,T1) VSHUF(Chi_11,T2) \
|
||||||
|
VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \
|
||||||
|
VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \
|
||||||
|
VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \
|
||||||
|
VPREFETCH_M1(4,%r9) \
|
||||||
|
VPREFETCH_M1(5,%r9) \
|
||||||
|
VPREFETCH_M1(6,%r9) \
|
||||||
|
VPREFETCH_M1(7,%r9) \
|
||||||
|
/*16*/ \
|
||||||
|
VMADDSUBIDUP(1,%r8,T1,UChi_00) VMADDSUBIDUP(1,%r8,T2,UChi_10) \
|
||||||
|
VMADDSUBIDUP(4,%r8,T1,UChi_01) VMADDSUBIDUP(4,%r8,T2,UChi_11) \
|
||||||
|
VMADDSUBIDUP(7,%r8,T1,UChi_02) VMADDSUBIDUP(7,%r8,T2,UChi_12) \
|
||||||
|
VPREFETCH_M1(8,%r9) \
|
||||||
|
VPREFETCH_M1(9,%r9) \
|
||||||
|
VPREFETCH_M1(10,%r9) \
|
||||||
|
VPREFETCH_M1(11,%r9) \
|
||||||
|
/*22*/ \
|
||||||
|
VSHUF(Chi_02,T1) VSHUF(Chi_12,T2) \
|
||||||
|
VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \
|
||||||
|
VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \
|
||||||
|
VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \
|
||||||
|
VPREFETCH_M2(12,%r9) \
|
||||||
|
VPREFETCH_M2(13,%r9) \
|
||||||
|
VPREFETCH_M2(14,%r9) \
|
||||||
|
VPREFETCH_M2(15,%r9) \
|
||||||
|
/*30*/ \
|
||||||
|
VMADDSUBIDUP(2,%r8,T1,UChi_00) VMADDSUBIDUP(2,%r8,T2,UChi_10) \
|
||||||
|
VMADDSUBIDUP(5,%r8,T1,UChi_01) VMADDSUBIDUP(5,%r8,T2,UChi_11) \
|
||||||
|
VPREFETCH_M2(16,%r9) \
|
||||||
|
VPREFETCH_M2(17,%r9) \
|
||||||
|
VPREFETCH_M2(18,%r9) \
|
||||||
|
VPREFETCH_M2(19,%r9) \
|
||||||
|
VMADDSUBIDUP(8,%r8,T1,UChi_02) VMADDSUBIDUP(8,%r8,T2,UChi_12) \
|
||||||
|
/*36*/ \
|
||||||
|
VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
|
||||||
|
VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
|
||||||
|
VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
|
||||||
|
VPREFETCH_M2(20,%r9) \
|
||||||
|
VPREFETCH_M2(21,%r9) \
|
||||||
|
VPREFETCH_M2(22,%r9) \
|
||||||
|
VPREFETCH_M2(23,%r9) \
|
||||||
|
VPREFETCH_G1(2,%r8) \
|
||||||
|
VPREFETCH_G1(3,%r8) \
|
||||||
|
VPREFETCH_G2(4,%r8) \
|
||||||
|
VPREFETCH_G2(5,%r8) \
|
||||||
|
VPREFETCH_G2(6,%r8) \
|
||||||
|
VPREFETCH_G2(7,%r8) \
|
||||||
|
/*42 insns*/ );
|
||||||
|
|
||||||
|
#define MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf) \
|
||||||
|
LOAD64(%r8,ptr) \
|
||||||
|
LOAD64(%r9,pf) \
|
||||||
|
__asm__ ( \
|
||||||
|
VSHUF(Chi_00,T1) VSHUF(Chi_10,T2) \
|
||||||
|
VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10) \
|
||||||
|
VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11) \
|
||||||
|
VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12) \
|
||||||
|
/*8*/ \
|
||||||
|
VSHUF(Chi_01,T1) VSHUF(Chi_11,T2) \
|
||||||
|
VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \
|
||||||
|
VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \
|
||||||
|
VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \
|
||||||
|
/*16*/ \
|
||||||
|
VMADDSUBIDUP(1,%r8,T1,UChi_00) VMADDSUBIDUP(1,%r8,T2,UChi_10) \
|
||||||
|
VMADDSUBIDUP(4,%r8,T1,UChi_01) VMADDSUBIDUP(4,%r8,T2,UChi_11) \
|
||||||
|
VMADDSUBIDUP(7,%r8,T1,UChi_02) VMADDSUBIDUP(7,%r8,T2,UChi_12) \
|
||||||
|
/*22*/ \
|
||||||
|
VSHUF(Chi_02,T1) VSHUF(Chi_12,T2) \
|
||||||
|
VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \
|
||||||
|
VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \
|
||||||
|
VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \
|
||||||
|
/*30*/ \
|
||||||
|
VMADDSUBIDUP(2,%r8,T1,UChi_00) VMADDSUBIDUP(2,%r8,T2,UChi_10) \
|
||||||
|
VMADDSUBIDUP(5,%r8,T1,UChi_01) VMADDSUBIDUP(5,%r8,T2,UChi_11) \
|
||||||
|
VMADDSUBIDUP(8,%r8,T1,UChi_02) VMADDSUBIDUP(8,%r8,T2,UChi_12) \
|
||||||
|
/*36*/ \
|
||||||
|
VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
|
||||||
|
VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
|
||||||
|
VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
|
||||||
|
/* VPREFETCH1(2,%r8)*/ \
|
||||||
|
/* VPREFETCH1(3,%r8)*/ \
|
||||||
|
/*42 insns*/ );
|
||||||
|
|
||||||
|
|
||||||
|
#define Z6 Chi_00
|
||||||
|
#define MULT_ADDSUB_2SPIN_NEW(ptr,pf) \
|
||||||
|
LOAD64(%r8,ptr) \
|
||||||
|
__asm__ ( \
|
||||||
|
VSHUFMEM(0,%r8,Z0) \
|
||||||
|
VRDUP(Chi_00,T1) VIDUP(Chi_00,Chi_00) \
|
||||||
|
VRDUP(Chi_10,T2) VIDUP(Chi_10,Chi_10) \
|
||||||
|
VMUL(Z0,Chi_00,Z1) VMUL(Z0,Chi_10,Z2) \
|
||||||
|
VSHUFMEM(3,%r8,Z0) \
|
||||||
|
VMUL(Z0,Chi_00,Z3) VMUL(Z0,Chi_10,Z4) \
|
||||||
|
VSHUFMEM(6,%r8,Z0) \
|
||||||
|
VMUL(Z0,Chi_00,Z5) VMUL(Z0,Chi_10,Z6) \
|
||||||
|
VMULMEM(0,%r8,T1,UChi_00) VMULMEM(0,%r8,T2,UChi_10) \
|
||||||
|
VMULMEM(3,%r8,T1,UChi_01) VMULMEM(3,%r8,T2,UChi_11) \
|
||||||
|
VMULMEM(6,%r8,T1,UChi_02) VMULMEM(6,%r8,T2,UChi_12) \
|
||||||
|
/*11 cycles*/ \
|
||||||
|
VSHUFMEM(1,%r8,Z0) \
|
||||||
|
VRDUP(Chi_01,T1) VIDUP(Chi_01,Chi_01) \
|
||||||
|
VRDUP(Chi_11,T2) VIDUP(Chi_11,Chi_11) \
|
||||||
|
VMADD(Z0,Chi_01,Z1) VMADD(Z0,Chi_11,Z2) \
|
||||||
|
VSHUFMEM(4,%r8,Z0) \
|
||||||
|
VMADD(Z0,Chi_01,Z3) VMADD(Z0,Chi_11,Z4) \
|
||||||
|
VSHUFMEM(7,%r8,Z0) \
|
||||||
|
VMADD(Z0,Chi_01,Z5) VMADD(Z0,Chi_11,Z6) \
|
||||||
|
VMADDMEM(1,%r8,T1,UChi_00) VMADDMEM(1,%r8,T2,UChi_10) \
|
||||||
|
VMADDMEM(4,%r8,T1,UChi_01) VMADDMEM(4,%r8,T2,UChi_11) \
|
||||||
|
VMADDMEM(7,%r8,T1,UChi_02) VMADDMEM(7,%r8,T2,UChi_12) \
|
||||||
|
/*22 cycles*/ \
|
||||||
|
VSHUFMEM(2,%r8,Z0) \
|
||||||
|
VRDUP(Chi_02,T1) VIDUP(Chi_02,Chi_02) \
|
||||||
|
VRDUP(Chi_12,T2) VIDUP(Chi_12,Chi_12) \
|
||||||
|
VMADD(Z0,Chi_02,Z1) VMADD(Z0,Chi_12,Z2) \
|
||||||
|
VSHUFMEM(5,%r8,Z0) \
|
||||||
|
VMADD(Z0,Chi_02,Z3) VMADD(Z0,Chi_12,Z4) \
|
||||||
|
VSHUFMEM(8,%r8,Z0) \
|
||||||
|
VMADD(Z0,Chi_02,Z5) VMADD(Z0,Chi_12,Z6) \
|
||||||
|
/*33 cycles*/ \
|
||||||
|
VMADDSUBMEM(2,%r8,T1,Z1) VMADDSUBMEM(2,%r8,T2,Z2) \
|
||||||
|
VMADDSUBMEM(5,%r8,T1,Z3) VMADDSUBMEM(5,%r8,T2,Z4) \
|
||||||
|
VMADDSUBMEM(8,%r8,T1,Z5) VMADDSUBMEM(8,%r8,T2,Z6) \
|
||||||
|
/*stall*/ \
|
||||||
|
/*stall*/ \
|
||||||
|
/*stall*/ \
|
||||||
|
VADD(Z1,UChi_00,UChi_00) VADD(Z2,UChi_10,UChi_10) \
|
||||||
|
VADD(Z3,UChi_01,UChi_01) VADD(Z4,UChi_11,UChi_11) \
|
||||||
|
VADD(Z5,UChi_02,UChi_02) VADD(Z6,UChi_12,UChi_12) )
|
||||||
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
0
lib/stencil/.dirstamp
Normal file
0
lib/stencil/.dirstamp
Normal file
@ -49,16 +49,25 @@ LebesgueOrder::LebesgueOrder(GridBase *_grid)
|
|||||||
{
|
{
|
||||||
grid = _grid;
|
grid = _grid;
|
||||||
if ( Block[0]==0) ZGraph();
|
if ( Block[0]==0) ZGraph();
|
||||||
|
else if ( Block[1]==0) NoBlocking();
|
||||||
else CartesianBlocking();
|
else CartesianBlocking();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void LebesgueOrder::NoBlocking(void)
|
||||||
|
{
|
||||||
|
std::cout<<GridLogDebug<<"Lexicographic : no cache blocking"<<std::endl;
|
||||||
|
_LebesgueReorder.resize(0);
|
||||||
|
for ( int s = 0 ; s!= grid->oSites();s++){
|
||||||
|
_LebesgueReorder.push_back(s);
|
||||||
|
}
|
||||||
|
}
|
||||||
void LebesgueOrder::CartesianBlocking(void)
|
void LebesgueOrder::CartesianBlocking(void)
|
||||||
{
|
{
|
||||||
_LebesgueReorder.resize(0);
|
_LebesgueReorder.resize(0);
|
||||||
|
|
||||||
std::cout << GridLogMessage << " CartesianBlocking ";
|
std::cout << GridLogDebug << " CartesianBlocking ";
|
||||||
for(int d=0;d<Block.size();d++) std::cout <<Block[d]<<" ";
|
// for(int d=0;d<Block.size();d++) std::cout <<Block[d]<<" ";
|
||||||
std::cout<<std::endl;
|
// std::cout<<std::endl;
|
||||||
|
|
||||||
IndexInteger ND = grid->_ndimension;
|
IndexInteger ND = grid->_ndimension;
|
||||||
|
|
||||||
@ -103,7 +112,9 @@ void LebesgueOrder::IterateI(int ND,
|
|||||||
} else {
|
} else {
|
||||||
for(int d=0;d<ND;d++){
|
for(int d=0;d<ND;d++){
|
||||||
x[d]=xi[d]+xo[d];
|
x[d]=xi[d]+xo[d];
|
||||||
|
// std::cout << x[d]<<" ";
|
||||||
}
|
}
|
||||||
|
// std::cout << "\n";
|
||||||
IndexInteger index;
|
IndexInteger index;
|
||||||
Lexicographic::IndexFromCoor(x,index,grid->_rdimensions);
|
Lexicographic::IndexFromCoor(x,index,grid->_rdimensions);
|
||||||
_LebesgueReorder.push_back(index);
|
_LebesgueReorder.push_back(index);
|
||||||
@ -114,7 +125,8 @@ void LebesgueOrder::IterateI(int ND,
|
|||||||
void LebesgueOrder::ZGraph(void)
|
void LebesgueOrder::ZGraph(void)
|
||||||
{
|
{
|
||||||
_LebesgueReorder.resize(0);
|
_LebesgueReorder.resize(0);
|
||||||
|
|
||||||
|
std::cout << GridLogDebug << " Lebesgue order "<<std::endl;
|
||||||
// Align up dimensions to power of two.
|
// Align up dimensions to power of two.
|
||||||
const IndexInteger one=1;
|
const IndexInteger one=1;
|
||||||
|
|
||||||
@ -188,6 +200,7 @@ void LebesgueOrder::ZGraph(void)
|
|||||||
}
|
}
|
||||||
assert( _LebesgueReorder.size() == vol );
|
assert( _LebesgueReorder.size() == vol );
|
||||||
|
|
||||||
|
/*
|
||||||
std::vector<int> coor(4);
|
std::vector<int> coor(4);
|
||||||
for(IndexInteger asite=0;asite<vol;asite++){
|
for(IndexInteger asite=0;asite<vol;asite++){
|
||||||
grid->oCoorFromOindex (coor,_LebesgueReorder[asite]);
|
grid->oCoorFromOindex (coor,_LebesgueReorder[asite]);
|
||||||
@ -198,5 +211,6 @@ void LebesgueOrder::ZGraph(void)
|
|||||||
<< coor[3]<<"]"
|
<< coor[3]<<"]"
|
||||||
<<std::endl;
|
<<std::endl;
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -59,6 +59,7 @@ namespace Grid {
|
|||||||
// Cartesian stencil blocking strategy
|
// Cartesian stencil blocking strategy
|
||||||
/////////////////////////////////
|
/////////////////////////////////
|
||||||
static std::vector<int> Block;
|
static std::vector<int> Block;
|
||||||
|
void NoBlocking(void);
|
||||||
void CartesianBlocking(void);
|
void CartesianBlocking(void);
|
||||||
void IterateO(int ND,int dim,
|
void IterateO(int ND,int dim,
|
||||||
std::vector<IndexInteger> & xo,
|
std::vector<IndexInteger> & xo,
|
||||||
|
@ -97,7 +97,7 @@ int main (int argc, char ** argv)
|
|||||||
RealD M5 =1.8;
|
RealD M5 =1.8;
|
||||||
typename WilsonFermion5DR::ImplParams params;
|
typename WilsonFermion5DR::ImplParams params;
|
||||||
|
|
||||||
WilsonFermion5DR Dw(1,Umu,*FGrid,*FrbGrid,*sUGrid,*sUrbGrid,M5,params);
|
WilsonFermion5DR Dw(1,Umu,*FGrid,*FrbGrid,*sUGrid,M5,params);
|
||||||
|
|
||||||
Dw.Dhop(src,result,0);
|
Dw.Dhop(src,result,0);
|
||||||
|
|
||||||
|
@ -27,9 +27,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
#include <Grid.h>
|
#include <Grid.h>
|
||||||
#include <PerfCount.h>
|
#include <PerfCount.h>
|
||||||
|
|
||||||
|
int main(int argc,char **argv)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
#if 0
|
||||||
#include <simd/Intel512wilson.h>
|
#include <simd/Intel512wilson.h>
|
||||||
|
|
||||||
|
|
||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
using namespace Grid::QCD;
|
using namespace Grid::QCD;
|
||||||
|
|
||||||
@ -478,3 +482,4 @@ void WilsonDslashAvx512F(void *ptr1,void *ptr2,void *ptr3)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
Reference in New Issue
Block a user