mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-14 09:45:36 +00:00
Merge remote-tracking branch 'origin/develop' into temporary-smearing
This commit is contained in:
commit
9cb90f714e
2
.gitignore
vendored
2
.gitignore
vendored
@ -94,7 +94,7 @@ Thumbs.db
|
|||||||
|
|
||||||
# build directory #
|
# build directory #
|
||||||
###################
|
###################
|
||||||
build/*
|
build*/*
|
||||||
|
|
||||||
# IDE related files #
|
# IDE related files #
|
||||||
#####################
|
#####################
|
||||||
|
90
.travis.yml
Normal file
90
.travis.yml
Normal file
@ -0,0 +1,90 @@
|
|||||||
|
language: cpp
|
||||||
|
|
||||||
|
cache:
|
||||||
|
directories:
|
||||||
|
- clang
|
||||||
|
|
||||||
|
matrix:
|
||||||
|
include:
|
||||||
|
- os: osx
|
||||||
|
osx_image: xcode7.2
|
||||||
|
compiler: clang
|
||||||
|
- os: osx
|
||||||
|
osx_image: xcode7.2
|
||||||
|
compiler: gcc
|
||||||
|
env: VERSION=-5
|
||||||
|
- compiler: gcc
|
||||||
|
addons:
|
||||||
|
apt:
|
||||||
|
sources:
|
||||||
|
- ubuntu-toolchain-r-test
|
||||||
|
packages:
|
||||||
|
- g++-4.9
|
||||||
|
- libmpfr-dev
|
||||||
|
- libgmp-dev
|
||||||
|
- libmpc-dev
|
||||||
|
- binutils-dev
|
||||||
|
env: VERSION=-4.9
|
||||||
|
- compiler: gcc
|
||||||
|
addons:
|
||||||
|
apt:
|
||||||
|
sources:
|
||||||
|
- ubuntu-toolchain-r-test
|
||||||
|
packages:
|
||||||
|
- g++-5
|
||||||
|
- libmpfr-dev
|
||||||
|
- libgmp-dev
|
||||||
|
- libmpc-dev
|
||||||
|
- binutils-dev
|
||||||
|
env: VERSION=-5
|
||||||
|
- compiler: clang
|
||||||
|
addons:
|
||||||
|
apt:
|
||||||
|
sources:
|
||||||
|
- ubuntu-toolchain-r-test
|
||||||
|
packages:
|
||||||
|
- g++-4.8
|
||||||
|
- libmpfr-dev
|
||||||
|
- libgmp-dev
|
||||||
|
- libmpc-dev
|
||||||
|
- binutils-dev
|
||||||
|
env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
|
||||||
|
- compiler: clang
|
||||||
|
addons:
|
||||||
|
apt:
|
||||||
|
sources:
|
||||||
|
- ubuntu-toolchain-r-test
|
||||||
|
packages:
|
||||||
|
- g++-4.8
|
||||||
|
- libmpfr-dev
|
||||||
|
- libgmp-dev
|
||||||
|
- libmpc-dev
|
||||||
|
- binutils-dev
|
||||||
|
env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
|
||||||
|
|
||||||
|
before_install:
|
||||||
|
- export GRIDDIR=`pwd`
|
||||||
|
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]] && [ ! -e clang/bin ]; then wget $CLANG_LINK; tar -xf `basename $CLANG_LINK`; mkdir clang; mv clang+*/* clang/; fi
|
||||||
|
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export PATH="${GRIDDIR}/clang/bin:${PATH}"; fi
|
||||||
|
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
|
||||||
|
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
|
||||||
|
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi
|
||||||
|
- if [[ "$TRAVIS_OS_NAME" == "osx" ]] && [[ "$CC" == "gcc" ]]; then brew install gcc5; fi
|
||||||
|
|
||||||
|
install:
|
||||||
|
- export CC=$CC$VERSION
|
||||||
|
- export CXX=$CXX$VERSION
|
||||||
|
- echo $PATH
|
||||||
|
- which $CC
|
||||||
|
- $CC --version
|
||||||
|
- which $CXX
|
||||||
|
- $CXX --version
|
||||||
|
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
|
||||||
|
|
||||||
|
script:
|
||||||
|
- ./scripts/reconfigure_script
|
||||||
|
- mkdir build
|
||||||
|
- cd build
|
||||||
|
- ../configure CXXFLAGS="-msse4.2 -O3 -std=c++11" LIBS="-lmpfr -lgmp" --enable-precision=single --enable-simd=SSE4 --enable-comms=none
|
||||||
|
- make -j4
|
||||||
|
- ./benchmarks/Benchmark_dwf --threads 1
|
@ -1,4 +1,4 @@
|
|||||||
# Grid
|
# Grid [![Build Status](https://travis-ci.org/paboyle/Grid.svg?branch=master)](https://travis-ci.org/paboyle/Grid)
|
||||||
Data parallel C++ mathematical object library
|
Data parallel C++ mathematical object library
|
||||||
|
|
||||||
Last update 2015/7/30
|
Last update 2015/7/30
|
||||||
|
@ -27,6 +27,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
*************************************************************************************/
|
*************************************************************************************/
|
||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
#include <Grid.h>
|
#include <Grid.h>
|
||||||
|
#include <PerfCount.h>
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
@ -45,6 +46,10 @@ struct scal {
|
|||||||
};
|
};
|
||||||
|
|
||||||
bool overlapComms = false;
|
bool overlapComms = false;
|
||||||
|
typedef WilsonFermion5D<DomainWallRedBlack5dImplR> WilsonFermion5DR;
|
||||||
|
typedef WilsonFermion5D<DomainWallRedBlack5dImplF> WilsonFermion5DF;
|
||||||
|
typedef WilsonFermion5D<DomainWallRedBlack5dImplD> WilsonFermion5DD;
|
||||||
|
|
||||||
|
|
||||||
int main (int argc, char ** argv)
|
int main (int argc, char ** argv)
|
||||||
{
|
{
|
||||||
@ -64,6 +69,12 @@ int main (int argc, char ** argv)
|
|||||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
|
||||||
|
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
|
||||||
|
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
|
||||||
|
std::cout << GridLogMessage << "Making s innermost rb grids"<<std::endl;
|
||||||
|
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
|
||||||
|
|
||||||
std::vector<int> seeds4({1,2,3,4});
|
std::vector<int> seeds4({1,2,3,4});
|
||||||
std::vector<int> seeds5({5,6,7,8});
|
std::vector<int> seeds5({5,6,7,8});
|
||||||
|
|
||||||
@ -78,7 +89,9 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
ColourMatrix cm = Complex(1.0,0.0);
|
ColourMatrix cm = Complex(1.0,0.0);
|
||||||
|
|
||||||
LatticeGaugeField Umu(UGrid); random(RNG4,Umu);
|
LatticeGaugeField Umu(UGrid);
|
||||||
|
random(RNG4,Umu);
|
||||||
|
|
||||||
LatticeGaugeField Umu5d(FGrid);
|
LatticeGaugeField Umu5d(FGrid);
|
||||||
|
|
||||||
// replicate across fifth dimension
|
// replicate across fifth dimension
|
||||||
@ -119,14 +132,21 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
RealD NP = UGrid->_Nprocessors;
|
RealD NP = UGrid->_Nprocessors;
|
||||||
|
|
||||||
|
for(int doasm=1;doasm<2;doasm++){
|
||||||
|
|
||||||
|
QCD::WilsonKernelsStatic::AsmOpt=doasm;
|
||||||
|
|
||||||
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
|
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
|
std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
|
||||||
int ncall=1000;
|
int ncall =10;
|
||||||
{
|
if (1) {
|
||||||
|
|
||||||
double t0=usecond();
|
double t0=usecond();
|
||||||
for(int i=0;i<ncall;i++){
|
for(int i=0;i<ncall;i++){
|
||||||
|
__SSC_START;
|
||||||
Dw.Dhop(src,result,0);
|
Dw.Dhop(src,result,0);
|
||||||
|
__SSC_STOP;
|
||||||
}
|
}
|
||||||
double t1=usecond();
|
double t1=usecond();
|
||||||
|
|
||||||
@ -140,9 +160,121 @@ int main (int argc, char ** argv)
|
|||||||
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NP<<std::endl;
|
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NP<<std::endl;
|
||||||
err = ref-result;
|
err = ref-result;
|
||||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
||||||
Dw.Report();
|
// Dw.Report();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (1)
|
||||||
|
{
|
||||||
|
typedef WilsonFermion5D<DomainWallRedBlack5dImplF> WilsonFermion5DF;
|
||||||
|
LatticeFermionF ssrc(sFGrid);
|
||||||
|
LatticeFermionF sref(sFGrid);
|
||||||
|
LatticeFermionF sresult(sFGrid);
|
||||||
|
WilsonFermion5DF sDw(1,Umu,*sFGrid,*sFrbGrid,*sUGrid,M5,params);
|
||||||
|
|
||||||
|
for(int x=0;x<latt4[0];x++){
|
||||||
|
for(int y=0;y<latt4[1];y++){
|
||||||
|
for(int z=0;z<latt4[2];z++){
|
||||||
|
for(int t=0;t<latt4[3];t++){
|
||||||
|
for(int s=0;s<Ls;s++){
|
||||||
|
std::vector<int> site({s,x,y,z,t});
|
||||||
|
SpinColourVectorF tmp;
|
||||||
|
peekSite(tmp,src,site);
|
||||||
|
pokeSite(tmp,ssrc,site);
|
||||||
|
}}}}}
|
||||||
|
|
||||||
|
double t0=usecond();
|
||||||
|
for(int i=0;i<ncall;i++){
|
||||||
|
__SSC_START;
|
||||||
|
sDw.Dhop(ssrc,sresult,0);
|
||||||
|
__SSC_STOP;
|
||||||
|
}
|
||||||
|
double t1=usecond();
|
||||||
|
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||||
|
double flops=1344*volume*ncall;
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "Called Dw sinner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NP<<std::endl;
|
||||||
|
// sDw.Report();
|
||||||
|
|
||||||
|
if(0){
|
||||||
|
for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
|
||||||
|
sDw.Dhop(ssrc,sresult,0);
|
||||||
|
PerformanceCounter Counter(i);
|
||||||
|
Counter.Start();
|
||||||
|
sDw.Dhop(ssrc,sresult,0);
|
||||||
|
Counter.Stop();
|
||||||
|
Counter.Report();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
RealF sum=0;
|
||||||
|
for(int x=0;x<latt4[0];x++){
|
||||||
|
for(int y=0;y<latt4[1];y++){
|
||||||
|
for(int z=0;z<latt4[2];z++){
|
||||||
|
for(int t=0;t<latt4[3];t++){
|
||||||
|
for(int s=0;s<Ls;s++){
|
||||||
|
std::vector<int> site({s,x,y,z,t});
|
||||||
|
SpinColourVectorF normal, simd;
|
||||||
|
peekSite(normal,result,site);
|
||||||
|
peekSite(simd,sresult,site);
|
||||||
|
sum=sum+norm2(normal-simd);
|
||||||
|
// std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<norm2(normal-simd)<<std::endl;
|
||||||
|
// std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<normal<<std::endl;
|
||||||
|
// std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<simd<<std::endl;
|
||||||
|
}}}}}
|
||||||
|
std::cout<<" difference between normal and simd is "<<sum<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
|
if (1) {
|
||||||
|
|
||||||
|
LatticeFermionF sr_eo(sFGrid);
|
||||||
|
LatticeFermionF serr(sFGrid);
|
||||||
|
|
||||||
|
LatticeFermion ssrc_e (sFrbGrid);
|
||||||
|
LatticeFermion ssrc_o (sFrbGrid);
|
||||||
|
LatticeFermion sr_e (sFrbGrid);
|
||||||
|
LatticeFermion sr_o (sFrbGrid);
|
||||||
|
|
||||||
|
pickCheckerboard(Even,ssrc_e,ssrc);
|
||||||
|
pickCheckerboard(Odd,ssrc_o,ssrc);
|
||||||
|
|
||||||
|
setCheckerboard(sr_eo,ssrc_o);
|
||||||
|
setCheckerboard(sr_eo,ssrc_e);
|
||||||
|
serr = sr_eo-ssrc;
|
||||||
|
std::cout<<GridLogMessage << "EO src norm diff "<< norm2(serr)<<std::endl;
|
||||||
|
|
||||||
|
sr_e = zero;
|
||||||
|
sr_o = zero;
|
||||||
|
|
||||||
|
double t0=usecond();
|
||||||
|
for(int i=0;i<ncall;i++){
|
||||||
|
sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
|
||||||
|
}
|
||||||
|
double t1=usecond();
|
||||||
|
|
||||||
|
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||||
|
double flops=(1344.0*volume*ncall)/2;
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "sDeo mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "sDeo mflop/s per node "<< flops/(t1-t0)/NP<<std::endl;
|
||||||
|
|
||||||
|
sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
|
||||||
|
sDw.DhopOE(ssrc_e,sr_o,DaggerNo);
|
||||||
|
sDw.Dhop (ssrc ,sresult,DaggerNo);
|
||||||
|
|
||||||
|
pickCheckerboard(Even,ssrc_e,sresult);
|
||||||
|
pickCheckerboard(Odd ,ssrc_o,sresult);
|
||||||
|
ssrc_e = ssrc_e - sr_e;
|
||||||
|
std::cout<<GridLogMessage << "sE norm diff "<< norm2(ssrc_e)<<std::endl;
|
||||||
|
ssrc_o = ssrc_o - sr_o;
|
||||||
|
std::cout<<GridLogMessage << "sO norm diff "<< norm2(ssrc_o)<<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
if (1)
|
if (1)
|
||||||
{ // Naive wilson dag implementation
|
{ // Naive wilson dag implementation
|
||||||
@ -197,7 +329,6 @@ int main (int argc, char ** argv)
|
|||||||
std::cout<<GridLogMessage << "Deo mflop/s = "<< flops/(t1-t0)<<std::endl;
|
std::cout<<GridLogMessage << "Deo mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "Deo mflop/s per node "<< flops/(t1-t0)/NP<<std::endl;
|
std::cout<<GridLogMessage << "Deo mflop/s per node "<< flops/(t1-t0)/NP<<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||||
Dw.DhopOE(src_e,r_o,DaggerNo);
|
Dw.DhopOE(src_e,r_o,DaggerNo);
|
||||||
Dw.Dhop (src ,result,DaggerNo);
|
Dw.Dhop (src ,result,DaggerNo);
|
||||||
@ -217,5 +348,8 @@ int main (int argc, char ** argv)
|
|||||||
std::cout<<GridLogMessage << "norm diff even "<< norm2(src_e)<<std::endl;
|
std::cout<<GridLogMessage << "norm diff even "<< norm2(src_e)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "norm diff odd "<< norm2(src_o)<<std::endl;
|
std::cout<<GridLogMessage << "norm diff odd "<< norm2(src_o)<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
Grid_finalize();
|
Grid_finalize();
|
||||||
}
|
}
|
||||||
|
154
benchmarks/Benchmark_dwf_ntpf.cc
Normal file
154
benchmarks/Benchmark_dwf_ntpf.cc
Normal file
@ -0,0 +1,154 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./benchmarks/Benchmark_dwf.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid.h>
|
||||||
|
#include <PerfCount.h>
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
using namespace Grid;
|
||||||
|
using namespace Grid::QCD;
|
||||||
|
|
||||||
|
template<class d>
|
||||||
|
struct scal {
|
||||||
|
d internal;
|
||||||
|
};
|
||||||
|
|
||||||
|
Gamma::GammaMatrix Gmu [] = {
|
||||||
|
Gamma::GammaX,
|
||||||
|
Gamma::GammaY,
|
||||||
|
Gamma::GammaZ,
|
||||||
|
Gamma::GammaT
|
||||||
|
};
|
||||||
|
|
||||||
|
bool overlapComms = false;
|
||||||
|
|
||||||
|
|
||||||
|
int main (int argc, char ** argv)
|
||||||
|
{
|
||||||
|
Grid_init(&argc,&argv);
|
||||||
|
|
||||||
|
if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){
|
||||||
|
overlapComms = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
int threads = GridThread::GetThreads();
|
||||||
|
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
||||||
|
|
||||||
|
std::vector<int> latt4 = GridDefaultLatt();
|
||||||
|
const int Ls=16;
|
||||||
|
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
||||||
|
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||||
|
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||||
|
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||||
|
|
||||||
|
std::vector<int> seeds4({1,2,3,4});
|
||||||
|
std::vector<int> seeds5({5,6,7,8});
|
||||||
|
|
||||||
|
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
||||||
|
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
|
||||||
|
|
||||||
|
LatticeFermion src (FGrid); random(RNG5,src);
|
||||||
|
LatticeFermion result(FGrid); result=zero;
|
||||||
|
LatticeFermion ref(FGrid); ref=zero;
|
||||||
|
LatticeFermion tmp(FGrid);
|
||||||
|
LatticeFermion err(FGrid);
|
||||||
|
|
||||||
|
ColourMatrix cm = Complex(1.0,0.0);
|
||||||
|
|
||||||
|
LatticeGaugeField Umu(UGrid);
|
||||||
|
random(RNG4,Umu);
|
||||||
|
|
||||||
|
LatticeGaugeField Umu5d(FGrid);
|
||||||
|
|
||||||
|
// replicate across fifth dimension
|
||||||
|
for(int ss=0;ss<Umu._grid->oSites();ss++){
|
||||||
|
for(int s=0;s<Ls;s++){
|
||||||
|
Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////
|
||||||
|
// Naive wilson implementation
|
||||||
|
////////////////////////////////////
|
||||||
|
std::vector<LatticeColourMatrix> U(4,FGrid);
|
||||||
|
for(int mu=0;mu<Nd;mu++){
|
||||||
|
U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (1)
|
||||||
|
{
|
||||||
|
ref = zero;
|
||||||
|
for(int mu=0;mu<Nd;mu++){
|
||||||
|
|
||||||
|
tmp = U[mu]*Cshift(src,mu+1,1);
|
||||||
|
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
|
||||||
|
|
||||||
|
tmp =adj(U[mu])*src;
|
||||||
|
tmp =Cshift(tmp,mu+1,-1);
|
||||||
|
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
|
||||||
|
}
|
||||||
|
ref = -0.5*ref;
|
||||||
|
}
|
||||||
|
|
||||||
|
RealD mass=0.1;
|
||||||
|
RealD M5 =1.8;
|
||||||
|
|
||||||
|
typename DomainWallFermionR::ImplParams params;
|
||||||
|
params.overlapCommsCompute = overlapComms;
|
||||||
|
|
||||||
|
RealD NP = UGrid->_Nprocessors;
|
||||||
|
|
||||||
|
|
||||||
|
QCD::WilsonKernelsStatic::AsmOpt=1;
|
||||||
|
|
||||||
|
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
|
||||||
|
int ncall =50;
|
||||||
|
if (1) {
|
||||||
|
|
||||||
|
double t0=usecond();
|
||||||
|
for(int i=0;i<ncall;i++){
|
||||||
|
Dw.Dhop(src,result,0);
|
||||||
|
}
|
||||||
|
double t1=usecond();
|
||||||
|
|
||||||
|
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||||
|
double flops=1344*volume*ncall;
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NP<<std::endl;
|
||||||
|
err = ref-result;
|
||||||
|
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
||||||
|
// Dw.Report();
|
||||||
|
}
|
||||||
|
Grid_finalize();
|
||||||
|
}
|
172
benchmarks/Benchmark_zmm.cc
Normal file
172
benchmarks/Benchmark_zmm.cc
Normal file
@ -0,0 +1,172 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./tests/Test_zmm.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid.h>
|
||||||
|
#include <PerfCount.h>
|
||||||
|
|
||||||
|
|
||||||
|
using namespace Grid;
|
||||||
|
using namespace Grid::QCD;
|
||||||
|
|
||||||
|
|
||||||
|
int bench(std::ofstream &os, std::vector<int> &latt4,int Ls);
|
||||||
|
|
||||||
|
int main(int argc,char **argv)
|
||||||
|
{
|
||||||
|
Grid_init(&argc,&argv);
|
||||||
|
std::ofstream os("zmm.dat");
|
||||||
|
|
||||||
|
os << "#V Ls Lxy Lzt C++ Asm OMP L1 " <<std::endl;
|
||||||
|
for(int L=4;L<=32;L+=4){
|
||||||
|
for(int m=1;m<=2;m++){
|
||||||
|
for(int Ls=8;Ls<=16;Ls+=8){
|
||||||
|
std::vector<int> grid({L,L,m*L,m*L});
|
||||||
|
for(int i=0;i<4;i++) {
|
||||||
|
std::cout << grid[i]<<"x";
|
||||||
|
}
|
||||||
|
std::cout << Ls<<std::endl;
|
||||||
|
bench(os,grid,Ls);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
|
||||||
|
{
|
||||||
|
|
||||||
|
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
||||||
|
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||||
|
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||||
|
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||||
|
|
||||||
|
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
|
||||||
|
std::vector<int> mpi_layout = GridDefaultMpi();
|
||||||
|
int threads = GridThread::GetThreads();
|
||||||
|
|
||||||
|
std::vector<int> seeds4({1,2,3,4});
|
||||||
|
std::vector<int> seeds5({5,6,7,8});
|
||||||
|
|
||||||
|
GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds4);
|
||||||
|
|
||||||
|
LatticeFermion src (FGrid);
|
||||||
|
LatticeFermion tmp (FGrid);
|
||||||
|
LatticeFermion srce(FrbGrid);
|
||||||
|
|
||||||
|
LatticeFermion resulto(FrbGrid); resulto=zero;
|
||||||
|
LatticeFermion resulta(FrbGrid); resulta=zero;
|
||||||
|
LatticeFermion junk(FrbGrid); junk=zero;
|
||||||
|
LatticeFermion diff(FrbGrid);
|
||||||
|
LatticeGaugeField Umu(UGrid);
|
||||||
|
|
||||||
|
double mfc, mfa, mfo, mfl1;
|
||||||
|
|
||||||
|
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
||||||
|
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
|
||||||
|
random(RNG5,src);
|
||||||
|
#if 1
|
||||||
|
random(RNG4,Umu);
|
||||||
|
#else
|
||||||
|
int mmu=2;
|
||||||
|
std::vector<LatticeColourMatrix> U(4,UGrid);
|
||||||
|
for(int mu=0;mu<Nd;mu++){
|
||||||
|
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
|
||||||
|
if ( mu!=mmu ) U[mu] = zero;
|
||||||
|
if ( mu==mmu ) U[mu] = 1.0;
|
||||||
|
PokeIndex<LorentzIndex>(Umu,U[mu],mu);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
pickCheckerboard(Even,srce,src);
|
||||||
|
|
||||||
|
RealD mass=0.1;
|
||||||
|
RealD M5 =1.8;
|
||||||
|
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
|
||||||
|
int ncall=50;
|
||||||
|
double t0=usecond();
|
||||||
|
for(int i=0;i<ncall;i++){
|
||||||
|
Dw.DhopOE(srce,resulto,0);
|
||||||
|
}
|
||||||
|
double t1=usecond();
|
||||||
|
|
||||||
|
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||||
|
double flops=1344*volume/2;
|
||||||
|
|
||||||
|
mfc = flops*ncall/(t1-t0);
|
||||||
|
std::cout<<GridLogMessage << "Called C++ Dw"<< " mflop/s = "<< mfc<<std::endl;
|
||||||
|
|
||||||
|
QCD::WilsonKernelsStatic::AsmOpt=1;
|
||||||
|
t0=usecond();
|
||||||
|
for(int i=0;i<ncall;i++){
|
||||||
|
Dw.DhopOE(srce,resulta,0);
|
||||||
|
}
|
||||||
|
t1=usecond();
|
||||||
|
mfa = flops*ncall/(t1-t0);
|
||||||
|
std::cout<<GridLogMessage << "Called ASM Dw"<< " mflop/s = "<< mfa<<std::endl;
|
||||||
|
/*
|
||||||
|
int dag=DaggerNo;
|
||||||
|
t0=usecond();
|
||||||
|
for(int i=0;i<1;i++){
|
||||||
|
Dw.DhopInternalOMPbench(Dw.StencilEven,Dw.LebesgueEvenOdd,Dw.UmuOdd,srce,resulta,dag);
|
||||||
|
}
|
||||||
|
t1=usecond();
|
||||||
|
mfo = flops*100/(t1-t0);
|
||||||
|
std::cout<<GridLogMessage << "Called ASM-OMP Dw"<< " mflop/s = "<< mfo<<std::endl;
|
||||||
|
|
||||||
|
t0=usecond();
|
||||||
|
for(int i=0;i<1;i++){
|
||||||
|
Dw.DhopInternalL1bench(Dw.StencilEven,Dw.LebesgueEvenOdd,Dw.UmuOdd,srce,resulta,dag);
|
||||||
|
}
|
||||||
|
t1=usecond();
|
||||||
|
mfl1= flops*100/(t1-t0);
|
||||||
|
std::cout<<GridLogMessage << "Called ASM-L1 Dw"<< " mflop/s = "<< mfl1<<std::endl;
|
||||||
|
os << latt4[0]*latt4[1]*latt4[2]*latt4[3]<< " "<<Ls<<" "<< latt4[0] <<" " <<latt4[2]<< " "
|
||||||
|
<< mfc<<" "
|
||||||
|
<< mfa<<" "
|
||||||
|
<< mfo<<" "
|
||||||
|
<< mfl1<<std::endl;
|
||||||
|
*/
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
|
||||||
|
Dw.DhopOE(srce,resulta,0);
|
||||||
|
PerformanceCounter Counter(i);
|
||||||
|
Counter.Start();
|
||||||
|
Dw.DhopOE(srce,resulta,0);
|
||||||
|
Counter.Stop();
|
||||||
|
Counter.Report();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
//resulta = (-0.5) * resulta;
|
||||||
|
|
||||||
|
diff = resulto-resulta;
|
||||||
|
std::cout<<GridLogMessage << "diff "<< norm2(diff)<<std::endl;
|
||||||
|
std::cout<<std::endl;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
|||||||
|
|
||||||
bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson
|
bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_dwf_ntpf Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson Benchmark_zmm
|
||||||
|
|
||||||
|
|
||||||
Benchmark_comms_SOURCES=Benchmark_comms.cc
|
Benchmark_comms_SOURCES=Benchmark_comms.cc
|
||||||
@ -10,6 +10,10 @@ Benchmark_dwf_SOURCES=Benchmark_dwf.cc
|
|||||||
Benchmark_dwf_LDADD=-lGrid
|
Benchmark_dwf_LDADD=-lGrid
|
||||||
|
|
||||||
|
|
||||||
|
Benchmark_dwf_ntpf_SOURCES=Benchmark_dwf_ntpf.cc
|
||||||
|
Benchmark_dwf_ntpf_LDADD=-lGrid
|
||||||
|
|
||||||
|
|
||||||
Benchmark_memory_asynch_SOURCES=Benchmark_memory_asynch.cc
|
Benchmark_memory_asynch_SOURCES=Benchmark_memory_asynch.cc
|
||||||
Benchmark_memory_asynch_LDADD=-lGrid
|
Benchmark_memory_asynch_LDADD=-lGrid
|
||||||
|
|
||||||
@ -25,3 +29,7 @@ Benchmark_su3_LDADD=-lGrid
|
|||||||
Benchmark_wilson_SOURCES=Benchmark_wilson.cc
|
Benchmark_wilson_SOURCES=Benchmark_wilson.cc
|
||||||
Benchmark_wilson_LDADD=-lGrid
|
Benchmark_wilson_LDADD=-lGrid
|
||||||
|
|
||||||
|
|
||||||
|
Benchmark_zmm_SOURCES=Benchmark_zmm.cc
|
||||||
|
Benchmark_zmm_LDADD=-lGrid
|
||||||
|
|
||||||
|
109
configure
vendored
109
configure
vendored
@ -626,12 +626,18 @@ ac_subst_vars='am__EXEEXT_FALSE
|
|||||||
am__EXEEXT_TRUE
|
am__EXEEXT_TRUE
|
||||||
LTLIBOBJS
|
LTLIBOBJS
|
||||||
LIBOBJS
|
LIBOBJS
|
||||||
|
USE_LAPACK_LIB_FALSE
|
||||||
|
USE_LAPACK_LIB_TRUE
|
||||||
|
USE_LAPACK_FALSE
|
||||||
|
USE_LAPACK_TRUE
|
||||||
BUILD_CHROMA_REGRESSION_FALSE
|
BUILD_CHROMA_REGRESSION_FALSE
|
||||||
BUILD_CHROMA_REGRESSION_TRUE
|
BUILD_CHROMA_REGRESSION_TRUE
|
||||||
BUILD_COMMS_NONE_FALSE
|
BUILD_COMMS_NONE_FALSE
|
||||||
BUILD_COMMS_NONE_TRUE
|
BUILD_COMMS_NONE_TRUE
|
||||||
BUILD_COMMS_MPI_FALSE
|
BUILD_COMMS_MPI_FALSE
|
||||||
BUILD_COMMS_MPI_TRUE
|
BUILD_COMMS_MPI_TRUE
|
||||||
|
BUILD_COMMS_SHMEM_FALSE
|
||||||
|
BUILD_COMMS_SHMEM_TRUE
|
||||||
BUILD_ZMM_FALSE
|
BUILD_ZMM_FALSE
|
||||||
BUILD_ZMM_TRUE
|
BUILD_ZMM_TRUE
|
||||||
EGREP
|
EGREP
|
||||||
@ -751,7 +757,9 @@ enable_simd
|
|||||||
enable_precision
|
enable_precision
|
||||||
enable_comms
|
enable_comms
|
||||||
enable_rng
|
enable_rng
|
||||||
|
enable_timers
|
||||||
enable_chroma
|
enable_chroma
|
||||||
|
enable_lapack
|
||||||
'
|
'
|
||||||
ac_precious_vars='build_alias
|
ac_precious_vars='build_alias
|
||||||
host_alias
|
host_alias
|
||||||
@ -1410,7 +1418,9 @@ Optional Features:
|
|||||||
--enable-comms=none|mpi Select communications
|
--enable-comms=none|mpi Select communications
|
||||||
--enable-rng=ranlux48|mt19937
|
--enable-rng=ranlux48|mt19937
|
||||||
Select Random Number Generator to be used
|
Select Random Number Generator to be used
|
||||||
|
--enable-timers=yes|no Enable system dependent high res timers
|
||||||
--enable-chroma Expect chroma compiled under c++11
|
--enable-chroma Expect chroma compiled under c++11
|
||||||
|
--enable-lapack Enable lapack yes/no
|
||||||
|
|
||||||
Some influential environment variables:
|
Some influential environment variables:
|
||||||
CXX C++ compiler command
|
CXX C++ compiler command
|
||||||
@ -6410,7 +6420,7 @@ if test "${enable_simd+set}" = set; then :
|
|||||||
enableval=$enable_simd; \
|
enableval=$enable_simd; \
|
||||||
ac_SIMD=${enable_simd}
|
ac_SIMD=${enable_simd}
|
||||||
else
|
else
|
||||||
ac_SIMD=AVX2
|
ac_SIMD=DEBUG
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
||||||
@ -6477,7 +6487,7 @@ $as_echo "#define AVX512 1" >>confdefs.h
|
|||||||
$as_echo "#define IMCI 1" >>confdefs.h
|
$as_echo "#define IMCI 1" >>confdefs.h
|
||||||
|
|
||||||
supported="cross compilation"
|
supported="cross compilation"
|
||||||
ac_ZMM=yes;
|
ac_ZMM=no;
|
||||||
;;
|
;;
|
||||||
NEONv8)
|
NEONv8)
|
||||||
echo Configuring for experimental ARMv8a support
|
echo Configuring for experimental ARMv8a support
|
||||||
@ -6561,12 +6571,26 @@ $as_echo "#define GRID_COMMS_NONE 1" >>confdefs.h
|
|||||||
|
|
||||||
$as_echo "#define GRID_COMMS_MPI 1" >>confdefs.h
|
$as_echo "#define GRID_COMMS_MPI 1" >>confdefs.h
|
||||||
|
|
||||||
|
;;
|
||||||
|
shmem)
|
||||||
|
echo Configuring for SHMEM communications
|
||||||
|
|
||||||
|
$as_echo "#define GRID_COMMS_SHMEM 1" >>confdefs.h
|
||||||
|
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
as_fn_error $? "${ac_COMMS} unsupported --enable-comms option" "$LINENO" 5;
|
as_fn_error $? "${ac_COMMS} unsupported --enable-comms option" "$LINENO" 5;
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
|
if test "X${ac_COMMS}X" == "XshmemX" ; then
|
||||||
|
BUILD_COMMS_SHMEM_TRUE=
|
||||||
|
BUILD_COMMS_SHMEM_FALSE='#'
|
||||||
|
else
|
||||||
|
BUILD_COMMS_SHMEM_TRUE='#'
|
||||||
|
BUILD_COMMS_SHMEM_FALSE=
|
||||||
|
fi
|
||||||
|
|
||||||
if test "X${ac_COMMS}X" == "XmpiX" ; then
|
if test "X${ac_COMMS}X" == "XmpiX" ; then
|
||||||
BUILD_COMMS_MPI_TRUE=
|
BUILD_COMMS_MPI_TRUE=
|
||||||
BUILD_COMMS_MPI_FALSE='#'
|
BUILD_COMMS_MPI_FALSE='#'
|
||||||
@ -6610,6 +6634,34 @@ $as_echo "#define RNG_MT19937 1" >>confdefs.h
|
|||||||
as_fn_error $? "${ac_RNG} unsupported --enable-rng option" "$LINENO" 5;
|
as_fn_error $? "${ac_RNG} unsupported --enable-rng option" "$LINENO" 5;
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
|
#
|
||||||
|
# SDE timing mode
|
||||||
|
#
|
||||||
|
# Check whether --enable-timers was given.
|
||||||
|
if test "${enable_timers+set}" = set; then :
|
||||||
|
enableval=$enable_timers; \
|
||||||
|
ac_TIMERS=${enable_timers}
|
||||||
|
else
|
||||||
|
ac_TIMERS=yes
|
||||||
|
fi
|
||||||
|
|
||||||
|
case ${ac_TIMERS} in
|
||||||
|
yes)
|
||||||
|
|
||||||
|
$as_echo "#define TIMERS_ON 1" >>confdefs.h
|
||||||
|
|
||||||
|
;;
|
||||||
|
no)
|
||||||
|
|
||||||
|
$as_echo "#define TIMERS_OFF 1" >>confdefs.h
|
||||||
|
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
as_fn_error $? "${ac_TIMERS} unsupported --enable-timers option" "$LINENO" 5;
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
#
|
#
|
||||||
# Chroma regression tests
|
# Chroma regression tests
|
||||||
#
|
#
|
||||||
@ -6642,6 +6694,46 @@ else
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# Lapack
|
||||||
|
#
|
||||||
|
# Check whether --enable-lapack was given.
|
||||||
|
if test "${enable_lapack+set}" = set; then :
|
||||||
|
enableval=$enable_lapack; ac_LAPACK=${enable_lapack}
|
||||||
|
else
|
||||||
|
ac_LAPACK=no
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
case ${ac_LAPACK} in
|
||||||
|
yes)
|
||||||
|
echo Enabling lapack
|
||||||
|
;;
|
||||||
|
no)
|
||||||
|
echo Disabling lapack
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo Enabling lapack at ${ac_LAPACK}
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
if test "X${ac_LAPACK}X" != "XnoX" ; then
|
||||||
|
USE_LAPACK_TRUE=
|
||||||
|
USE_LAPACK_FALSE='#'
|
||||||
|
else
|
||||||
|
USE_LAPACK_TRUE='#'
|
||||||
|
USE_LAPACK_FALSE=
|
||||||
|
fi
|
||||||
|
|
||||||
|
if test "X${ac_LAPACK}X" != "XyesX" ; then
|
||||||
|
USE_LAPACK_LIB_TRUE=
|
||||||
|
USE_LAPACK_LIB_FALSE='#'
|
||||||
|
else
|
||||||
|
USE_LAPACK_LIB_TRUE='#'
|
||||||
|
USE_LAPACK_LIB_FALSE=
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
###################################################################
|
###################################################################
|
||||||
# Checks for doxygen support
|
# Checks for doxygen support
|
||||||
# if present enables the "make doxyfile" command
|
# if present enables the "make doxyfile" command
|
||||||
@ -6809,6 +6901,10 @@ if test -z "${BUILD_ZMM_TRUE}" && test -z "${BUILD_ZMM_FALSE}"; then
|
|||||||
as_fn_error $? "conditional \"BUILD_ZMM\" was never defined.
|
as_fn_error $? "conditional \"BUILD_ZMM\" was never defined.
|
||||||
Usually this means the macro was only invoked conditionally." "$LINENO" 5
|
Usually this means the macro was only invoked conditionally." "$LINENO" 5
|
||||||
fi
|
fi
|
||||||
|
if test -z "${BUILD_COMMS_SHMEM_TRUE}" && test -z "${BUILD_COMMS_SHMEM_FALSE}"; then
|
||||||
|
as_fn_error $? "conditional \"BUILD_COMMS_SHMEM\" was never defined.
|
||||||
|
Usually this means the macro was only invoked conditionally." "$LINENO" 5
|
||||||
|
fi
|
||||||
if test -z "${BUILD_COMMS_MPI_TRUE}" && test -z "${BUILD_COMMS_MPI_FALSE}"; then
|
if test -z "${BUILD_COMMS_MPI_TRUE}" && test -z "${BUILD_COMMS_MPI_FALSE}"; then
|
||||||
as_fn_error $? "conditional \"BUILD_COMMS_MPI\" was never defined.
|
as_fn_error $? "conditional \"BUILD_COMMS_MPI\" was never defined.
|
||||||
Usually this means the macro was only invoked conditionally." "$LINENO" 5
|
Usually this means the macro was only invoked conditionally." "$LINENO" 5
|
||||||
@ -6821,6 +6917,14 @@ if test -z "${BUILD_CHROMA_REGRESSION_TRUE}" && test -z "${BUILD_CHROMA_REGRESSI
|
|||||||
as_fn_error $? "conditional \"BUILD_CHROMA_REGRESSION\" was never defined.
|
as_fn_error $? "conditional \"BUILD_CHROMA_REGRESSION\" was never defined.
|
||||||
Usually this means the macro was only invoked conditionally." "$LINENO" 5
|
Usually this means the macro was only invoked conditionally." "$LINENO" 5
|
||||||
fi
|
fi
|
||||||
|
if test -z "${USE_LAPACK_TRUE}" && test -z "${USE_LAPACK_FALSE}"; then
|
||||||
|
as_fn_error $? "conditional \"USE_LAPACK\" was never defined.
|
||||||
|
Usually this means the macro was only invoked conditionally." "$LINENO" 5
|
||||||
|
fi
|
||||||
|
if test -z "${USE_LAPACK_LIB_TRUE}" && test -z "${USE_LAPACK_LIB_FALSE}"; then
|
||||||
|
as_fn_error $? "conditional \"USE_LAPACK_LIB\" was never defined.
|
||||||
|
Usually this means the macro was only invoked conditionally." "$LINENO" 5
|
||||||
|
fi
|
||||||
|
|
||||||
: "${CONFIG_STATUS=./config.status}"
|
: "${CONFIG_STATUS=./config.status}"
|
||||||
ac_write_fail=0
|
ac_write_fail=0
|
||||||
@ -8167,6 +8271,7 @@ The following features are enabled:
|
|||||||
- communications type : ${ac_COMMS}
|
- communications type : ${ac_COMMS}
|
||||||
- default precision : ${ac_PRECISION}
|
- default precision : ${ac_PRECISION}
|
||||||
- RNG choice : ${ac_RNG}
|
- RNG choice : ${ac_RNG}
|
||||||
|
- LAPACK : ${ac_LAPACK}
|
||||||
|
|
||||||
|
|
||||||
"
|
"
|
||||||
|
49
configure.ac
49
configure.ac
@ -71,7 +71,7 @@ AC_CHECK_FUNCS([gettimeofday])
|
|||||||
|
|
||||||
AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVXFMA4|AVX2|AVX512|IMCI],\
|
AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVXFMA4|AVX2|AVX512|IMCI],\
|
||||||
[Select instructions to be SSE4.0, AVX 1.0, AVX 2.0+FMA, AVX 512, IMCI])],\
|
[Select instructions to be SSE4.0, AVX 1.0, AVX 2.0+FMA, AVX 512, IMCI])],\
|
||||||
[ac_SIMD=${enable_simd}],[ac_SIMD=AVX2])
|
[ac_SIMD=${enable_simd}],[ac_SIMD=DEBUG])
|
||||||
|
|
||||||
supported=no
|
supported=no
|
||||||
|
|
||||||
@ -124,7 +124,7 @@ case ${ac_SIMD} in
|
|||||||
echo Configuring for IMCI
|
echo Configuring for IMCI
|
||||||
AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner] )
|
AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner] )
|
||||||
supported="cross compilation"
|
supported="cross compilation"
|
||||||
ac_ZMM=yes;
|
ac_ZMM=no;
|
||||||
;;
|
;;
|
||||||
NEONv8)
|
NEONv8)
|
||||||
echo Configuring for experimental ARMv8a support
|
echo Configuring for experimental ARMv8a support
|
||||||
@ -178,11 +178,16 @@ case ${ac_COMMS} in
|
|||||||
echo Configuring for MPI communications
|
echo Configuring for MPI communications
|
||||||
AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
|
AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
|
||||||
;;
|
;;
|
||||||
|
shmem)
|
||||||
|
echo Configuring for SHMEM communications
|
||||||
|
AC_DEFINE([GRID_COMMS_SHMEM],[1],[GRID_COMMS_SHMEM] )
|
||||||
|
;;
|
||||||
*)
|
*)
|
||||||
AC_MSG_ERROR([${ac_COMMS} unsupported --enable-comms option]);
|
AC_MSG_ERROR([${ac_COMMS} unsupported --enable-comms option]);
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
|
AM_CONDITIONAL(BUILD_COMMS_SHMEM,[ test "X${ac_COMMS}X" == "XshmemX" ])
|
||||||
AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" ])
|
AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" ])
|
||||||
AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])
|
AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])
|
||||||
|
|
||||||
@ -203,6 +208,25 @@ case ${ac_RNG} in
|
|||||||
AC_MSG_ERROR([${ac_RNG} unsupported --enable-rng option]);
|
AC_MSG_ERROR([${ac_RNG} unsupported --enable-rng option]);
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
|
#
|
||||||
|
# SDE timing mode
|
||||||
|
#
|
||||||
|
AC_ARG_ENABLE([timers],[AC_HELP_STRING([--enable-timers=yes|no],\
|
||||||
|
[Enable system dependent high res timers])],\
|
||||||
|
[ac_TIMERS=${enable_timers}],[ac_TIMERS=yes])
|
||||||
|
case ${ac_TIMERS} in
|
||||||
|
yes)
|
||||||
|
AC_DEFINE([TIMERS_ON],[1],[TIMERS_ON] )
|
||||||
|
;;
|
||||||
|
no)
|
||||||
|
AC_DEFINE([TIMERS_OFF],[1],[TIMERS_OFF] )
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
AC_MSG_ERROR([${ac_TIMERS} unsupported --enable-timers option]);
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
#
|
#
|
||||||
# Chroma regression tests
|
# Chroma regression tests
|
||||||
#
|
#
|
||||||
@ -222,6 +246,26 @@ esac
|
|||||||
|
|
||||||
AM_CONDITIONAL(BUILD_CHROMA_REGRESSION,[ test "X${ac_CHROMA}X" == "XyesX" ])
|
AM_CONDITIONAL(BUILD_CHROMA_REGRESSION,[ test "X${ac_CHROMA}X" == "XyesX" ])
|
||||||
|
|
||||||
|
#
|
||||||
|
# Lapack
|
||||||
|
#
|
||||||
|
AC_ARG_ENABLE([lapack],[AC_HELP_STRING([--enable-lapack],[Enable lapack yes/no ])],[ac_LAPACK=${enable_lapack}],[ac_LAPACK=no])
|
||||||
|
|
||||||
|
case ${ac_LAPACK} in
|
||||||
|
yes)
|
||||||
|
echo Enabling lapack
|
||||||
|
;;
|
||||||
|
no)
|
||||||
|
echo Disabling lapack
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo Enabling lapack at ${ac_LAPACK}
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
AM_CONDITIONAL(USE_LAPACK,[ test "X${ac_LAPACK}X" != "XnoX" ])
|
||||||
|
AM_CONDITIONAL(USE_LAPACK_LIB,[ test "X${ac_LAPACK}X" != "XyesX" ])
|
||||||
|
|
||||||
###################################################################
|
###################################################################
|
||||||
# Checks for doxygen support
|
# Checks for doxygen support
|
||||||
# if present enables the "make doxyfile" command
|
# if present enables the "make doxyfile" command
|
||||||
@ -265,6 +309,7 @@ The following features are enabled:
|
|||||||
- communications type : ${ac_COMMS}
|
- communications type : ${ac_COMMS}
|
||||||
- default precision : ${ac_PRECISION}
|
- default precision : ${ac_PRECISION}
|
||||||
- RNG choice : ${ac_RNG}
|
- RNG choice : ${ac_RNG}
|
||||||
|
- LAPACK : ${ac_LAPACK}
|
||||||
|
|
||||||
|
|
||||||
"
|
"
|
||||||
|
@ -36,11 +36,18 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <malloc.h>
|
#include <malloc.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include <immintrin.h>
|
|
||||||
#ifdef HAVE_MM_MALLOC_H
|
#ifdef HAVE_MM_MALLOC_H
|
||||||
#include <mm_malloc.h>
|
#include <mm_malloc.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef GRID_COMMS_SHMEM
|
||||||
|
extern "C" {
|
||||||
|
#include <mpp/shmem.h>
|
||||||
|
extern void * shmem_align(size_t, size_t);
|
||||||
|
extern void shmem_free(void *);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
@ -72,21 +79,59 @@ public:
|
|||||||
|
|
||||||
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
|
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
|
||||||
|
|
||||||
pointer allocate(size_type __n, const void* = 0)
|
pointer allocate(size_type __n, const void* _p= 0)
|
||||||
{
|
{
|
||||||
|
#ifdef GRID_COMMS_SHMEM
|
||||||
|
|
||||||
|
_Tp *ptr = (_Tp *) shmem_align(__n*sizeof(_Tp),64);
|
||||||
|
|
||||||
|
|
||||||
|
#define PARANOID_SYMMETRIC_HEAP
|
||||||
|
#ifdef PARANOID_SYMMETRIC_HEAP
|
||||||
|
static void * bcast;
|
||||||
|
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
|
||||||
|
|
||||||
|
bcast = (void *) ptr;
|
||||||
|
shmem_broadcast32((void *)&bcast,(void *)&bcast,sizeof(void *)/4,0,0,0,shmem_n_pes(),psync);
|
||||||
|
|
||||||
|
if ( bcast != ptr ) {
|
||||||
|
std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout);
|
||||||
|
BACKTRACEFILE();
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
assert( bcast == (void *) ptr);
|
||||||
|
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
|
||||||
#ifdef HAVE_MM_MALLOC_H
|
#ifdef HAVE_MM_MALLOC_H
|
||||||
_Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
|
_Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
|
||||||
#else
|
#else
|
||||||
_Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
|
_Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
|
_Tp tmp;
|
||||||
|
#undef FIRST_TOUCH_OPTIMISE
|
||||||
|
#ifdef FIRST_TOUCH_OPTIMISE
|
||||||
|
#pragma omp parallel for
|
||||||
|
for(int i=0;i<__n;i++){
|
||||||
|
ptr[i]=tmp;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
void deallocate(pointer __p, size_type) {
|
void deallocate(pointer __p, size_type) {
|
||||||
|
#ifdef GRID_COMMS_SHMEM
|
||||||
|
shmem_free((void *)__p);
|
||||||
|
#else
|
||||||
#ifdef HAVE_MM_MALLOC_H
|
#ifdef HAVE_MM_MALLOC_H
|
||||||
_mm_free((void *)__p);
|
_mm_free((void *)__p);
|
||||||
#else
|
#else
|
||||||
free((void *)__p);
|
free((void *)__p);
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
void construct(pointer __p, const _Tp& __val) { };
|
void construct(pointer __p, const _Tp& __val) { };
|
||||||
|
180
lib/Config.h.in
180
lib/Config.h.in
@ -1,180 +0,0 @@
|
|||||||
/* lib/Config.h.in. Generated from configure.ac by autoheader. */
|
|
||||||
|
|
||||||
/* AVX Intrinsics */
|
|
||||||
#undef AVX1
|
|
||||||
|
|
||||||
/* AVX2 Intrinsics */
|
|
||||||
#undef AVX2
|
|
||||||
|
|
||||||
/* AVX512 Intrinsics for Knights Landing */
|
|
||||||
#undef AVX512
|
|
||||||
|
|
||||||
/* AVX Intrinsics with FMA4 */
|
|
||||||
#undef AVXFMA4
|
|
||||||
|
|
||||||
/* EMPTY_SIMD only for DEBUGGING */
|
|
||||||
#undef EMPTY_SIMD
|
|
||||||
|
|
||||||
/* GRID_COMMS_MPI */
|
|
||||||
#undef GRID_COMMS_MPI
|
|
||||||
|
|
||||||
/* GRID_COMMS_NONE */
|
|
||||||
#undef GRID_COMMS_NONE
|
|
||||||
|
|
||||||
/* GRID_DEFAULT_PRECISION is DOUBLE */
|
|
||||||
#undef GRID_DEFAULT_PRECISION_DOUBLE
|
|
||||||
|
|
||||||
/* GRID_DEFAULT_PRECISION is SINGLE */
|
|
||||||
#undef GRID_DEFAULT_PRECISION_SINGLE
|
|
||||||
|
|
||||||
/* Support Altivec instructions */
|
|
||||||
#undef HAVE_ALTIVEC
|
|
||||||
|
|
||||||
/* Support AVX (Advanced Vector Extensions) instructions */
|
|
||||||
#undef HAVE_AVX
|
|
||||||
|
|
||||||
/* Support AVX2 (Advanced Vector Extensions 2) instructions */
|
|
||||||
#undef HAVE_AVX2
|
|
||||||
|
|
||||||
/* Define to 1 if you have the declaration of `be64toh', and to 0 if you
|
|
||||||
don't. */
|
|
||||||
#undef HAVE_DECL_BE64TOH
|
|
||||||
|
|
||||||
/* Define to 1 if you have the declaration of `ntohll', and to 0 if you don't.
|
|
||||||
*/
|
|
||||||
#undef HAVE_DECL_NTOHLL
|
|
||||||
|
|
||||||
/* Define to 1 if you have the <endian.h> header file. */
|
|
||||||
#undef HAVE_ENDIAN_H
|
|
||||||
|
|
||||||
/* Define to 1 if you have the <execinfo.h> header file. */
|
|
||||||
#undef HAVE_EXECINFO_H
|
|
||||||
|
|
||||||
/* Support FMA3 (Fused Multiply-Add) instructions */
|
|
||||||
#undef HAVE_FMA
|
|
||||||
|
|
||||||
/* Define to 1 if you have the `gettimeofday' function. */
|
|
||||||
#undef HAVE_GETTIMEOFDAY
|
|
||||||
|
|
||||||
/* Define to 1 if you have the <gmp.h> header file. */
|
|
||||||
#undef HAVE_GMP_H
|
|
||||||
|
|
||||||
/* Define to 1 if you have the <inttypes.h> header file. */
|
|
||||||
#undef HAVE_INTTYPES_H
|
|
||||||
|
|
||||||
/* Define to 1 if you have the <malloc.h> header file. */
|
|
||||||
#undef HAVE_MALLOC_H
|
|
||||||
|
|
||||||
/* Define to 1 if you have the <malloc/malloc.h> header file. */
|
|
||||||
#undef HAVE_MALLOC_MALLOC_H
|
|
||||||
|
|
||||||
/* Define to 1 if you have the <memory.h> header file. */
|
|
||||||
#undef HAVE_MEMORY_H
|
|
||||||
|
|
||||||
/* Support mmx instructions */
|
|
||||||
#undef HAVE_MMX
|
|
||||||
|
|
||||||
/* Define to 1 if you have the <mm_malloc.h> header file. */
|
|
||||||
#undef HAVE_MM_MALLOC_H
|
|
||||||
|
|
||||||
/* Support SSE (Streaming SIMD Extensions) instructions */
|
|
||||||
#undef HAVE_SSE
|
|
||||||
|
|
||||||
/* Support SSE2 (Streaming SIMD Extensions 2) instructions */
|
|
||||||
#undef HAVE_SSE2
|
|
||||||
|
|
||||||
/* Support SSE3 (Streaming SIMD Extensions 3) instructions */
|
|
||||||
#undef HAVE_SSE3
|
|
||||||
|
|
||||||
/* Support SSSE4.1 (Streaming SIMD Extensions 4.1) instructions */
|
|
||||||
#undef HAVE_SSE4_1
|
|
||||||
|
|
||||||
/* Support SSSE4.2 (Streaming SIMD Extensions 4.2) instructions */
|
|
||||||
#undef HAVE_SSE4_2
|
|
||||||
|
|
||||||
/* Support SSSE3 (Supplemental Streaming SIMD Extensions 3) instructions */
|
|
||||||
#undef HAVE_SSSE3
|
|
||||||
|
|
||||||
/* Define to 1 if you have the <stdint.h> header file. */
|
|
||||||
#undef HAVE_STDINT_H
|
|
||||||
|
|
||||||
/* Define to 1 if you have the <stdlib.h> header file. */
|
|
||||||
#undef HAVE_STDLIB_H
|
|
||||||
|
|
||||||
/* Define to 1 if you have the <strings.h> header file. */
|
|
||||||
#undef HAVE_STRINGS_H
|
|
||||||
|
|
||||||
/* Define to 1 if you have the <string.h> header file. */
|
|
||||||
#undef HAVE_STRING_H
|
|
||||||
|
|
||||||
/* Define to 1 if you have the <sys/stat.h> header file. */
|
|
||||||
#undef HAVE_SYS_STAT_H
|
|
||||||
|
|
||||||
/* Define to 1 if you have the <sys/types.h> header file. */
|
|
||||||
#undef HAVE_SYS_TYPES_H
|
|
||||||
|
|
||||||
/* Define to 1 if you have the <unistd.h> header file. */
|
|
||||||
#undef HAVE_UNISTD_H
|
|
||||||
|
|
||||||
/* IMCI Intrinsics for Knights Corner */
|
|
||||||
#undef IMCI
|
|
||||||
|
|
||||||
/* NEON ARMv8 Experimental support */
|
|
||||||
#undef NEONv8
|
|
||||||
|
|
||||||
/* Name of package */
|
|
||||||
#undef PACKAGE
|
|
||||||
|
|
||||||
/* Define to the address where bug reports for this package should be sent. */
|
|
||||||
#undef PACKAGE_BUGREPORT
|
|
||||||
|
|
||||||
/* Define to the full name of this package. */
|
|
||||||
#undef PACKAGE_NAME
|
|
||||||
|
|
||||||
/* Define to the full name and version of this package. */
|
|
||||||
#undef PACKAGE_STRING
|
|
||||||
|
|
||||||
/* Define to the one symbol short name of this package. */
|
|
||||||
#undef PACKAGE_TARNAME
|
|
||||||
|
|
||||||
/* Define to the home page for this package. */
|
|
||||||
#undef PACKAGE_URL
|
|
||||||
|
|
||||||
/* Define to the version of this package. */
|
|
||||||
#undef PACKAGE_VERSION
|
|
||||||
|
|
||||||
/* RNG_MT19937 */
|
|
||||||
#undef RNG_MT19937
|
|
||||||
|
|
||||||
/* RNG_RANLUX */
|
|
||||||
#undef RNG_RANLUX
|
|
||||||
|
|
||||||
/* SSE4 Intrinsics */
|
|
||||||
#undef SSE4
|
|
||||||
|
|
||||||
/* Define to 1 if you have the ANSI C header files. */
|
|
||||||
#undef STDC_HEADERS
|
|
||||||
|
|
||||||
/* Version number of package */
|
|
||||||
#undef VERSION
|
|
||||||
|
|
||||||
/* Define for Solaris 2.5.1 so the uint32_t typedef from <sys/synch.h>,
|
|
||||||
<pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
|
|
||||||
#define below would cause a syntax error. */
|
|
||||||
#undef _UINT32_T
|
|
||||||
|
|
||||||
/* Define for Solaris 2.5.1 so the uint64_t typedef from <sys/synch.h>,
|
|
||||||
<pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
|
|
||||||
#define below would cause a syntax error. */
|
|
||||||
#undef _UINT64_T
|
|
||||||
|
|
||||||
/* Define to `unsigned int' if <sys/types.h> does not define. */
|
|
||||||
#undef size_t
|
|
||||||
|
|
||||||
/* Define to the type of an unsigned integer type of width exactly 32 bits if
|
|
||||||
such a type exists and the standard includes do not define it. */
|
|
||||||
#undef uint32_t
|
|
||||||
|
|
||||||
/* Define to the type of an unsigned integer type of width exactly 64 bits if
|
|
||||||
such a type exists and the standard includes do not define it. */
|
|
||||||
#undef uint64_t
|
|
@ -37,4 +37,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#ifdef GRID_COMMS_MPI
|
#ifdef GRID_COMMS_MPI
|
||||||
#include <cshift/Cshift_mpi.h>
|
#include <cshift/Cshift_mpi.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef GRID_COMMS_SHMEM
|
||||||
|
#include <cshift/Cshift_mpi.h> // uses same implementation of communicator
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
@ -62,10 +62,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <serialisation/Serialisation.h>
|
#include <serialisation/Serialisation.h>
|
||||||
#include <Config.h>
|
#include <Config.h>
|
||||||
#include <Timer.h>
|
#include <Timer.h>
|
||||||
|
#include <PerfCount.h>
|
||||||
#include <Log.h>
|
#include <Log.h>
|
||||||
#include <AlignedAllocator.h>
|
#include <AlignedAllocator.h>
|
||||||
#include <Simd.h>
|
#include <Simd.h>
|
||||||
#include <Threads.h>
|
#include <Threads.h>
|
||||||
|
#include <Lexicographic.h>
|
||||||
#include <Communicator.h>
|
#include <Communicator.h>
|
||||||
#include <Cartesian.h>
|
#include <Cartesian.h>
|
||||||
#include <Tensors.h>
|
#include <Tensors.h>
|
||||||
|
47
lib/Init.cc
47
lib/Init.cc
@ -45,12 +45,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <iterator>
|
#include <iterator>
|
||||||
|
|
||||||
#define __X86_64
|
|
||||||
|
|
||||||
#ifdef HAVE_EXECINFO_H
|
|
||||||
#include <execinfo.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
@ -150,6 +144,10 @@ void GridParseLayout(char **argv,int argc,
|
|||||||
}
|
}
|
||||||
if( GridCmdOptionExists(argv,argv+argc,"--threads") ){
|
if( GridCmdOptionExists(argv,argv+argc,"--threads") ){
|
||||||
std::vector<int> ompthreads(0);
|
std::vector<int> ompthreads(0);
|
||||||
|
#ifndef GRID_OMP
|
||||||
|
std::cout << GridLogWarning << "'--threads' option used but Grid was"
|
||||||
|
<< " not compiled with thread support" << std::endl;
|
||||||
|
#endif
|
||||||
arg= GridCmdOptionPayload(argv,argv+argc,"--threads");
|
arg= GridCmdOptionPayload(argv,argv+argc,"--threads");
|
||||||
GridCmdOptionIntVector(arg,ompthreads);
|
GridCmdOptionIntVector(arg,ompthreads);
|
||||||
assert(ompthreads.size()==1);
|
assert(ompthreads.size()==1);
|
||||||
@ -174,9 +172,8 @@ std::string GridCmdVectorIntToString(const std::vector<int> & vec){
|
|||||||
/////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////
|
||||||
void Grid_init(int *argc,char ***argv)
|
void Grid_init(int *argc,char ***argv)
|
||||||
{
|
{
|
||||||
#ifdef GRID_COMMS_MPI
|
CartesianCommunicator::Init(argc,argv);
|
||||||
MPI_Init(argc,argv);
|
|
||||||
#endif
|
|
||||||
// Parse command line args.
|
// Parse command line args.
|
||||||
|
|
||||||
GridLogger::StopWatch.Start();
|
GridLogger::StopWatch.Start();
|
||||||
@ -194,9 +191,10 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
std::cout<<GridLogMessage<<"--debug-stdout : print stdout from EVERY node"<<std::endl;
|
std::cout<<GridLogMessage<<"--debug-stdout : print stdout from EVERY node"<<std::endl;
|
||||||
std::cout<<GridLogMessage<<"--decomposition : report on default omp,mpi and simd decomposition"<<std::endl;
|
std::cout<<GridLogMessage<<"--decomposition : report on default omp,mpi and simd decomposition"<<std::endl;
|
||||||
std::cout<<GridLogMessage<<"--mpi n.n.n.n : default MPI decomposition"<<std::endl;
|
std::cout<<GridLogMessage<<"--mpi n.n.n.n : default MPI decomposition"<<std::endl;
|
||||||
std::cout<<GridLogMessage<<"--omp n : default number of OMP threads"<<std::endl;
|
std::cout<<GridLogMessage<<"--threads n : default number of OMP threads"<<std::endl;
|
||||||
std::cout<<GridLogMessage<<"--grid n.n.n.n : default Grid size"<<std::endl;
|
std::cout<<GridLogMessage<<"--grid n.n.n.n : default Grid size"<<std::endl;
|
||||||
std::cout<<GridLogMessage<<"--log list : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug"<<std::endl;
|
std::cout<<GridLogMessage<<"--log list : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
|
||||||
|
exit(EXIT_SUCCESS);
|
||||||
}
|
}
|
||||||
|
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--log") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--log") ){
|
||||||
@ -213,8 +211,7 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
Grid_quiesce_nodes();
|
Grid_quiesce_nodes();
|
||||||
}
|
}
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-opt") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-opt") ){
|
||||||
QCD::WilsonFermionStatic::HandOptDslash=1;
|
QCD::WilsonKernelsStatic::HandOpt=1;
|
||||||
QCD::WilsonFermion5DStatic::HandOptDslash=1;
|
|
||||||
}
|
}
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
|
||||||
LebesgueOrder::UseLebesgueOrder=1;
|
LebesgueOrder::UseLebesgueOrder=1;
|
||||||
@ -287,13 +284,7 @@ void Grid_finalize(void)
|
|||||||
Grid_unquiesce_nodes();
|
Grid_unquiesce_nodes();
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
double usecond(void) {
|
|
||||||
struct timeval tv;
|
|
||||||
gettimeofday(&tv,NULL);
|
|
||||||
return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
|
|
||||||
}
|
|
||||||
|
|
||||||
#define _NBACKTRACE (256)
|
|
||||||
void * Grid_backtrace_buffer[_NBACKTRACE];
|
void * Grid_backtrace_buffer[_NBACKTRACE];
|
||||||
|
|
||||||
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
|
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
|
||||||
@ -305,11 +296,11 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
|
|||||||
// Linux/Posix
|
// Linux/Posix
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
// And x86 64bit
|
// And x86 64bit
|
||||||
|
#ifdef __x86_64__
|
||||||
ucontext_t * uc= (ucontext_t *)ptr;
|
ucontext_t * uc= (ucontext_t *)ptr;
|
||||||
struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
|
struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
|
||||||
printf(" instruction %llx\n",(unsigned long long)sc->rip);
|
printf(" instruction %llx\n",(unsigned long long)sc->rip);
|
||||||
#define REG(A) printf(" %s %lx\n",#A,sc-> A);
|
#define REG(A) printf(" %s %lx\n",#A,sc-> A);
|
||||||
|
|
||||||
REG(rdi);
|
REG(rdi);
|
||||||
REG(rsi);
|
REG(rsi);
|
||||||
REG(rbp);
|
REG(rbp);
|
||||||
@ -330,17 +321,15 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
|
|||||||
REG(r14);
|
REG(r14);
|
||||||
REG(r15);
|
REG(r15);
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAVE_EXECINFO_H
|
|
||||||
int symbols = backtrace (Grid_backtrace_buffer,_NBACKTRACE);
|
|
||||||
char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);
|
|
||||||
for (int i = 0; i < symbols; i++){
|
|
||||||
printf ("%s\n", strings[i]);
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
|
BACKTRACE();
|
||||||
exit(0);
|
exit(0);
|
||||||
return;
|
return;
|
||||||
};
|
};
|
||||||
|
#ifdef GRID_FPE
|
||||||
|
#define _GNU_SOURCE
|
||||||
|
#include <fenv.h>
|
||||||
|
#endif
|
||||||
void Grid_debug_handler_init(void)
|
void Grid_debug_handler_init(void)
|
||||||
{
|
{
|
||||||
struct sigaction sa,osa;
|
struct sigaction sa,osa;
|
||||||
@ -349,5 +338,9 @@ void Grid_debug_handler_init(void)
|
|||||||
sa.sa_flags = SA_SIGINFO;
|
sa.sa_flags = SA_SIGINFO;
|
||||||
sigaction(SIGSEGV,&sa,NULL);
|
sigaction(SIGSEGV,&sa,NULL);
|
||||||
sigaction(SIGTRAP,&sa,NULL);
|
sigaction(SIGTRAP,&sa,NULL);
|
||||||
|
#ifdef GRID_FPE
|
||||||
|
feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
|
||||||
|
sigaction(SIGFPE,&sa,NULL);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
32
lib/Lexicographic.h
Normal file
32
lib/Lexicographic.h
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
#ifndef GRID_LEXICOGRAPHIC_H
|
||||||
|
#define GRID_LEXICOGRAPHIC_H
|
||||||
|
|
||||||
|
|
||||||
|
namespace Grid{
|
||||||
|
|
||||||
|
class Lexicographic {
|
||||||
|
public:
|
||||||
|
|
||||||
|
static inline void CoorFromIndex (std::vector<int>& coor,int index,std::vector<int> &dims){
|
||||||
|
int nd= dims.size();
|
||||||
|
coor.resize(nd);
|
||||||
|
for(int d=0;d<nd;d++){
|
||||||
|
coor[d] = index % dims[d];
|
||||||
|
index = index / dims[d];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void IndexFromCoor (std::vector<int>& coor,int &index,std::vector<int> &dims){
|
||||||
|
int nd=dims.size();
|
||||||
|
int stride=1;
|
||||||
|
index=0;
|
||||||
|
for(int d=0;d<nd;d++){
|
||||||
|
index = index+stride*coor[d];
|
||||||
|
stride=stride*dims[d];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
|
#endif
|
@ -73,13 +73,16 @@ void GridLogConfigure(std::vector<std::string> &logstreams)
|
|||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
void Grid_quiesce_nodes(void)
|
void Grid_quiesce_nodes(void)
|
||||||
{
|
{
|
||||||
|
int me=0;
|
||||||
#ifdef GRID_COMMS_MPI
|
#ifdef GRID_COMMS_MPI
|
||||||
int me;
|
|
||||||
MPI_Comm_rank(MPI_COMM_WORLD,&me);
|
MPI_Comm_rank(MPI_COMM_WORLD,&me);
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_COMMS_SHMEM
|
||||||
|
me = shmem_my_pe();
|
||||||
|
#endif
|
||||||
if ( me ) {
|
if ( me ) {
|
||||||
std::cout.setstate(std::ios::badbit);
|
std::cout.setstate(std::ios::badbit);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void Grid_unquiesce_nodes(void)
|
void Grid_unquiesce_nodes(void)
|
||||||
|
39
lib/Log.h
39
lib/Log.h
@ -32,9 +32,15 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
#ifndef GRID_LOG_H
|
#ifndef GRID_LOG_H
|
||||||
#define GRID_LOG_H
|
#define GRID_LOG_H
|
||||||
|
|
||||||
|
#ifdef HAVE_EXECINFO_H
|
||||||
|
#include <execinfo.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
// Dress the output; use std::chrono for time stamping via the StopWatch class
|
// Dress the output; use std::chrono for time stamping via the StopWatch class
|
||||||
|
int Rank(void); // used for early stage debug before library init
|
||||||
|
|
||||||
|
|
||||||
class Colours{
|
class Colours{
|
||||||
@ -48,7 +54,6 @@ namespace Grid {
|
|||||||
Active(activate);
|
Active(activate);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
void Active(bool activate){
|
void Active(bool activate){
|
||||||
is_active=activate;
|
is_active=activate;
|
||||||
|
|
||||||
@ -140,5 +145,37 @@ void GridLogConfigure(std::vector<std::string> &logstreams);
|
|||||||
extern Colours GridLogColours;
|
extern Colours GridLogColours;
|
||||||
|
|
||||||
|
|
||||||
|
#define _NBACKTRACE (256)
|
||||||
|
extern void * Grid_backtrace_buffer[_NBACKTRACE];
|
||||||
|
|
||||||
|
#define BACKTRACEFILE() {\
|
||||||
|
char string[20]; \
|
||||||
|
std::sprintf(string,"backtrace.%d",Rank()); \
|
||||||
|
std::FILE * fp = std::fopen(string,"w"); \
|
||||||
|
BACKTRACEFP(fp)\
|
||||||
|
std::fclose(fp); \
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef HAVE_EXECINFO_H
|
||||||
|
#define BACKTRACEFP(fp) { \
|
||||||
|
int symbols = backtrace (Grid_backtrace_buffer,_NBACKTRACE);\
|
||||||
|
char **strings = backtrace_symbols(Grid_backtrace_buffer,symbols);\
|
||||||
|
for (int i = 0; i < symbols; i++){\
|
||||||
|
std::fprintf (fp,"BackTrace Strings: %d %s\n",i, strings[i]); std::fflush(fp); \
|
||||||
|
}\
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
#define BACKTRACEFP(fp) { \
|
||||||
|
std::fprintf (fp,"BT %d %lx\n",0, __builtin_return_address(0)); std::fflush(fp); \
|
||||||
|
std::fprintf (fp,"BT %d %lx\n",1, __builtin_return_address(1)); std::fflush(fp); \
|
||||||
|
std::fprintf (fp,"BT %d %lx\n",2, __builtin_return_address(2)); std::fflush(fp); \
|
||||||
|
std::fprintf (fp,"BT %d %lx\n",3, __builtin_return_address(3)); std::fflush(fp); \
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define BACKTRACE() BACKTRACEFP(stdout)
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
File diff suppressed because one or more lines are too long
@ -6,6 +6,10 @@ if BUILD_COMMS_MPI
|
|||||||
extra_sources+=communicator/Communicator_mpi.cc
|
extra_sources+=communicator/Communicator_mpi.cc
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
if BUILD_COMMS_SHMEM
|
||||||
|
extra_sources+=communicator/Communicator_shmem.cc
|
||||||
|
endif
|
||||||
|
|
||||||
if BUILD_COMMS_NONE
|
if BUILD_COMMS_NONE
|
||||||
extra_sources+=communicator/Communicator_none.cc
|
extra_sources+=communicator/Communicator_none.cc
|
||||||
endif
|
endif
|
||||||
|
BIN
lib/Old/Endeavour.tgz
Normal file
BIN
lib/Old/Endeavour.tgz
Normal file
Binary file not shown.
@ -32,28 +32,44 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
#define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16))
|
#define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16))
|
||||||
|
#define RawConfig(A,B) (A<<8|B)
|
||||||
const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::PerformanceCounterConfigs [] = {
|
const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::PerformanceCounterConfigs [] = {
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES , "CPUCYCLES.........." },
|
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES , "CACHE_REFERENCES..." , INSTRUCTIONS},
|
||||||
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS , "INSTRUCTIONS......." },
|
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES , "CACHE_MISSES......." , CACHE_REFERENCES},
|
||||||
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES , "CACHE_REFERENCES..." },
|
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES , "CPUCYCLES.........." , INSTRUCTIONS},
|
||||||
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES , "CACHE_MISSES......." },
|
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS , "INSTRUCTIONS......." , CPUCYCLES },
|
||||||
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,MISS) , "L1D_READ_MISS......"},
|
// 4
|
||||||
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,ACCESS) , "L1D_READ_ACCESS...."},
|
#ifdef AVX512
|
||||||
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,MISS) , "L1D_WRITE_MISS....."},
|
{ PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES },
|
||||||
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,ACCESS) , "L1D_WRITE_ACCESS..."},
|
{ PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS },
|
||||||
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,MISS) , "L1D_PREFETCH_MISS.."},
|
{ PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS },
|
||||||
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) , "L1D_PREFETCH_ACCESS"},
|
{ PERF_TYPE_RAW, RawConfig(0x02,0x04), "L2_HIT_LOADS.......", L1D_READ_ACCESS },
|
||||||
{ PERF_TYPE_HW_CACHE, CacheControl(LL,READ,MISS) , "LL_READ_MISS......."},
|
{ PERF_TYPE_RAW, RawConfig(0x04,0x04), "L2_MISS_LOADS......", L1D_READ_ACCESS },
|
||||||
// { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,ACCESS) , "LL_READ_ACCESS....."},
|
{ PERF_TYPE_RAW, RawConfig(0x10,0x04), "UTLB_MISS_LOADS....", L1D_READ_ACCESS },
|
||||||
{ PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,MISS) , "LL_WRITE_MISS......"},
|
{ PERF_TYPE_RAW, RawConfig(0x08,0x04), "DTLB_MISS_LOADS....", L1D_READ_ACCESS },
|
||||||
{ PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,ACCESS) , "LL_WRITE_ACCESS...."},
|
// 11
|
||||||
{ PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,MISS) , "LL_PREFETCH_MISS..."},
|
#else
|
||||||
{ PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,ACCESS) , "LL_PREFETCH_ACCESS."},
|
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,ACCESS) , "L1D_READ_ACCESS....",INSTRUCTIONS},
|
||||||
{ PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,MISS) , "L1I_READ_MISS......"},
|
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,MISS) , "L1D_READ_MISS......",L1D_READ_ACCESS},
|
||||||
{ PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,ACCESS) , "L1I_READ_ACCESS...."}
|
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,MISS) , "L1D_WRITE_MISS.....",L1D_READ_ACCESS},
|
||||||
|
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,ACCESS) , "L1D_WRITE_ACCESS...",L1D_READ_ACCESS},
|
||||||
|
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,MISS) , "L1D_PREFETCH_MISS..",L1D_READ_ACCESS},
|
||||||
|
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) , "L1D_PREFETCH_ACCESS",L1D_READ_ACCESS},
|
||||||
|
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) , "L1D_PREFETCH_ACCESS",L1D_READ_ACCESS},
|
||||||
|
// 11
|
||||||
#endif
|
#endif
|
||||||
|
{ PERF_TYPE_HW_CACHE, CacheControl(LL,READ,MISS) , "LL_READ_MISS.......",L1D_READ_ACCESS},
|
||||||
|
{ PERF_TYPE_HW_CACHE, CacheControl(LL,READ,ACCESS) , "LL_READ_ACCESS.....",L1D_READ_ACCESS},
|
||||||
|
{ PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,MISS) , "LL_WRITE_MISS......",L1D_READ_ACCESS},
|
||||||
|
{ PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,ACCESS) , "LL_WRITE_ACCESS....",L1D_READ_ACCESS},
|
||||||
|
//15
|
||||||
|
{ PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,MISS) , "LL_PREFETCH_MISS...",L1D_READ_ACCESS},
|
||||||
|
{ PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,ACCESS) , "LL_PREFETCH_ACCESS.",L1D_READ_ACCESS},
|
||||||
|
{ PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,MISS) , "L1I_READ_MISS......",INSTRUCTIONS},
|
||||||
|
{ PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,ACCESS) , "L1I_READ_ACCESS....",INSTRUCTIONS}
|
||||||
|
//19
|
||||||
// { PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "STALL_CYCLES" },
|
// { PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "STALL_CYCLES" },
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
112
lib/PerfCount.h
112
lib/PerfCount.h
@ -34,7 +34,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <ctime>
|
#include <ctime>
|
||||||
#include <chrono>
|
#include <chrono>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
#include <unistd.h>
|
||||||
#include <sys/ioctl.h>
|
#include <sys/ioctl.h>
|
||||||
|
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
@ -43,8 +43,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#else
|
#else
|
||||||
#include <sys/syscall.h>
|
#include <sys/syscall.h>
|
||||||
#endif
|
#endif
|
||||||
namespace Grid {
|
|
||||||
|
|
||||||
|
namespace Grid {
|
||||||
|
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
|
static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
|
||||||
@ -58,6 +58,49 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef TIMERS_OFF
|
||||||
|
|
||||||
|
|
||||||
|
inline uint64_t cyclecount(void){
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
#define __SSC_MARK(mark) __asm__ __volatile__ ("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(mark):"%ebx")
|
||||||
|
#define __SSC_STOP __SSC_MARK(0x110)
|
||||||
|
#define __SSC_START __SSC_MARK(0x111)
|
||||||
|
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#define __SSC_MARK(mark)
|
||||||
|
#define __SSC_STOP
|
||||||
|
#define __SSC_START
|
||||||
|
|
||||||
|
/*
|
||||||
|
* cycle counters arch dependent
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifdef __bgq__
|
||||||
|
inline uint64_t cyclecount(void){
|
||||||
|
uint64_t tmp;
|
||||||
|
asm volatile ("mfspr %0,0x10C" : "=&r" (tmp) );
|
||||||
|
return tmp;
|
||||||
|
}
|
||||||
|
#elif defined __x86_64__
|
||||||
|
#include <x86intrin.h>
|
||||||
|
inline uint64_t cyclecount(void){
|
||||||
|
return __rdtsc();
|
||||||
|
// unsigned int dummy;
|
||||||
|
// return __rdtscp(&dummy);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
|
||||||
|
inline uint64_t cyclecount(void){
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
class PerformanceCounter {
|
class PerformanceCounter {
|
||||||
private:
|
private:
|
||||||
@ -67,6 +110,7 @@ private:
|
|||||||
uint32_t type;
|
uint32_t type;
|
||||||
uint64_t config;
|
uint64_t config;
|
||||||
const char *name;
|
const char *name;
|
||||||
|
int normalisation;
|
||||||
} PerformanceCounterConfig;
|
} PerformanceCounterConfig;
|
||||||
|
|
||||||
static const PerformanceCounterConfig PerformanceCounterConfigs [];
|
static const PerformanceCounterConfig PerformanceCounterConfigs [];
|
||||||
@ -74,26 +118,12 @@ private:
|
|||||||
public:
|
public:
|
||||||
|
|
||||||
enum PerformanceCounterType {
|
enum PerformanceCounterType {
|
||||||
CPUCYCLES=0,
|
CACHE_REFERENCES=0,
|
||||||
INSTRUCTIONS,
|
CACHE_MISSES=1,
|
||||||
// STALL_CYCLES,
|
CPUCYCLES=2,
|
||||||
CACHE_REFERENCES,
|
INSTRUCTIONS=3,
|
||||||
CACHE_MISSES,
|
L1D_READ_ACCESS=4,
|
||||||
L1D_READ_MISS,
|
PERFORMANCE_COUNTER_NUM_TYPES=19
|
||||||
L1D_READ_ACCESS,
|
|
||||||
L1D_WRITE_MISS,
|
|
||||||
L1D_WRITE_ACCESS,
|
|
||||||
L1D_PREFETCH_MISS,
|
|
||||||
L1D_PREFETCH_ACCESS,
|
|
||||||
LL_READ_MISS,
|
|
||||||
// LL_READ_ACCESS,
|
|
||||||
LL_WRITE_MISS,
|
|
||||||
LL_WRITE_ACCESS,
|
|
||||||
LL_PREFETCH_MISS,
|
|
||||||
LL_PREFETCH_ACCESS,
|
|
||||||
L1I_READ_MISS,
|
|
||||||
L1I_READ_ACCESS,
|
|
||||||
PERFORMANCE_COUNTER_NUM_TYPES
|
|
||||||
};
|
};
|
||||||
|
|
||||||
public:
|
public:
|
||||||
@ -101,7 +131,9 @@ public:
|
|||||||
int PCT;
|
int PCT;
|
||||||
|
|
||||||
long long count;
|
long long count;
|
||||||
|
long long cycles;
|
||||||
int fd;
|
int fd;
|
||||||
|
int cyclefd;
|
||||||
unsigned long long elapsed;
|
unsigned long long elapsed;
|
||||||
uint64_t begin;
|
uint64_t begin;
|
||||||
|
|
||||||
@ -114,7 +146,9 @@ public:
|
|||||||
assert(_pct>=0);
|
assert(_pct>=0);
|
||||||
assert(_pct<PERFORMANCE_COUNTER_NUM_TYPES);
|
assert(_pct<PERFORMANCE_COUNTER_NUM_TYPES);
|
||||||
fd=-1;
|
fd=-1;
|
||||||
|
cyclefd=-1;
|
||||||
count=0;
|
count=0;
|
||||||
|
cycles=0;
|
||||||
PCT =_pct;
|
PCT =_pct;
|
||||||
Open();
|
Open();
|
||||||
#endif
|
#endif
|
||||||
@ -139,6 +173,15 @@ public:
|
|||||||
fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name);
|
fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name);
|
||||||
perror("Error is");
|
perror("Error is");
|
||||||
}
|
}
|
||||||
|
int norm = PerformanceCounterConfigs[PCT].normalisation;
|
||||||
|
pe.type = PerformanceCounterConfigs[norm].type;
|
||||||
|
pe.config= PerformanceCounterConfigs[norm].config;
|
||||||
|
name = PerformanceCounterConfigs[norm].name;
|
||||||
|
cyclefd = perf_event_open(&pe, 0, -1, -1, 0); // pid 0, cpu -1 current process any cpu. group -1
|
||||||
|
if (cyclefd == -1) {
|
||||||
|
fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name);
|
||||||
|
perror("Error is");
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -146,10 +189,12 @@ public:
|
|||||||
{
|
{
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
if ( fd!= -1) {
|
if ( fd!= -1) {
|
||||||
ioctl(fd, PERF_EVENT_IOC_RESET, 0);
|
::ioctl(fd, PERF_EVENT_IOC_RESET, 0);
|
||||||
ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
|
::ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
|
||||||
|
::ioctl(cyclefd, PERF_EVENT_IOC_RESET, 0);
|
||||||
|
::ioctl(cyclefd, PERF_EVENT_IOC_ENABLE, 0);
|
||||||
}
|
}
|
||||||
begin =__rdtsc();
|
begin =cyclecount();
|
||||||
#else
|
#else
|
||||||
begin = 0;
|
begin = 0;
|
||||||
#endif
|
#endif
|
||||||
@ -157,12 +202,15 @@ public:
|
|||||||
|
|
||||||
void Stop(void) {
|
void Stop(void) {
|
||||||
count=0;
|
count=0;
|
||||||
|
cycles=0;
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
if ( fd!= -1) {
|
if ( fd!= -1) {
|
||||||
ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
|
::ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
|
||||||
|
::ioctl(cyclefd, PERF_EVENT_IOC_DISABLE, 0);
|
||||||
::read(fd, &count, sizeof(long long));
|
::read(fd, &count, sizeof(long long));
|
||||||
|
::read(cyclefd, &cycles, sizeof(long long));
|
||||||
}
|
}
|
||||||
elapsed = __rdtsc() - begin;
|
elapsed = cyclecount() - begin;
|
||||||
#else
|
#else
|
||||||
elapsed = 0;
|
elapsed = 0;
|
||||||
#endif
|
#endif
|
||||||
@ -170,16 +218,20 @@ public:
|
|||||||
}
|
}
|
||||||
void Report(void) {
|
void Report(void) {
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
printf("%llu cycles %s = %20llu\n", elapsed , PerformanceCounterConfigs[PCT].name, count);
|
int N = PerformanceCounterConfigs[PCT].normalisation;
|
||||||
|
const char * sn = PerformanceCounterConfigs[N].name ;
|
||||||
|
const char * sc = PerformanceCounterConfigs[PCT].name;
|
||||||
|
std::printf("tsc = %llu %s = %llu %s = %20llu\n (%s/%s) rate = %lf\n", elapsed,sn ,cycles,
|
||||||
|
sc, count, sc,sn, (double)count/(double)cycles);
|
||||||
#else
|
#else
|
||||||
printf("%llu cycles \n", elapsed );
|
std::printf("%llu cycles \n", elapsed );
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
~PerformanceCounter()
|
~PerformanceCounter()
|
||||||
{
|
{
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
close(fd);
|
::close(fd); ::close(cyclefd);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -42,10 +42,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
|
|
||||||
#define _MM_SELECT_FOUR_FOUR(A,B,C,D) ((A<<6)|(B<<4)|(C<<2)|(D))
|
#define _MM_SELECT_FOUR_FOUR(A,B,C,D) ((A<<6)|(B<<4)|(C<<2)|(D))
|
||||||
|
#define _MM_SELECT_FOUR_FOUR_STRING(A,B,C,D) "((" #A "<<6)|(" #B "<<4)|(" #C "<<2)|(" #D "))"
|
||||||
#define _MM_SELECT_EIGHT_TWO(A,B,C,D,E,F,G,H) ((A<<7)|(B<<6)|(C<<5)|(D<<4)|(E<<3)|(F<<2)|(G<<4)|(H))
|
#define _MM_SELECT_EIGHT_TWO(A,B,C,D,E,F,G,H) ((A<<7)|(B<<6)|(C<<5)|(D<<4)|(E<<3)|(F<<2)|(G<<4)|(H))
|
||||||
#define _MM_SELECT_FOUR_TWO (A,B,C,D) _MM_SELECT_EIGHT_TWO(0,0,0,0,A,B,C,D)
|
#define _MM_SELECT_FOUR_TWO (A,B,C,D) _MM_SELECT_EIGHT_TWO(0,0,0,0,A,B,C,D)
|
||||||
#define _MM_SELECT_TWO_TWO (A,B) _MM_SELECT_FOUR_TWO(0,0,A,B)
|
#define _MM_SELECT_TWO_TWO (A,B) _MM_SELECT_FOUR_TWO(0,0,A,B)
|
||||||
|
|
||||||
|
#define RotateBit (0x100)
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
typedef uint32_t Integer;
|
typedef uint32_t Integer;
|
||||||
|
218
lib/Stencil.h
218
lib/Stencil.h
@ -71,13 +71,14 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
struct StencilEntry {
|
struct StencilEntry {
|
||||||
int _offset;
|
uint32_t _offset;
|
||||||
int _is_local;
|
uint32_t _byte_offset;
|
||||||
int _permute;
|
uint16_t _is_local;
|
||||||
int _around_the_world;
|
uint16_t _permute;
|
||||||
|
uint32_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline
|
||||||
};
|
};
|
||||||
|
|
||||||
template<class vobj,class cobj, class compressor>
|
template<class vobj,class cobj>
|
||||||
class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
|
class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
|
||||||
public:
|
public:
|
||||||
|
|
||||||
@ -101,7 +102,16 @@ namespace Grid {
|
|||||||
|
|
||||||
std::vector<Packet> Packets;
|
std::vector<Packet> Packets;
|
||||||
|
|
||||||
|
#define SEND_IMMEDIATE
|
||||||
|
#define SERIAL_SENDS
|
||||||
|
|
||||||
void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){
|
void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){
|
||||||
|
comms_bytes+=2.0*bytes;
|
||||||
|
#ifdef SEND_IMMEDIATE
|
||||||
|
commtime-=usecond();
|
||||||
|
_grid->SendToRecvFrom(xmit,to,rcv,from,bytes);
|
||||||
|
commtime+=usecond();
|
||||||
|
#endif
|
||||||
Packet p;
|
Packet p;
|
||||||
p.send_buf = xmit;
|
p.send_buf = xmit;
|
||||||
p.recv_buf = rcv;
|
p.recv_buf = rcv;
|
||||||
@ -111,20 +121,63 @@ namespace Grid {
|
|||||||
p.done = 0;
|
p.done = 0;
|
||||||
comms_bytes+=2.0*bytes;
|
comms_bytes+=2.0*bytes;
|
||||||
Packets.push_back(p);
|
Packets.push_back(p);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef SERIAL_SENDS
|
||||||
void Communicate(void ) {
|
void Communicate(void ) {
|
||||||
commtime-=usecond();
|
commtime-=usecond();
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
_grid->SendToRecvFrom(Packets[i].send_buf,
|
#ifndef SEND_IMMEDIATE
|
||||||
|
_grid->SendToRecvFrom(
|
||||||
|
Packets[i].send_buf,
|
||||||
Packets[i].to_rank,
|
Packets[i].to_rank,
|
||||||
Packets[i].recv_buf,
|
Packets[i].recv_buf,
|
||||||
Packets[i].from_rank,
|
Packets[i].from_rank,
|
||||||
Packets[i].bytes);
|
Packets[i].bytes);
|
||||||
|
#endif
|
||||||
Packets[i].done = 1;
|
Packets[i].done = 1;
|
||||||
}
|
}
|
||||||
commtime+=usecond();
|
commtime+=usecond();
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
void Communicate(void ) {
|
||||||
|
typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
|
||||||
|
std::vector<std::vector<CommsRequest_t> > reqs(Packets.size());
|
||||||
|
commtime-=usecond();
|
||||||
|
const int concurrency=2;
|
||||||
|
for(int i=0;i<Packets.size();i+=concurrency){
|
||||||
|
for(int ii=0;ii<concurrency;ii++){
|
||||||
|
int j = i+ii;
|
||||||
|
if ( j<Packets.size() ) {
|
||||||
|
#ifndef SEND_IMMEDIATE
|
||||||
|
_grid->SendToRecvFromBegin(reqs[j],
|
||||||
|
Packets[j].send_buf,
|
||||||
|
Packets[j].to_rank,
|
||||||
|
Packets[j].recv_buf,
|
||||||
|
Packets[j].from_rank,
|
||||||
|
Packets[j].bytes);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(int ii=0;ii<concurrency;ii++){
|
||||||
|
int j = i+ii;
|
||||||
|
if ( j<Packets.size() ) {
|
||||||
|
#ifndef SEND_IMMEDIATE
|
||||||
|
_grid->SendToRecvFromComplete(reqs[i]);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(int ii=0;ii<concurrency;ii++){
|
||||||
|
int j = i+ii;
|
||||||
|
if ( j<Packets.size() ) {
|
||||||
|
Packets[j].done = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
commtime+=usecond();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
// Simd merge queue for asynch comms
|
// Simd merge queue for asynch comms
|
||||||
@ -144,25 +197,36 @@ namespace Grid {
|
|||||||
m.rpointers= rpointers;
|
m.rpointers= rpointers;
|
||||||
m.buffer_size = buffer_size;
|
m.buffer_size = buffer_size;
|
||||||
m.packet_id = packet_id;
|
m.packet_id = packet_id;
|
||||||
|
#ifdef SEND_IMMEDIATE
|
||||||
|
mergetime-=usecond();
|
||||||
|
PARALLEL_FOR_LOOP
|
||||||
|
for(int o=0;o<m.buffer_size;o++){
|
||||||
|
merge1(m.mpointer[o],m.rpointers,o);
|
||||||
|
}
|
||||||
|
mergetime+=usecond();
|
||||||
|
#else
|
||||||
Mergers.push_back(m);
|
Mergers.push_back(m);
|
||||||
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void CommsMerge(void ) {
|
void CommsMerge(void ) {
|
||||||
//PARALLEL_NESTED_LOOP2
|
//PARALLEL_NESTED_LOOP2
|
||||||
for(int i=0;i<Mergers.size();i++){
|
for(int i=0;i<Mergers.size();i++){
|
||||||
|
|
||||||
|
|
||||||
spintime-=usecond();
|
spintime-=usecond();
|
||||||
int packet_id = Mergers[i].packet_id;
|
int packet_id = Mergers[i].packet_id;
|
||||||
while(! Packets[packet_id].done ); // spin for completion
|
while(! Packets[packet_id].done ); // spin for completion
|
||||||
spintime+=usecond();
|
spintime+=usecond();
|
||||||
|
|
||||||
|
#ifndef SEND_IMMEDIATE
|
||||||
mergetime-=usecond();
|
mergetime-=usecond();
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int o=0;o<Mergers[i].buffer_size;o++){
|
for(int o=0;o<Mergers[i].buffer_size;o++){
|
||||||
merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
|
merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
|
||||||
}
|
}
|
||||||
mergetime+=usecond();
|
mergetime+=usecond();
|
||||||
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -182,8 +246,29 @@ PARALLEL_FOR_LOOP
|
|||||||
std::vector<int> _permute_type;
|
std::vector<int> _permute_type;
|
||||||
|
|
||||||
// npoints x Osites() of these
|
// npoints x Osites() of these
|
||||||
std::vector<std::vector<StencilEntry> > _entries;
|
// Flat vector, change layout for cache friendly.
|
||||||
inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point][osite]; }
|
Vector<StencilEntry> _entries;
|
||||||
|
|
||||||
|
inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; }
|
||||||
|
|
||||||
|
void PrecomputeByteOffsets(void){
|
||||||
|
for(int i=0;i<_entries.size();i++){
|
||||||
|
if( _entries[i]._is_local ) {
|
||||||
|
_entries[i]._byte_offset = _entries[i]._offset*sizeof(vobj);
|
||||||
|
} else {
|
||||||
|
_entries[i]._byte_offset =(uint64_t)&comm_buf[0]+ _entries[i]._offset*sizeof(cobj);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) {
|
||||||
|
_mm_prefetch((char *)&_entries[ent+1],_MM_HINT_T0);
|
||||||
|
local = _entries[ent]._is_local;
|
||||||
|
perm = _entries[ent]._permute;
|
||||||
|
if (perm) ptype = _permute_type[point];
|
||||||
|
if (local) return base + _entries[ent]._byte_offset;
|
||||||
|
else return _entries[ent]._byte_offset;
|
||||||
|
}
|
||||||
|
|
||||||
// Comms buffers
|
// Comms buffers
|
||||||
std::vector<Vector<scalar_object> > u_simd_send_buf;
|
std::vector<Vector<scalar_object> > u_simd_send_buf;
|
||||||
@ -215,7 +300,7 @@ PARALLEL_FOR_LOOP
|
|||||||
int checkerboard,
|
int checkerboard,
|
||||||
const std::vector<int> &directions,
|
const std::vector<int> &directions,
|
||||||
const std::vector<int> &distances)
|
const std::vector<int> &distances)
|
||||||
: _entries(npoints), _permute_type(npoints), _comm_buf_size(npoints)
|
: _permute_type(npoints), _comm_buf_size(npoints)
|
||||||
{
|
{
|
||||||
#ifdef TIMING_HACK
|
#ifdef TIMING_HACK
|
||||||
gathertime=0;
|
gathertime=0;
|
||||||
@ -237,12 +322,12 @@ PARALLEL_FOR_LOOP
|
|||||||
|
|
||||||
int osites = _grid->oSites();
|
int osites = _grid->oSites();
|
||||||
|
|
||||||
|
_entries.resize(_npoints* osites);
|
||||||
for(int ii=0;ii<npoints;ii++){
|
for(int ii=0;ii<npoints;ii++){
|
||||||
|
|
||||||
int i = ii; // reverse direction to get SIMD comms done first
|
int i = ii; // reverse direction to get SIMD comms done first
|
||||||
int point = i;
|
int point = i;
|
||||||
|
|
||||||
_entries[i].resize( osites);
|
|
||||||
|
|
||||||
int dimension = directions[i];
|
int dimension = directions[i];
|
||||||
int displacement = distances[i];
|
int displacement = distances[i];
|
||||||
@ -258,6 +343,9 @@ PARALLEL_FOR_LOOP
|
|||||||
int simd_layout = _grid->_simd_layout[dimension];
|
int simd_layout = _grid->_simd_layout[dimension];
|
||||||
int comm_dim = _grid->_processors[dimension] >1 ;
|
int comm_dim = _grid->_processors[dimension] >1 ;
|
||||||
int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim);
|
int splice_dim = _grid->_simd_layout[dimension]>1 && (comm_dim);
|
||||||
|
int rotate_dim = _grid->_simd_layout[dimension]>2;
|
||||||
|
|
||||||
|
assert ( (rotate_dim && comm_dim) == false) ; // Do not think spread out is supported
|
||||||
|
|
||||||
int sshift[2];
|
int sshift[2];
|
||||||
|
|
||||||
@ -290,6 +378,8 @@ PARALLEL_FOR_LOOP
|
|||||||
u_send_buf.resize(_unified_buffer_size);
|
u_send_buf.resize(_unified_buffer_size);
|
||||||
comm_buf.resize(_unified_buffer_size);
|
comm_buf.resize(_unified_buffer_size);
|
||||||
|
|
||||||
|
PrecomputeByteOffsets();
|
||||||
|
|
||||||
const int Nsimd = grid->Nsimd();
|
const int Nsimd = grid->Nsimd();
|
||||||
u_simd_send_buf.resize(Nsimd);
|
u_simd_send_buf.resize(Nsimd);
|
||||||
u_simd_recv_buf.resize(Nsimd);
|
u_simd_recv_buf.resize(Nsimd);
|
||||||
@ -305,6 +395,7 @@ PARALLEL_FOR_LOOP
|
|||||||
int rd = _grid->_rdimensions[dimension];
|
int rd = _grid->_rdimensions[dimension];
|
||||||
int ld = _grid->_ldimensions[dimension];
|
int ld = _grid->_ldimensions[dimension];
|
||||||
int gd = _grid->_gdimensions[dimension];
|
int gd = _grid->_gdimensions[dimension];
|
||||||
|
int ly = _grid->_simd_layout[dimension];
|
||||||
|
|
||||||
// Map to always positive shift modulo global full dimension.
|
// Map to always positive shift modulo global full dimension.
|
||||||
int shift = (shiftpm+fd)%fd;
|
int shift = (shiftpm+fd)%fd;
|
||||||
@ -335,7 +426,7 @@ PARALLEL_FOR_LOOP
|
|||||||
int wrap = sshift/rd;
|
int wrap = sshift/rd;
|
||||||
int num = sshift%rd;
|
int num = sshift%rd;
|
||||||
if ( x< rd-num ) permute_slice=wrap;
|
if ( x< rd-num ) permute_slice=wrap;
|
||||||
else permute_slice = 1-wrap;
|
else permute_slice = (wrap+1)%ly;
|
||||||
}
|
}
|
||||||
|
|
||||||
CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
|
CopyPlane(point,dimension,x,sx,cbmask,permute_slice,wraparound);
|
||||||
@ -355,7 +446,6 @@ PARALLEL_FOR_LOOP
|
|||||||
int simd_layout = _grid->_simd_layout[dimension];
|
int simd_layout = _grid->_simd_layout[dimension];
|
||||||
int comm_dim = _grid->_processors[dimension] >1 ;
|
int comm_dim = _grid->_processors[dimension] >1 ;
|
||||||
|
|
||||||
// assert(simd_layout==1); // Why?
|
|
||||||
assert(comm_dim==1);
|
assert(comm_dim==1);
|
||||||
int shift = (shiftpm + fd) %fd;
|
int shift = (shiftpm + fd) %fd;
|
||||||
assert(shift>=0);
|
assert(shift>=0);
|
||||||
@ -440,10 +530,11 @@ PARALLEL_FOR_LOOP
|
|||||||
// Simple block stride gather of SIMD objects
|
// Simple block stride gather of SIMD objects
|
||||||
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
|
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
|
||||||
for(int b=0;b<_grid->_slice_block[dimension];b++){
|
for(int b=0;b<_grid->_slice_block[dimension];b++){
|
||||||
_entries[point][lo+o+b]._offset =ro+o+b;
|
int idx=point+(lo+o+b)*_npoints;
|
||||||
_entries[point][lo+o+b]._is_local=1;
|
_entries[idx]._offset =ro+o+b;
|
||||||
_entries[point][lo+o+b]._permute=permute;
|
_entries[idx]._permute=permute;
|
||||||
_entries[point][lo+o+b]._around_the_world=wrap;
|
_entries[idx]._is_local=1;
|
||||||
|
_entries[idx]._around_the_world=wrap;
|
||||||
}
|
}
|
||||||
o +=_grid->_slice_stride[dimension];
|
o +=_grid->_slice_stride[dimension];
|
||||||
}
|
}
|
||||||
@ -460,10 +551,11 @@ PARALLEL_FOR_LOOP
|
|||||||
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);
|
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);
|
||||||
|
|
||||||
if ( ocb&cbmask ) {
|
if ( ocb&cbmask ) {
|
||||||
_entries[point][lo+o+b]._offset =ro+o+b;
|
int idx = point+(lo+o+b)*_npoints;
|
||||||
_entries[point][lo+o+b]._is_local=1;
|
_entries[idx]._offset =ro+o+b;
|
||||||
_entries[point][lo+o+b]._permute=permute;
|
_entries[idx]._is_local=1;
|
||||||
_entries[point][lo+o+b]._around_the_world=wrap;
|
_entries[idx]._permute=permute;
|
||||||
|
_entries[idx]._around_the_world=wrap;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -486,10 +578,11 @@ PARALLEL_FOR_LOOP
|
|||||||
// Simple block stride gather of SIMD objects
|
// Simple block stride gather of SIMD objects
|
||||||
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
|
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
|
||||||
for(int b=0;b<_grid->_slice_block[dimension];b++){
|
for(int b=0;b<_grid->_slice_block[dimension];b++){
|
||||||
_entries[point][so+o+b]._offset =offset+(bo++);
|
int idx=point+(so+o+b)*_npoints;
|
||||||
_entries[point][so+o+b]._is_local=0;
|
_entries[idx]._offset =offset+(bo++);
|
||||||
_entries[point][so+o+b]._permute=0;
|
_entries[idx]._is_local=0;
|
||||||
_entries[point][so+o+b]._around_the_world=wrap;
|
_entries[idx]._permute=0;
|
||||||
|
_entries[idx]._around_the_world=wrap;
|
||||||
}
|
}
|
||||||
o +=_grid->_slice_stride[dimension];
|
o +=_grid->_slice_stride[dimension];
|
||||||
}
|
}
|
||||||
@ -505,10 +598,11 @@ PARALLEL_FOR_LOOP
|
|||||||
|
|
||||||
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
||||||
if ( ocb & cbmask ) {
|
if ( ocb & cbmask ) {
|
||||||
_entries[point][so+o+b]._offset =offset+(bo++);
|
int idx = point+(so+o+b)*_npoints;
|
||||||
_entries[point][so+o+b]._is_local=0;
|
_entries[idx]._offset =offset+(bo++);
|
||||||
_entries[point][so+o+b]._permute =0;
|
_entries[idx]._is_local=0;
|
||||||
_entries[point][so+o+b]._around_the_world=wrap;
|
_entries[idx]._permute =0;
|
||||||
|
_entries[idx]._around_the_world=wrap;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
o +=_grid->_slice_stride[dimension];
|
o +=_grid->_slice_stride[dimension];
|
||||||
@ -517,19 +611,26 @@ PARALLEL_FOR_LOOP
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
template<class compressor>
|
||||||
|
void HaloExchange(const Lattice<vobj> &source,compressor &compress)
|
||||||
|
{
|
||||||
|
Mergers.resize(0);
|
||||||
|
Packets.resize(0);
|
||||||
|
HaloGather(source,compress);
|
||||||
|
this->Communicate();
|
||||||
|
CommsMerge(); // spins
|
||||||
|
}
|
||||||
|
#if 0
|
||||||
|
// Overlapping comms and compute typically slows down compute and is useless
|
||||||
|
// unless memory bandwidth greatly exceeds network
|
||||||
|
template<class compressor>
|
||||||
std::thread HaloExchangeBegin(const Lattice<vobj> &source,compressor &compress) {
|
std::thread HaloExchangeBegin(const Lattice<vobj> &source,compressor &compress) {
|
||||||
Mergers.resize(0);
|
Mergers.resize(0);
|
||||||
Packets.resize(0);
|
Packets.resize(0);
|
||||||
HaloGather(source,compress);
|
HaloGather(source,compress);
|
||||||
return std::thread([&] { this->Communicate(); });
|
return std::thread([&] { this->Communicate(); });
|
||||||
}
|
}
|
||||||
|
|
||||||
void HaloExchange(const Lattice<vobj> &source,compressor &compress)
|
|
||||||
{
|
|
||||||
auto thr = HaloExchangeBegin(source,compress);
|
|
||||||
HaloExchangeComplete(thr);
|
|
||||||
}
|
|
||||||
|
|
||||||
void HaloExchangeComplete(std::thread &thr)
|
void HaloExchangeComplete(std::thread &thr)
|
||||||
{
|
{
|
||||||
CommsMerge(); // spins
|
CommsMerge(); // spins
|
||||||
@ -537,21 +638,10 @@ PARALLEL_FOR_LOOP
|
|||||||
thr.join();
|
thr.join();
|
||||||
jointime+=usecond();
|
jointime+=usecond();
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
void HaloGather(const Lattice<vobj> &source,compressor &compress)
|
template<class compressor>
|
||||||
|
void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point)
|
||||||
{
|
{
|
||||||
// conformable(source._grid,_grid);
|
|
||||||
assert(source._grid==_grid);
|
|
||||||
halogtime-=usecond();
|
|
||||||
|
|
||||||
assert (comm_buf.size() == _unified_buffer_size );
|
|
||||||
u_comm_offset=0;
|
|
||||||
|
|
||||||
// Gather all comms buffers
|
|
||||||
for(int point = 0 ; point < _npoints; point++) {
|
|
||||||
|
|
||||||
compress.Point(point);
|
|
||||||
|
|
||||||
int dimension = _directions[point];
|
int dimension = _directions[point];
|
||||||
int displacement = _distances[point];
|
int displacement = _distances[point];
|
||||||
|
|
||||||
@ -601,10 +691,27 @@ PARALLEL_FOR_LOOP
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<class compressor>
|
||||||
|
void HaloGather(const Lattice<vobj> &source,compressor &compress)
|
||||||
|
{
|
||||||
|
// conformable(source._grid,_grid);
|
||||||
|
assert(source._grid==_grid);
|
||||||
|
halogtime-=usecond();
|
||||||
|
|
||||||
|
assert (comm_buf.size() == _unified_buffer_size );
|
||||||
|
u_comm_offset=0;
|
||||||
|
|
||||||
|
// Gather all comms buffers
|
||||||
|
for(int point = 0 ; point < _npoints; point++) {
|
||||||
|
compress.Point(point);
|
||||||
|
HaloGatherDir(source,compress,point);
|
||||||
|
}
|
||||||
|
|
||||||
assert(u_comm_offset==_unified_buffer_size);
|
assert(u_comm_offset==_unified_buffer_size);
|
||||||
halogtime+=usecond();
|
halogtime+=usecond();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<class compressor>
|
||||||
void Gather(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor & compress)
|
void Gather(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor & compress)
|
||||||
{
|
{
|
||||||
typedef typename cobj::vector_type vector_type;
|
typedef typename cobj::vector_type vector_type;
|
||||||
@ -653,13 +760,6 @@ PARALLEL_FOR_LOOP
|
|||||||
assert (recv_from_rank != _grid->ThisRank());
|
assert (recv_from_rank != _grid->ThisRank());
|
||||||
|
|
||||||
// FIXME Implement asynchronous send & also avoid buffer copy
|
// FIXME Implement asynchronous send & also avoid buffer copy
|
||||||
/*
|
|
||||||
_grid->SendToRecvFrom((void *)&send_buf[0],
|
|
||||||
xmit_to_rank,
|
|
||||||
(void *)&comm_buf[u_comm_offset],
|
|
||||||
recv_from_rank,
|
|
||||||
bytes);
|
|
||||||
*/
|
|
||||||
AddPacket((void *)&u_send_buf[u_comm_offset],
|
AddPacket((void *)&u_send_buf[u_comm_offset],
|
||||||
(void *) &comm_buf[u_comm_offset],
|
(void *) &comm_buf[u_comm_offset],
|
||||||
xmit_to_rank,
|
xmit_to_rank,
|
||||||
@ -672,6 +772,7 @@ PARALLEL_FOR_LOOP
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template<class compressor>
|
||||||
void GatherSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress)
|
void GatherSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress)
|
||||||
{
|
{
|
||||||
const int Nsimd = _grid->Nsimd();
|
const int Nsimd = _grid->Nsimd();
|
||||||
@ -684,6 +785,7 @@ PARALLEL_FOR_LOOP
|
|||||||
int comm_dim = _grid->_processors[dimension] >1 ;
|
int comm_dim = _grid->_processors[dimension] >1 ;
|
||||||
|
|
||||||
assert(comm_dim==1);
|
assert(comm_dim==1);
|
||||||
|
// This will not work with a rotate dim
|
||||||
assert(simd_layout==2);
|
assert(simd_layout==2);
|
||||||
assert(shift>=0);
|
assert(shift>=0);
|
||||||
assert(shift<fd);
|
assert(shift<fd);
|
||||||
@ -729,6 +831,8 @@ PARALLEL_FOR_LOOP
|
|||||||
|
|
||||||
for(int i=0;i<Nsimd;i++){
|
for(int i=0;i<Nsimd;i++){
|
||||||
|
|
||||||
|
// FIXME
|
||||||
|
// This logic is hard coded to simd_layout ==2 and not allowing >2
|
||||||
// std::cout << "GatherSimd : lane 1st elem " << i << u_simd_send_buf[i ][u_comm_offset]<<std::endl;
|
// std::cout << "GatherSimd : lane 1st elem " << i << u_simd_send_buf[i ][u_comm_offset]<<std::endl;
|
||||||
|
|
||||||
int inner_bit = (Nsimd>>(permute_type+1));
|
int inner_bit = (Nsimd>>(permute_type+1));
|
||||||
|
27
lib/Timer.h
27
lib/Timer.h
@ -39,11 +39,18 @@ namespace Grid {
|
|||||||
// Dress the output; use std::chrono
|
// Dress the output; use std::chrono
|
||||||
|
|
||||||
// C++11 time facilities better?
|
// C++11 time facilities better?
|
||||||
double usecond(void);
|
inline double usecond(void) {
|
||||||
|
struct timeval tv;
|
||||||
|
#ifdef TIMERS_ON
|
||||||
|
gettimeofday(&tv,NULL);
|
||||||
|
#endif
|
||||||
|
return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
|
||||||
|
}
|
||||||
|
|
||||||
typedef std::chrono::system_clock GridClock;
|
typedef std::chrono::system_clock GridClock;
|
||||||
typedef std::chrono::time_point<GridClock> GridTimePoint;
|
typedef std::chrono::time_point<GridClock> GridTimePoint;
|
||||||
typedef std::chrono::milliseconds GridTime;
|
typedef std::chrono::milliseconds GridTime;
|
||||||
|
typedef std::chrono::microseconds GridUsecs;
|
||||||
|
|
||||||
inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milliseconds & time)
|
inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milliseconds & time)
|
||||||
{
|
{
|
||||||
@ -55,29 +62,39 @@ class GridStopWatch {
|
|||||||
private:
|
private:
|
||||||
bool running;
|
bool running;
|
||||||
GridTimePoint start;
|
GridTimePoint start;
|
||||||
GridTime accumulator;
|
GridUsecs accumulator;
|
||||||
public:
|
public:
|
||||||
GridStopWatch () {
|
GridStopWatch () {
|
||||||
Reset();
|
Reset();
|
||||||
}
|
}
|
||||||
void Start(void) {
|
void Start(void) {
|
||||||
assert(running == false);
|
assert(running == false);
|
||||||
|
#ifdef TIMERS_ON
|
||||||
start = GridClock::now();
|
start = GridClock::now();
|
||||||
|
#endif
|
||||||
running = true;
|
running = true;
|
||||||
}
|
}
|
||||||
void Stop(void) {
|
void Stop(void) {
|
||||||
assert(running == true);
|
assert(running == true);
|
||||||
accumulator+= std::chrono::duration_cast<GridTime>(GridClock::now()-start);
|
#ifdef TIMERS_ON
|
||||||
|
accumulator+= std::chrono::duration_cast<GridUsecs>(GridClock::now()-start);
|
||||||
|
#endif
|
||||||
running = false;
|
running = false;
|
||||||
};
|
};
|
||||||
void Reset(void){
|
void Reset(void){
|
||||||
running = false;
|
running = false;
|
||||||
|
#ifdef TIMERS_ON
|
||||||
start = GridClock::now();
|
start = GridClock::now();
|
||||||
accumulator = std::chrono::duration_cast<GridTime>(start-start);
|
#endif
|
||||||
|
accumulator = std::chrono::duration_cast<GridUsecs>(start-start);
|
||||||
}
|
}
|
||||||
GridTime Elapsed(void) {
|
GridTime Elapsed(void) {
|
||||||
assert(running == false);
|
assert(running == false);
|
||||||
return accumulator;
|
return std::chrono::duration_cast<GridTime>( accumulator );
|
||||||
|
}
|
||||||
|
uint64_t useconds(void){
|
||||||
|
assert(running == false);
|
||||||
|
return (uint64_t) accumulator.count();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -147,6 +147,56 @@ namespace Grid {
|
|||||||
}
|
}
|
||||||
Orthogonalise();
|
Orthogonalise();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
virtual void CreateSubspaceLanczos(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis)
|
||||||
|
{
|
||||||
|
// Run a Lanczos with sloppy convergence
|
||||||
|
const int Nstop = nn;
|
||||||
|
const int Nk = nn+20;
|
||||||
|
const int Np = nn+20;
|
||||||
|
const int Nm = Nk+Np;
|
||||||
|
const int MaxIt= 10000;
|
||||||
|
RealD resid = 1.0e-3;
|
||||||
|
|
||||||
|
Chebyshev<FineField> Cheb(0.5,64.0,21);
|
||||||
|
ImplicitlyRestartedLanczos<FineField> IRL(hermop,Cheb,Nstop,Nk,Nm,resid,MaxIt);
|
||||||
|
// IRL.lock = 1;
|
||||||
|
|
||||||
|
FineField noise(FineGrid); gaussian(RNG,noise);
|
||||||
|
FineField tmp(FineGrid);
|
||||||
|
std::vector<RealD> eval(Nm);
|
||||||
|
std::vector<FineField> evec(Nm,FineGrid);
|
||||||
|
|
||||||
|
int Nconv;
|
||||||
|
IRL.calc(eval,evec,
|
||||||
|
noise,
|
||||||
|
Nconv);
|
||||||
|
|
||||||
|
// pull back nn vectors
|
||||||
|
for(int b=0;b<nn;b++){
|
||||||
|
|
||||||
|
subspace[b] = evec[b];
|
||||||
|
|
||||||
|
std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
|
||||||
|
|
||||||
|
hermop.Op(subspace[b],tmp);
|
||||||
|
std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(tmp)<<std::endl;
|
||||||
|
|
||||||
|
noise = tmp - sqrt(eval[b])*subspace[b] ;
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<" ; [ M - Lambda ]_"<<b<<" vec_"<<b<<" = " <<norm2(noise)<<std::endl;
|
||||||
|
|
||||||
|
noise = tmp + eval[b]*subspace[b] ;
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<" ; [ M - Lambda ]_"<<b<<" vec_"<<b<<" = " <<norm2(noise)<<std::endl;
|
||||||
|
|
||||||
|
}
|
||||||
|
Orthogonalise();
|
||||||
|
for(int b=0;b<nn;b++){
|
||||||
|
std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
virtual void CreateSubspace(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
|
virtual void CreateSubspace(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
|
||||||
|
|
||||||
RealD scale;
|
RealD scale;
|
||||||
@ -200,7 +250,7 @@ namespace Grid {
|
|||||||
////////////////////
|
////////////////////
|
||||||
Geometry geom;
|
Geometry geom;
|
||||||
GridBase * _grid;
|
GridBase * _grid;
|
||||||
CartesianStencil<siteVector,siteVector,SimpleCompressor<siteVector> > Stencil;
|
CartesianStencil<siteVector,siteVector> Stencil;
|
||||||
|
|
||||||
std::vector<CoarseMatrix> A;
|
std::vector<CoarseMatrix> A;
|
||||||
|
|
||||||
|
@ -222,6 +222,7 @@ namespace Grid {
|
|||||||
SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
|
SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
|
||||||
virtual RealD Mpc (const Field &in, Field &out) {
|
virtual RealD Mpc (const Field &in, Field &out) {
|
||||||
Field tmp(in._grid);
|
Field tmp(in._grid);
|
||||||
|
// std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << " _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;
|
||||||
|
|
||||||
_Mat.Meooe(in,tmp);
|
_Mat.Meooe(in,tmp);
|
||||||
_Mat.MooeeInv(tmp,out);
|
_Mat.MooeeInv(tmp,out);
|
||||||
@ -251,10 +252,10 @@ namespace Grid {
|
|||||||
virtual RealD Mpc (const Field &in, Field &out) {
|
virtual RealD Mpc (const Field &in, Field &out) {
|
||||||
Field tmp(in._grid);
|
Field tmp(in._grid);
|
||||||
|
|
||||||
_Mat.Meooe(in,tmp);
|
_Mat.Meooe(in,out);
|
||||||
_Mat.MooeeInv(tmp,out);
|
_Mat.MooeeInv(out,tmp);
|
||||||
_Mat.Meooe(out,tmp);
|
_Mat.Meooe(tmp,out);
|
||||||
_Mat.MooeeInv(tmp,out);
|
_Mat.MooeeInv(out,tmp);
|
||||||
|
|
||||||
return axpy_norm(out,-1.0,tmp,in);
|
return axpy_norm(out,-1.0,tmp,in);
|
||||||
}
|
}
|
||||||
@ -270,6 +271,35 @@ namespace Grid {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<class Matrix,class Field>
|
||||||
|
class SchurDiagTwoOperator : public SchurOperatorBase<Field> {
|
||||||
|
protected:
|
||||||
|
Matrix &_Mat;
|
||||||
|
public:
|
||||||
|
SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){};
|
||||||
|
|
||||||
|
virtual RealD Mpc (const Field &in, Field &out) {
|
||||||
|
Field tmp(in._grid);
|
||||||
|
|
||||||
|
_Mat.MooeeInv(in,out);
|
||||||
|
_Mat.Meooe(out,tmp);
|
||||||
|
_Mat.MooeeInv(tmp,out);
|
||||||
|
_Mat.Meooe(out,tmp);
|
||||||
|
|
||||||
|
return axpy_norm(out,-1.0,tmp,in);
|
||||||
|
}
|
||||||
|
virtual RealD MpcDag (const Field &in, Field &out){
|
||||||
|
Field tmp(in._grid);
|
||||||
|
|
||||||
|
_Mat.MeooeDag(in,out);
|
||||||
|
_Mat.MooeeInvDag(out,tmp);
|
||||||
|
_Mat.MeooeDag(tmp,out);
|
||||||
|
_Mat.MooeeInvDag(out,tmp);
|
||||||
|
|
||||||
|
return axpy_norm(out,-1.0,tmp,in);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////
|
||||||
// Base classes for functions of operators
|
// Base classes for functions of operators
|
||||||
|
@ -58,12 +58,13 @@ namespace Grid {
|
|||||||
Field Mtmp(in._grid);
|
Field Mtmp(in._grid);
|
||||||
AtoN = in;
|
AtoN = in;
|
||||||
out = AtoN*Coeffs[0];
|
out = AtoN*Coeffs[0];
|
||||||
// std::cout <<"Poly in " <<norm2(in)<<std::endl;
|
// std::cout <<"Poly in " <<norm2(in)<<" size "<< Coeffs.size()<<std::endl;
|
||||||
// std::cout <<"0 " <<norm2(out)<<std::endl;
|
// std::cout <<"Coeffs[0]= "<<Coeffs[0]<< " 0 " <<norm2(out)<<std::endl;
|
||||||
for(int n=1;n<Coeffs.size();n++){
|
for(int n=1;n<Coeffs.size();n++){
|
||||||
Mtmp = AtoN;
|
Mtmp = AtoN;
|
||||||
Linop.HermOp(Mtmp,AtoN);
|
Linop.HermOp(Mtmp,AtoN);
|
||||||
out=out+AtoN*Coeffs[n];
|
out=out+AtoN*Coeffs[n];
|
||||||
|
// std::cout <<"Coeffs "<<n<<"= "<< Coeffs[n]<< " 0 " <<std::endl;
|
||||||
// std::cout << n<<" " <<norm2(out)<<std::endl;
|
// std::cout << n<<" " <<norm2(out)<<std::endl;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -82,7 +83,8 @@ namespace Grid {
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
void csv(std::ostream &out){
|
void csv(std::ostream &out){
|
||||||
for (RealD x=lo; x<hi; x+=(hi-lo)/1000) {
|
RealD diff = hi-lo;
|
||||||
|
for (RealD x=lo-0.2*diff; x<hi+0.2*diff; x+=(hi-lo)/1000) {
|
||||||
RealD f = approx(x);
|
RealD f = approx(x);
|
||||||
out<< x<<" "<<f<<std::endl;
|
out<< x<<" "<<f<<std::endl;
|
||||||
}
|
}
|
||||||
@ -99,10 +101,24 @@ namespace Grid {
|
|||||||
|
|
||||||
Chebyshev(){};
|
Chebyshev(){};
|
||||||
Chebyshev(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD) ) {Init(_lo,_hi,_order,func);};
|
Chebyshev(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD) ) {Init(_lo,_hi,_order,func);};
|
||||||
|
Chebyshev(RealD _lo,RealD _hi,int _order) {Init(_lo,_hi,_order);};
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// c.f. numerical recipes "chebft"/"chebev". This is sec 5.8 "Chebyshev approximation".
|
// c.f. numerical recipes "chebft"/"chebev". This is sec 5.8 "Chebyshev approximation".
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// CJ: the one we need for Lanczos
|
||||||
|
void Init(RealD _lo,RealD _hi,int _order)
|
||||||
|
{
|
||||||
|
lo=_lo;
|
||||||
|
hi=_hi;
|
||||||
|
order=_order;
|
||||||
|
|
||||||
|
if(order < 2) exit(-1);
|
||||||
|
Coeffs.resize(order);
|
||||||
|
Coeffs.assign(0.,order);
|
||||||
|
Coeffs[order-1] = 1.;
|
||||||
|
};
|
||||||
|
|
||||||
void Init(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD))
|
void Init(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD))
|
||||||
{
|
{
|
||||||
lo=_lo;
|
lo=_lo;
|
||||||
@ -182,6 +198,8 @@ namespace Grid {
|
|||||||
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
|
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
|
||||||
|
|
||||||
GridBase *grid=in._grid;
|
GridBase *grid=in._grid;
|
||||||
|
//std::cout << "Chevyshef(): in._grid="<<in._grid<<std::endl;
|
||||||
|
//<<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;
|
||||||
|
|
||||||
int vol=grid->gSites();
|
int vol=grid->gSites();
|
||||||
|
|
||||||
|
@ -16,9 +16,13 @@
|
|||||||
#define INCLUDED_ALG_REMEZ_H
|
#define INCLUDED_ALG_REMEZ_H
|
||||||
|
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
|
#include <Config.h>
|
||||||
|
|
||||||
//#include <algorithms/approx/bigfloat.h>
|
#ifdef HAVE_GMP_H
|
||||||
|
#include <algorithms/approx/bigfloat.h>
|
||||||
|
#else
|
||||||
#include <algorithms/approx/bigfloat_double.h>
|
#include <algorithms/approx/bigfloat_double.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#define JMAX 10000 //Maximum number of iterations of Newton's approximation
|
#define JMAX 10000 //Maximum number of iterations of Newton's approximation
|
||||||
#define SUM_MAX 10 // Maximum number of terms in exponential
|
#define SUM_MAX 10 // Maximum number of terms in exponential
|
||||||
|
@ -84,7 +84,7 @@ public:
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::cout<<GridLogIterative << std::setprecision(4)<< "ConjugateGradient: k=0 residual "<<cp<<" rsq"<<rsq<<std::endl;
|
std::cout<<GridLogIterative << std::setprecision(4)<< "ConjugateGradient: k=0 residual "<<cp<<" target "<<rsq<<std::endl;
|
||||||
|
|
||||||
GridStopWatch LinalgTimer;
|
GridStopWatch LinalgTimer;
|
||||||
GridStopWatch MatrixTimer;
|
GridStopWatch MatrixTimer;
|
||||||
@ -101,8 +101,8 @@ public:
|
|||||||
MatrixTimer.Stop();
|
MatrixTimer.Stop();
|
||||||
|
|
||||||
LinalgTimer.Start();
|
LinalgTimer.Start();
|
||||||
RealD qqck = norm2(mmp);
|
// RealD qqck = norm2(mmp);
|
||||||
ComplexD dck = innerProduct(p,mmp);
|
// ComplexD dck = innerProduct(p,mmp);
|
||||||
|
|
||||||
a = c/d;
|
a = c/d;
|
||||||
b_pred = a*(a*qq-d)/c;
|
b_pred = a*(a*qq-d)/c;
|
||||||
@ -133,8 +133,8 @@ public:
|
|||||||
std::cout<<GridLogMessage<<"ConjugateGradient: Converged on iteration " <<k
|
std::cout<<GridLogMessage<<"ConjugateGradient: Converged on iteration " <<k
|
||||||
<<" computed residual "<<sqrt(cp/ssq)
|
<<" computed residual "<<sqrt(cp/ssq)
|
||||||
<<" true residual " <<true_residual
|
<<" true residual " <<true_residual
|
||||||
<<" target "<<Tolerance;
|
<<" target "<<Tolerance<<std::endl;
|
||||||
std::cout<<" Time elapsed: Total "<< SolverTimer.Elapsed() << " Matrix "<<MatrixTimer.Elapsed() << " Linalg "<<LinalgTimer.Elapsed();
|
std::cout<<GridLogMessage<<"Time elapsed: Total "<< SolverTimer.Elapsed() << " Matrix "<<MatrixTimer.Elapsed() << " Linalg "<<LinalgTimer.Elapsed();
|
||||||
std::cout<<std::endl;
|
std::cout<<std::endl;
|
||||||
|
|
||||||
assert(true_residual/Tolerance < 1000.0);
|
assert(true_residual/Tolerance < 1000.0);
|
||||||
|
@ -274,7 +274,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
|
|||||||
}
|
}
|
||||||
// ugly hack
|
// ugly hack
|
||||||
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
|
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
|
||||||
assert(0);
|
// assert(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
@ -38,32 +38,34 @@ template<class Field>
|
|||||||
class SortEigen {
|
class SortEigen {
|
||||||
private:
|
private:
|
||||||
|
|
||||||
|
//hacking for testing for now
|
||||||
|
private:
|
||||||
static bool less_lmd(RealD left,RealD right){
|
static bool less_lmd(RealD left,RealD right){
|
||||||
return fabs(left) < fabs(right);
|
return left > right;
|
||||||
}
|
}
|
||||||
static bool less_pair(std::pair<RealD,Field>& left,
|
static bool less_pair(std::pair<RealD,Field const*>& left,
|
||||||
std::pair<RealD,Field>& right){
|
std::pair<RealD,Field const*>& right){
|
||||||
return fabs(left.first) < fabs(right.first);
|
return left.first > (right.first);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
void push(DenseVector<RealD>& lmd,
|
void push(DenseVector<RealD>& lmd,
|
||||||
DenseVector<Field>& evec,int N) {
|
DenseVector<Field>& evec,int N) {
|
||||||
|
DenseVector<Field> cpy(lmd.size(),evec[0]._grid);
|
||||||
|
for(int i=0;i<lmd.size();i++) cpy[i] = evec[i];
|
||||||
|
|
||||||
DenseVector<std::pair<RealD, Field> > emod;
|
DenseVector<std::pair<RealD, Field const*> > emod(lmd.size());
|
||||||
typename DenseVector<std::pair<RealD, Field> >::iterator it;
|
for(int i=0;i<lmd.size();++i)
|
||||||
|
emod[i] = std::pair<RealD,Field const*>(lmd[i],&cpy[i]);
|
||||||
for(int i=0;i<lmd.size();++i){
|
|
||||||
emod.push_back(std::pair<RealD,Field>(lmd[i],evec[i]));
|
|
||||||
}
|
|
||||||
|
|
||||||
partial_sort(emod.begin(),emod.begin()+N,emod.end(),less_pair);
|
partial_sort(emod.begin(),emod.begin()+N,emod.end(),less_pair);
|
||||||
|
|
||||||
it=emod.begin();
|
typename DenseVector<std::pair<RealD, Field const*> >::iterator it = emod.begin();
|
||||||
for(int i=0;i<N;++i){
|
for(int i=0;i<N;++i){
|
||||||
lmd[i]=it->first;
|
lmd[i]=it->first;
|
||||||
evec[i]=it->second;
|
evec[i]=*(it->second);
|
||||||
++it;
|
++it;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -29,6 +29,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#ifndef GRID_IRL_H
|
#ifndef GRID_IRL_H
|
||||||
#define GRID_IRL_H
|
#define GRID_IRL_H
|
||||||
|
|
||||||
|
#include <string.h> //memset
|
||||||
|
#ifdef USE_LAPACK
|
||||||
|
#include <lapacke.h>
|
||||||
|
#endif
|
||||||
#include <algorithms/iterative/DenseMatrix.h>
|
#include <algorithms/iterative/DenseMatrix.h>
|
||||||
#include <algorithms/iterative/EigenSort.h>
|
#include <algorithms/iterative/EigenSort.h>
|
||||||
|
|
||||||
@ -49,6 +53,7 @@ public:
|
|||||||
int Niter;
|
int Niter;
|
||||||
int converged;
|
int converged;
|
||||||
|
|
||||||
|
int Nstop; // Number of evecs checked for convergence
|
||||||
int Nk; // Number of converged sought
|
int Nk; // Number of converged sought
|
||||||
int Np; // Np -- Number of spare vecs in kryloc space
|
int Np; // Np -- Number of spare vecs in kryloc space
|
||||||
int Nm; // Nm -- total number of vectors
|
int Nm; // Nm -- total number of vectors
|
||||||
@ -57,6 +62,8 @@ public:
|
|||||||
|
|
||||||
SortEigen<Field> _sort;
|
SortEigen<Field> _sort;
|
||||||
|
|
||||||
|
// GridCartesian &_fgrid;
|
||||||
|
|
||||||
LinearOperatorBase<Field> &_Linop;
|
LinearOperatorBase<Field> &_Linop;
|
||||||
|
|
||||||
OperatorFunction<Field> &_poly;
|
OperatorFunction<Field> &_poly;
|
||||||
@ -67,7 +74,27 @@ public:
|
|||||||
void init(void){};
|
void init(void){};
|
||||||
void Abort(int ff, DenseVector<RealD> &evals, DenseVector<DenseVector<RealD> > &evecs);
|
void Abort(int ff, DenseVector<RealD> &evals, DenseVector<DenseVector<RealD> > &evecs);
|
||||||
|
|
||||||
ImplicitlyRestartedLanczos(LinearOperatorBase<Field> &Linop, // op
|
ImplicitlyRestartedLanczos(
|
||||||
|
LinearOperatorBase<Field> &Linop, // op
|
||||||
|
OperatorFunction<Field> & poly, // polynmial
|
||||||
|
int _Nstop, // sought vecs
|
||||||
|
int _Nk, // sought vecs
|
||||||
|
int _Nm, // spare vecs
|
||||||
|
RealD _eresid, // resid in lmdue deficit
|
||||||
|
int _Niter) : // Max iterations
|
||||||
|
_Linop(Linop),
|
||||||
|
_poly(poly),
|
||||||
|
Nstop(_Nstop),
|
||||||
|
Nk(_Nk),
|
||||||
|
Nm(_Nm),
|
||||||
|
eresid(_eresid),
|
||||||
|
Niter(_Niter)
|
||||||
|
{
|
||||||
|
Np = Nm-Nk; assert(Np>0);
|
||||||
|
};
|
||||||
|
|
||||||
|
ImplicitlyRestartedLanczos(
|
||||||
|
LinearOperatorBase<Field> &Linop, // op
|
||||||
OperatorFunction<Field> & poly, // polynmial
|
OperatorFunction<Field> & poly, // polynmial
|
||||||
int _Nk, // sought vecs
|
int _Nk, // sought vecs
|
||||||
int _Nm, // spare vecs
|
int _Nm, // spare vecs
|
||||||
@ -75,6 +102,7 @@ public:
|
|||||||
int _Niter) : // Max iterations
|
int _Niter) : // Max iterations
|
||||||
_Linop(Linop),
|
_Linop(Linop),
|
||||||
_poly(poly),
|
_poly(poly),
|
||||||
|
Nstop(_Nk),
|
||||||
Nk(_Nk),
|
Nk(_Nk),
|
||||||
Nm(_Nm),
|
Nm(_Nm),
|
||||||
eresid(_eresid),
|
eresid(_eresid),
|
||||||
@ -142,6 +170,7 @@ public:
|
|||||||
RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
|
RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
|
||||||
// 7. vk+1 := wk/βk+1
|
// 7. vk+1 := wk/βk+1
|
||||||
|
|
||||||
|
// std::cout << "alpha = " << zalph << " beta "<<beta<<std::endl;
|
||||||
const RealD tiny = 1.0e-20;
|
const RealD tiny = 1.0e-20;
|
||||||
if ( beta < tiny ) {
|
if ( beta < tiny ) {
|
||||||
std::cout << " beta is tiny "<<beta<<std::endl;
|
std::cout << " beta is tiny "<<beta<<std::endl;
|
||||||
@ -219,15 +248,122 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef USE_LAPACK
|
||||||
|
void diagonalize_lapack(DenseVector<RealD>& lmd,
|
||||||
|
DenseVector<RealD>& lme,
|
||||||
|
int N1,
|
||||||
|
int N2,
|
||||||
|
DenseVector<RealD>& Qt,
|
||||||
|
GridBase *grid){
|
||||||
|
const int size = Nm;
|
||||||
|
// tevals.resize(size);
|
||||||
|
// tevecs.resize(size);
|
||||||
|
int NN = N1;
|
||||||
|
double evals_tmp[NN];
|
||||||
|
double evec_tmp[NN][NN];
|
||||||
|
memset(evec_tmp[0],0,sizeof(double)*NN*NN);
|
||||||
|
// double AA[NN][NN];
|
||||||
|
double DD[NN];
|
||||||
|
double EE[NN];
|
||||||
|
for (int i = 0; i< NN; i++)
|
||||||
|
for (int j = i - 1; j <= i + 1; j++)
|
||||||
|
if ( j < NN && j >= 0 ) {
|
||||||
|
if (i==j) DD[i] = lmd[i];
|
||||||
|
if (i==j) evals_tmp[i] = lmd[i];
|
||||||
|
if (j==(i-1)) EE[j] = lme[j];
|
||||||
|
}
|
||||||
|
int evals_found;
|
||||||
|
int lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ;
|
||||||
|
int liwork = 3+NN*10 ;
|
||||||
|
int iwork[liwork];
|
||||||
|
double work[lwork];
|
||||||
|
int isuppz[2*NN];
|
||||||
|
char jobz = 'V'; // calculate evals & evecs
|
||||||
|
char range = 'I'; // calculate all evals
|
||||||
|
// char range = 'A'; // calculate all evals
|
||||||
|
char uplo = 'U'; // refer to upper half of original matrix
|
||||||
|
char compz = 'I'; // Compute eigenvectors of tridiagonal matrix
|
||||||
|
int ifail[NN];
|
||||||
|
int info;
|
||||||
|
// int total = QMP_get_number_of_nodes();
|
||||||
|
// int node = QMP_get_node_number();
|
||||||
|
// GridBase *grid = evec[0]._grid;
|
||||||
|
int total = grid->_Nprocessors;
|
||||||
|
int node = grid->_processor;
|
||||||
|
int interval = (NN/total)+1;
|
||||||
|
double vl = 0.0, vu = 0.0;
|
||||||
|
int il = interval*node+1 , iu = interval*(node+1);
|
||||||
|
if (iu > NN) iu=NN;
|
||||||
|
double tol = 0.0;
|
||||||
|
if (1) {
|
||||||
|
memset(evals_tmp,0,sizeof(double)*NN);
|
||||||
|
if ( il <= NN){
|
||||||
|
printf("total=%d node=%d il=%d iu=%d\n",total,node,il,iu);
|
||||||
|
LAPACK_dstegr(&jobz, &range, &NN,
|
||||||
|
(double*)DD, (double*)EE,
|
||||||
|
&vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A'
|
||||||
|
&tol, // tolerance
|
||||||
|
&evals_found, evals_tmp, (double*)evec_tmp, &NN,
|
||||||
|
isuppz,
|
||||||
|
work, &lwork, iwork, &liwork,
|
||||||
|
&info);
|
||||||
|
for (int i = iu-1; i>= il-1; i--){
|
||||||
|
printf("node=%d evals_found=%d evals_tmp[%d] = %g\n",node,evals_found, i - (il-1),evals_tmp[i - (il-1)]);
|
||||||
|
evals_tmp[i] = evals_tmp[i - (il-1)];
|
||||||
|
if (il>1) evals_tmp[i-(il-1)]=0.;
|
||||||
|
for (int j = 0; j< NN; j++){
|
||||||
|
evec_tmp[i][j] = evec_tmp[i - (il-1)][j];
|
||||||
|
if (il>1) evec_tmp[i-(il-1)][j]=0.;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
{
|
||||||
|
// QMP_sum_double_array(evals_tmp,NN);
|
||||||
|
// QMP_sum_double_array((double *)evec_tmp,NN*NN);
|
||||||
|
grid->GlobalSumVector(evals_tmp,NN);
|
||||||
|
grid->GlobalSumVector((double*)evec_tmp,NN*NN);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// cheating a bit. It is better to sort instead of just reversing it, but the document of the routine says evals are sorted in increasing order. qr gives evals in decreasing order.
|
||||||
|
for(int i=0;i<NN;i++){
|
||||||
|
for(int j=0;j<NN;j++)
|
||||||
|
Qt[(NN-1-i)*N2+j]=evec_tmp[i][j];
|
||||||
|
lmd [NN-1-i]=evals_tmp[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
void diagonalize(DenseVector<RealD>& lmd,
|
void diagonalize(DenseVector<RealD>& lmd,
|
||||||
DenseVector<RealD>& lme,
|
DenseVector<RealD>& lme,
|
||||||
int Nm2,
|
int N2,
|
||||||
int Nm,
|
int N1,
|
||||||
DenseVector<RealD>& Qt)
|
DenseVector<RealD>& Qt,
|
||||||
|
GridBase *grid)
|
||||||
{
|
{
|
||||||
int Niter = 100*Nm;
|
|
||||||
|
#ifdef USE_LAPACK
|
||||||
|
const int check_lapack=0; // just use lapack if 0, check against lapack if 1
|
||||||
|
|
||||||
|
if(!check_lapack)
|
||||||
|
return diagonalize_lapack(lmd,lme,N2,N1,Qt,grid);
|
||||||
|
|
||||||
|
DenseVector <RealD> lmd2(N1);
|
||||||
|
DenseVector <RealD> lme2(N1);
|
||||||
|
DenseVector<RealD> Qt2(N1*N1);
|
||||||
|
for(int k=0; k<N1; ++k){
|
||||||
|
lmd2[k] = lmd[k];
|
||||||
|
lme2[k] = lme[k];
|
||||||
|
}
|
||||||
|
for(int k=0; k<N1*N1; ++k)
|
||||||
|
Qt2[k] = Qt[k];
|
||||||
|
|
||||||
|
// diagonalize_lapack(lmd2,lme2,Nm2,Nm,Qt,grid);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int Niter = 100*N1;
|
||||||
int kmin = 1;
|
int kmin = 1;
|
||||||
int kmax = Nk;
|
int kmax = N2;
|
||||||
// (this should be more sophisticated)
|
// (this should be more sophisticated)
|
||||||
|
|
||||||
for(int iter=0; iter<Niter; ++iter){
|
for(int iter=0; iter<Niter; ++iter){
|
||||||
@ -239,7 +375,7 @@ public:
|
|||||||
// (Dsh: shift)
|
// (Dsh: shift)
|
||||||
|
|
||||||
// transformation
|
// transformation
|
||||||
qr_decomp(lmd,lme,Nk,Nm,Qt,Dsh,kmin,kmax);
|
qr_decomp(lmd,lme,N2,N1,Qt,Dsh,kmin,kmax);
|
||||||
|
|
||||||
// Convergence criterion (redef of kmin and kamx)
|
// Convergence criterion (redef of kmin and kamx)
|
||||||
for(int j=kmax-1; j>= kmin; --j){
|
for(int j=kmax-1; j>= kmin; --j){
|
||||||
@ -250,6 +386,23 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
Niter = iter;
|
Niter = iter;
|
||||||
|
#ifdef USE_LAPACK
|
||||||
|
if(check_lapack){
|
||||||
|
const double SMALL=1e-8;
|
||||||
|
diagonalize_lapack(lmd2,lme2,N2,N1,Qt2,grid);
|
||||||
|
DenseVector <RealD> lmd3(N2);
|
||||||
|
for(int k=0; k<N2; ++k) lmd3[k]=lmd[k];
|
||||||
|
_sort.push(lmd3,N2);
|
||||||
|
_sort.push(lmd2,N2);
|
||||||
|
for(int k=0; k<N2; ++k){
|
||||||
|
if (fabs(lmd2[k] - lmd3[k]) >SMALL) std::cout <<"lmd(qr) lmd(lapack) "<< k << ": " << lmd2[k] <<" "<< lmd3[k] <<std::endl;
|
||||||
|
// if (fabs(lme2[k] - lme[k]) >SMALL) std::cout <<"lme(qr)-lme(lapack) "<< k << ": " << lme2[k] - lme[k] <<std::endl;
|
||||||
|
}
|
||||||
|
for(int k=0; k<N1*N1; ++k){
|
||||||
|
// if (fabs(Qt2[k] - Qt[k]) >SMALL) std::cout <<"Qt(qr)-Qt(lapack) "<< k << ": " << Qt2[k] - Qt[k] <<std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
return;
|
return;
|
||||||
|
|
||||||
continued:
|
continued:
|
||||||
@ -265,6 +418,7 @@ public:
|
|||||||
abort();
|
abort();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if 1
|
||||||
static RealD normalise(Field& v)
|
static RealD normalise(Field& v)
|
||||||
{
|
{
|
||||||
RealD nn = norm2(v);
|
RealD nn = norm2(v);
|
||||||
@ -326,6 +480,7 @@ until convergence
|
|||||||
{
|
{
|
||||||
|
|
||||||
GridBase *grid = evec[0]._grid;
|
GridBase *grid = evec[0]._grid;
|
||||||
|
assert(grid == src._grid);
|
||||||
|
|
||||||
std::cout << " -- Nk = " << Nk << " Np = "<< Np << std::endl;
|
std::cout << " -- Nk = " << Nk << " Np = "<< Np << std::endl;
|
||||||
std::cout << " -- Nm = " << Nm << std::endl;
|
std::cout << " -- Nm = " << Nm << std::endl;
|
||||||
@ -356,11 +511,21 @@ until convergence
|
|||||||
// (uniform vector) Why not src??
|
// (uniform vector) Why not src??
|
||||||
// evec[0] = 1.0;
|
// evec[0] = 1.0;
|
||||||
evec[0] = src;
|
evec[0] = src;
|
||||||
|
std:: cout <<"norm2(src)= " << norm2(src)<<std::endl;
|
||||||
|
// << src._grid << std::endl;
|
||||||
normalise(evec[0]);
|
normalise(evec[0]);
|
||||||
|
std:: cout <<"norm2(evec[0])= " << norm2(evec[0]) <<std::endl;
|
||||||
|
// << evec[0]._grid << std::endl;
|
||||||
|
|
||||||
// Initial Nk steps
|
// Initial Nk steps
|
||||||
for(int k=0; k<Nk; ++k) step(eval,lme,evec,f,Nm,k);
|
for(int k=0; k<Nk; ++k) step(eval,lme,evec,f,Nm,k);
|
||||||
|
// std:: cout <<"norm2(evec[1])= " << norm2(evec[1]) << std::endl;
|
||||||
|
// std:: cout <<"norm2(evec[2])= " << norm2(evec[2]) << std::endl;
|
||||||
RitzMatrix(evec,Nk);
|
RitzMatrix(evec,Nk);
|
||||||
|
for(int k=0; k<Nk; ++k){
|
||||||
|
// std:: cout <<"eval " << k << " " <<eval[k] << std::endl;
|
||||||
|
// std:: cout <<"lme " << k << " " << lme[k] << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
// Restarting loop begins
|
// Restarting loop begins
|
||||||
for(int iter = 0; iter<Niter; ++iter){
|
for(int iter = 0; iter<Niter; ++iter){
|
||||||
@ -382,20 +547,24 @@ until convergence
|
|||||||
lme2[k] = lme[k+k1-1];
|
lme2[k] = lme[k+k1-1];
|
||||||
}
|
}
|
||||||
setUnit_Qt(Nm,Qt);
|
setUnit_Qt(Nm,Qt);
|
||||||
diagonalize(eval2,lme2,Nm,Nm,Qt);
|
diagonalize(eval2,lme2,Nm,Nm,Qt,grid);
|
||||||
|
|
||||||
// sorting
|
// sorting
|
||||||
_sort.push(eval2,Nm);
|
_sort.push(eval2,Nm);
|
||||||
|
|
||||||
// Implicitly shifted QR transformations
|
// Implicitly shifted QR transformations
|
||||||
setUnit_Qt(Nm,Qt);
|
setUnit_Qt(Nm,Qt);
|
||||||
for(int ip=k2; ip<Nm; ++ip)
|
for(int ip=k2; ip<Nm; ++ip){
|
||||||
|
std::cout << "qr_decomp "<< ip << " "<< eval2[ip] << std::endl;
|
||||||
qr_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
|
qr_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
for(int i=0; i<(Nk+1); ++i) B[i] = 0.0;
|
for(int i=0; i<(Nk+1); ++i) B[i] = 0.0;
|
||||||
|
|
||||||
for(int j=k1-1; j<k2+1; ++j){
|
for(int j=k1-1; j<k2+1; ++j){
|
||||||
for(int k=0; k<Nm; ++k){
|
for(int k=0; k<Nm; ++k){
|
||||||
|
B[j].checkerboard = evec[k].checkerboard;
|
||||||
B[j] += Qt[k+Nm*j] * evec[k];
|
B[j] += Qt[k+Nm*j] * evec[k];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -418,21 +587,25 @@ until convergence
|
|||||||
lme2[k] = lme[k];
|
lme2[k] = lme[k];
|
||||||
}
|
}
|
||||||
setUnit_Qt(Nm,Qt);
|
setUnit_Qt(Nm,Qt);
|
||||||
diagonalize(eval2,lme2,Nk,Nm,Qt);
|
diagonalize(eval2,lme2,Nk,Nm,Qt,grid);
|
||||||
|
|
||||||
for(int k = 0; k<Nk; ++k) B[k]=0.0;
|
for(int k = 0; k<Nk; ++k) B[k]=0.0;
|
||||||
|
|
||||||
for(int j = 0; j<Nk; ++j){
|
for(int j = 0; j<Nk; ++j){
|
||||||
for(int k = 0; k<Nk; ++k){
|
for(int k = 0; k<Nk; ++k){
|
||||||
|
B[j].checkerboard = evec[k].checkerboard;
|
||||||
B[j] += Qt[k+j*Nm] * evec[k];
|
B[j] += Qt[k+j*Nm] * evec[k];
|
||||||
}
|
}
|
||||||
|
// std::cout << "norm(B["<<j<<"])="<<norm2(B[j])<<std::endl;
|
||||||
}
|
}
|
||||||
|
// _sort.push(eval2,B,Nk);
|
||||||
|
|
||||||
Nconv = 0;
|
Nconv = 0;
|
||||||
// std::cout << std::setiosflags(std::ios_base::scientific);
|
// std::cout << std::setiosflags(std::ios_base::scientific);
|
||||||
for(int i=0; i<Nk; ++i){
|
for(int i=0; i<Nk; ++i){
|
||||||
|
|
||||||
_poly(_Linop,B[i],v);
|
// _poly(_Linop,B[i],v);
|
||||||
|
_Linop.HermOp(B[i],v);
|
||||||
|
|
||||||
RealD vnum = real(innerProduct(B[i],v)); // HermOp.
|
RealD vnum = real(innerProduct(B[i],v)); // HermOp.
|
||||||
RealD vden = norm2(B[i]);
|
RealD vden = norm2(B[i]);
|
||||||
@ -440,11 +613,13 @@ until convergence
|
|||||||
v -= eval2[i]*B[i];
|
v -= eval2[i]*B[i];
|
||||||
RealD vv = norm2(v);
|
RealD vv = norm2(v);
|
||||||
|
|
||||||
|
std::cout.precision(13);
|
||||||
std::cout << "[" << std::setw(3)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
|
std::cout << "[" << std::setw(3)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
|
||||||
std::cout << "eval = "<<std::setw(25)<< std::setiosflags(std::ios_base::left)<< eval2[i];
|
std::cout << "eval = "<<std::setw(25)<< std::setiosflags(std::ios_base::left)<< eval2[i];
|
||||||
std::cout <<" |H B[i] - eval[i]B[i]|^2 "<< std::setw(25)<< std::setiosflags(std::ios_base::right)<< vv<< std::endl;
|
std::cout <<" |H B[i] - eval[i]B[i]|^2 "<< std::setw(25)<< std::setiosflags(std::ios_base::right)<< vv<< std::endl;
|
||||||
|
|
||||||
if(vv<eresid*eresid){
|
// change the criteria as evals are supposed to be sorted, all evals smaller(larger) than Nstop should have converged
|
||||||
|
if((vv<eresid*eresid) && (i == Nconv) ){
|
||||||
Iconv[Nconv] = i;
|
Iconv[Nconv] = i;
|
||||||
++Nconv;
|
++Nconv;
|
||||||
}
|
}
|
||||||
@ -455,7 +630,7 @@ until convergence
|
|||||||
|
|
||||||
std::cout<<" #modes converged: "<<Nconv<<std::endl;
|
std::cout<<" #modes converged: "<<Nconv<<std::endl;
|
||||||
|
|
||||||
if( Nconv>=Nk ){
|
if( Nconv>=Nstop ){
|
||||||
goto converged;
|
goto converged;
|
||||||
}
|
}
|
||||||
} // end of iter loop
|
} // end of iter loop
|
||||||
@ -465,12 +640,11 @@ until convergence
|
|||||||
|
|
||||||
converged:
|
converged:
|
||||||
// Sorting
|
// Sorting
|
||||||
|
eval.resize(Nconv);
|
||||||
eval.clear();
|
evec.resize(Nconv,grid);
|
||||||
evec.clear();
|
|
||||||
for(int i=0; i<Nconv; ++i){
|
for(int i=0; i<Nconv; ++i){
|
||||||
eval.push_back(eval2[Iconv[i]]);
|
eval[i] = eval2[Iconv[i]];
|
||||||
evec.push_back(B[Iconv[i]]);
|
evec[i] = B[Iconv[i]];
|
||||||
}
|
}
|
||||||
_sort.push(eval,evec,Nconv);
|
_sort.push(eval,evec,Nconv);
|
||||||
|
|
||||||
@ -1025,6 +1199,7 @@ static void Lock(DenseMatrix<T> &H, ///Hess mtx
|
|||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
@ -47,6 +47,10 @@ namespace Grid {
|
|||||||
int mmax;
|
int mmax;
|
||||||
int nstep;
|
int nstep;
|
||||||
int steps;
|
int steps;
|
||||||
|
GridStopWatch PrecTimer;
|
||||||
|
GridStopWatch MatTimer;
|
||||||
|
GridStopWatch LinalgTimer;
|
||||||
|
|
||||||
LinearFunction<Field> &Preconditioner;
|
LinearFunction<Field> &Preconditioner;
|
||||||
|
|
||||||
PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec,int _mmax,int _nstep) :
|
PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec,int _mmax,int _nstep) :
|
||||||
@ -68,14 +72,24 @@ namespace Grid {
|
|||||||
|
|
||||||
Field r(src._grid);
|
Field r(src._grid);
|
||||||
|
|
||||||
|
PrecTimer.Reset();
|
||||||
|
MatTimer.Reset();
|
||||||
|
LinalgTimer.Reset();
|
||||||
|
|
||||||
|
GridStopWatch SolverTimer;
|
||||||
|
SolverTimer.Start();
|
||||||
|
|
||||||
steps=0;
|
steps=0;
|
||||||
for(int k=0;k<MaxIterations;k++){
|
for(int k=0;k<MaxIterations;k++){
|
||||||
|
|
||||||
cp=GCRnStep(Linop,src,psi,rsq);
|
cp=GCRnStep(Linop,src,psi,rsq);
|
||||||
|
|
||||||
if ( verbose ) std::cout<<GridLogMessage<<"VPGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<std::endl;
|
std::cout<<GridLogMessage<<"VPGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<std::endl;
|
||||||
|
|
||||||
if(cp<rsq) {
|
if(cp<rsq) {
|
||||||
|
|
||||||
|
SolverTimer.Stop();
|
||||||
|
|
||||||
Linop.HermOp(psi,r);
|
Linop.HermOp(psi,r);
|
||||||
axpy(r,-1.0,src,r);
|
axpy(r,-1.0,src,r);
|
||||||
RealD tr = norm2(r);
|
RealD tr = norm2(r);
|
||||||
@ -83,6 +97,11 @@ namespace Grid {
|
|||||||
<< " computed residual "<<sqrt(cp/ssq)
|
<< " computed residual "<<sqrt(cp/ssq)
|
||||||
<< " true residual " <<sqrt(tr/ssq)
|
<< " true residual " <<sqrt(tr/ssq)
|
||||||
<< " target " <<Tolerance <<std::endl;
|
<< " target " <<Tolerance <<std::endl;
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage<<"VPGCR Time elapsed: Total "<< SolverTimer.Elapsed() <<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<"VPGCR Time elapsed: Precon "<< PrecTimer.Elapsed() <<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<"VPGCR Time elapsed: Matrix "<< MatTimer.Elapsed() <<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<"VPGCR Time elapsed: Linalg "<< LinalgTimer.Elapsed() <<std::endl;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -90,6 +109,7 @@ namespace Grid {
|
|||||||
std::cout<<GridLogMessage<<"Variable Preconditioned GCR did not converge"<<std::endl;
|
std::cout<<GridLogMessage<<"Variable Preconditioned GCR did not converge"<<std::endl;
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
RealD GCRnStep(LinearOperatorBase<Field> &Linop,const Field &src, Field &psi,RealD rsq){
|
RealD GCRnStep(LinearOperatorBase<Field> &Linop,const Field &src, Field &psi,RealD rsq){
|
||||||
|
|
||||||
RealD cp;
|
RealD cp;
|
||||||
@ -116,24 +136,25 @@ namespace Grid {
|
|||||||
// initial guess x0 is taken as nonzero.
|
// initial guess x0 is taken as nonzero.
|
||||||
// r0=src-A x0 = src
|
// r0=src-A x0 = src
|
||||||
//////////////////////////////////
|
//////////////////////////////////
|
||||||
|
MatTimer.Start();
|
||||||
Linop.HermOpAndNorm(psi,Az,zAz,zAAz);
|
Linop.HermOpAndNorm(psi,Az,zAz,zAAz);
|
||||||
|
MatTimer.Stop();
|
||||||
r=src-Az;
|
r=src-Az;
|
||||||
|
|
||||||
/////////////////////
|
/////////////////////
|
||||||
// p = Prec(r)
|
// p = Prec(r)
|
||||||
/////////////////////
|
/////////////////////
|
||||||
|
PrecTimer.Start();
|
||||||
Preconditioner(r,z);
|
Preconditioner(r,z);
|
||||||
|
PrecTimer.Stop();
|
||||||
|
|
||||||
std::cout<<GridLogMessage<< " Preconditioner in " << norm2(r)<<std::endl;
|
MatTimer.Start();
|
||||||
std::cout<<GridLogMessage<< " Preconditioner out " << norm2(z)<<std::endl;
|
|
||||||
|
|
||||||
Linop.HermOp(z,tmp);
|
Linop.HermOp(z,tmp);
|
||||||
|
MatTimer.Stop();
|
||||||
|
|
||||||
std::cout<<GridLogMessage<< " Preconditioner Aout " << norm2(tmp)<<std::endl;
|
|
||||||
ttmp=tmp;
|
ttmp=tmp;
|
||||||
tmp=tmp-r;
|
tmp=tmp-r;
|
||||||
|
|
||||||
std::cout<<GridLogMessage<< " Preconditioner resid " << std::sqrt(norm2(tmp)/norm2(r))<<std::endl;
|
|
||||||
/*
|
/*
|
||||||
std::cout<<GridLogMessage<<r<<std::endl;
|
std::cout<<GridLogMessage<<r<<std::endl;
|
||||||
std::cout<<GridLogMessage<<z<<std::endl;
|
std::cout<<GridLogMessage<<z<<std::endl;
|
||||||
@ -141,7 +162,9 @@ namespace Grid {
|
|||||||
std::cout<<GridLogMessage<<tmp<<std::endl;
|
std::cout<<GridLogMessage<<tmp<<std::endl;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
MatTimer.Start();
|
||||||
Linop.HermOpAndNorm(z,Az,zAz,zAAz);
|
Linop.HermOpAndNorm(z,Az,zAz,zAAz);
|
||||||
|
MatTimer.Stop();
|
||||||
|
|
||||||
//p[0],q[0],qq[0]
|
//p[0],q[0],qq[0]
|
||||||
p[0]= z;
|
p[0]= z;
|
||||||
@ -165,16 +188,20 @@ namespace Grid {
|
|||||||
|
|
||||||
cp = axpy_norm(r,-a,q[peri_k],r);
|
cp = axpy_norm(r,-a,q[peri_k],r);
|
||||||
|
|
||||||
std::cout<<GridLogMessage<< " VPGCR_step resid" <<sqrt(cp/rsq)<<std::endl;
|
|
||||||
if((k==nstep-1)||(cp<rsq)){
|
if((k==nstep-1)||(cp<rsq)){
|
||||||
return cp;
|
return cp;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage<< " VPGCR_step["<<steps<<"] resid " <<sqrt(cp/rsq)<<std::endl;
|
||||||
|
|
||||||
|
PrecTimer.Start();
|
||||||
Preconditioner(r,z);// solve Az = r
|
Preconditioner(r,z);// solve Az = r
|
||||||
|
PrecTimer.Stop();
|
||||||
|
|
||||||
|
MatTimer.Start();
|
||||||
Linop.HermOpAndNorm(z,Az,zAz,zAAz);
|
Linop.HermOpAndNorm(z,Az,zAz,zAAz);
|
||||||
|
|
||||||
|
|
||||||
Linop.HermOp(z,tmp);
|
Linop.HermOp(z,tmp);
|
||||||
|
MatTimer.Stop();
|
||||||
tmp=tmp-r;
|
tmp=tmp-r;
|
||||||
std::cout<<GridLogMessage<< " Preconditioner resid " <<sqrt(norm2(tmp)/norm2(r))<<std::endl;
|
std::cout<<GridLogMessage<< " Preconditioner resid " <<sqrt(norm2(tmp)/norm2(r))<<std::endl;
|
||||||
|
|
||||||
|
@ -102,6 +102,8 @@ namespace Grid {
|
|||||||
|
|
||||||
pickCheckerboard(Even,src_e,in);
|
pickCheckerboard(Even,src_e,in);
|
||||||
pickCheckerboard(Odd ,src_o,in);
|
pickCheckerboard(Odd ,src_o,in);
|
||||||
|
pickCheckerboard(Even,sol_e,out);
|
||||||
|
pickCheckerboard(Odd ,sol_o,out);
|
||||||
|
|
||||||
/////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////
|
||||||
// src_o = Mdag * (source_o - Moe MeeInv source_e)
|
// src_o = Mdag * (source_o - Moe MeeInv source_e)
|
||||||
|
@ -115,27 +115,11 @@ public:
|
|||||||
for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*ocoor[d];
|
for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*ocoor[d];
|
||||||
return idx;
|
return idx;
|
||||||
}
|
}
|
||||||
static inline void CoorFromIndex (std::vector<int>& coor,int index,std::vector<int> &dims){
|
|
||||||
int nd= dims.size();
|
|
||||||
coor.resize(nd);
|
|
||||||
for(int d=0;d<nd;d++){
|
|
||||||
coor[d] = index % dims[d];
|
|
||||||
index = index / dims[d];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
inline void oCoorFromOindex (std::vector<int>& coor,int Oindex){
|
inline void oCoorFromOindex (std::vector<int>& coor,int Oindex){
|
||||||
CoorFromIndex(coor,Oindex,_rdimensions);
|
Lexicographic::CoorFromIndex(coor,Oindex,_rdimensions);
|
||||||
}
|
|
||||||
static inline void IndexFromCoor (std::vector<int>& coor,int &index,std::vector<int> &dims){
|
|
||||||
int nd=dims.size();
|
|
||||||
int stride=1;
|
|
||||||
index=0;
|
|
||||||
for(int d=0;d<nd;d++){
|
|
||||||
index = index+stride*coor[d];
|
|
||||||
stride=stride*dims[d];
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////
|
||||||
// SIMD lane addressing
|
// SIMD lane addressing
|
||||||
//////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////
|
||||||
@ -147,13 +131,32 @@ public:
|
|||||||
}
|
}
|
||||||
inline void iCoorFromIindex(std::vector<int> &coor,int lane)
|
inline void iCoorFromIindex(std::vector<int> &coor,int lane)
|
||||||
{
|
{
|
||||||
CoorFromIndex(coor,lane,_simd_layout);
|
Lexicographic::CoorFromIndex(coor,lane,_simd_layout);
|
||||||
}
|
}
|
||||||
inline int PermuteDim(int dimension){
|
inline int PermuteDim(int dimension){
|
||||||
return _simd_layout[dimension]>1;
|
return _simd_layout[dimension]>1;
|
||||||
}
|
}
|
||||||
inline int PermuteType(int dimension){
|
inline int PermuteType(int dimension){
|
||||||
int permute_type=0;
|
int permute_type=0;
|
||||||
|
//
|
||||||
|
// FIXME:
|
||||||
|
//
|
||||||
|
// Best way to encode this would be to present a mask
|
||||||
|
// for which simd dimensions are rotated, and the rotation
|
||||||
|
// size. If there is only one simd dimension rotated, this is just
|
||||||
|
// a permute.
|
||||||
|
//
|
||||||
|
// Cases: PermuteType == 1,2,4,8
|
||||||
|
// Distance should be either 0,1,2..
|
||||||
|
//
|
||||||
|
if ( _simd_layout[dimension] > 2 ) {
|
||||||
|
for(int d=0;d<_ndimension;d++){
|
||||||
|
if ( d != dimension ) assert ( (_simd_layout[d]==1) );
|
||||||
|
}
|
||||||
|
permute_type = RotateBit; // How to specify distance; this is not just direction.
|
||||||
|
return permute_type;
|
||||||
|
}
|
||||||
|
|
||||||
for(int d=_ndimension-1;d>dimension;d--){
|
for(int d=_ndimension-1;d>dimension;d--){
|
||||||
if (_simd_layout[d]>1 ) permute_type++;
|
if (_simd_layout[d]>1 ) permute_type++;
|
||||||
}
|
}
|
||||||
@ -163,12 +166,12 @@ public:
|
|||||||
// Array sizing queries
|
// Array sizing queries
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
inline int iSites(void) { return _isites; };
|
inline int iSites(void) const { return _isites; };
|
||||||
inline int Nsimd(void) { return _isites; };// Synonymous with iSites
|
inline int Nsimd(void) const { return _isites; };// Synonymous with iSites
|
||||||
inline int oSites(void) { return _osites; };
|
inline int oSites(void) const { return _osites; };
|
||||||
inline int lSites(void) { return _isites*_osites; };
|
inline int lSites(void) const { return _isites*_osites; };
|
||||||
inline int gSites(void) { return _isites*_osites*_Nprocessors; };
|
inline int gSites(void) const { return _isites*_osites*_Nprocessors; };
|
||||||
inline int Nd (void) { return _ndimension;};
|
inline int Nd (void) const { return _ndimension;};
|
||||||
|
|
||||||
inline const std::vector<int> &FullDimensions(void) { return _fdimensions;};
|
inline const std::vector<int> &FullDimensions(void) { return _fdimensions;};
|
||||||
inline const std::vector<int> &GlobalDimensions(void) { return _gdimensions;};
|
inline const std::vector<int> &GlobalDimensions(void) { return _gdimensions;};
|
||||||
@ -179,7 +182,10 @@ public:
|
|||||||
// Global addressing
|
// Global addressing
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
void GlobalIndexToGlobalCoor(int gidx,std::vector<int> &gcoor){
|
void GlobalIndexToGlobalCoor(int gidx,std::vector<int> &gcoor){
|
||||||
CoorFromIndex(gcoor,gidx,_gdimensions);
|
Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
|
||||||
|
}
|
||||||
|
void LocalIndexToLocalCoor(int lidx,std::vector<int> &lcoor){
|
||||||
|
Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
|
||||||
}
|
}
|
||||||
void GlobalCoorToGlobalIndex(const std::vector<int> & gcoor,int & gidx){
|
void GlobalCoorToGlobalIndex(const std::vector<int> & gcoor,int & gidx){
|
||||||
gidx=0;
|
gidx=0;
|
||||||
|
@ -170,9 +170,15 @@ public:
|
|||||||
// Use a reduced simd grid
|
// Use a reduced simd grid
|
||||||
_simd_layout[d] = simd_layout[d];
|
_simd_layout[d] = simd_layout[d];
|
||||||
_rdimensions[d]= _ldimensions[d]/_simd_layout[d];
|
_rdimensions[d]= _ldimensions[d]/_simd_layout[d];
|
||||||
|
assert(_rdimensions[d]>0);
|
||||||
|
|
||||||
// all elements of a simd vector must have same checkerboard.
|
// all elements of a simd vector must have same checkerboard.
|
||||||
if ( simd_layout[d]>1 ) assert((_rdimensions[d]&0x1)==0);
|
// If Ls vectorised, this must still be the case; e.g. dwf rb5d
|
||||||
|
if ( _simd_layout[d]>1 ) {
|
||||||
|
if ( d != _checker_dim ) {
|
||||||
|
assert( (_rdimensions[d]&0x1) == 0 );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
_osites *= _rdimensions[d];
|
_osites *= _rdimensions[d];
|
||||||
_isites *= _simd_layout[d];
|
_isites *= _simd_layout[d];
|
||||||
|
@ -34,6 +34,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#ifdef GRID_COMMS_MPI
|
#ifdef GRID_COMMS_MPI
|
||||||
#include <mpi.h>
|
#include <mpi.h>
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef GRID_COMMS_SHMEM
|
||||||
|
#include <mpp/shmem.h>
|
||||||
|
#endif
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
class CartesianCommunicator {
|
class CartesianCommunicator {
|
||||||
public:
|
public:
|
||||||
@ -53,6 +56,8 @@ class CartesianCommunicator {
|
|||||||
typedef int CommsRequest_t;
|
typedef int CommsRequest_t;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
static void Init(int *argc, char ***argv);
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
CartesianCommunicator(const std::vector<int> &pdimensions_in);
|
CartesianCommunicator(const std::vector<int> &pdimensions_in);
|
||||||
|
|
||||||
@ -81,6 +86,7 @@ class CartesianCommunicator {
|
|||||||
void GlobalSumVector(RealD *,int N);
|
void GlobalSumVector(RealD *,int N);
|
||||||
|
|
||||||
void GlobalSum(uint32_t &);
|
void GlobalSum(uint32_t &);
|
||||||
|
void GlobalSum(uint64_t &);
|
||||||
|
|
||||||
void GlobalSum(ComplexF &c)
|
void GlobalSum(ComplexF &c)
|
||||||
{
|
{
|
||||||
@ -115,11 +121,10 @@ class CartesianCommunicator {
|
|||||||
int recv_from_rank,
|
int recv_from_rank,
|
||||||
int bytes);
|
int bytes);
|
||||||
|
|
||||||
void RecvFrom(void *recv,
|
void SendRecvPacket(void *xmit,
|
||||||
int recv_from_rank,
|
void *recv,
|
||||||
int bytes);
|
|
||||||
void SendTo(void *xmit,
|
|
||||||
int xmit_to_rank,
|
int xmit_to_rank,
|
||||||
|
int recv_from_rank,
|
||||||
int bytes);
|
int bytes);
|
||||||
|
|
||||||
void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
|
@ -31,6 +31,19 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
// Should error check all MPI calls.
|
// Should error check all MPI calls.
|
||||||
|
void CartesianCommunicator::Init(int *argc, char ***argv) {
|
||||||
|
int flag;
|
||||||
|
MPI_Initialized(&flag); // needed to coexist with other libs apparently
|
||||||
|
if ( !flag ) {
|
||||||
|
MPI_Init(argc,argv);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int Rank(void) {
|
||||||
|
int pe;
|
||||||
|
MPI_Comm_rank(MPI_COMM_WORLD,&pe);
|
||||||
|
return pe;
|
||||||
|
}
|
||||||
|
|
||||||
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
||||||
{
|
{
|
||||||
@ -59,6 +72,10 @@ void CartesianCommunicator::GlobalSum(uint32_t &u){
|
|||||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
}
|
}
|
||||||
|
void CartesianCommunicator::GlobalSum(uint64_t &u){
|
||||||
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
void CartesianCommunicator::GlobalSum(float &f){
|
void CartesianCommunicator::GlobalSum(float &f){
|
||||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
@ -108,21 +125,22 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
|||||||
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
|
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
|
||||||
SendToRecvFromComplete(reqs);
|
SendToRecvFromComplete(reqs);
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::RecvFrom(void *recv,
|
|
||||||
int from,
|
void CartesianCommunicator::SendRecvPacket(void *xmit,
|
||||||
|
void *recv,
|
||||||
|
int sender,
|
||||||
|
int receiver,
|
||||||
int bytes)
|
int bytes)
|
||||||
{
|
{
|
||||||
MPI_Status stat;
|
MPI_Status stat;
|
||||||
int ierr=MPI_Recv(recv, bytes, MPI_CHAR,from,from,communicator,&stat);
|
assert(sender != receiver);
|
||||||
assert(ierr==0);
|
int tag = sender;
|
||||||
|
if ( _processor == sender ) {
|
||||||
|
MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
|
||||||
|
}
|
||||||
|
if ( _processor == receiver ) {
|
||||||
|
MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::SendTo(void *xmit,
|
|
||||||
int dest,
|
|
||||||
int bytes)
|
|
||||||
{
|
|
||||||
int rank = _processor; // used for tag; must know who it comes from
|
|
||||||
int ierr = MPI_Send(xmit, bytes, MPI_CHAR,dest,_processor,communicator);
|
|
||||||
assert(ierr==0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Basic Halo comms primitive
|
// Basic Halo comms primitive
|
||||||
|
@ -28,6 +28,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include "Grid.h"
|
#include "Grid.h"
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
|
void CartesianCommunicator::Init(int *argc, char *** arv)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
int Rank(void ){ return 0; };
|
||||||
|
|
||||||
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
||||||
{
|
{
|
||||||
_processors = processors;
|
_processors = processors;
|
||||||
@ -47,16 +53,13 @@ void CartesianCommunicator::GlobalSum(float &){}
|
|||||||
void CartesianCommunicator::GlobalSumVector(float *,int N){}
|
void CartesianCommunicator::GlobalSumVector(float *,int N){}
|
||||||
void CartesianCommunicator::GlobalSum(double &){}
|
void CartesianCommunicator::GlobalSum(double &){}
|
||||||
void CartesianCommunicator::GlobalSum(uint32_t &){}
|
void CartesianCommunicator::GlobalSum(uint32_t &){}
|
||||||
|
void CartesianCommunicator::GlobalSum(uint64_t &){}
|
||||||
void CartesianCommunicator::GlobalSumVector(double *,int N){}
|
void CartesianCommunicator::GlobalSumVector(double *,int N){}
|
||||||
|
|
||||||
void CartesianCommunicator::RecvFrom(void *recv,
|
void CartesianCommunicator::SendRecvPacket(void *xmit,
|
||||||
int recv_from_rank,
|
void *recv,
|
||||||
int bytes)
|
|
||||||
{
|
|
||||||
assert(0);
|
|
||||||
}
|
|
||||||
void CartesianCommunicator::SendTo(void *xmit,
|
|
||||||
int xmit_to_rank,
|
int xmit_to_rank,
|
||||||
|
int recv_from_rank,
|
||||||
int bytes)
|
int bytes)
|
||||||
{
|
{
|
||||||
assert(0);
|
assert(0);
|
||||||
|
334
lib/communicator/Communicator_shmem.cc
Normal file
334
lib/communicator/Communicator_shmem.cc
Normal file
@ -0,0 +1,334 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/communicator/Communicator_shmem.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include "Grid.h"
|
||||||
|
#include <mpp/shmem.h>
|
||||||
|
|
||||||
|
namespace Grid {
|
||||||
|
|
||||||
|
// Should error check all MPI calls.
|
||||||
|
#define SHMEM_VET(addr)
|
||||||
|
|
||||||
|
#define SHMEM_VET_DEBUG(addr) { \
|
||||||
|
if ( ! shmem_addr_accessible(addr,_processor) ) {\
|
||||||
|
std::fprintf(stderr,"%d Inaccessible shmem address %lx %s %s\n",_processor,addr,__FUNCTION__,#addr); \
|
||||||
|
BACKTRACEFILE(); \
|
||||||
|
}\
|
||||||
|
}
|
||||||
|
int Rank(void) {
|
||||||
|
return shmem_my_pe();
|
||||||
|
}
|
||||||
|
typedef struct HandShake_t {
|
||||||
|
uint64_t seq_local;
|
||||||
|
uint64_t seq_remote;
|
||||||
|
} HandShake;
|
||||||
|
|
||||||
|
static Vector< HandShake > XConnections;
|
||||||
|
static Vector< HandShake > RConnections;
|
||||||
|
|
||||||
|
void CartesianCommunicator::Init(int *argc, char ***argv) {
|
||||||
|
shmem_init();
|
||||||
|
XConnections.resize(shmem_n_pes());
|
||||||
|
RConnections.resize(shmem_n_pes());
|
||||||
|
for(int pe =0 ; pe<shmem_n_pes();pe++){
|
||||||
|
XConnections[pe].seq_local = 0;
|
||||||
|
XConnections[pe].seq_remote= 0;
|
||||||
|
RConnections[pe].seq_local = 0;
|
||||||
|
RConnections[pe].seq_remote= 0;
|
||||||
|
}
|
||||||
|
shmem_barrier_all();
|
||||||
|
}
|
||||||
|
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
|
||||||
|
{
|
||||||
|
_ndimension = processors.size();
|
||||||
|
std::vector<int> periodic(_ndimension,1);
|
||||||
|
|
||||||
|
_Nprocessors=1;
|
||||||
|
_processors = processors;
|
||||||
|
_processor_coor.resize(_ndimension);
|
||||||
|
|
||||||
|
_processor = shmem_my_pe();
|
||||||
|
|
||||||
|
Lexicographic::CoorFromIndex(_processor_coor,_processor,_processors);
|
||||||
|
|
||||||
|
for(int i=0;i<_ndimension;i++){
|
||||||
|
_Nprocessors*=_processors[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
int Size = shmem_n_pes();
|
||||||
|
|
||||||
|
|
||||||
|
assert(Size==_Nprocessors);
|
||||||
|
}
|
||||||
|
|
||||||
|
void CartesianCommunicator::GlobalSum(uint32_t &u){
|
||||||
|
static long long source ;
|
||||||
|
static long long dest ;
|
||||||
|
static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
|
||||||
|
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
|
||||||
|
|
||||||
|
// int nreduce=1;
|
||||||
|
// int pestart=0;
|
||||||
|
// int logStride=0;
|
||||||
|
|
||||||
|
source = u;
|
||||||
|
dest = 0;
|
||||||
|
shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
|
||||||
|
shmem_barrier_all(); // necessary?
|
||||||
|
u = dest;
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::GlobalSum(uint64_t &u){
|
||||||
|
static long long source ;
|
||||||
|
static long long dest ;
|
||||||
|
static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
|
||||||
|
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
|
||||||
|
|
||||||
|
// int nreduce=1;
|
||||||
|
// int pestart=0;
|
||||||
|
// int logStride=0;
|
||||||
|
|
||||||
|
source = u;
|
||||||
|
dest = 0;
|
||||||
|
shmem_longlong_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
|
||||||
|
shmem_barrier_all(); // necessary?
|
||||||
|
u = dest;
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::GlobalSum(float &f){
|
||||||
|
static float source ;
|
||||||
|
static float dest ;
|
||||||
|
static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
|
||||||
|
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
|
||||||
|
|
||||||
|
source = f;
|
||||||
|
dest =0.0;
|
||||||
|
shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
|
||||||
|
f = dest;
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::GlobalSumVector(float *f,int N)
|
||||||
|
{
|
||||||
|
static float source ;
|
||||||
|
static float dest = 0 ;
|
||||||
|
static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
|
||||||
|
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
|
||||||
|
|
||||||
|
if ( shmem_addr_accessible(f,_processor) ){
|
||||||
|
shmem_float_sum_to_all(f,f,N,0,0,_Nprocessors,llwrk,psync);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int i=0;i<N;i++){
|
||||||
|
dest =0.0;
|
||||||
|
source = f[i];
|
||||||
|
shmem_float_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
|
||||||
|
f[i] = dest;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::GlobalSum(double &d)
|
||||||
|
{
|
||||||
|
static double source;
|
||||||
|
static double dest ;
|
||||||
|
static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
|
||||||
|
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
|
||||||
|
|
||||||
|
source = d;
|
||||||
|
dest = 0;
|
||||||
|
shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
|
||||||
|
d = dest;
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::GlobalSumVector(double *d,int N)
|
||||||
|
{
|
||||||
|
static double source ;
|
||||||
|
static double dest ;
|
||||||
|
static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
|
||||||
|
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
|
||||||
|
|
||||||
|
if ( shmem_addr_accessible(d,_processor) ){
|
||||||
|
shmem_double_sum_to_all(d,d,N,0,0,_Nprocessors,llwrk,psync);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int i=0;i<N;i++){
|
||||||
|
source = d[i];
|
||||||
|
dest =0.0;
|
||||||
|
shmem_double_sum_to_all(&dest,&source,1,0,0,_Nprocessors,llwrk,psync);
|
||||||
|
d[i] = dest;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
|
||||||
|
{
|
||||||
|
std::vector<int> coor = _processor_coor;
|
||||||
|
|
||||||
|
assert(std::abs(shift) <_processors[dim]);
|
||||||
|
|
||||||
|
coor[dim] = (_processor_coor[dim] + shift + _processors[dim])%_processors[dim];
|
||||||
|
Lexicographic::IndexFromCoor(coor,source,_processors);
|
||||||
|
|
||||||
|
coor[dim] = (_processor_coor[dim] - shift + _processors[dim])%_processors[dim];
|
||||||
|
Lexicographic::IndexFromCoor(coor,dest,_processors);
|
||||||
|
|
||||||
|
}
|
||||||
|
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
|
||||||
|
{
|
||||||
|
int rank;
|
||||||
|
Lexicographic::IndexFromCoor(coor,rank,_processors);
|
||||||
|
return rank;
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
|
||||||
|
{
|
||||||
|
Lexicographic::CoorFromIndex(coor,rank,_processors);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Basic Halo comms primitive
|
||||||
|
void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||||
|
int dest,
|
||||||
|
void *recv,
|
||||||
|
int from,
|
||||||
|
int bytes)
|
||||||
|
{
|
||||||
|
SHMEM_VET(xmit);
|
||||||
|
SHMEM_VET(recv);
|
||||||
|
std::vector<CommsRequest_t> reqs(0);
|
||||||
|
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
|
||||||
|
SendToRecvFromComplete(reqs);
|
||||||
|
}
|
||||||
|
|
||||||
|
void CartesianCommunicator::SendRecvPacket(void *xmit,
|
||||||
|
void *recv,
|
||||||
|
int sender,
|
||||||
|
int receiver,
|
||||||
|
int bytes)
|
||||||
|
{
|
||||||
|
static uint64_t seq;
|
||||||
|
|
||||||
|
assert(recv!=xmit);
|
||||||
|
volatile HandShake *RecvSeq = (volatile HandShake *) & RConnections[sender];
|
||||||
|
volatile HandShake *SendSeq = (volatile HandShake *) & XConnections[receiver];
|
||||||
|
|
||||||
|
if ( _processor == sender ) {
|
||||||
|
|
||||||
|
printf("Sender SHMEM pt2pt %d -> %d\n",sender,receiver);
|
||||||
|
// Check he has posted a receive
|
||||||
|
while(SendSeq->seq_remote == SendSeq->seq_local);
|
||||||
|
|
||||||
|
printf("Sender receive %d posted\n",sender,receiver);
|
||||||
|
|
||||||
|
// Advance our send count
|
||||||
|
seq = ++(SendSeq->seq_local);
|
||||||
|
|
||||||
|
// Send this packet
|
||||||
|
SHMEM_VET(recv);
|
||||||
|
shmem_putmem(recv,xmit,bytes,receiver);
|
||||||
|
shmem_fence();
|
||||||
|
|
||||||
|
printf("Sender sent payload %d\n",seq);
|
||||||
|
//Notify him we're done
|
||||||
|
shmem_putmem((void *)&(RecvSeq->seq_remote),&seq,sizeof(seq),receiver);
|
||||||
|
shmem_fence();
|
||||||
|
printf("Sender ringing door bell %d\n",seq);
|
||||||
|
}
|
||||||
|
if ( _processor == receiver ) {
|
||||||
|
|
||||||
|
printf("Receiver SHMEM pt2pt %d->%d\n",sender,receiver);
|
||||||
|
// Post a receive
|
||||||
|
seq = ++(RecvSeq->seq_local);
|
||||||
|
shmem_putmem((void *)&(SendSeq->seq_remote),&seq,sizeof(seq),sender);
|
||||||
|
|
||||||
|
printf("Receiver Opening letter box %d\n",seq);
|
||||||
|
|
||||||
|
|
||||||
|
// Now wait until he has advanced our reception counter
|
||||||
|
while(RecvSeq->seq_remote != RecvSeq->seq_local);
|
||||||
|
|
||||||
|
printf("Receiver Got the mail %d\n",seq);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Basic Halo comms primitive
|
||||||
|
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
|
void *xmit,
|
||||||
|
int dest,
|
||||||
|
void *recv,
|
||||||
|
int from,
|
||||||
|
int bytes)
|
||||||
|
{
|
||||||
|
SHMEM_VET(xmit);
|
||||||
|
SHMEM_VET(recv);
|
||||||
|
// shmem_putmem_nb(recv,xmit,bytes,dest,NULL);
|
||||||
|
shmem_putmem(recv,xmit,bytes,dest);
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
||||||
|
{
|
||||||
|
// shmem_quiet(); // I'm done
|
||||||
|
shmem_barrier_all();// He's done too
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::Barrier(void)
|
||||||
|
{
|
||||||
|
shmem_barrier_all();
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
|
||||||
|
{
|
||||||
|
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
|
||||||
|
static uint32_t word;
|
||||||
|
uint32_t *array = (uint32_t *) data;
|
||||||
|
assert( (bytes % 4)==0);
|
||||||
|
int words = bytes/4;
|
||||||
|
|
||||||
|
if ( shmem_addr_accessible(data,_processor) ){
|
||||||
|
shmem_broadcast32(data,data,words,root,0,0,shmem_n_pes(),psync);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int w=0;w<words;w++){
|
||||||
|
word = array[w];
|
||||||
|
shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync);
|
||||||
|
if ( shmem_my_pe() != root ) {
|
||||||
|
array[w] = word;
|
||||||
|
}
|
||||||
|
shmem_barrier_all();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
|
||||||
|
{
|
||||||
|
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
|
||||||
|
static uint32_t word;
|
||||||
|
uint32_t *array = (uint32_t *) data;
|
||||||
|
assert( (bytes % 4)==0);
|
||||||
|
int words = bytes/4;
|
||||||
|
|
||||||
|
for(int w=0;w<words;w++){
|
||||||
|
word = array[w];
|
||||||
|
shmem_broadcast32((void *)&word,(void *)&word,1,root,0,0,shmem_n_pes(),psync);
|
||||||
|
if ( shmem_my_pe() != root ) {
|
||||||
|
array[w]= word;
|
||||||
|
}
|
||||||
|
shmem_barrier_all();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -35,7 +35,7 @@ class SimpleCompressor {
|
|||||||
public:
|
public:
|
||||||
void Point(int) {};
|
void Point(int) {};
|
||||||
|
|
||||||
vobj operator() (const vobj &arg,int dimension,int plane,int osite,GridBase *grid) {
|
vobj operator() (const vobj &arg) {
|
||||||
return arg;
|
return arg;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -56,24 +56,24 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
|
|||||||
|
|
||||||
int e1=rhs._grid->_slice_nblock[dimension];
|
int e1=rhs._grid->_slice_nblock[dimension];
|
||||||
int e2=rhs._grid->_slice_block[dimension];
|
int e2=rhs._grid->_slice_block[dimension];
|
||||||
|
int stride=rhs._grid->_slice_stride[dimension];
|
||||||
if ( cbmask == 0x3 ) {
|
if ( cbmask == 0x3 ) {
|
||||||
PARALLEL_NESTED_LOOP2
|
PARALLEL_NESTED_LOOP2
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o = n*rhs._grid->_slice_stride[dimension];
|
int o = n*stride;
|
||||||
int bo = n*rhs._grid->_slice_block[dimension];
|
int bo = n*e2;
|
||||||
buffer[off+bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
|
buffer[off+bo+b]=compress(rhs._odata[so+o+b]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
int bo=0;
|
int bo=0;
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o = n*rhs._grid->_slice_stride[dimension];
|
int o = n*stride;
|
||||||
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
||||||
if ( ocb &cbmask ) {
|
if ( ocb &cbmask ) {
|
||||||
buffer[off+bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
|
buffer[off+bo++]=compress(rhs._odata[so+o+b]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -97,16 +97,16 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename cobj::scalar_
|
|||||||
|
|
||||||
int e1=rhs._grid->_slice_nblock[dimension];
|
int e1=rhs._grid->_slice_nblock[dimension];
|
||||||
int e2=rhs._grid->_slice_block[dimension];
|
int e2=rhs._grid->_slice_block[dimension];
|
||||||
|
int n1=rhs._grid->_slice_stride[dimension];
|
||||||
|
int n2=rhs._grid->_slice_block[dimension];
|
||||||
if ( cbmask ==0x3){
|
if ( cbmask ==0x3){
|
||||||
PARALLEL_NESTED_LOOP2
|
PARALLEL_NESTED_LOOP2
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
|
|
||||||
int o=n*rhs._grid->_slice_stride[dimension];
|
int o = n*n1;
|
||||||
int offset = b+n*rhs._grid->_slice_block[dimension];
|
int offset = b+n*n2;
|
||||||
|
cobj temp =compress(rhs._odata[so+o+b]);
|
||||||
cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
|
|
||||||
extract<cobj>(temp,pointers,offset);
|
extract<cobj>(temp,pointers,offset);
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -121,7 +121,7 @@ PARALLEL_NESTED_LOOP2
|
|||||||
int offset = b+n*rhs._grid->_slice_block[dimension];
|
int offset = b+n*rhs._grid->_slice_block[dimension];
|
||||||
|
|
||||||
if ( ocb & cbmask ) {
|
if ( ocb & cbmask ) {
|
||||||
cobj temp =compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid);
|
cobj temp =compress(rhs._odata[so+o+b]);
|
||||||
extract<cobj>(temp,pointers,offset);
|
extract<cobj>(temp,pointers,offset);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -243,13 +243,13 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
|
|||||||
|
|
||||||
int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
|
int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
|
||||||
int e2=rhs._grid->_slice_block[dimension];
|
int e2=rhs._grid->_slice_block[dimension];
|
||||||
|
int stride = rhs._grid->_slice_stride[dimension];
|
||||||
if(cbmask == 0x3 ){
|
if(cbmask == 0x3 ){
|
||||||
PARALLEL_NESTED_LOOP2
|
PARALLEL_NESTED_LOOP2
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
|
|
||||||
int o =n*rhs._grid->_slice_stride[dimension]+b;
|
int o =n*stride+b;
|
||||||
//lhs._odata[lo+o]=rhs._odata[ro+o];
|
//lhs._odata[lo+o]=rhs._odata[ro+o];
|
||||||
vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
|
vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
|
||||||
}
|
}
|
||||||
@ -259,7 +259,7 @@ PARALLEL_NESTED_LOOP2
|
|||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
|
|
||||||
int o =n*rhs._grid->_slice_stride[dimension]+b;
|
int o =n*stride+b;
|
||||||
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
|
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
|
||||||
if ( ocb&cbmask ) {
|
if ( ocb&cbmask ) {
|
||||||
//lhs._odata[lo+o]=rhs._odata[ro+o];
|
//lhs._odata[lo+o]=rhs._odata[ro+o];
|
||||||
@ -285,11 +285,12 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
|
|||||||
|
|
||||||
int e1=rhs._grid->_slice_nblock[dimension];
|
int e1=rhs._grid->_slice_nblock[dimension];
|
||||||
int e2=rhs._grid->_slice_block [dimension];
|
int e2=rhs._grid->_slice_block [dimension];
|
||||||
|
int stride = rhs._grid->_slice_stride[dimension];
|
||||||
PARALLEL_NESTED_LOOP2
|
PARALLEL_NESTED_LOOP2
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
|
|
||||||
int o =n*rhs._grid->_slice_stride[dimension];
|
int o =n*stride;
|
||||||
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
|
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
|
||||||
if ( ocb&cbmask ) {
|
if ( ocb&cbmask ) {
|
||||||
permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
|
permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
|
||||||
@ -323,6 +324,7 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
|
|||||||
int rd = grid->_rdimensions[dimension];
|
int rd = grid->_rdimensions[dimension];
|
||||||
int ld = grid->_ldimensions[dimension];
|
int ld = grid->_ldimensions[dimension];
|
||||||
int gd = grid->_gdimensions[dimension];
|
int gd = grid->_gdimensions[dimension];
|
||||||
|
int ly = grid->_simd_layout[dimension];
|
||||||
|
|
||||||
// Map to always positive shift modulo global full dimension.
|
// Map to always positive shift modulo global full dimension.
|
||||||
shift = (shift+fd)%fd;
|
shift = (shift+fd)%fd;
|
||||||
@ -331,6 +333,7 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
|
|||||||
// the permute type
|
// the permute type
|
||||||
int permute_dim =grid->PermuteDim(dimension);
|
int permute_dim =grid->PermuteDim(dimension);
|
||||||
int permute_type=grid->PermuteType(dimension);
|
int permute_type=grid->PermuteType(dimension);
|
||||||
|
int permute_type_dist;
|
||||||
|
|
||||||
for(int x=0;x<rd;x++){
|
for(int x=0;x<rd;x++){
|
||||||
|
|
||||||
@ -342,15 +345,31 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
|
|||||||
int sshift = grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
|
int sshift = grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
|
||||||
int sx = (x+sshift)%rd;
|
int sx = (x+sshift)%rd;
|
||||||
|
|
||||||
|
// FIXME : This must change where we have a
|
||||||
|
// Rotate slice.
|
||||||
|
|
||||||
|
// Document how this works ; why didn't I do this when I first wrote it...
|
||||||
|
// wrap is whether sshift > rd.
|
||||||
|
// num is sshift mod rd.
|
||||||
|
//
|
||||||
int permute_slice=0;
|
int permute_slice=0;
|
||||||
if(permute_dim){
|
if(permute_dim){
|
||||||
int wrap = sshift/rd;
|
int wrap = sshift/rd;
|
||||||
int num = sshift%rd;
|
int num = sshift%rd;
|
||||||
|
|
||||||
if ( x< rd-num ) permute_slice=wrap;
|
if ( x< rd-num ) permute_slice=wrap;
|
||||||
else permute_slice = 1-wrap;
|
else permute_slice = (wrap+1)%ly;
|
||||||
|
|
||||||
|
if ( (ly>2) && (permute_slice) ) {
|
||||||
|
assert(permute_type & RotateBit);
|
||||||
|
permute_type_dist = permute_type|permute_slice;
|
||||||
|
} else {
|
||||||
|
permute_type_dist = permute_type;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type);
|
}
|
||||||
|
|
||||||
|
if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type_dist);
|
||||||
else Copy_plane(ret,rhs,dimension,x,sx,cbmask);
|
else Copy_plane(ret,rhs,dimension,x,sx,cbmask);
|
||||||
|
|
||||||
|
|
||||||
|
@ -191,8 +191,9 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
|
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
|
||||||
int words = sizeof(vobj)/sizeof(vector_type);
|
int words = sizeof(vobj)/sizeof(vector_type);
|
||||||
|
|
||||||
std::vector<std::vector<scalar_object> > send_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
|
std::vector<Vector<scalar_object> > send_buf_extract(Nsimd,Vector<scalar_object>(buffer_size) );
|
||||||
std::vector<std::vector<scalar_object> > recv_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
|
std::vector<Vector<scalar_object> > recv_buf_extract(Nsimd,Vector<scalar_object>(buffer_size) );
|
||||||
|
|
||||||
int bytes = buffer_size*sizeof(scalar_object);
|
int bytes = buffer_size*sizeof(scalar_object);
|
||||||
|
|
||||||
std::vector<scalar_object *> pointers(Nsimd); //
|
std::vector<scalar_object *> pointers(Nsimd); //
|
||||||
|
@ -55,7 +55,13 @@ extern int GridCshiftPermuteMap[4][16];
|
|||||||
// Basic expressions used in Expression Template
|
// Basic expressions used in Expression Template
|
||||||
////////////////////////////////////////////////
|
////////////////////////////////////////////////
|
||||||
|
|
||||||
class LatticeBase {};
|
class LatticeBase
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
virtual ~LatticeBase(void) = default;
|
||||||
|
GridBase *_grid;
|
||||||
|
};
|
||||||
|
|
||||||
class LatticeExpressionBase {};
|
class LatticeExpressionBase {};
|
||||||
|
|
||||||
template<class T> using Vector = std::vector<T,alignedAllocator<T> >; // Aligned allocator??
|
template<class T> using Vector = std::vector<T,alignedAllocator<T> >; // Aligned allocator??
|
||||||
@ -88,8 +94,6 @@ template<class vobj>
|
|||||||
class Lattice : public LatticeBase
|
class Lattice : public LatticeBase
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
|
|
||||||
GridBase *_grid;
|
|
||||||
int checkerboard;
|
int checkerboard;
|
||||||
Vector<vobj> _odata;
|
Vector<vobj> _odata;
|
||||||
|
|
||||||
@ -177,8 +181,8 @@ PARALLEL_FOR_LOOP
|
|||||||
}
|
}
|
||||||
//GridFromExpression is tricky to do
|
//GridFromExpression is tricky to do
|
||||||
template<class Op,class T1>
|
template<class Op,class T1>
|
||||||
Lattice(const LatticeUnaryExpression<Op,T1> & expr): _grid(nullptr){
|
Lattice(const LatticeUnaryExpression<Op,T1> & expr) {
|
||||||
|
_grid = nullptr;
|
||||||
GridFromExpression(_grid,expr);
|
GridFromExpression(_grid,expr);
|
||||||
assert(_grid!=nullptr);
|
assert(_grid!=nullptr);
|
||||||
|
|
||||||
@ -199,7 +203,8 @@ PARALLEL_FOR_LOOP
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
template<class Op,class T1, class T2>
|
template<class Op,class T1, class T2>
|
||||||
Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr): _grid(nullptr){
|
Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr) {
|
||||||
|
_grid = nullptr;
|
||||||
GridFromExpression(_grid,expr);
|
GridFromExpression(_grid,expr);
|
||||||
assert(_grid!=nullptr);
|
assert(_grid!=nullptr);
|
||||||
|
|
||||||
@ -220,7 +225,8 @@ PARALLEL_FOR_LOOP
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
template<class Op,class T1, class T2, class T3>
|
template<class Op,class T1, class T2, class T3>
|
||||||
Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr): _grid(nullptr){
|
Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) {
|
||||||
|
_grid = nullptr;
|
||||||
GridFromExpression(_grid,expr);
|
GridFromExpression(_grid,expr);
|
||||||
assert(_grid!=nullptr);
|
assert(_grid!=nullptr);
|
||||||
|
|
||||||
@ -240,7 +246,8 @@ PARALLEL_FOR_LOOP
|
|||||||
// Constructor requires "grid" passed.
|
// Constructor requires "grid" passed.
|
||||||
// what about a default grid?
|
// what about a default grid?
|
||||||
//////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////
|
||||||
Lattice(GridBase *grid) : _grid(grid), _odata(_grid->oSites()) {
|
Lattice(GridBase *grid) : _odata(grid->oSites()) {
|
||||||
|
_grid = grid;
|
||||||
// _odata.reserve(_grid->oSites());
|
// _odata.reserve(_grid->oSites());
|
||||||
// _odata.resize(_grid->oSites());
|
// _odata.resize(_grid->oSites());
|
||||||
// std::cout << "Constructing lattice object with Grid pointer "<<_grid<<std::endl;
|
// std::cout << "Constructing lattice object with Grid pointer "<<_grid<<std::endl;
|
||||||
@ -248,6 +255,8 @@ PARALLEL_FOR_LOOP
|
|||||||
checkerboard=0;
|
checkerboard=0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
virtual ~Lattice(void) = default;
|
||||||
|
|
||||||
template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){
|
template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int ss=0;ss<_grid->oSites();ss++){
|
for(int ss=0;ss<_grid->oSites();ss++){
|
||||||
|
@ -152,7 +152,7 @@ PARALLEL_FOR_LOOP
|
|||||||
// Peek a scalar object from the SIMD array
|
// Peek a scalar object from the SIMD array
|
||||||
//////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////
|
||||||
template<class vobj,class sobj>
|
template<class vobj,class sobj>
|
||||||
void peekLocalSite(sobj &s,Lattice<vobj> &l,std::vector<int> &site){
|
void peekLocalSite(sobj &s,const Lattice<vobj> &l,std::vector<int> &site){
|
||||||
|
|
||||||
GridBase *grid=l._grid;
|
GridBase *grid=l._grid;
|
||||||
|
|
||||||
|
@ -152,7 +152,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
|
|||||||
assert(grid!=NULL);
|
assert(grid!=NULL);
|
||||||
|
|
||||||
// FIXME
|
// FIXME
|
||||||
std::cout<<GridLogMessage<<"WARNING ! SliceSum is unthreaded "<<grid->SumArraySize()<<" threads "<<std::endl;
|
// std::cout<<GridLogMessage<<"WARNING ! SliceSum is unthreaded "<<grid->SumArraySize()<<" threads "<<std::endl;
|
||||||
|
|
||||||
const int Nd = grid->_ndimension;
|
const int Nd = grid->_ndimension;
|
||||||
const int Nsimd = grid->Nsimd();
|
const int Nsimd = grid->Nsimd();
|
||||||
@ -178,7 +178,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
|
|||||||
// sum over reduced dimension planes, breaking out orthog dir
|
// sum over reduced dimension planes, breaking out orthog dir
|
||||||
|
|
||||||
for(int ss=0;ss<grid->oSites();ss++){
|
for(int ss=0;ss<grid->oSites();ss++){
|
||||||
GridBase::CoorFromIndex(coor,ss,grid->_rdimensions);
|
Lexicographic::CoorFromIndex(coor,ss,grid->_rdimensions);
|
||||||
int r = coor[orthogdim];
|
int r = coor[orthogdim];
|
||||||
lvSum[r]=lvSum[r]+Data._odata[ss];
|
lvSum[r]=lvSum[r]+Data._odata[ss];
|
||||||
}
|
}
|
||||||
|
@ -75,7 +75,7 @@ namespace Grid {
|
|||||||
|
|
||||||
std::seed_seq src;
|
std::seed_seq src;
|
||||||
|
|
||||||
fixedSeed(std::vector<int> &seeds) : src(seeds.begin(),seeds.end()) {};
|
fixedSeed(const std::vector<int> &seeds) : src(seeds.begin(),seeds.end()) {};
|
||||||
|
|
||||||
result_type operator () (void){
|
result_type operator () (void){
|
||||||
|
|
||||||
@ -122,6 +122,7 @@ namespace Grid {
|
|||||||
std::vector<RngEngine> _generators;
|
std::vector<RngEngine> _generators;
|
||||||
std::vector<std::uniform_real_distribution<RealD>> _uniform;
|
std::vector<std::uniform_real_distribution<RealD>> _uniform;
|
||||||
std::vector<std::normal_distribution<RealD>> _gaussian;
|
std::vector<std::normal_distribution<RealD>> _gaussian;
|
||||||
|
std::vector<std::discrete_distribution<int32_t>> _bernoulli;
|
||||||
|
|
||||||
void GetState(std::vector<RngStateType> & saved,int gen) {
|
void GetState(std::vector<RngStateType> & saved,int gen) {
|
||||||
saved.resize(RngStateCount);
|
saved.resize(RngStateCount);
|
||||||
@ -161,6 +162,7 @@ namespace Grid {
|
|||||||
_generators.resize(1);
|
_generators.resize(1);
|
||||||
_uniform.resize(1,std::uniform_real_distribution<RealD>{0,1});
|
_uniform.resize(1,std::uniform_real_distribution<RealD>{0,1});
|
||||||
_gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) );
|
_gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) );
|
||||||
|
_bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
|
||||||
_seeded=0;
|
_seeded=0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -242,7 +244,7 @@ namespace Grid {
|
|||||||
std::random_device rd;
|
std::random_device rd;
|
||||||
Seed(rd);
|
Seed(rd);
|
||||||
}
|
}
|
||||||
void SeedFixedIntegers(std::vector<int> &seeds){
|
void SeedFixedIntegers(const std::vector<int> &seeds){
|
||||||
fixedSeed src(seeds);
|
fixedSeed src(seeds);
|
||||||
Seed(src);
|
Seed(src);
|
||||||
}
|
}
|
||||||
@ -266,6 +268,7 @@ namespace Grid {
|
|||||||
_generators.resize(_vol);
|
_generators.resize(_vol);
|
||||||
_uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
|
_uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
|
||||||
_gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
|
_gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
|
||||||
|
_bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
|
||||||
_seeded=0;
|
_seeded=0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -354,7 +357,7 @@ PARALLEL_FOR_LOOP
|
|||||||
std::random_device rd;
|
std::random_device rd;
|
||||||
Seed(rd);
|
Seed(rd);
|
||||||
}
|
}
|
||||||
void SeedFixedIntegers(std::vector<int> &seeds){
|
void SeedFixedIntegers(const std::vector<int> &seeds){
|
||||||
fixedSeed src(seeds);
|
fixedSeed src(seeds);
|
||||||
Seed(src);
|
Seed(src);
|
||||||
}
|
}
|
||||||
@ -369,13 +372,21 @@ PARALLEL_FOR_LOOP
|
|||||||
rng.fill(l,rng._gaussian);
|
rng.fill(l,rng._gaussian);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){
|
||||||
|
rng.fill(l,rng._bernoulli);
|
||||||
|
}
|
||||||
|
|
||||||
template <class sobj> inline void random(GridSerialRNG &rng,sobj &l){
|
template <class sobj> inline void random(GridSerialRNG &rng,sobj &l){
|
||||||
rng.fill(l,rng._uniform);
|
rng.fill(l,rng._uniform);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l){
|
template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l){
|
||||||
rng.fill(l,rng._gaussian);
|
rng.fill(l,rng._gaussian);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){
|
||||||
|
rng.fill(l,rng._bernoulli);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -115,9 +115,9 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
|
|||||||
int sc;
|
int sc;
|
||||||
std::vector<int> coor_c(_ndimension);
|
std::vector<int> coor_c(_ndimension);
|
||||||
std::vector<int> coor_f(_ndimension);
|
std::vector<int> coor_f(_ndimension);
|
||||||
GridBase::CoorFromIndex(coor_f,sf,fine->_rdimensions);
|
Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
|
||||||
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
|
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
|
||||||
GridBase::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
|
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
|
||||||
|
|
||||||
for(int i=0;i<nbasis;i++) {
|
for(int i=0;i<nbasis;i++) {
|
||||||
|
|
||||||
@ -160,9 +160,9 @@ PARALLEL_FOR_LOOP
|
|||||||
std::vector<int> coor_c(_ndimension);
|
std::vector<int> coor_c(_ndimension);
|
||||||
std::vector<int> coor_f(_ndimension);
|
std::vector<int> coor_f(_ndimension);
|
||||||
|
|
||||||
GridBase::CoorFromIndex(coor_f,sf,fine->_rdimensions);
|
Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
|
||||||
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
|
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
|
||||||
GridBase::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
|
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
|
||||||
|
|
||||||
// z = A x + y
|
// z = A x + y
|
||||||
fineZ._odata[sf]=coarseA._odata[sc]*fineX._odata[sf]+fineY._odata[sf];
|
fineZ._odata[sf]=coarseA._odata[sc]*fineX._odata[sf]+fineY._odata[sf];
|
||||||
@ -225,9 +225,9 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
|
|||||||
std::vector<int> coor_c(_ndimension);
|
std::vector<int> coor_c(_ndimension);
|
||||||
std::vector<int> coor_f(_ndimension);
|
std::vector<int> coor_f(_ndimension);
|
||||||
|
|
||||||
GridBase::CoorFromIndex(coor_f,sf,fine->_rdimensions);
|
Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
|
||||||
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
|
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
|
||||||
GridBase::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
|
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
|
||||||
|
|
||||||
coarseData._odata[sc]=coarseData._odata[sc]+fineData._odata[sf];
|
coarseData._odata[sc]=coarseData._odata[sc]+fineData._odata[sf];
|
||||||
|
|
||||||
@ -311,9 +311,9 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
|
|||||||
std::vector<int> coor_c(_ndimension);
|
std::vector<int> coor_c(_ndimension);
|
||||||
std::vector<int> coor_f(_ndimension);
|
std::vector<int> coor_f(_ndimension);
|
||||||
|
|
||||||
GridBase::CoorFromIndex(coor_f,sf,fine->_rdimensions);
|
Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
|
||||||
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
|
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
|
||||||
GridBase::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
|
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
|
||||||
|
|
||||||
for(int i=0;i<nbasis;i++) {
|
for(int i=0;i<nbasis;i++) {
|
||||||
if(i==0) fineData._odata[sf]=coarseData._odata[sc](i) * Basis[i]._odata[sf];
|
if(i==0) fineData._odata[sf]=coarseData._odata[sc](i) * Basis[i]._odata[sf];
|
||||||
@ -325,6 +325,126 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars.
|
||||||
|
// Simd layouts need not match since we use peek/poke Local
|
||||||
|
template<class vobj,class vvobj>
|
||||||
|
void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
|
||||||
|
{
|
||||||
|
typedef typename vobj::scalar_object sobj;
|
||||||
|
typedef typename vvobj::scalar_object ssobj;
|
||||||
|
|
||||||
|
sobj s;
|
||||||
|
ssobj ss;
|
||||||
|
|
||||||
|
GridBase *ig = in._grid;
|
||||||
|
GridBase *og = out._grid;
|
||||||
|
|
||||||
|
int ni = ig->_ndimension;
|
||||||
|
int no = og->_ndimension;
|
||||||
|
|
||||||
|
assert(ni == no);
|
||||||
|
|
||||||
|
for(int d=0;d<no;d++){
|
||||||
|
assert(ig->_processors[d] == og->_processors[d]);
|
||||||
|
assert(ig->_ldimensions[d] == og->_ldimensions[d]);
|
||||||
|
}
|
||||||
|
|
||||||
|
PARALLEL_FOR_LOOP
|
||||||
|
for(int idx=0;idx<ig->lSites();idx++){
|
||||||
|
std::vector<int> lcoor(ni);
|
||||||
|
ig->LocalIndexToLocalCoor(idx,lcoor);
|
||||||
|
peekLocalSite(s,in,lcoor);
|
||||||
|
ss=s;
|
||||||
|
pokeLocalSite(ss,out,lcoor);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template<class vobj>
|
||||||
|
void InsertSlice(Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog)
|
||||||
|
{
|
||||||
|
typedef typename vobj::scalar_object sobj;
|
||||||
|
sobj s;
|
||||||
|
|
||||||
|
GridBase *lg = lowDim._grid;
|
||||||
|
GridBase *hg = higherDim._grid;
|
||||||
|
int nl = lg->_ndimension;
|
||||||
|
int nh = hg->_ndimension;
|
||||||
|
|
||||||
|
assert(nl+1 == nh);
|
||||||
|
assert(orthog<nh);
|
||||||
|
assert(orthog>=0);
|
||||||
|
assert(hg->_processors[orthog]==1);
|
||||||
|
|
||||||
|
int dl; dl = 0;
|
||||||
|
for(int d=0;d<nh;d++){
|
||||||
|
if ( d != orthog) {
|
||||||
|
assert(lg->_processors[dl] == hg->_processors[d]);
|
||||||
|
assert(lg->_ldimensions[dl] == hg->_ldimensions[d]);
|
||||||
|
dl++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// the above should guarantee that the operations are local
|
||||||
|
PARALLEL_FOR_LOOP
|
||||||
|
for(int idx=0;idx<lg->lSites();idx++){
|
||||||
|
std::vector<int> lcoor(nl);
|
||||||
|
std::vector<int> hcoor(nh);
|
||||||
|
lg->LocalIndexToLocalCoor(idx,lcoor);
|
||||||
|
dl=0;
|
||||||
|
hcoor[orthog] = slice;
|
||||||
|
for(int d=0;d<nh;d++){
|
||||||
|
if ( d!=orthog ) {
|
||||||
|
hcoor[d]=lcoor[dl++];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
peekLocalSite(s,lowDim,lcoor);
|
||||||
|
pokeLocalSite(s,higherDim,hcoor);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class vobj>
|
||||||
|
void ExtractSlice(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice, int orthog)
|
||||||
|
{
|
||||||
|
typedef typename vobj::scalar_object sobj;
|
||||||
|
sobj s;
|
||||||
|
|
||||||
|
GridBase *lg = lowDim._grid;
|
||||||
|
GridBase *hg = higherDim._grid;
|
||||||
|
int nl = lg->_ndimension;
|
||||||
|
int nh = hg->_ndimension;
|
||||||
|
|
||||||
|
assert(nl+1 == nh);
|
||||||
|
assert(orthog<nh);
|
||||||
|
assert(orthog>=0);
|
||||||
|
assert(hg->_processors[orthog]==1);
|
||||||
|
|
||||||
|
int dl; dl = 0;
|
||||||
|
for(int d=0;d<nh;d++){
|
||||||
|
if ( d != orthog) {
|
||||||
|
assert(lg->_processors[dl] == hg->_processors[d]);
|
||||||
|
assert(lg->_ldimensions[dl] == hg->_ldimensions[d]);
|
||||||
|
dl++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// the above should guarantee that the operations are local
|
||||||
|
PARALLEL_FOR_LOOP
|
||||||
|
for(int idx=0;idx<lg->lSites();idx++){
|
||||||
|
std::vector<int> lcoor(nl);
|
||||||
|
std::vector<int> hcoor(nh);
|
||||||
|
lg->LocalIndexToLocalCoor(idx,lcoor);
|
||||||
|
dl=0;
|
||||||
|
hcoor[orthog] = slice;
|
||||||
|
for(int d=0;d<nh;d++){
|
||||||
|
if ( d!=orthog ) {
|
||||||
|
hcoor[d]=lcoor[dl++];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
peekLocalSite(s,higherDim,hcoor);
|
||||||
|
pokeLocalSite(s,lowDim,lcoor);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
|
void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
|
||||||
|
@ -146,7 +146,7 @@ class BinaryIO {
|
|||||||
csum = 0;
|
csum = 0;
|
||||||
std::vector<int> lcoor;
|
std::vector<int> lcoor;
|
||||||
for(int l=0;l<grid->lSites();l++){
|
for(int l=0;l<grid->lSites();l++){
|
||||||
grid->CoorFromIndex(lcoor,l,grid->_ldimensions);
|
Lexicographic::CoorFromIndex(lcoor,l,grid->_ldimensions);
|
||||||
peekLocalSite(siteObj,lat,lcoor);
|
peekLocalSite(siteObj,lat,lcoor);
|
||||||
munge(siteObj,fileObj,csum);
|
munge(siteObj,fileObj,csum);
|
||||||
}
|
}
|
||||||
@ -168,6 +168,7 @@ class BinaryIO {
|
|||||||
GridBase *grid = Umu._grid;
|
GridBase *grid = Umu._grid;
|
||||||
|
|
||||||
std::cout<< GridLogMessage<< "Serial read I/O "<< file<< std::endl;
|
std::cout<< GridLogMessage<< "Serial read I/O "<< file<< std::endl;
|
||||||
|
GridStopWatch timer; timer.Start();
|
||||||
|
|
||||||
int ieee32big = (format == std::string("IEEE32BIG"));
|
int ieee32big = (format == std::string("IEEE32BIG"));
|
||||||
int ieee32 = (format == std::string("IEEE32"));
|
int ieee32 = (format == std::string("IEEE32"));
|
||||||
@ -182,6 +183,7 @@ class BinaryIO {
|
|||||||
|
|
||||||
Umu = zero;
|
Umu = zero;
|
||||||
uint32_t csum=0;
|
uint32_t csum=0;
|
||||||
|
uint64_t bytes=0;
|
||||||
fobj file_object;
|
fobj file_object;
|
||||||
sobj munged;
|
sobj munged;
|
||||||
|
|
||||||
@ -194,7 +196,7 @@ class BinaryIO {
|
|||||||
|
|
||||||
if ( grid->IsBoss() ) {
|
if ( grid->IsBoss() ) {
|
||||||
fin.read((char *)&file_object,sizeof(file_object));
|
fin.read((char *)&file_object,sizeof(file_object));
|
||||||
|
bytes += sizeof(file_object);
|
||||||
if(ieee32big) be32toh_v((void *)&file_object,sizeof(file_object));
|
if(ieee32big) be32toh_v((void *)&file_object,sizeof(file_object));
|
||||||
if(ieee32) le32toh_v((void *)&file_object,sizeof(file_object));
|
if(ieee32) le32toh_v((void *)&file_object,sizeof(file_object));
|
||||||
if(ieee64big) be64toh_v((void *)&file_object,sizeof(file_object));
|
if(ieee64big) be64toh_v((void *)&file_object,sizeof(file_object));
|
||||||
@ -205,6 +207,10 @@ class BinaryIO {
|
|||||||
// The boss who read the file has their value poked
|
// The boss who read the file has their value poked
|
||||||
pokeSite(munged,Umu,site);
|
pokeSite(munged,Umu,site);
|
||||||
}}}}
|
}}}}
|
||||||
|
timer.Stop();
|
||||||
|
std::cout<<GridLogPerformance<<"readObjectSerial: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
|
||||||
|
<< (double)bytes/ (double)timer.useconds() <<" MB/s " <<std::endl;
|
||||||
|
|
||||||
return csum;
|
return csum;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -224,13 +230,14 @@ class BinaryIO {
|
|||||||
// Serialise through node zero
|
// Serialise through node zero
|
||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
std::cout<< GridLogMessage<< "Serial write I/O "<< file<<std::endl;
|
std::cout<< GridLogMessage<< "Serial write I/O "<< file<<std::endl;
|
||||||
|
GridStopWatch timer; timer.Start();
|
||||||
|
|
||||||
std::ofstream fout;
|
std::ofstream fout;
|
||||||
if ( grid->IsBoss() ) {
|
if ( grid->IsBoss() ) {
|
||||||
fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
|
fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
|
||||||
fout.seekp(offset);
|
fout.seekp(offset);
|
||||||
}
|
}
|
||||||
|
uint64_t bytes=0;
|
||||||
uint32_t csum=0;
|
uint32_t csum=0;
|
||||||
fobj file_object;
|
fobj file_object;
|
||||||
sobj unmunged;
|
sobj unmunged;
|
||||||
@ -253,9 +260,14 @@ class BinaryIO {
|
|||||||
if(ieee64big) htobe64_v((void *)&file_object,sizeof(file_object));
|
if(ieee64big) htobe64_v((void *)&file_object,sizeof(file_object));
|
||||||
if(ieee64) htole64_v((void *)&file_object,sizeof(file_object));
|
if(ieee64) htole64_v((void *)&file_object,sizeof(file_object));
|
||||||
|
|
||||||
|
// NB could gather an xstrip as an optimisation.
|
||||||
fout.write((char *)&file_object,sizeof(file_object));
|
fout.write((char *)&file_object,sizeof(file_object));
|
||||||
|
bytes+=sizeof(file_object);
|
||||||
}
|
}
|
||||||
}}}}
|
}}}}
|
||||||
|
timer.Stop();
|
||||||
|
std::cout<<GridLogPerformance<<"writeObjectSerial: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
|
||||||
|
<< (double)bytes/timer.useconds() <<" MB/s " <<std::endl;
|
||||||
|
|
||||||
return csum;
|
return csum;
|
||||||
}
|
}
|
||||||
@ -265,6 +277,7 @@ class BinaryIO {
|
|||||||
typedef typename GridSerialRNG::RngStateType RngStateType;
|
typedef typename GridSerialRNG::RngStateType RngStateType;
|
||||||
const int RngStateCount = GridSerialRNG::RngStateCount;
|
const int RngStateCount = GridSerialRNG::RngStateCount;
|
||||||
|
|
||||||
|
|
||||||
GridBase *grid = parallel._grid;
|
GridBase *grid = parallel._grid;
|
||||||
int gsites = grid->_gsites;
|
int gsites = grid->_gsites;
|
||||||
|
|
||||||
@ -310,7 +323,7 @@ class BinaryIO {
|
|||||||
Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
|
Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
|
||||||
fout.write((char *)&saved[0],bytes);
|
fout.write((char *)&saved[0],bytes);
|
||||||
}
|
}
|
||||||
|
grid->Broadcast(0,(void *)&csum,sizeof(csum));
|
||||||
return csum;
|
return csum;
|
||||||
}
|
}
|
||||||
static inline uint32_t readRNGSerial(GridSerialRNG &serial,GridParallelRNG ¶llel,std::string file,int offset)
|
static inline uint32_t readRNGSerial(GridSerialRNG &serial,GridParallelRNG ¶llel,std::string file,int offset)
|
||||||
@ -360,6 +373,8 @@ class BinaryIO {
|
|||||||
Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
|
Uint32Checksum((uint32_t *)&saved[0],bytes,csum);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
grid->Broadcast(0,(void *)&csum,sizeof(csum));
|
||||||
|
|
||||||
return csum;
|
return csum;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -426,6 +441,9 @@ class BinaryIO {
|
|||||||
std::cout << std::endl;
|
std::cout << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GridStopWatch timer; timer.Start();
|
||||||
|
uint64_t bytes=0;
|
||||||
|
|
||||||
int myrank = grid->ThisRank();
|
int myrank = grid->ThisRank();
|
||||||
int iorank = grid->RankFromProcessorCoor(ioproc);
|
int iorank = grid->RankFromProcessorCoor(ioproc);
|
||||||
|
|
||||||
@ -439,9 +457,9 @@ class BinaryIO {
|
|||||||
// available (how short sighted is that?)
|
// available (how short sighted is that?)
|
||||||
//////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////
|
||||||
Umu = zero;
|
Umu = zero;
|
||||||
uint32_t csum=0;
|
static uint32_t csum=0;
|
||||||
fobj fileObj;
|
fobj fileObj;
|
||||||
sobj siteObj;
|
static sobj siteObj; // Static to place in symmetric region for SHMEM
|
||||||
|
|
||||||
// need to implement these loops in Nd independent way with a lexico conversion
|
// need to implement these loops in Nd independent way with a lexico conversion
|
||||||
for(int tlex=0;tlex<slice_vol;tlex++){
|
for(int tlex=0;tlex<slice_vol;tlex++){
|
||||||
@ -451,7 +469,7 @@ class BinaryIO {
|
|||||||
std::vector<int> lsite(nd);
|
std::vector<int> lsite(nd);
|
||||||
std::vector<int> iosite(nd);
|
std::vector<int> iosite(nd);
|
||||||
|
|
||||||
grid->CoorFromIndex(tsite,tlex,range);
|
Lexicographic::CoorFromIndex(tsite,tlex,range);
|
||||||
|
|
||||||
for(int d=0;d<nd;d++){
|
for(int d=0;d<nd;d++){
|
||||||
lsite[d] = tsite[d]%grid->_ldimensions[d]; // local site
|
lsite[d] = tsite[d]%grid->_ldimensions[d]; // local site
|
||||||
@ -472,6 +490,7 @@ class BinaryIO {
|
|||||||
|
|
||||||
fin.seekg(offset+g_idx*sizeof(fileObj));
|
fin.seekg(offset+g_idx*sizeof(fileObj));
|
||||||
fin.read((char *)&fileObj,sizeof(fileObj));
|
fin.read((char *)&fileObj,sizeof(fileObj));
|
||||||
|
bytes+=sizeof(fileObj);
|
||||||
|
|
||||||
if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj));
|
if(ieee32big) be32toh_v((void *)&fileObj,sizeof(fileObj));
|
||||||
if(ieee32) le32toh_v((void *)&fileObj,sizeof(fileObj));
|
if(ieee32) le32toh_v((void *)&fileObj,sizeof(fileObj));
|
||||||
@ -480,22 +499,28 @@ class BinaryIO {
|
|||||||
|
|
||||||
munge(fileObj,siteObj,csum);
|
munge(fileObj,siteObj,csum);
|
||||||
|
|
||||||
if ( rank != myrank ) {
|
|
||||||
grid->SendTo((void *)&siteObj,rank,sizeof(siteObj));
|
|
||||||
} else {
|
|
||||||
pokeLocalSite(siteObj,Umu,lsite);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
// Possibly do transport through pt2pt
|
||||||
if ( myrank == rank ) {
|
if ( rank != iorank ) {
|
||||||
grid->RecvFrom((void *)&siteObj,iorank,sizeof(siteObj));
|
if ( (myrank == rank) || (myrank==iorank) ) {
|
||||||
pokeLocalSite(siteObj,Umu,lsite);
|
grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,iorank,rank,sizeof(siteObj));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Poke at destination
|
||||||
|
if ( myrank == rank ) {
|
||||||
|
pokeLocalSite(siteObj,Umu,lsite);
|
||||||
|
}
|
||||||
grid->Barrier(); // necessary?
|
grid->Barrier(); // necessary?
|
||||||
}
|
}
|
||||||
|
|
||||||
grid->GlobalSum(csum);
|
grid->GlobalSum(csum);
|
||||||
|
grid->GlobalSum(bytes);
|
||||||
|
grid->Barrier();
|
||||||
|
|
||||||
|
timer.Stop();
|
||||||
|
std::cout<<GridLogPerformance<<"readObjectParallel: read "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
|
||||||
|
<< (double)bytes/timer.useconds() <<" MB/s " <<std::endl;
|
||||||
|
|
||||||
return csum;
|
return csum;
|
||||||
}
|
}
|
||||||
@ -530,7 +555,7 @@ class BinaryIO {
|
|||||||
|
|
||||||
for(int d=0;d<grid->_ndimension;d++) {
|
for(int d=0;d<grid->_ndimension;d++) {
|
||||||
|
|
||||||
if ( d==0 ) parallel[d] = 0;
|
if ( d!= grid->_ndimension-1 ) parallel[d] = 0;
|
||||||
|
|
||||||
if (parallel[d]) {
|
if (parallel[d]) {
|
||||||
range[d] = grid->_ldimensions[d];
|
range[d] = grid->_ldimensions[d];
|
||||||
@ -559,6 +584,9 @@ class BinaryIO {
|
|||||||
std::cout << std::endl;
|
std::cout << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GridStopWatch timer; timer.Start();
|
||||||
|
uint64_t bytes=0;
|
||||||
|
|
||||||
int myrank = grid->ThisRank();
|
int myrank = grid->ThisRank();
|
||||||
int iorank = grid->RankFromProcessorCoor(ioproc);
|
int iorank = grid->RankFromProcessorCoor(ioproc);
|
||||||
|
|
||||||
@ -577,9 +605,9 @@ class BinaryIO {
|
|||||||
|
|
||||||
uint32_t csum=0;
|
uint32_t csum=0;
|
||||||
fobj fileObj;
|
fobj fileObj;
|
||||||
sobj siteObj;
|
static sobj siteObj; // static for SHMEM target; otherwise dynamic allocate with AlignedAllocator
|
||||||
|
|
||||||
|
|
||||||
|
// should aggregate a whole chunk and then write.
|
||||||
// need to implement these loops in Nd independent way with a lexico conversion
|
// need to implement these loops in Nd independent way with a lexico conversion
|
||||||
for(int tlex=0;tlex<slice_vol;tlex++){
|
for(int tlex=0;tlex<slice_vol;tlex++){
|
||||||
|
|
||||||
@ -588,7 +616,7 @@ class BinaryIO {
|
|||||||
std::vector<int> lsite(nd);
|
std::vector<int> lsite(nd);
|
||||||
std::vector<int> iosite(nd);
|
std::vector<int> iosite(nd);
|
||||||
|
|
||||||
grid->CoorFromIndex(tsite,tlex,range);
|
Lexicographic::CoorFromIndex(tsite,tlex,range);
|
||||||
|
|
||||||
for(int d=0;d<nd;d++){
|
for(int d=0;d<nd;d++){
|
||||||
lsite[d] = tsite[d]%grid->_ldimensions[d]; // local site
|
lsite[d] = tsite[d]%grid->_ldimensions[d]; // local site
|
||||||
@ -606,13 +634,21 @@ class BinaryIO {
|
|||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
// iorank writes from the seek
|
// iorank writes from the seek
|
||||||
////////////////////////////////
|
////////////////////////////////
|
||||||
if (myrank == iorank) {
|
|
||||||
|
|
||||||
if ( rank != myrank ) {
|
// Owner of data peeks it
|
||||||
grid->RecvFrom((void *)&siteObj,rank,sizeof(siteObj));
|
|
||||||
} else {
|
|
||||||
peekLocalSite(siteObj,Umu,lsite);
|
peekLocalSite(siteObj,Umu,lsite);
|
||||||
|
|
||||||
|
// Pair of nodes may need to do pt2pt send
|
||||||
|
if ( rank != iorank ) { // comms is necessary
|
||||||
|
if ( (myrank == rank) || (myrank==iorank) ) { // and we have to do it
|
||||||
|
// Send to IOrank
|
||||||
|
grid->SendRecvPacket((void *)&siteObj,(void *)&siteObj,rank,iorank,sizeof(siteObj));
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
grid->Barrier(); // necessary?
|
||||||
|
|
||||||
|
if (myrank == iorank) {
|
||||||
|
|
||||||
munge(siteObj,fileObj,csum);
|
munge(siteObj,fileObj,csum);
|
||||||
|
|
||||||
@ -623,17 +659,16 @@ class BinaryIO {
|
|||||||
|
|
||||||
fout.seekp(offset+g_idx*sizeof(fileObj));
|
fout.seekp(offset+g_idx*sizeof(fileObj));
|
||||||
fout.write((char *)&fileObj,sizeof(fileObj));
|
fout.write((char *)&fileObj,sizeof(fileObj));
|
||||||
|
bytes+=sizeof(fileObj);
|
||||||
} else {
|
|
||||||
if ( myrank == rank ) {
|
|
||||||
peekLocalSite(siteObj,Umu,lsite);
|
|
||||||
grid->SendTo((void *)&siteObj,iorank,sizeof(siteObj));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
grid->Barrier(); // necessary// or every 16 packets to rate throttle??
|
|
||||||
}
|
|
||||||
|
|
||||||
grid->GlobalSum(csum);
|
grid->GlobalSum(csum);
|
||||||
|
grid->GlobalSum(bytes);
|
||||||
|
|
||||||
|
timer.Stop();
|
||||||
|
std::cout<<GridLogPerformance<<"writeObjectParallel: wrote "<< bytes <<" bytes in "<<timer.Elapsed() <<" "
|
||||||
|
<< (double)bytes/timer.useconds() <<" MB/s " <<std::endl;
|
||||||
|
|
||||||
return csum;
|
return csum;
|
||||||
}
|
}
|
||||||
|
@ -213,37 +213,38 @@ class NerscIO : public BinaryIO {
|
|||||||
static inline void truncate(std::string file){
|
static inline void truncate(std::string file){
|
||||||
std::ofstream fout(file,std::ios::out);
|
std::ofstream fout(file,std::ios::out);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define dump_nersc_header(field, s)\
|
||||||
|
s << "BEGIN_HEADER" << std::endl;\
|
||||||
|
s << "HDR_VERSION = " << field.hdr_version << std::endl;\
|
||||||
|
s << "DATATYPE = " << field.data_type << std::endl;\
|
||||||
|
s << "STORAGE_FORMAT = " << field.storage_format << std::endl;\
|
||||||
|
for(int i=0;i<4;i++){\
|
||||||
|
s << "DIMENSION_" << i+1 << " = " << field.dimension[i] << std::endl ;\
|
||||||
|
}\
|
||||||
|
s << "LINK_TRACE = " << std::setprecision(10) << field.link_trace << std::endl;\
|
||||||
|
s << "PLAQUETTE = " << std::setprecision(10) << field.plaquette << std::endl;\
|
||||||
|
for(int i=0;i<4;i++){\
|
||||||
|
s << "BOUNDARY_"<<i+1<<" = " << field.boundary[i] << std::endl;\
|
||||||
|
}\
|
||||||
|
\
|
||||||
|
s << "CHECKSUM = "<< std::hex << std::setw(10) << field.checksum << std::dec<<std::endl;\
|
||||||
|
s << "ENSEMBLE_ID = " << field.ensemble_id << std::endl;\
|
||||||
|
s << "ENSEMBLE_LABEL = " << field.ensemble_label << std::endl;\
|
||||||
|
s << "SEQUENCE_NUMBER = " << field.sequence_number << std::endl;\
|
||||||
|
s << "CREATOR = " << field.creator << std::endl;\
|
||||||
|
s << "CREATOR_HARDWARE = "<< field.creator_hardware << std::endl;\
|
||||||
|
s << "CREATION_DATE = " << field.creation_date << std::endl;\
|
||||||
|
s << "ARCHIVE_DATE = " << field.archive_date << std::endl;\
|
||||||
|
s << "FLOATING_POINT = " << field.floating_point << std::endl;\
|
||||||
|
s << "END_HEADER" << std::endl;
|
||||||
|
|
||||||
static inline unsigned int writeHeader(NerscField &field,std::string file)
|
static inline unsigned int writeHeader(NerscField &field,std::string file)
|
||||||
{
|
{
|
||||||
std::ofstream fout(file,std::ios::out|std::ios::in);
|
std::ofstream fout(file,std::ios::out|std::ios::in);
|
||||||
|
|
||||||
fout.seekp(0,std::ios::beg);
|
fout.seekp(0,std::ios::beg);
|
||||||
fout << "BEGIN_HEADER" << std::endl;
|
dump_nersc_header(field, fout);
|
||||||
fout << "HDR_VERSION = " << field.hdr_version << std::endl;
|
|
||||||
fout << "DATATYPE = " << field.data_type << std::endl;
|
|
||||||
fout << "STORAGE_FORMAT = " << field.storage_format << std::endl;
|
|
||||||
|
|
||||||
for(int i=0;i<4;i++){
|
|
||||||
fout << "DIMENSION_" << i+1 << " = " << field.dimension[i] << std::endl ;
|
|
||||||
}
|
|
||||||
// just to keep the space and write it later
|
|
||||||
fout << "LINK_TRACE = " << std::setprecision(10) << field.link_trace << std::endl;
|
|
||||||
fout << "PLAQUETTE = " << std::setprecision(10) << field.plaquette << std::endl;
|
|
||||||
for(int i=0;i<4;i++){
|
|
||||||
fout << "BOUNDARY_"<<i+1<<" = " << field.boundary[i] << std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
fout << "CHECKSUM = "<< std::hex << std::setw(10) << field.checksum << std::dec<<std::endl;
|
|
||||||
|
|
||||||
fout << "ENSEMBLE_ID = " << field.ensemble_id << std::endl;
|
|
||||||
fout << "ENSEMBLE_LABEL = " << field.ensemble_label << std::endl;
|
|
||||||
fout << "SEQUENCE_NUMBER = " << field.sequence_number << std::endl;
|
|
||||||
fout << "CREATOR = " << field.creator << std::endl;
|
|
||||||
fout << "CREATOR_HARDWARE = "<< field.creator_hardware << std::endl;
|
|
||||||
fout << "CREATION_DATE = " << field.creation_date << std::endl;
|
|
||||||
fout << "ARCHIVE_DATE = " << field.archive_date << std::endl;
|
|
||||||
fout << "FLOATING_POINT = " << field.floating_point << std::endl;
|
|
||||||
fout << "END_HEADER" << std::endl;
|
|
||||||
field.data_start = fout.tellp();
|
field.data_start = fout.tellp();
|
||||||
return field.data_start;
|
return field.data_start;
|
||||||
}
|
}
|
||||||
@ -353,7 +354,7 @@ static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
|
|||||||
csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>, LorentzColour2x3D>
|
csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>, LorentzColour2x3D>
|
||||||
(Umu,file,Nersc3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format);
|
(Umu,file,Nersc3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format);
|
||||||
}
|
}
|
||||||
} else if ( header.data_type == std::string("4D_SU3_GAUGE_3X3") ) {
|
} else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
|
||||||
if ( ieee32 || ieee32big ) {
|
if ( ieee32 || ieee32big ) {
|
||||||
//csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
|
//csum=BinaryIO::readObjectSerial<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
|
||||||
csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
|
csum=BinaryIO::readObjectParallel<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
|
||||||
@ -372,6 +373,7 @@ static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
|
|||||||
|
|
||||||
assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
|
assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
|
||||||
assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
|
assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
|
||||||
|
|
||||||
assert(csum == header.checksum );
|
assert(csum == header.checksum );
|
||||||
|
|
||||||
std::cout<<GridLogMessage <<"Read NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
|
std::cout<<GridLogMessage <<"Read NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
|
||||||
@ -419,6 +421,7 @@ static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu
|
|||||||
std::string file1 = file+"para";
|
std::string file1 = file+"para";
|
||||||
int offset1 = writeHeader(header,file1);
|
int offset1 = writeHeader(header,file1);
|
||||||
int csum1=BinaryIO::writeObjectParallel<vobj,fobj2D>(Umu,file1,munge,offset,header.floating_point);
|
int csum1=BinaryIO::writeObjectParallel<vobj,fobj2D>(Umu,file1,munge,offset,header.floating_point);
|
||||||
|
//int csum1=BinaryIO::writeObjectSerial<vobj,fobj2D>(Umu,file1,munge,offset,header.floating_point);
|
||||||
|
|
||||||
|
|
||||||
std::cout << GridLogMessage << " TESTING PARALLEL WRITE offsets " << offset1 << " "<< offset << std::endl;
|
std::cout << GridLogMessage << " TESTING PARALLEL WRITE offsets " << offset1 << " "<< offset << std::endl;
|
||||||
@ -429,11 +432,12 @@ static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu
|
|||||||
|
|
||||||
} else {
|
} else {
|
||||||
header.floating_point = std::string("IEEE64BIG");
|
header.floating_point = std::string("IEEE64BIG");
|
||||||
header.data_type = std::string("4D_SU3_GAUGE_3X3");
|
header.data_type = std::string("4D_SU3_GAUGE_3x3");
|
||||||
NerscSimpleUnmunger<fobj3D,sobj> munge;
|
NerscSimpleUnmunger<fobj3D,sobj> munge;
|
||||||
BinaryIO::Uint32Checksum<vobj,fobj3D>(Umu, munge,header.checksum);
|
BinaryIO::Uint32Checksum<vobj,fobj3D>(Umu, munge,header.checksum);
|
||||||
offset = writeHeader(header,file);
|
offset = writeHeader(header,file);
|
||||||
csum=BinaryIO::writeObjectSerial<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point);
|
// csum=BinaryIO::writeObjectSerial<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point);
|
||||||
|
csum=BinaryIO::writeObjectParallel<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::cout<<GridLogMessage <<"Written NERSC Configuration "<<file<< " checksum "<<std::hex<<csum<< std::dec<<" plaq "<< header.plaquette <<std::endl;
|
std::cout<<GridLogMessage <<"Written NERSC Configuration "<<file<< " checksum "<<std::hex<<csum<< std::dec<<" plaq "<< header.plaquette <<std::endl;
|
||||||
@ -507,6 +511,8 @@ static inline void readRNGState(GridSerialRNG &serial,GridParallelRNG & parallel
|
|||||||
// munger is a function of <floating point, Real, data_type>
|
// munger is a function of <floating point, Real, data_type>
|
||||||
uint32_t csum=BinaryIO::readRNGSerial(serial,parallel,file,offset);
|
uint32_t csum=BinaryIO::readRNGSerial(serial,parallel,file,offset);
|
||||||
|
|
||||||
|
std::cerr<<" Csum "<< csum << " "<< header.checksum <<std::endl;
|
||||||
|
|
||||||
assert(csum == header.checksum );
|
assert(csum == header.checksum );
|
||||||
|
|
||||||
std::cout<<GridLogMessage <<"Read NERSC RNG file "<<file<< " format "<< data_type <<std::endl;
|
std::cout<<GridLogMessage <<"Read NERSC RNG file "<<file<< " format "<< data_type <<std::endl;
|
||||||
|
@ -90,7 +90,7 @@ namespace QCD {
|
|||||||
template<typename vtype> using iHalfSpinVector = iScalar<iVector<iScalar<vtype>, Nhs> >;
|
template<typename vtype> using iHalfSpinVector = iScalar<iVector<iScalar<vtype>, Nhs> >;
|
||||||
template<typename vtype> using iHalfSpinColourVector = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;
|
template<typename vtype> using iHalfSpinColourVector = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;
|
||||||
|
|
||||||
template<typename vtype> using iGparitySpinColourVector = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
|
template<typename vtype> using iGparitySpinColourVector = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
|
||||||
template<typename vtype> using iGparityHalfSpinColourVector = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
|
template<typename vtype> using iGparityHalfSpinColourVector = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
|
||||||
|
|
||||||
// Spin matrix
|
// Spin matrix
|
||||||
@ -383,7 +383,6 @@ namespace QCD {
|
|||||||
//////////////////////////////////////////////
|
//////////////////////////////////////////////
|
||||||
// Poke scalars
|
// Poke scalars
|
||||||
//////////////////////////////////////////////
|
//////////////////////////////////////////////
|
||||||
|
|
||||||
template<class vobj> void pokeSpin(vobj &lhs,const decltype(peekIndex<SpinIndex>(lhs,0)) & rhs,int i)
|
template<class vobj> void pokeSpin(vobj &lhs,const decltype(peekIndex<SpinIndex>(lhs,0)) & rhs,int i)
|
||||||
{
|
{
|
||||||
pokeIndex<SpinIndex>(lhs,rhs,i);
|
pokeIndex<SpinIndex>(lhs,rhs,i);
|
||||||
@ -407,6 +406,40 @@ namespace QCD {
|
|||||||
pokeIndex<LorentzIndex>(lhs,rhs,i);
|
pokeIndex<LorentzIndex>(lhs,rhs,i);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//////////////////////////////////////////////
|
||||||
|
// Fermion <-> propagator assignements
|
||||||
|
//////////////////////////////////////////////
|
||||||
|
template <class Prop, class Ferm>
|
||||||
|
void FermToProp(Prop &p, const Ferm &f, const int s, const int c)
|
||||||
|
{
|
||||||
|
for(int j = 0; j < Ns; ++j)
|
||||||
|
{
|
||||||
|
auto pjs = peekSpin(p, j, s);
|
||||||
|
auto fj = peekSpin(f, j);
|
||||||
|
|
||||||
|
for(int i = 0; i < Nc; ++i)
|
||||||
|
{
|
||||||
|
pokeColour(pjs, peekColour(fj, i), i, c);
|
||||||
|
}
|
||||||
|
pokeSpin(p, pjs, j, s);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Prop, class Ferm>
|
||||||
|
void PropToFerm(Ferm &f, const Prop &p, const int s, const int c)
|
||||||
|
{
|
||||||
|
for(int j = 0; j < Ns; ++j)
|
||||||
|
{
|
||||||
|
auto pjs = peekSpin(p, j, s);
|
||||||
|
auto fj = peekSpin(f, j);
|
||||||
|
|
||||||
|
for(int i = 0; i < Nc; ++i)
|
||||||
|
{
|
||||||
|
pokeColour(fj, peekColour(pjs, i, c), i);
|
||||||
|
}
|
||||||
|
pokeSpin(f, fj, j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////
|
//////////////////////////////////////////////
|
||||||
// transpose array and scalar
|
// transpose array and scalar
|
||||||
|
@ -113,6 +113,8 @@ typedef SymanzikGaugeAction<ConjugateGimplD> ConjugateSymanzikGaugeAction
|
|||||||
template class A<GparityWilsonImplF>; \
|
template class A<GparityWilsonImplF>; \
|
||||||
template class A<GparityWilsonImplD>;
|
template class A<GparityWilsonImplD>;
|
||||||
|
|
||||||
|
#define GparityFermOpTemplateInstantiate(A)
|
||||||
|
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
// Fermion operators / actions
|
// Fermion operators / actions
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
@ -208,6 +210,14 @@ typedef DomainWallFermion<GparityWilsonImplR> GparityDomainWallFermionR;
|
|||||||
typedef DomainWallFermion<GparityWilsonImplF> GparityDomainWallFermionF;
|
typedef DomainWallFermion<GparityWilsonImplF> GparityDomainWallFermionF;
|
||||||
typedef DomainWallFermion<GparityWilsonImplD> GparityDomainWallFermionD;
|
typedef DomainWallFermion<GparityWilsonImplD> GparityDomainWallFermionD;
|
||||||
|
|
||||||
|
typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionR;
|
||||||
|
typedef WilsonTMFermion<GparityWilsonImplF> GparityWilsonTMFermionF;
|
||||||
|
typedef WilsonTMFermion<GparityWilsonImplD> GparityWilsonTMFermionD;
|
||||||
|
typedef MobiusFermion<GparityWilsonImplR> GparityMobiusFermionR;
|
||||||
|
typedef MobiusFermion<GparityWilsonImplF> GparityMobiusFermionF;
|
||||||
|
typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;
|
||||||
|
|
||||||
|
|
||||||
}}
|
}}
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
// G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
|
// G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
|
||||||
|
@ -527,6 +527,7 @@ namespace QCD {
|
|||||||
}
|
}
|
||||||
|
|
||||||
FermOpTemplateInstantiate(CayleyFermion5D);
|
FermOpTemplateInstantiate(CayleyFermion5D);
|
||||||
|
GparityFermOpTemplateInstantiate(CayleyFermion5D);
|
||||||
|
|
||||||
}}
|
}}
|
||||||
|
|
||||||
|
@ -130,7 +130,7 @@ namespace Grid {
|
|||||||
|
|
||||||
typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
|
typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
|
||||||
typedef WilsonImplParams ImplParams;
|
typedef WilsonImplParams ImplParams;
|
||||||
typedef CartesianStencil<SiteSpinor,SiteHalfSpinor,Compressor> StencilImpl;
|
typedef WilsonStencil<SiteSpinor,SiteHalfSpinor> StencilImpl;
|
||||||
|
|
||||||
ImplParams Params;
|
ImplParams Params;
|
||||||
|
|
||||||
@ -142,6 +142,10 @@ namespace Grid {
|
|||||||
mult(&phi(),&U(mu),&chi());
|
mult(&phi(),&U(mu),&chi());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<class ref>
|
||||||
|
inline void loadLinkElement(Simd & reg,ref &memory){
|
||||||
|
reg = memory;
|
||||||
|
}
|
||||||
inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
|
inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
|
||||||
{
|
{
|
||||||
conformable(Uds._grid,GaugeGrid);
|
conformable(Uds._grid,GaugeGrid);
|
||||||
@ -181,6 +185,100 @@ PARALLEL_FOR_LOOP
|
|||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
///////
|
||||||
|
// Single flavour four spinors with colour index, 5d redblack
|
||||||
|
///////
|
||||||
|
template<class S,int Nrepresentation=Nc>
|
||||||
|
class DomainWallRedBlack5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > {
|
||||||
|
public:
|
||||||
|
|
||||||
|
typedef PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > Gimpl;
|
||||||
|
|
||||||
|
INHERIT_GIMPL_TYPES(Gimpl);
|
||||||
|
|
||||||
|
template<typename vtype> using iImplSpinor = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
|
||||||
|
template<typename vtype> using iImplHalfSpinor = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
|
||||||
|
template<typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds >;
|
||||||
|
template<typename vtype> using iImplGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd >;
|
||||||
|
template<typename vtype> using iImplGaugeLink = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
|
||||||
|
|
||||||
|
typedef iImplSpinor <Simd> SiteSpinor;
|
||||||
|
typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
|
||||||
|
typedef Lattice<SiteSpinor> FermionField;
|
||||||
|
|
||||||
|
// Make the doubled gauge field a *scalar*
|
||||||
|
typedef iImplDoubledGaugeField<typename Simd::scalar_type> SiteDoubledGaugeField; // This is a scalar
|
||||||
|
typedef iImplGaugeField<typename Simd::scalar_type> SiteScalarGaugeField; // scalar
|
||||||
|
typedef iImplGaugeLink <typename Simd::scalar_type> SiteScalarGaugeLink; // scalar
|
||||||
|
|
||||||
|
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
|
||||||
|
|
||||||
|
typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
|
||||||
|
typedef WilsonImplParams ImplParams;
|
||||||
|
typedef WilsonStencil<SiteSpinor,SiteHalfSpinor> StencilImpl;
|
||||||
|
|
||||||
|
ImplParams Params;
|
||||||
|
|
||||||
|
DomainWallRedBlack5dImpl(const ImplParams &p= ImplParams()) : Params(p) {};
|
||||||
|
|
||||||
|
bool overlapCommsCompute(void) { return false; };
|
||||||
|
|
||||||
|
template<class ref>
|
||||||
|
inline void loadLinkElement(Simd & reg,ref &memory){
|
||||||
|
vsplat(reg,memory);
|
||||||
|
}
|
||||||
|
inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St)
|
||||||
|
{
|
||||||
|
SiteGaugeLink UU;
|
||||||
|
for(int i=0;i<Nrepresentation;i++){
|
||||||
|
for(int j=0;j<Nrepresentation;j++){
|
||||||
|
vsplat(UU()()(i,j),U(mu)()(i,j));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mult(&phi(),&UU(),&chi());
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
|
||||||
|
{
|
||||||
|
SiteScalarGaugeField ScalarUmu;
|
||||||
|
SiteDoubledGaugeField ScalarUds;
|
||||||
|
|
||||||
|
GaugeLinkField U (Umu._grid);
|
||||||
|
GaugeField Uadj(Umu._grid);
|
||||||
|
for(int mu=0;mu<Nd;mu++){
|
||||||
|
U = PeekIndex<LorentzIndex>(Umu,mu);
|
||||||
|
U = adj(Cshift(U,mu,-1));
|
||||||
|
PokeIndex<LorentzIndex>(Uadj,U,mu);
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int lidx=0;lidx<GaugeGrid->lSites();lidx++){
|
||||||
|
std::vector<int> lcoor;
|
||||||
|
GaugeGrid->LocalIndexToLocalCoor(lidx,lcoor);
|
||||||
|
|
||||||
|
peekLocalSite(ScalarUmu,Umu,lcoor);
|
||||||
|
for(int mu=0;mu<4;mu++) ScalarUds(mu) = ScalarUmu(mu);
|
||||||
|
|
||||||
|
peekLocalSite(ScalarUmu,Uadj,lcoor);
|
||||||
|
for(int mu=0;mu<4;mu++) ScalarUds(mu+4) = ScalarUmu(mu);
|
||||||
|
|
||||||
|
pokeLocalSite(ScalarUds,Uds,lcoor);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã,int mu){
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Flavour doubled spinors; is Gparity the only? what about C*?
|
// Flavour doubled spinors; is Gparity the only? what about C*?
|
||||||
////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////
|
||||||
@ -205,7 +303,7 @@ PARALLEL_FOR_LOOP
|
|||||||
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
|
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
|
||||||
|
|
||||||
typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
|
typedef WilsonCompressor<SiteHalfSpinor,SiteSpinor> Compressor;
|
||||||
typedef CartesianStencil<SiteSpinor,SiteHalfSpinor,Compressor> StencilImpl;
|
typedef WilsonStencil<SiteSpinor,SiteHalfSpinor> StencilImpl;
|
||||||
|
|
||||||
typedef GparityWilsonImplParams ImplParams;
|
typedef GparityWilsonImplParams ImplParams;
|
||||||
|
|
||||||
@ -379,6 +477,10 @@ PARALLEL_FOR_LOOP
|
|||||||
typedef WilsonImpl<vComplexF,Nc> WilsonImplF; // Float
|
typedef WilsonImpl<vComplexF,Nc> WilsonImplF; // Float
|
||||||
typedef WilsonImpl<vComplexD,Nc> WilsonImplD; // Double
|
typedef WilsonImpl<vComplexD,Nc> WilsonImplD; // Double
|
||||||
|
|
||||||
|
typedef DomainWallRedBlack5dImpl<vComplex ,Nc> DomainWallRedBlack5dImplR; // Real.. whichever prec
|
||||||
|
typedef DomainWallRedBlack5dImpl<vComplexF,Nc> DomainWallRedBlack5dImplF; // Float
|
||||||
|
typedef DomainWallRedBlack5dImpl<vComplexD,Nc> DomainWallRedBlack5dImplD; // Double
|
||||||
|
|
||||||
typedef GparityWilsonImpl<vComplex ,Nc> GparityWilsonImplR; // Real.. whichever prec
|
typedef GparityWilsonImpl<vComplex ,Nc> GparityWilsonImplR; // Real.. whichever prec
|
||||||
typedef GparityWilsonImpl<vComplexF,Nc> GparityWilsonImplF; // Float
|
typedef GparityWilsonImpl<vComplexF,Nc> GparityWilsonImplF; // Float
|
||||||
typedef GparityWilsonImpl<vComplexD,Nc> GparityWilsonImplD; // Double
|
typedef GparityWilsonImpl<vComplexD,Nc> GparityWilsonImplD; // Double
|
||||||
|
@ -48,14 +48,16 @@ namespace Grid {
|
|||||||
GridCartesian &FourDimGrid,
|
GridCartesian &FourDimGrid,
|
||||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||||
RealD _mass,RealD _M5,
|
RealD _mass,RealD _M5,
|
||||||
RealD scale) :
|
// RealD scale):
|
||||||
|
RealD scale,const ImplParams &p= ImplParams()) :
|
||||||
|
|
||||||
// b+c=scale, b-c = 1 <=> 2b = scale+1; 2c = scale-1
|
// b+c=scale, b-c = 1 <=> 2b = scale+1; 2c = scale-1
|
||||||
MobiusFermion<Impl>(_Umu,
|
MobiusFermion<Impl>(_Umu,
|
||||||
FiveDimGrid,
|
FiveDimGrid,
|
||||||
FiveDimRedBlackGrid,
|
FiveDimRedBlackGrid,
|
||||||
FourDimGrid,
|
FourDimGrid,
|
||||||
FourDimRedBlackGrid,_mass,_M5,0.5*(scale+1.0),0.5*(scale-1.0))
|
FourDimRedBlackGrid,_mass,_M5,0.5*(scale+1.0),0.5*(scale-1.0),p)
|
||||||
|
// FourDimRedBlackGrid,_mass,_M5,0.5*(scale+1.0),0.5*(scale-1.0))
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -48,12 +48,7 @@ namespace QCD {
|
|||||||
mu=p;
|
mu=p;
|
||||||
};
|
};
|
||||||
|
|
||||||
virtual SiteHalfSpinor operator () (const SiteSpinor &in,int dim,int plane,int osite,GridBase *grid) {
|
inline SiteHalfSpinor operator () (const SiteSpinor &in) {
|
||||||
return spinproject(in);
|
|
||||||
}
|
|
||||||
|
|
||||||
SiteHalfSpinor spinproject(const SiteSpinor &in)
|
|
||||||
{
|
|
||||||
SiteHalfSpinor ret;
|
SiteHalfSpinor ret;
|
||||||
int mudag=mu;
|
int mudag=mu;
|
||||||
if (!dag) {
|
if (!dag) {
|
||||||
@ -92,6 +87,173 @@ namespace QCD {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/////////////////////////
|
||||||
|
// optimised versions
|
||||||
|
/////////////////////////
|
||||||
|
|
||||||
|
template<class SiteHalfSpinor,class SiteSpinor>
|
||||||
|
class WilsonXpCompressor {
|
||||||
|
public:
|
||||||
|
inline SiteHalfSpinor operator () (const SiteSpinor &in) {
|
||||||
|
SiteHalfSpinor ret;
|
||||||
|
spProjXp(ret,in);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
template<class SiteHalfSpinor,class SiteSpinor>
|
||||||
|
class WilsonYpCompressor {
|
||||||
|
public:
|
||||||
|
inline SiteHalfSpinor operator () (const SiteSpinor &in) {
|
||||||
|
SiteHalfSpinor ret;
|
||||||
|
spProjYp(ret,in);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
template<class SiteHalfSpinor,class SiteSpinor>
|
||||||
|
class WilsonZpCompressor {
|
||||||
|
public:
|
||||||
|
inline SiteHalfSpinor operator () (const SiteSpinor &in) {
|
||||||
|
SiteHalfSpinor ret;
|
||||||
|
spProjZp(ret,in);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
template<class SiteHalfSpinor,class SiteSpinor>
|
||||||
|
class WilsonTpCompressor {
|
||||||
|
public:
|
||||||
|
inline SiteHalfSpinor operator () (const SiteSpinor &in) {
|
||||||
|
SiteHalfSpinor ret;
|
||||||
|
spProjTp(ret,in);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template<class SiteHalfSpinor,class SiteSpinor>
|
||||||
|
class WilsonXmCompressor {
|
||||||
|
public:
|
||||||
|
inline SiteHalfSpinor operator () (const SiteSpinor &in) {
|
||||||
|
SiteHalfSpinor ret;
|
||||||
|
spProjXm(ret,in);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
template<class SiteHalfSpinor,class SiteSpinor>
|
||||||
|
class WilsonYmCompressor {
|
||||||
|
public:
|
||||||
|
inline SiteHalfSpinor operator () (const SiteSpinor &in) {
|
||||||
|
SiteHalfSpinor ret;
|
||||||
|
spProjYm(ret,in);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
template<class SiteHalfSpinor,class SiteSpinor>
|
||||||
|
class WilsonZmCompressor {
|
||||||
|
public:
|
||||||
|
inline SiteHalfSpinor operator () (const SiteSpinor &in) {
|
||||||
|
SiteHalfSpinor ret;
|
||||||
|
spProjZm(ret,in);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
template<class SiteHalfSpinor,class SiteSpinor>
|
||||||
|
class WilsonTmCompressor {
|
||||||
|
public:
|
||||||
|
inline SiteHalfSpinor operator () (const SiteSpinor &in) {
|
||||||
|
SiteHalfSpinor ret;
|
||||||
|
spProjTm(ret,in);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Fast comms buffer manipulation which should inline right through (avoid direction
|
||||||
|
// dependent logic that prevents inlining
|
||||||
|
template<class vobj,class cobj>
|
||||||
|
class WilsonStencil : public CartesianStencil<vobj,cobj> {
|
||||||
|
public:
|
||||||
|
|
||||||
|
WilsonStencil(GridBase *grid,
|
||||||
|
int npoints,
|
||||||
|
int checkerboard,
|
||||||
|
const std::vector<int> &directions,
|
||||||
|
const std::vector<int> &distances) : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances)
|
||||||
|
{ };
|
||||||
|
|
||||||
|
template < class compressor>
|
||||||
|
std::thread HaloExchangeOptBegin(const Lattice<vobj> &source,compressor &compress) {
|
||||||
|
this->Mergers.resize(0);
|
||||||
|
this->Packets.resize(0);
|
||||||
|
this->HaloGatherOpt(source,compress);
|
||||||
|
return std::thread([&] { this->Communicate(); });
|
||||||
|
}
|
||||||
|
|
||||||
|
template < class compressor>
|
||||||
|
void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)
|
||||||
|
{
|
||||||
|
auto thr = this->HaloExchangeOptBegin(source,compress);
|
||||||
|
this->HaloExchangeOptComplete(thr);
|
||||||
|
}
|
||||||
|
|
||||||
|
void HaloExchangeOptComplete(std::thread &thr)
|
||||||
|
{
|
||||||
|
this->CommsMerge(); // spins
|
||||||
|
this->jointime-=usecond();
|
||||||
|
thr.join();
|
||||||
|
this->jointime+=usecond();
|
||||||
|
}
|
||||||
|
|
||||||
|
template < class compressor>
|
||||||
|
void HaloGatherOpt(const Lattice<vobj> &source,compressor &compress)
|
||||||
|
{
|
||||||
|
// conformable(source._grid,_grid);
|
||||||
|
assert(source._grid==this->_grid);
|
||||||
|
this->halogtime-=usecond();
|
||||||
|
|
||||||
|
assert (this->comm_buf.size() == this->_unified_buffer_size );
|
||||||
|
this->u_comm_offset=0;
|
||||||
|
|
||||||
|
int dag = compress.dag;
|
||||||
|
static std::vector<int> dirs(Nd*2);
|
||||||
|
for(int mu=0;mu<Nd;mu++){
|
||||||
|
if ( dag ) {
|
||||||
|
dirs[mu] =mu;
|
||||||
|
dirs[mu+4]=mu+Nd;
|
||||||
|
} else {
|
||||||
|
dirs[mu] =mu+Nd;
|
||||||
|
dirs[mu+Nd]=mu;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
WilsonXpCompressor<cobj,vobj> XpCompress;
|
||||||
|
this->HaloGatherDir(source,XpCompress,dirs[0]);
|
||||||
|
|
||||||
|
WilsonYpCompressor<cobj,vobj> YpCompress;
|
||||||
|
this->HaloGatherDir(source,YpCompress,dirs[1]);
|
||||||
|
|
||||||
|
WilsonZpCompressor<cobj,vobj> ZpCompress;
|
||||||
|
this->HaloGatherDir(source,ZpCompress,dirs[2]);
|
||||||
|
|
||||||
|
WilsonTpCompressor<cobj,vobj> TpCompress;
|
||||||
|
this->HaloGatherDir(source,TpCompress,dirs[3]);
|
||||||
|
|
||||||
|
WilsonXmCompressor<cobj,vobj> XmCompress;
|
||||||
|
this->HaloGatherDir(source,XmCompress,dirs[4]);
|
||||||
|
|
||||||
|
WilsonYmCompressor<cobj,vobj> YmCompress;
|
||||||
|
this->HaloGatherDir(source,YmCompress,dirs[5]);
|
||||||
|
|
||||||
|
WilsonZmCompressor<cobj,vobj> ZmCompress;
|
||||||
|
this->HaloGatherDir(source,ZmCompress,dirs[6]);
|
||||||
|
|
||||||
|
WilsonTmCompressor<cobj,vobj> TmCompress;
|
||||||
|
this->HaloGatherDir(source,TmCompress,dirs[7]);
|
||||||
|
|
||||||
|
assert(this->u_comm_offset==this->_unified_buffer_size);
|
||||||
|
this->halogtime+=usecond();
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
}} // namespace close
|
}} // namespace close
|
||||||
#endif
|
#endif
|
||||||
|
@ -64,7 +64,9 @@ namespace QCD {
|
|||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu)
|
void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu)
|
||||||
{
|
{
|
||||||
Impl::DoubleStore(GaugeGrid(),Umu,_Umu);
|
GaugeField HUmu(_Umu._grid);
|
||||||
|
HUmu = _Umu*(-0.5);
|
||||||
|
Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
|
||||||
pickCheckerboard(Even,UmuEven,Umu);
|
pickCheckerboard(Even,UmuEven,Umu);
|
||||||
pickCheckerboard(Odd ,UmuOdd,Umu);
|
pickCheckerboard(Odd ,UmuOdd,Umu);
|
||||||
}
|
}
|
||||||
@ -286,121 +288,27 @@ PARALLEL_FOR_LOOP
|
|||||||
void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
|
void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U,
|
||||||
const FermionField &in, FermionField &out,int dag)
|
const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
{
|
||||||
if ( Impl::overlapCommsCompute () ) {
|
|
||||||
DhopInternalCommsOverlapCompute(st,U,in,out,dag);
|
|
||||||
} else {
|
|
||||||
DhopInternalCommsThenCompute(st,U,in,out,dag);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
template<class Impl>
|
|
||||||
void WilsonFermion<Impl>::DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
|
|
||||||
const FermionField &in, FermionField &out,int dag) {
|
|
||||||
|
|
||||||
assert((dag==DaggerNo) ||(dag==DaggerYes));
|
assert((dag==DaggerNo) ||(dag==DaggerYes));
|
||||||
|
|
||||||
Compressor compressor(dag);
|
Compressor compressor(dag);
|
||||||
st.HaloExchange(in,compressor);
|
st.HaloExchange(in,compressor);
|
||||||
|
|
||||||
if ( dag == DaggerYes ) {
|
if ( dag == DaggerYes ) {
|
||||||
if( HandOptDslash ) {
|
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||||
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
|
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,1,1,in,out);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
for(int sss=0;sss<in._grid->oSites();sss++){
|
||||||
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out);
|
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,1,1,in,out);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
if( HandOptDslash ) {
|
|
||||||
PARALLEL_FOR_LOOP
|
|
||||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
|
||||||
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
PARALLEL_FOR_LOOP
|
|
||||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
|
||||||
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
|
||||||
void WilsonFermion<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st,DoubledGaugeField & U,
|
|
||||||
const FermionField &in, FermionField &out,int dag) {
|
|
||||||
|
|
||||||
assert((dag==DaggerNo) ||(dag==DaggerYes));
|
|
||||||
|
|
||||||
Compressor compressor(dag);
|
|
||||||
|
|
||||||
auto handle = st.HaloExchangeBegin(in,compressor);
|
|
||||||
|
|
||||||
bool local = true;
|
|
||||||
bool nonlocal = false;
|
|
||||||
if ( dag == DaggerYes ) {
|
|
||||||
if( HandOptDslash ) {
|
|
||||||
PARALLEL_FOR_LOOP
|
|
||||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
|
||||||
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
PARALLEL_FOR_LOOP
|
|
||||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
|
||||||
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if( HandOptDslash ) {
|
|
||||||
PARALLEL_FOR_LOOP
|
|
||||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
|
||||||
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
PARALLEL_FOR_LOOP
|
|
||||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
|
||||||
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
st.HaloExchangeComplete(handle);
|
|
||||||
|
|
||||||
local = false;
|
|
||||||
nonlocal = true;
|
|
||||||
if ( dag == DaggerYes ) {
|
|
||||||
if( HandOptDslash ) {
|
|
||||||
PARALLEL_FOR_LOOP
|
|
||||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
|
||||||
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
PARALLEL_FOR_LOOP
|
|
||||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
|
||||||
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if( HandOptDslash ) {
|
|
||||||
PARALLEL_FOR_LOOP
|
|
||||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
|
||||||
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
PARALLEL_FOR_LOOP
|
|
||||||
for(int sss=0;sss<in._grid->oSites();sss++){
|
|
||||||
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
FermOpTemplateInstantiate(WilsonFermion);
|
FermOpTemplateInstantiate(WilsonFermion);
|
||||||
|
GparityFermOpTemplateInstantiate(WilsonFermion);
|
||||||
|
|
||||||
|
|
||||||
}}
|
}}
|
||||||
|
@ -114,12 +114,6 @@ namespace Grid {
|
|||||||
void DhopInternal(StencilImpl & st,DoubledGaugeField & U,
|
void DhopInternal(StencilImpl & st,DoubledGaugeField & U,
|
||||||
const FermionField &in, FermionField &out,int dag) ;
|
const FermionField &in, FermionField &out,int dag) ;
|
||||||
|
|
||||||
void DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
|
|
||||||
const FermionField &in, FermionField &out,int dag) ;
|
|
||||||
void DhopInternalCommsOverlapCompute(StencilImpl & st,DoubledGaugeField & U,
|
|
||||||
const FermionField &in, FermionField &out,int dag) ;
|
|
||||||
|
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
WilsonFermion(GaugeField &_Umu,
|
WilsonFermion(GaugeField &_Umu,
|
||||||
GridCartesian &Fgrid,
|
GridCartesian &Fgrid,
|
||||||
|
@ -38,8 +38,6 @@ namespace QCD {
|
|||||||
// S-direction is INNERMOST and takes no part in the parity.
|
// S-direction is INNERMOST and takes no part in the parity.
|
||||||
const std::vector<int> WilsonFermion5DStatic::directions ({1,2,3,4, 1, 2, 3, 4});
|
const std::vector<int> WilsonFermion5DStatic::directions ({1,2,3,4, 1, 2, 3, 4});
|
||||||
const std::vector<int> WilsonFermion5DStatic::displacements({1,1,1,1,-1,-1,-1,-1});
|
const std::vector<int> WilsonFermion5DStatic::displacements({1,1,1,1,-1,-1,-1,-1});
|
||||||
int WilsonFermion5DStatic::HandOptDslash;
|
|
||||||
int WilsonFermion5DStatic::AsmOptDslash;
|
|
||||||
|
|
||||||
// 5d lattice for DWF.
|
// 5d lattice for DWF.
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -67,10 +65,8 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
|
|||||||
// some assertions
|
// some assertions
|
||||||
assert(FiveDimGrid._ndimension==5);
|
assert(FiveDimGrid._ndimension==5);
|
||||||
assert(FourDimGrid._ndimension==4);
|
assert(FourDimGrid._ndimension==4);
|
||||||
|
|
||||||
assert(FiveDimRedBlackGrid._ndimension==5);
|
assert(FiveDimRedBlackGrid._ndimension==5);
|
||||||
assert(FourDimRedBlackGrid._ndimension==4);
|
assert(FourDimRedBlackGrid._ndimension==4);
|
||||||
|
|
||||||
assert(FiveDimRedBlackGrid._checker_dim==1);
|
assert(FiveDimRedBlackGrid._checker_dim==1);
|
||||||
|
|
||||||
// Dimension zero of the five-d is the Ls direction
|
// Dimension zero of the five-d is the Ls direction
|
||||||
@ -99,16 +95,74 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
|
|||||||
|
|
||||||
// Allocate the required comms buffer
|
// Allocate the required comms buffer
|
||||||
ImportGauge(_Umu);
|
ImportGauge(_Umu);
|
||||||
alltime=0;
|
|
||||||
commtime=0;
|
|
||||||
jointime=0;
|
|
||||||
dslashtime=0;
|
|
||||||
dslash1time=0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<class Impl>
|
||||||
|
WilsonFermion5D<Impl>::WilsonFermion5D(int simd,GaugeField &_Umu,
|
||||||
|
GridCartesian &FiveDimGrid,
|
||||||
|
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||||
|
GridCartesian &FourDimGrid,
|
||||||
|
RealD _M5,const ImplParams &p) :
|
||||||
|
Kernels(p),
|
||||||
|
_FiveDimGrid (&FiveDimGrid),
|
||||||
|
_FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
|
||||||
|
_FourDimGrid (&FourDimGrid),
|
||||||
|
Stencil (_FiveDimGrid,npoint,Even,directions,displacements),
|
||||||
|
StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even
|
||||||
|
StencilOdd (_FiveDimRedBlackGrid,npoint,Odd ,directions,displacements), // source is Odd
|
||||||
|
M5(_M5),
|
||||||
|
Umu(_FourDimGrid),
|
||||||
|
UmuEven(_FourDimGrid),
|
||||||
|
UmuOdd (_FourDimGrid),
|
||||||
|
Lebesgue(_FourDimGrid),
|
||||||
|
LebesgueEvenOdd(_FourDimGrid)
|
||||||
|
{
|
||||||
|
int nsimd = Simd::Nsimd();
|
||||||
|
|
||||||
|
// some assertions
|
||||||
|
assert(FiveDimGrid._ndimension==5);
|
||||||
|
assert(FiveDimRedBlackGrid._ndimension==5);
|
||||||
|
assert(FiveDimRedBlackGrid._checker_dim==0); // Checkerboard the s-direction
|
||||||
|
assert(FourDimGrid._ndimension==4);
|
||||||
|
|
||||||
|
// Dimension zero of the five-d is the Ls direction
|
||||||
|
Ls=FiveDimGrid._fdimensions[0];
|
||||||
|
assert(FiveDimGrid._processors[0] ==1);
|
||||||
|
assert(FiveDimGrid._simd_layout[0] ==nsimd);
|
||||||
|
|
||||||
|
assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
|
||||||
|
assert(FiveDimRedBlackGrid._processors[0] ==1);
|
||||||
|
assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);
|
||||||
|
|
||||||
|
// Other dimensions must match the decomposition of the four-D fields
|
||||||
|
for(int d=0;d<4;d++){
|
||||||
|
assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
|
||||||
|
assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
|
||||||
|
|
||||||
|
assert(FourDimGrid._simd_layout[d]=1);
|
||||||
|
assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
|
||||||
|
|
||||||
|
assert(FiveDimGrid._fdimensions[d+1] ==FourDimGrid._fdimensions[d]);
|
||||||
|
assert(FiveDimGrid._processors[d+1] ==FourDimGrid._processors[d]);
|
||||||
|
assert(FiveDimGrid._simd_layout[d+1] ==FourDimGrid._simd_layout[d]);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
GaugeField HUmu(_Umu._grid);
|
||||||
|
HUmu = _Umu*(-0.5);
|
||||||
|
Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
|
||||||
|
UmuEven=Umu;// Really want a reference.
|
||||||
|
UmuOdd =Umu;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
|
void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
|
||||||
{
|
{
|
||||||
Impl::DoubleStore(GaugeGrid(),Umu,_Umu);
|
GaugeField HUmu(_Umu._grid);
|
||||||
|
HUmu = _Umu*(-0.5);
|
||||||
|
Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
|
||||||
pickCheckerboard(Even,UmuEven,Umu);
|
pickCheckerboard(Even,UmuEven,Umu);
|
||||||
pickCheckerboard(Odd ,UmuOdd,Umu);
|
pickCheckerboard(Odd ,UmuOdd,Umu);
|
||||||
}
|
}
|
||||||
@ -232,30 +286,6 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
|
||||||
void WilsonFermion5D<Impl>::Report(void)
|
|
||||||
{
|
|
||||||
std::cout<<GridLogMessage << "******************** WilsonFermion"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Wilson5d time "<<alltime <<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "HaloBegin time "<<commtime <<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Dslash time "<<dslashtime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Dslash1 time "<<dslash1time<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "HaloComplete time "<<jointime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "******************** Stencil"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Stencil all gather time "<<Stencil.halogtime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Stencil nosplice gather time "<<Stencil.nosplicetime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Stencil splice gather time "<<Stencil.splicetime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "********************"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Stencil gather "<<Stencil.gathertime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Stencil gather simd "<<Stencil.gathermtime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Stencil merge simd "<<Stencil.mergetime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Stencil spin simd "<<Stencil.spintime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "********************"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Stencil MB/s "<<(double)Stencil.comms_bytes/Stencil.commtime<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Stencil comm time "<<Stencil.commtime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "Stencil join time "<<Stencil.jointime<<" us"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "********************"<<std::endl;
|
|
||||||
}
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
|
void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
|
||||||
const FermionField &A,
|
const FermionField &A,
|
||||||
@ -277,280 +307,32 @@ template<class Impl>
|
|||||||
void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField & U,
|
DoubledGaugeField & U,
|
||||||
const FermionField &in, FermionField &out,int dag)
|
const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
|
||||||
if ( Impl::overlapCommsCompute () ) {
|
|
||||||
DhopInternalCommsOverlapCompute(st,lo,U,in,out,dag);
|
|
||||||
} else {
|
|
||||||
DhopInternalCommsThenCompute(st,lo,U,in,out,dag);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class Impl>
|
|
||||||
void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField & U,
|
|
||||||
const FermionField &in, FermionField &out,int dag)
|
|
||||||
{
|
{
|
||||||
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
||||||
alltime-=usecond();
|
|
||||||
Compressor compressor(dag);
|
Compressor compressor(dag);
|
||||||
|
|
||||||
// Assume balanced KMP_AFFINITY; this is forced in GridThread.h
|
int LLs = in._grid->_rdimensions[0];
|
||||||
|
|
||||||
int threads = GridThread::GetThreads();
|
st.HaloExchange(in,compressor);
|
||||||
int HT = GridThread::GetHyperThreads();
|
|
||||||
int cores = GridThread::GetCores();
|
|
||||||
int nwork = U._grid->oSites();
|
|
||||||
|
|
||||||
commtime -=usecond();
|
|
||||||
auto handle = st.HaloExchangeBegin(in,compressor);
|
|
||||||
st.HaloExchangeComplete(handle);
|
|
||||||
commtime +=usecond();
|
|
||||||
|
|
||||||
jointime -=usecond();
|
|
||||||
jointime +=usecond();
|
|
||||||
|
|
||||||
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
||||||
// Not loop ordering and data layout.
|
|
||||||
// Designed to create
|
|
||||||
// - per thread reuse in L1 cache for U
|
|
||||||
// - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
|
|
||||||
dslashtime -=usecond();
|
|
||||||
if ( dag == DaggerYes ) {
|
if ( dag == DaggerYes ) {
|
||||||
if( this->HandOptDslash ) {
|
|
||||||
#pragma omp parallel for schedule(static)
|
|
||||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
|
||||||
int sU=ss;
|
|
||||||
for(int s=0;s<Ls;s++){
|
|
||||||
int sF = s+Ls*sU;
|
|
||||||
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||||
{
|
|
||||||
int sd;
|
|
||||||
for(sd=0;sd<Ls;sd++){
|
|
||||||
int sU=ss;
|
int sU=ss;
|
||||||
int sF = sd+Ls*sU;
|
int sF=LLs*sU;
|
||||||
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
|
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,LLs,1,in,out);
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if( this->AsmOptDslash ) {
|
|
||||||
// for(int i=0;i<1;i++){
|
|
||||||
// for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
|
|
||||||
// PerformanceCounter Counter(i);
|
|
||||||
// Counter.Start();
|
|
||||||
|
|
||||||
#pragma omp parallel for
|
|
||||||
for(int t=0;t<threads;t++){
|
|
||||||
|
|
||||||
int hyperthread = t%HT;
|
|
||||||
int core = t/HT;
|
|
||||||
|
|
||||||
int sswork, swork,soff,ssoff, sU,sF;
|
|
||||||
|
|
||||||
GridThread::GetWork(nwork,core,sswork,ssoff,cores);
|
|
||||||
GridThread::GetWork(Ls , hyperthread, swork, soff,HT);
|
|
||||||
|
|
||||||
for(int ss=0;ss<sswork;ss++){
|
|
||||||
for(int s=soff;s<soff+swork;s++){
|
|
||||||
|
|
||||||
sU=ss+ ssoff;
|
|
||||||
|
|
||||||
if ( LebesgueOrder::UseLebesgueOrder ) {
|
|
||||||
sU = lo.Reorder(sU);
|
|
||||||
}
|
|
||||||
sF = s+Ls*sU;
|
|
||||||
Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out,(uint64_t *)0);// &buf[0]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Counter.Stop();
|
|
||||||
// Counter.Report();
|
|
||||||
// }
|
|
||||||
} else if( this->HandOptDslash ) {
|
|
||||||
/*
|
|
||||||
|
|
||||||
#pragma omp parallel for schedule(static)
|
|
||||||
for(int t=0;t<threads;t++){
|
|
||||||
|
|
||||||
int hyperthread = t%HT;
|
|
||||||
int core = t/HT;
|
|
||||||
|
|
||||||
int sswork, swork,soff,ssoff, sU,sF;
|
|
||||||
|
|
||||||
GridThread::GetWork(nwork,core,sswork,ssoff,cores);
|
|
||||||
GridThread::GetWork(Ls , hyperthread, swork, soff,HT);
|
|
||||||
|
|
||||||
for(int ss=0;ss<sswork;ss++){
|
|
||||||
sU=ss+ ssoff;
|
|
||||||
for(int s=soff;s<soff+swork;s++){
|
|
||||||
sF = s+Ls*sU;
|
|
||||||
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
#pragma omp parallel for schedule(static)
|
|
||||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
|
||||||
int sU=ss;
|
|
||||||
for(int s=0;s<Ls;s++){
|
|
||||||
int sF = s+Ls*sU;
|
|
||||||
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
PARALLEL_FOR_LOOP
|
PARALLEL_FOR_LOOP
|
||||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
for(int ss=0;ss<U._grid->oSites();ss++){
|
||||||
int sU=ss;
|
int sU=ss;
|
||||||
for(int s=0;s<Ls;s++){
|
int sF=LLs*sU;
|
||||||
int sF = s+Ls*sU;
|
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,LLs,1,in,out);
|
||||||
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
dslashtime +=usecond();
|
|
||||||
alltime+=usecond();
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class Impl>
|
|
||||||
void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField & U,
|
|
||||||
const FermionField &in, FermionField &out,int dag)
|
|
||||||
{
|
|
||||||
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
|
||||||
alltime-=usecond();
|
|
||||||
|
|
||||||
int calls;
|
|
||||||
int updates;
|
|
||||||
Compressor compressor(dag);
|
|
||||||
|
|
||||||
// Assume balanced KMP_AFFINITY; this is forced in GridThread.h
|
|
||||||
|
|
||||||
int threads = GridThread::GetThreads();
|
|
||||||
int HT = GridThread::GetHyperThreads();
|
|
||||||
int cores = GridThread::GetCores();
|
|
||||||
int nwork = U._grid->oSites();
|
|
||||||
|
|
||||||
commtime -=usecond();
|
|
||||||
auto handle = st.HaloExchangeBegin(in,compressor);
|
|
||||||
commtime +=usecond();
|
|
||||||
|
|
||||||
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
|
||||||
// Not loop ordering and data layout.
|
|
||||||
// Designed to create
|
|
||||||
// - per thread reuse in L1 cache for U
|
|
||||||
// - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
|
|
||||||
bool local = true;
|
|
||||||
bool nonlocal = false;
|
|
||||||
dslashtime -=usecond();
|
|
||||||
if ( dag == DaggerYes ) {
|
|
||||||
if( this->HandOptDslash ) {
|
|
||||||
PARALLEL_FOR_LOOP
|
|
||||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
|
||||||
int sU=ss;
|
|
||||||
for(int s=0;s<Ls;s++){
|
|
||||||
int sF = s+Ls*sU;
|
|
||||||
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
PARALLEL_FOR_LOOP
|
|
||||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
|
||||||
{
|
|
||||||
int sd;
|
|
||||||
for(sd=0;sd<Ls;sd++){
|
|
||||||
int sU=ss;
|
|
||||||
int sF = sd+Ls*sU;
|
|
||||||
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if( this->HandOptDslash ) {
|
|
||||||
PARALLEL_FOR_LOOP
|
|
||||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
|
||||||
int sU=ss;
|
|
||||||
for(int s=0;s<Ls;s++){
|
|
||||||
int sF = s+Ls*sU;
|
|
||||||
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
PARALLEL_FOR_LOOP
|
|
||||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
|
||||||
int sU=ss;
|
|
||||||
for(int s=0;s<Ls;s++){
|
|
||||||
int sF = s+Ls*sU;
|
|
||||||
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
dslashtime +=usecond();
|
|
||||||
|
|
||||||
jointime -=usecond();
|
|
||||||
st.HaloExchangeComplete(handle);
|
|
||||||
jointime +=usecond();
|
|
||||||
|
|
||||||
local = false;
|
|
||||||
nonlocal = true;
|
|
||||||
dslash1time -=usecond();
|
|
||||||
if ( dag == DaggerYes ) {
|
|
||||||
if( this->HandOptDslash ) {
|
|
||||||
PARALLEL_FOR_LOOP
|
|
||||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
|
||||||
int sU=ss;
|
|
||||||
for(int s=0;s<Ls;s++){
|
|
||||||
int sF = s+Ls*sU;
|
|
||||||
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
PARALLEL_FOR_LOOP
|
|
||||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
|
||||||
{
|
|
||||||
int sd;
|
|
||||||
for(sd=0;sd<Ls;sd++){
|
|
||||||
int sU=ss;
|
|
||||||
int sF = sd+Ls*sU;
|
|
||||||
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if( this->HandOptDslash ) {
|
|
||||||
PARALLEL_FOR_LOOP
|
|
||||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
|
||||||
int sU=ss;
|
|
||||||
for(int s=0;s<Ls;s++){
|
|
||||||
int sF = s+Ls*sU;
|
|
||||||
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
PARALLEL_FOR_LOOP
|
|
||||||
for(int ss=0;ss<U._grid->oSites();ss++){
|
|
||||||
int sU=ss;
|
|
||||||
for(int s=0;s<Ls;s++){
|
|
||||||
int sF = s+Ls*sU;
|
|
||||||
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out,local,nonlocal);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
dslash1time +=usecond();
|
|
||||||
alltime+=usecond();
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
|
void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
|
||||||
@ -593,6 +375,9 @@ void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag
|
|||||||
}
|
}
|
||||||
|
|
||||||
FermOpTemplateInstantiate(WilsonFermion5D);
|
FermOpTemplateInstantiate(WilsonFermion5D);
|
||||||
|
GparityFermOpTemplateInstantiate(WilsonFermion5D);
|
||||||
|
template class WilsonFermion5D<DomainWallRedBlack5dImplF>;
|
||||||
|
template class WilsonFermion5D<DomainWallRedBlack5dImplD>;
|
||||||
|
|
||||||
}}
|
}}
|
||||||
|
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
|
||||||
/*************************************************************************************
|
/*************************************************************************************
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
@ -48,8 +49,6 @@ namespace Grid {
|
|||||||
class WilsonFermion5DStatic {
|
class WilsonFermion5DStatic {
|
||||||
public:
|
public:
|
||||||
// S-direction is INNERMOST and takes no part in the parity.
|
// S-direction is INNERMOST and takes no part in the parity.
|
||||||
static int AsmOptDslash; // these are a temporary hack
|
|
||||||
static int HandOptDslash; // these are a temporary hack
|
|
||||||
static const std::vector<int> directions;
|
static const std::vector<int> directions;
|
||||||
static const std::vector<int> displacements;
|
static const std::vector<int> displacements;
|
||||||
const int npoint = 8;
|
const int npoint = 8;
|
||||||
@ -61,11 +60,7 @@ namespace Grid {
|
|||||||
public:
|
public:
|
||||||
INHERIT_IMPL_TYPES(Impl);
|
INHERIT_IMPL_TYPES(Impl);
|
||||||
typedef WilsonKernels<Impl> Kernels;
|
typedef WilsonKernels<Impl> Kernels;
|
||||||
double alltime;
|
|
||||||
double jointime;
|
|
||||||
double commtime;
|
|
||||||
double dslashtime;
|
|
||||||
double dslash1time;
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Implement the abstract base
|
// Implement the abstract base
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
@ -86,6 +81,7 @@ namespace Grid {
|
|||||||
virtual void MeooeDag (const FermionField &in, FermionField &out){assert(0);};
|
virtual void MeooeDag (const FermionField &in, FermionField &out){assert(0);};
|
||||||
virtual void MooeeDag (const FermionField &in, FermionField &out){assert(0);};
|
virtual void MooeeDag (const FermionField &in, FermionField &out){assert(0);};
|
||||||
virtual void MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
|
virtual void MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
|
||||||
|
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
|
||||||
|
|
||||||
// These can be overridden by fancy 5d chiral action
|
// These can be overridden by fancy 5d chiral action
|
||||||
virtual void DhopDeriv (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
|
virtual void DhopDeriv (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
|
||||||
@ -120,19 +116,6 @@ namespace Grid {
|
|||||||
FermionField &out,
|
FermionField &out,
|
||||||
int dag);
|
int dag);
|
||||||
|
|
||||||
void DhopInternalCommsThenCompute(StencilImpl & st,
|
|
||||||
LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField &U,
|
|
||||||
const FermionField &in,
|
|
||||||
FermionField &out,
|
|
||||||
int dag);
|
|
||||||
void DhopInternalCommsOverlapCompute(StencilImpl & st,
|
|
||||||
LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField &U,
|
|
||||||
const FermionField &in,
|
|
||||||
FermionField &out,
|
|
||||||
int dag);
|
|
||||||
|
|
||||||
// Constructors
|
// Constructors
|
||||||
WilsonFermion5D(GaugeField &_Umu,
|
WilsonFermion5D(GaugeField &_Umu,
|
||||||
GridCartesian &FiveDimGrid,
|
GridCartesian &FiveDimGrid,
|
||||||
@ -141,14 +124,21 @@ namespace Grid {
|
|||||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||||
double _M5,const ImplParams &p= ImplParams());
|
double _M5,const ImplParams &p= ImplParams());
|
||||||
|
|
||||||
|
// Constructors
|
||||||
|
WilsonFermion5D(int simd,
|
||||||
|
GaugeField &_Umu,
|
||||||
|
GridCartesian &FiveDimGrid,
|
||||||
|
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||||
|
GridCartesian &FourDimGrid,
|
||||||
|
double _M5,const ImplParams &p= ImplParams());
|
||||||
|
|
||||||
// DoubleStore
|
// DoubleStore
|
||||||
void ImportGauge(const GaugeField &_Umu);
|
void ImportGauge(const GaugeField &_Umu);
|
||||||
|
|
||||||
void Report(void);
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Data members require to support the functionality
|
// Data members require to support the functionality
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
protected:
|
public:
|
||||||
|
|
||||||
// Add these to the support from Wilson
|
// Add these to the support from Wilson
|
||||||
GridBase *_FourDimGrid;
|
GridBase *_FourDimGrid;
|
||||||
|
@ -31,440 +31,410 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
namespace Grid {
|
namespace Grid {
|
||||||
namespace QCD {
|
namespace QCD {
|
||||||
|
|
||||||
|
int WilsonKernelsStatic::HandOpt;
|
||||||
|
int WilsonKernelsStatic::AsmOpt;
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
WilsonKernels<Impl>::WilsonKernels(const ImplParams &p): Base(p) {};
|
WilsonKernels<Impl>::WilsonKernels(const ImplParams &p): Base(p) {};
|
||||||
|
|
||||||
// Need controls to do interior, exterior, or both
|
template<class Impl>
|
||||||
|
void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
|
if ( AsmOpt ) {
|
||||||
|
|
||||||
|
WilsonKernels<Impl>::DiracOptAsmDhopSite(st,U,buf,sF,sU,Ls,Ns,in,out);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
for(int site=0;site<Ns;site++) {
|
||||||
|
for(int s=0;s<Ls;s++) {
|
||||||
|
if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSite(st,U,buf,sF,sU,in,out);
|
||||||
|
else WilsonKernels<Impl>::DiracOptGenericDhopSite(st,U,buf,sF,sU,in,out);
|
||||||
|
sF++;
|
||||||
|
}
|
||||||
|
sU++;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<Impl>::DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out,bool local, bool nonlocal)
|
int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
|
// No asm implementation yet.
|
||||||
|
// if ( AsmOpt ) WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,U,buf,sF,sU,in,out);
|
||||||
|
// else
|
||||||
|
for(int site=0;site<Ns;site++) {
|
||||||
|
for(int s=0;s<Ls;s++) {
|
||||||
|
if (HandOpt) WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,U,buf,sF,sU,in,out);
|
||||||
|
else WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,U,buf,sF,sU,in,out);
|
||||||
|
sF++;
|
||||||
|
}
|
||||||
|
sU++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
////////////////////////////////////////////
|
||||||
|
// Generic implementation; move to different file?
|
||||||
|
////////////////////////////////////////////
|
||||||
|
|
||||||
|
template<class Impl>
|
||||||
|
void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
SiteHalfSpinor tmp;
|
SiteHalfSpinor tmp;
|
||||||
SiteHalfSpinor chi;
|
SiteHalfSpinor chi;
|
||||||
|
SiteHalfSpinor *chi_p;
|
||||||
SiteHalfSpinor Uchi;
|
SiteHalfSpinor Uchi;
|
||||||
SiteSpinor result;
|
SiteSpinor result;
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
int ptype;
|
int ptype;
|
||||||
|
|
||||||
int num = 0;
|
|
||||||
|
|
||||||
result=zero;
|
|
||||||
|
|
||||||
///////////////////////////
|
///////////////////////////
|
||||||
// Xp
|
// Xp
|
||||||
///////////////////////////
|
///////////////////////////
|
||||||
SE=st.GetEntry(ptype,Xp,sF);
|
SE=st.GetEntry(ptype,Xp,sF);
|
||||||
|
|
||||||
if (local && SE->_is_local ) {
|
if (SE->_is_local ) {
|
||||||
|
chi_p = χ
|
||||||
if ( SE->_permute ) {
|
if ( SE->_permute ) {
|
||||||
spProjXp(tmp,in._odata[SE->_offset]);
|
spProjXp(tmp,in._odata[SE->_offset]);
|
||||||
permute(chi,tmp,ptype);
|
permute(chi,tmp,ptype);
|
||||||
} else {
|
} else {
|
||||||
spProjXp(chi,in._odata[SE->_offset]);
|
spProjXp(chi,in._odata[SE->_offset]);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
chi_p=&buf[SE->_offset];
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( nonlocal && (!SE->_is_local) ) {
|
Impl::multLink(Uchi,U._odata[sU],*chi_p,Xp,SE,st);
|
||||||
chi=buf[SE->_offset];
|
spReconXp(result,Uchi);
|
||||||
}
|
|
||||||
|
|
||||||
if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
|
|
||||||
Impl::multLink(Uchi,U._odata[sU],chi,Xp,SE,st);
|
|
||||||
accumReconXp(result,Uchi);
|
|
||||||
num++;
|
|
||||||
}
|
|
||||||
|
|
||||||
///////////////////////////
|
///////////////////////////
|
||||||
// Yp
|
// Yp
|
||||||
///////////////////////////
|
///////////////////////////
|
||||||
SE=st.GetEntry(ptype,Yp,sF);
|
SE=st.GetEntry(ptype,Yp,sF);
|
||||||
|
|
||||||
if (local && SE->_is_local ) {
|
if ( SE->_is_local ) {
|
||||||
|
chi_p = χ
|
||||||
if ( SE->_permute ) {
|
if ( SE->_permute ) {
|
||||||
spProjYp(tmp,in._odata[SE->_offset]);
|
spProjYp(tmp,in._odata[SE->_offset]);
|
||||||
permute(chi,tmp,ptype);
|
permute(chi,tmp,ptype);
|
||||||
} else {
|
} else {
|
||||||
spProjYp(chi,in._odata[SE->_offset]);
|
spProjYp(chi,in._odata[SE->_offset]);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
chi_p=&buf[SE->_offset];
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( nonlocal && (!SE->_is_local) ) {
|
Impl::multLink(Uchi,U._odata[sU],*chi_p,Yp,SE,st);
|
||||||
chi=buf[SE->_offset];
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
|
|
||||||
Impl::multLink(Uchi,U._odata[sU],chi,Yp,SE,st);
|
|
||||||
accumReconYp(result,Uchi);
|
accumReconYp(result,Uchi);
|
||||||
num++;
|
|
||||||
}
|
|
||||||
|
|
||||||
///////////////////////////
|
///////////////////////////
|
||||||
// Zp
|
// Zp
|
||||||
///////////////////////////
|
///////////////////////////
|
||||||
SE=st.GetEntry(ptype,Zp,sF);
|
SE=st.GetEntry(ptype,Zp,sF);
|
||||||
|
|
||||||
if (local && SE->_is_local ) {
|
if ( SE->_is_local ) {
|
||||||
|
chi_p = χ
|
||||||
if ( SE->_permute ) {
|
if ( SE->_permute ) {
|
||||||
spProjZp(tmp,in._odata[SE->_offset]);
|
spProjZp(tmp,in._odata[SE->_offset]);
|
||||||
permute(chi,tmp,ptype);
|
permute(chi,tmp,ptype);
|
||||||
} else {
|
} else {
|
||||||
spProjZp(chi,in._odata[SE->_offset]);
|
spProjZp(chi,in._odata[SE->_offset]);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
chi_p=&buf[SE->_offset];
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( nonlocal && (!SE->_is_local) ) {
|
Impl::multLink(Uchi,U._odata[sU],*chi_p,Zp,SE,st);
|
||||||
chi=buf[SE->_offset];
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
|
|
||||||
Impl::multLink(Uchi,U._odata[sU],chi,Zp,SE,st);
|
|
||||||
accumReconZp(result,Uchi);
|
accumReconZp(result,Uchi);
|
||||||
num++;
|
|
||||||
}
|
|
||||||
|
|
||||||
///////////////////////////
|
///////////////////////////
|
||||||
// Tp
|
// Tp
|
||||||
///////////////////////////
|
///////////////////////////
|
||||||
SE=st.GetEntry(ptype,Tp,sF);
|
SE=st.GetEntry(ptype,Tp,sF);
|
||||||
|
|
||||||
if (local && SE->_is_local ) {
|
if ( SE->_is_local ) {
|
||||||
|
chi_p = χ
|
||||||
if ( SE->_permute ) {
|
if ( SE->_permute ) {
|
||||||
spProjTp(tmp,in._odata[SE->_offset]);
|
spProjTp(tmp,in._odata[SE->_offset]);
|
||||||
permute(chi,tmp,ptype);
|
permute(chi,tmp,ptype);
|
||||||
} else {
|
} else {
|
||||||
spProjTp(chi,in._odata[SE->_offset]);
|
spProjTp(chi,in._odata[SE->_offset]);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
chi_p=&buf[SE->_offset];
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( nonlocal && (!SE->_is_local) ) {
|
Impl::multLink(Uchi,U._odata[sU],*chi_p,Tp,SE,st);
|
||||||
chi=buf[SE->_offset];
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
|
|
||||||
Impl::multLink(Uchi,U._odata[sU],chi,Tp,SE,st);
|
|
||||||
accumReconTp(result,Uchi);
|
accumReconTp(result,Uchi);
|
||||||
num++;
|
|
||||||
}
|
|
||||||
|
|
||||||
///////////////////////////
|
///////////////////////////
|
||||||
// Xm
|
// Xm
|
||||||
///////////////////////////
|
///////////////////////////
|
||||||
SE=st.GetEntry(ptype,Xm,sF);
|
SE=st.GetEntry(ptype,Xm,sF);
|
||||||
|
|
||||||
if (local && SE->_is_local ) {
|
if ( SE->_is_local ) {
|
||||||
|
chi_p = χ
|
||||||
if ( SE->_permute ) {
|
if ( SE->_permute ) {
|
||||||
spProjXm(tmp,in._odata[SE->_offset]);
|
spProjXm(tmp,in._odata[SE->_offset]);
|
||||||
permute(chi,tmp,ptype);
|
permute(chi,tmp,ptype);
|
||||||
} else {
|
} else {
|
||||||
spProjXm(chi,in._odata[SE->_offset]);
|
spProjXm(chi,in._odata[SE->_offset]);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
chi_p=&buf[SE->_offset];
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( nonlocal && (!SE->_is_local) ) {
|
Impl::multLink(Uchi,U._odata[sU],*chi_p,Xm,SE,st);
|
||||||
chi=buf[SE->_offset];
|
|
||||||
}
|
|
||||||
|
|
||||||
if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
|
|
||||||
Impl::multLink(Uchi,U._odata[sU],chi,Xm,SE,st);
|
|
||||||
accumReconXm(result,Uchi);
|
accumReconXm(result,Uchi);
|
||||||
num++;
|
|
||||||
}
|
|
||||||
|
|
||||||
///////////////////////////
|
///////////////////////////
|
||||||
// Ym
|
// Ym
|
||||||
///////////////////////////
|
///////////////////////////
|
||||||
SE=st.GetEntry(ptype,Ym,sF);
|
SE=st.GetEntry(ptype,Ym,sF);
|
||||||
|
|
||||||
if (local && SE->_is_local ) {
|
if ( SE->_is_local ) {
|
||||||
|
chi_p = χ
|
||||||
if ( SE->_permute ) {
|
if ( SE->_permute ) {
|
||||||
spProjYm(tmp,in._odata[SE->_offset]);
|
spProjYm(tmp,in._odata[SE->_offset]);
|
||||||
permute(chi,tmp,ptype);
|
permute(chi,tmp,ptype);
|
||||||
} else {
|
} else {
|
||||||
spProjYm(chi,in._odata[SE->_offset]);
|
spProjYm(chi,in._odata[SE->_offset]);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
chi_p=&buf[SE->_offset];
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( nonlocal && (!SE->_is_local) ) {
|
Impl::multLink(Uchi,U._odata[sU],*chi_p,Ym,SE,st);
|
||||||
chi=buf[SE->_offset];
|
|
||||||
}
|
|
||||||
|
|
||||||
if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
|
|
||||||
Impl::multLink(Uchi,U._odata[sU],chi,Ym,SE,st);
|
|
||||||
accumReconYm(result,Uchi);
|
accumReconYm(result,Uchi);
|
||||||
num++;
|
|
||||||
}
|
|
||||||
|
|
||||||
///////////////////////////
|
///////////////////////////
|
||||||
// Zm
|
// Zm
|
||||||
///////////////////////////
|
///////////////////////////
|
||||||
SE=st.GetEntry(ptype,Zm,sF);
|
SE=st.GetEntry(ptype,Zm,sF);
|
||||||
|
|
||||||
if (local && SE->_is_local ) {
|
if ( SE->_is_local ) {
|
||||||
|
chi_p = χ
|
||||||
if ( SE->_permute ) {
|
if ( SE->_permute ) {
|
||||||
spProjZm(tmp,in._odata[SE->_offset]);
|
spProjZm(tmp,in._odata[SE->_offset]);
|
||||||
permute(chi,tmp,ptype);
|
permute(chi,tmp,ptype);
|
||||||
} else {
|
} else {
|
||||||
spProjZm(chi,in._odata[SE->_offset]);
|
spProjZm(chi,in._odata[SE->_offset]);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
chi_p=&buf[SE->_offset];
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( nonlocal && (!SE->_is_local) ) {
|
Impl::multLink(Uchi,U._odata[sU],*chi_p,Zm,SE,st);
|
||||||
chi=buf[SE->_offset];
|
|
||||||
}
|
|
||||||
|
|
||||||
if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
|
|
||||||
Impl::multLink(Uchi,U._odata[sU],chi,Zm,SE,st);
|
|
||||||
accumReconZm(result,Uchi);
|
accumReconZm(result,Uchi);
|
||||||
num++;
|
|
||||||
}
|
|
||||||
|
|
||||||
///////////////////////////
|
///////////////////////////
|
||||||
// Tm
|
// Tm
|
||||||
///////////////////////////
|
///////////////////////////
|
||||||
SE=st.GetEntry(ptype,Tm,sF);
|
SE=st.GetEntry(ptype,Tm,sF);
|
||||||
|
|
||||||
if (local && SE->_is_local ) {
|
if ( SE->_is_local ) {
|
||||||
|
chi_p = χ
|
||||||
if ( SE->_permute ) {
|
if ( SE->_permute ) {
|
||||||
spProjTm(tmp,in._odata[SE->_offset]);
|
spProjTm(tmp,in._odata[SE->_offset]);
|
||||||
permute(chi,tmp,ptype);
|
permute(chi,tmp,ptype);
|
||||||
} else {
|
} else {
|
||||||
spProjTm(chi,in._odata[SE->_offset]);
|
spProjTm(chi,in._odata[SE->_offset]);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
chi_p=&buf[SE->_offset];
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( nonlocal && (!SE->_is_local) ) {
|
Impl::multLink(Uchi,U._odata[sU],*chi_p,Tm,SE,st);
|
||||||
chi=buf[SE->_offset];
|
|
||||||
}
|
|
||||||
|
|
||||||
if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
|
|
||||||
Impl::multLink(Uchi,U._odata[sU],chi,Tm,SE,st);
|
|
||||||
accumReconTm(result,Uchi);
|
accumReconTm(result,Uchi);
|
||||||
num++;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( local ) {
|
vstream(out._odata[sF],result);
|
||||||
vstream(out._odata[sF],result*(-0.5));
|
|
||||||
} else if ( num ) {
|
|
||||||
vstream(out._odata[sF],out._odata[sF]+result*(-0.5));
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
// Need controls to do interior, exterior, or both
|
// Need controls to do interior, exterior, or both
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonKernels<Impl>::DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out,bool local, bool nonlocal)
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
SiteHalfSpinor tmp;
|
SiteHalfSpinor tmp;
|
||||||
SiteHalfSpinor chi;
|
SiteHalfSpinor chi;
|
||||||
|
SiteHalfSpinor *chi_p;
|
||||||
SiteHalfSpinor Uchi;
|
SiteHalfSpinor Uchi;
|
||||||
SiteSpinor result;
|
SiteSpinor result;
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
int ptype;
|
int ptype;
|
||||||
|
|
||||||
int num = 0;
|
|
||||||
|
|
||||||
result=zero;
|
|
||||||
|
|
||||||
///////////////////////////
|
///////////////////////////
|
||||||
// Xp
|
// Xp
|
||||||
///////////////////////////
|
///////////////////////////
|
||||||
SE=st.GetEntry(ptype,Xm,sF);
|
SE=st.GetEntry(ptype,Xm,sF);
|
||||||
|
|
||||||
if (local && SE->_is_local ) {
|
if ( SE->_is_local ) {
|
||||||
|
chi_p = χ
|
||||||
if ( SE->_permute ) {
|
if ( SE->_permute ) {
|
||||||
spProjXp(tmp,in._odata[SE->_offset]);
|
spProjXp(tmp,in._odata[SE->_offset]);
|
||||||
permute(chi,tmp,ptype);
|
permute(chi,tmp,ptype);
|
||||||
} else {
|
} else {
|
||||||
spProjXp(chi,in._odata[SE->_offset]);
|
spProjXp(chi,in._odata[SE->_offset]);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
chi_p=&buf[SE->_offset];
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( nonlocal && (!SE->_is_local) ) {
|
Impl::multLink(Uchi,U._odata[sU],*chi_p,Xm,SE,st);
|
||||||
chi=buf[SE->_offset];
|
spReconXp(result,Uchi);
|
||||||
}
|
|
||||||
|
|
||||||
if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
|
|
||||||
Impl::multLink(Uchi,U._odata[sU],chi,Xm,SE,st);
|
|
||||||
accumReconXp(result,Uchi);
|
|
||||||
num++;
|
|
||||||
}
|
|
||||||
|
|
||||||
///////////////////////////
|
///////////////////////////
|
||||||
// Yp
|
// Yp
|
||||||
///////////////////////////
|
///////////////////////////
|
||||||
SE=st.GetEntry(ptype,Ym,sF);
|
SE=st.GetEntry(ptype,Ym,sF);
|
||||||
|
|
||||||
if (local && SE->_is_local ) {
|
if ( SE->_is_local ) {
|
||||||
|
chi_p = χ
|
||||||
if ( SE->_permute ) {
|
if ( SE->_permute ) {
|
||||||
spProjYp(tmp,in._odata[SE->_offset]);
|
spProjYp(tmp,in._odata[SE->_offset]);
|
||||||
permute(chi,tmp,ptype);
|
permute(chi,tmp,ptype);
|
||||||
} else {
|
} else {
|
||||||
spProjYp(chi,in._odata[SE->_offset]);
|
spProjYp(chi,in._odata[SE->_offset]);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
chi_p=&buf[SE->_offset];
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( nonlocal && (!SE->_is_local) ) {
|
Impl::multLink(Uchi,U._odata[sU],*chi_p,Ym,SE,st);
|
||||||
chi=buf[SE->_offset];
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
|
|
||||||
Impl::multLink(Uchi,U._odata[sU],chi,Ym,SE,st);
|
|
||||||
accumReconYp(result,Uchi);
|
accumReconYp(result,Uchi);
|
||||||
num++;
|
|
||||||
}
|
|
||||||
|
|
||||||
///////////////////////////
|
///////////////////////////
|
||||||
// Zp
|
// Zp
|
||||||
///////////////////////////
|
///////////////////////////
|
||||||
SE=st.GetEntry(ptype,Zm,sF);
|
SE=st.GetEntry(ptype,Zm,sF);
|
||||||
|
|
||||||
if (local && SE->_is_local ) {
|
if ( SE->_is_local ) {
|
||||||
|
chi_p = χ
|
||||||
if ( SE->_permute ) {
|
if ( SE->_permute ) {
|
||||||
spProjZp(tmp,in._odata[SE->_offset]);
|
spProjZp(tmp,in._odata[SE->_offset]);
|
||||||
permute(chi,tmp,ptype);
|
permute(chi,tmp,ptype);
|
||||||
} else {
|
} else {
|
||||||
spProjZp(chi,in._odata[SE->_offset]);
|
spProjZp(chi,in._odata[SE->_offset]);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
chi_p=&buf[SE->_offset];
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( nonlocal && (!SE->_is_local) ) {
|
Impl::multLink(Uchi,U._odata[sU],*chi_p,Zm,SE,st);
|
||||||
chi=buf[SE->_offset];
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
|
|
||||||
Impl::multLink(Uchi,U._odata[sU],chi,Zm,SE,st);
|
|
||||||
accumReconZp(result,Uchi);
|
accumReconZp(result,Uchi);
|
||||||
num++;
|
|
||||||
}
|
|
||||||
|
|
||||||
///////////////////////////
|
///////////////////////////
|
||||||
// Tp
|
// Tp
|
||||||
///////////////////////////
|
///////////////////////////
|
||||||
SE=st.GetEntry(ptype,Tm,sF);
|
SE=st.GetEntry(ptype,Tm,sF);
|
||||||
|
|
||||||
if (local && SE->_is_local ) {
|
if ( SE->_is_local ) {
|
||||||
|
chi_p = χ
|
||||||
if ( SE->_permute ) {
|
if ( SE->_permute ) {
|
||||||
spProjTp(tmp,in._odata[SE->_offset]);
|
spProjTp(tmp,in._odata[SE->_offset]);
|
||||||
permute(chi,tmp,ptype);
|
permute(chi,tmp,ptype);
|
||||||
} else {
|
} else {
|
||||||
spProjTp(chi,in._odata[SE->_offset]);
|
spProjTp(chi,in._odata[SE->_offset]);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
chi_p=&buf[SE->_offset];
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( nonlocal && (!SE->_is_local) ) {
|
Impl::multLink(Uchi,U._odata[sU],*chi_p,Tm,SE,st);
|
||||||
chi=buf[SE->_offset];
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
|
|
||||||
Impl::multLink(Uchi,U._odata[sU],chi,Tm,SE,st);
|
|
||||||
accumReconTp(result,Uchi);
|
accumReconTp(result,Uchi);
|
||||||
num++;
|
|
||||||
}
|
|
||||||
|
|
||||||
///////////////////////////
|
///////////////////////////
|
||||||
// Xm
|
// Xm
|
||||||
///////////////////////////
|
///////////////////////////
|
||||||
SE=st.GetEntry(ptype,Xp,sF);
|
SE=st.GetEntry(ptype,Xp,sF);
|
||||||
|
|
||||||
if (local && SE->_is_local ) {
|
if ( SE->_is_local ) {
|
||||||
|
chi_p = χ
|
||||||
if ( SE->_permute ) {
|
if ( SE->_permute ) {
|
||||||
spProjXm(tmp,in._odata[SE->_offset]);
|
spProjXm(tmp,in._odata[SE->_offset]);
|
||||||
permute(chi,tmp,ptype);
|
permute(chi,tmp,ptype);
|
||||||
} else {
|
} else {
|
||||||
spProjXm(chi,in._odata[SE->_offset]);
|
spProjXm(chi,in._odata[SE->_offset]);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
chi_p=&buf[SE->_offset];
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( nonlocal && (!SE->_is_local) ) {
|
Impl::multLink(Uchi,U._odata[sU],*chi_p,Xp,SE,st);
|
||||||
chi=buf[SE->_offset];
|
|
||||||
}
|
|
||||||
|
|
||||||
if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
|
|
||||||
Impl::multLink(Uchi,U._odata[sU],chi,Xp,SE,st);
|
|
||||||
accumReconXm(result,Uchi);
|
accumReconXm(result,Uchi);
|
||||||
num++;
|
|
||||||
}
|
|
||||||
|
|
||||||
///////////////////////////
|
///////////////////////////
|
||||||
// Ym
|
// Ym
|
||||||
///////////////////////////
|
///////////////////////////
|
||||||
SE=st.GetEntry(ptype,Yp,sF);
|
SE=st.GetEntry(ptype,Yp,sF);
|
||||||
|
|
||||||
if (local && SE->_is_local ) {
|
if ( SE->_is_local ) {
|
||||||
|
chi_p = χ
|
||||||
if ( SE->_permute ) {
|
if ( SE->_permute ) {
|
||||||
spProjYm(tmp,in._odata[SE->_offset]);
|
spProjYm(tmp,in._odata[SE->_offset]);
|
||||||
permute(chi,tmp,ptype);
|
permute(chi,tmp,ptype);
|
||||||
} else {
|
} else {
|
||||||
spProjYm(chi,in._odata[SE->_offset]);
|
spProjYm(chi,in._odata[SE->_offset]);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
chi_p=&buf[SE->_offset];
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( nonlocal && (!SE->_is_local) ) {
|
Impl::multLink(Uchi,U._odata[sU],*chi_p,Yp,SE,st);
|
||||||
chi=buf[SE->_offset];
|
|
||||||
}
|
|
||||||
|
|
||||||
if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
|
|
||||||
Impl::multLink(Uchi,U._odata[sU],chi,Yp,SE,st);
|
|
||||||
accumReconYm(result,Uchi);
|
accumReconYm(result,Uchi);
|
||||||
num++;
|
|
||||||
}
|
|
||||||
|
|
||||||
///////////////////////////
|
///////////////////////////
|
||||||
// Zm
|
// Zm
|
||||||
///////////////////////////
|
///////////////////////////
|
||||||
SE=st.GetEntry(ptype,Zp,sF);
|
SE=st.GetEntry(ptype,Zp,sF);
|
||||||
|
|
||||||
if (local && SE->_is_local ) {
|
if ( SE->_is_local ) {
|
||||||
|
chi_p = χ
|
||||||
if ( SE->_permute ) {
|
if ( SE->_permute ) {
|
||||||
spProjZm(tmp,in._odata[SE->_offset]);
|
spProjZm(tmp,in._odata[SE->_offset]);
|
||||||
permute(chi,tmp,ptype);
|
permute(chi,tmp,ptype);
|
||||||
} else {
|
} else {
|
||||||
spProjZm(chi,in._odata[SE->_offset]);
|
spProjZm(chi,in._odata[SE->_offset]);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
chi_p=&buf[SE->_offset];
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( nonlocal && (!SE->_is_local) ) {
|
Impl::multLink(Uchi,U._odata[sU],*chi_p,Zp,SE,st);
|
||||||
chi=buf[SE->_offset];
|
|
||||||
}
|
|
||||||
|
|
||||||
if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
|
|
||||||
Impl::multLink(Uchi,U._odata[sU],chi,Zp,SE,st);
|
|
||||||
accumReconZm(result,Uchi);
|
accumReconZm(result,Uchi);
|
||||||
num++;
|
|
||||||
}
|
|
||||||
|
|
||||||
///////////////////////////
|
///////////////////////////
|
||||||
// Tm
|
// Tm
|
||||||
///////////////////////////
|
///////////////////////////
|
||||||
SE=st.GetEntry(ptype,Tp,sF);
|
SE=st.GetEntry(ptype,Tp,sF);
|
||||||
|
|
||||||
if (local && SE->_is_local ) {
|
if ( SE->_is_local ) {
|
||||||
|
chi_p = χ
|
||||||
if ( SE->_permute ) {
|
if ( SE->_permute ) {
|
||||||
spProjTm(tmp,in._odata[SE->_offset]);
|
spProjTm(tmp,in._odata[SE->_offset]);
|
||||||
permute(chi,tmp,ptype);
|
permute(chi,tmp,ptype);
|
||||||
} else {
|
} else {
|
||||||
spProjTm(chi,in._odata[SE->_offset]);
|
spProjTm(chi,in._odata[SE->_offset]);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
chi_p=&buf[SE->_offset];
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( nonlocal && (!SE->_is_local) ) {
|
Impl::multLink(Uchi,U._odata[sU],*chi_p,Tp,SE,st);
|
||||||
chi=buf[SE->_offset];
|
|
||||||
}
|
|
||||||
|
|
||||||
if( (local && SE->_is_local) || ( nonlocal && (!SE->_is_local)) ) {
|
|
||||||
Impl::multLink(Uchi,U._odata[sU],chi,Tp,SE,st);
|
|
||||||
accumReconTm(result,Uchi);
|
accumReconTm(result,Uchi);
|
||||||
num++;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( local ) {
|
vstream(out._odata[sF],result);
|
||||||
vstream(out._odata[sF],result*(-0.5));
|
|
||||||
} else if ( num ) {
|
|
||||||
vstream(out._odata[sF],out._odata[sF]+result*(-0.5));
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -593,19 +563,13 @@ void WilsonKernels<Impl>::DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
|
|||||||
spReconTm(result,Uchi);
|
spReconTm(result,Uchi);
|
||||||
}
|
}
|
||||||
|
|
||||||
vstream(out._odata[sF],result*(-0.5));
|
vstream(out._odata[sF],result);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if ( ! defined(AVX512) ) && ( ! defined(IMCI) )
|
|
||||||
template<class Impl>
|
|
||||||
void WilsonKernels<Impl>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int sF,int sU,const FermionField &in, FermionField &out,bool local, bool nonlocal)
|
|
||||||
{
|
|
||||||
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
FermOpTemplateInstantiate(WilsonKernels);
|
FermOpTemplateInstantiate(WilsonKernels);
|
||||||
|
|
||||||
|
template class WilsonKernels<DomainWallRedBlack5dImplF>;
|
||||||
|
template class WilsonKernels<DomainWallRedBlack5dImplD>;
|
||||||
|
|
||||||
}}
|
}}
|
||||||
|
@ -38,37 +38,56 @@ namespace Grid {
|
|||||||
// Helper routines that implement Wilson stencil for a single site.
|
// Helper routines that implement Wilson stencil for a single site.
|
||||||
// Common to both the WilsonFermion and WilsonFermion5D
|
// Common to both the WilsonFermion and WilsonFermion5D
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
class WilsonKernelsStatic {
|
||||||
|
public:
|
||||||
|
// S-direction is INNERMOST and takes no part in the parity.
|
||||||
|
static int AsmOpt; // these are a temporary hack
|
||||||
|
static int HandOpt; // these are a temporary hack
|
||||||
|
};
|
||||||
|
|
||||||
template<class Impl> class WilsonKernels : public FermionOperator<Impl> {
|
template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic {
|
||||||
public:
|
public:
|
||||||
|
|
||||||
INHERIT_IMPL_TYPES(Impl);
|
INHERIT_IMPL_TYPES(Impl);
|
||||||
typedef FermionOperator<Impl> Base;
|
typedef FermionOperator<Impl> Base;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void DiracOptDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
|
int sF, int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
void DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
void DiracOptDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in,FermionField &out,bool local= true, bool nonlocal=true);
|
int sF,int sU,int Ls, int Ns, const FermionField &in,FermionField &out);
|
||||||
|
|
||||||
void DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
|
void DiracOptDhopDir(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out,int dirdisp,int gamma);
|
int sF,int sU,const FermionField &in, FermionField &out,int dirdisp,int gamma);
|
||||||
|
|
||||||
|
private:
|
||||||
|
// Specialised variants
|
||||||
|
void DiracOptGenericDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int sF,int sU, const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
|
void DiracOptGenericDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int sF,int sU,const FermionField &in,FermionField &out);
|
||||||
|
|
||||||
void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
|
int sF,int sU,int Ls, int Ns, const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
int DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
|
|
||||||
|
|
||||||
int DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
|
int sF,int sU,const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
|
void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int sF,int sU,const FermionField &in, FermionField &out);
|
||||||
|
public:
|
||||||
|
|
||||||
WilsonKernels(const ImplParams &p= ImplParams());
|
WilsonKernels(const ImplParams &p= ImplParams());
|
||||||
|
|
||||||
|
@ -2,6 +2,8 @@
|
|||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Source file: ./lib/qcd/action/fermion/WilsonKernelsAsm.cc
|
Source file: ./lib/qcd/action/fermion/WilsonKernelsAsm.cc
|
||||||
|
|
||||||
Copyright (C) 2015
|
Copyright (C) 2015
|
||||||
@ -26,320 +28,88 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
*************************************************************************************/
|
*************************************************************************************/
|
||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
|
|
||||||
#include <Grid.h>
|
#include <Grid.h>
|
||||||
#if defined(AVX512) || defined (IMCI)
|
|
||||||
|
|
||||||
#include <simd/Avx512Asm.h>
|
|
||||||
|
|
||||||
#undef VLOAD
|
|
||||||
#undef VSTORE
|
|
||||||
#undef VMUL
|
|
||||||
#undef VMADD
|
|
||||||
#undef ZEND
|
|
||||||
#undef ZLOAD
|
|
||||||
#undef ZMUL
|
|
||||||
#undef ZMADD
|
|
||||||
#undef VZERO
|
|
||||||
#undef VTIMESI
|
|
||||||
#undef VTIMESMINUSI
|
|
||||||
|
|
||||||
#define VZERO(A) VZEROf(A)
|
|
||||||
#define VMOV(A,B) VMOVf(A,B)
|
|
||||||
#define VLOAD(OFF,PTR,DEST) VLOADf(OFF,PTR,DEST)
|
|
||||||
#define VSTORE(OFF,PTR,SRC) VSTOREf(OFF,PTR,SRC)
|
|
||||||
|
|
||||||
#define VADD(A,B,C) VADDf(A,B,C)
|
|
||||||
#define VSUB(A,B,C) VSUBf(A,B,C)
|
|
||||||
#define VMUL(Uri,Uir,Chi,UChi,Z) VMULf(Uri,Uir,Chi,UChi,Z)
|
|
||||||
#define VMADD(Uri,Uir,Chi,UChi,Z) VMADDf(Uri,Uir,Chi,UChi,Z)
|
|
||||||
|
|
||||||
#define VTIMESI(A,B,C) VTIMESIf(A,B,C)
|
|
||||||
#define VTIMESMINUSI(A,B,C) VTIMESMINUSIf(A,B,C)
|
|
||||||
#define VACCTIMESI(A,B,C) VACCTIMESIf(A,B,C)
|
|
||||||
#define VACCTIMESMINUSI(A,B,C) VACCTIMESMINUSIf(A,B,C)
|
|
||||||
|
|
||||||
#define VTIMESI0(A,B,C) VTIMESI0f(A,B,C)
|
|
||||||
#define VTIMESMINUSI0(A,B,C) VTIMESMINUSI0f(A,B,C)
|
|
||||||
#define VACCTIMESI0(A,B,C) VACCTIMESI0f(A,B,C)
|
|
||||||
#define VACCTIMESMINUSI0(A,B,C) VACCTIMESMINUSI0f(A,B,C)
|
|
||||||
|
|
||||||
#define VTIMESI1(A,B,C) VTIMESI1f(A,B,C)
|
|
||||||
#define VTIMESMINUSI1(A,B,C) VTIMESMINUSI1f(A,B,C)
|
|
||||||
#define VACCTIMESI1(A,B,C) VACCTIMESI1f(A,B,C)
|
|
||||||
#define VACCTIMESMINUSI1(A,B,C) VACCTIMESMINUSI1f(A,B,C)
|
|
||||||
|
|
||||||
#define VTIMESI2(A,B,C) VTIMESI2f(A,B,C)
|
|
||||||
#define VTIMESMINUSI2(A,B,C) VTIMESMINUSI2f(A,B,C)
|
|
||||||
#define VACCTIMESI2(A,B,C) VACCTIMESI2f(A,B,C)
|
|
||||||
#define VACCTIMESMINUSI2(A,B,C) VACCTIMESMINUSI2f(A,B,C)
|
|
||||||
|
|
||||||
#define VACCTIMESI1MEM(A,ACC,O,P) VACCTIMESI1MEMf(A,ACC,O,P)
|
|
||||||
#define VACCTIMESI2MEM(A,ACC,O,P) VACCTIMESI2MEMf(A,ACC,O,P)
|
|
||||||
#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMf(A,ACC,O,P)
|
|
||||||
#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMf(A,ACC,O,P)
|
|
||||||
|
|
||||||
#define VPERM0(A,B) VPERM0f(A,B)
|
|
||||||
#define VPERM1(A,B) VPERM1f(A,B)
|
|
||||||
#define VPERM2(A,B) VPERM2f(A,B)
|
|
||||||
#define VPERM3(A,B) VPERM3f(A,B)
|
|
||||||
#define VSHUFMEM(OFF,A,DEST) VSHUFMEMf(OFF,A,DEST)
|
|
||||||
|
|
||||||
#define ZEND1(A,B,C) ZEND1f(A,B,C)
|
|
||||||
#define ZEND2(A,B,C) ZEND2f(A,B,C)
|
|
||||||
#define ZLOAD(A,B,C,D) ZLOADf(A,B,C,D)
|
|
||||||
#define ZMUL(A,B,C,D,E) ZMULf(A,B,C,D,E)
|
|
||||||
#define ZMADD(A,B,C,D,E) ZMADDf(A,B,C,D,E)
|
|
||||||
|
|
||||||
#define ZMUL(A,B,C,D,E) ZMULf(A,B,C,D,E)
|
|
||||||
#define ZMADD(A,B,C,D,E) ZMADDf(A,B,C,D,E)
|
|
||||||
|
|
||||||
#define VADDMEM(O,A,B,C) VADDMEMf(O,A,B,C)
|
|
||||||
#define VSUBMEM(O,A,B,C) VSUBMEMf(O,A,B,C)
|
|
||||||
|
|
||||||
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
|
||||||
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
namespace QCD {
|
namespace QCD {
|
||||||
|
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////
|
||||||
|
// Default to no assembler implementation
|
||||||
|
///////////////////////////////////////////////////////////
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out,uint64_t *timers)
|
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
uint64_t now;
|
assert(0);
|
||||||
uint64_t first ;
|
}
|
||||||
int offset,local,perm, ptype;
|
|
||||||
const SiteHalfSpinor *pbuf = & buf[0];
|
#if defined(AVX512)
|
||||||
const SiteSpinor *plocal = & in._odata[0];
|
|
||||||
void *pf;
|
|
||||||
int osites = in._grid->oSites();
|
|
||||||
|
|
||||||
|
|
||||||
StencilEntry *SE;
|
///////////////////////////////////////////////////////////
|
||||||
|
// If we are AVX512 specialise the single precision routine
|
||||||
|
///////////////////////////////////////////////////////////
|
||||||
|
|
||||||
//#define STAMP(i) timers[i] = __rdtsc() ;
|
#include <simd/Intel512wilson.h>
|
||||||
#define STAMP(i) //timers[i] = __rdtsc() ;
|
#include <simd/Intel512single.h>
|
||||||
|
|
||||||
MASK_REGS;
|
static Vector<vComplexF> signs;
|
||||||
|
|
||||||
first = __rdtsc();
|
int setupSigns(void ){
|
||||||
|
Vector<vComplexF> bother(2);
|
||||||
|
signs = bother;
|
||||||
|
vrsign(signs[0]);
|
||||||
|
visign(signs[1]);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
static int signInit = setupSigns();
|
||||||
|
|
||||||
SE=st.GetEntry(ptype,Xm,ss);
|
#define MAYBEPERM(A,perm) if (perm) { A ; }
|
||||||
|
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
|
||||||
|
|
||||||
#if 0
|
template<>
|
||||||
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
|
void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
else pf=(void *)&pbuf[SE->_offset];
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||||
|
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||||
|
|
||||||
|
#undef VMOVIDUP
|
||||||
|
#undef VMOVRDUP
|
||||||
|
#undef MAYBEPERM
|
||||||
|
#undef MULT_2SPIN
|
||||||
|
#define MAYBEPERM(A,B)
|
||||||
|
#define VMOVIDUP(A,B,C) VBCASTIDUPf(A,B,C)
|
||||||
|
#define VMOVRDUP(A,B,C) VBCASTRDUPf(A,B,C)
|
||||||
|
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
|
||||||
|
template<>
|
||||||
|
void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out)
|
||||||
|
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
|
||||||
|
|
||||||
LOAD64(%r9,pf);
|
|
||||||
__asm__(
|
|
||||||
VPREFETCH(0,%r9)
|
|
||||||
VPREFETCH(1,%r9)
|
|
||||||
VPREFETCH(2,%r9)
|
|
||||||
VPREFETCH(3,%r9)
|
|
||||||
VPREFETCH(4,%r9)
|
|
||||||
VPREFETCH(5,%r9)
|
|
||||||
VPREFETCH(6,%r9)
|
|
||||||
VPREFETCH(7,%r9)
|
|
||||||
VPREFETCH(8,%r9)
|
|
||||||
VPREFETCH(9,%r9)
|
|
||||||
VPREFETCH(10,%r9)
|
|
||||||
VPREFETCH(11,%r9) );
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Xm
|
template void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
offset = SE->_offset;
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
local = SE->_is_local;
|
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
// Prefetch
|
|
||||||
SE=st.GetEntry(ptype,Ym,ss);
|
|
||||||
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
|
|
||||||
else pf=(void *)&pbuf[SE->_offset];
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
XM_PROJMEM(&plocal[offset]);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR3; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(&pbuf[offset]);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT_2SPIN_DIR_PFXM(Xm,pf);
|
|
||||||
}
|
|
||||||
XM_RECON;
|
|
||||||
|
|
||||||
// Ym
|
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
// Prefetch
|
|
||||||
SE=st.GetEntry(ptype,Zm,ss);
|
|
||||||
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
|
|
||||||
else pf=(void *)&pbuf[SE->_offset];
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
YM_PROJMEM(&plocal[offset]);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR2; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(&pbuf[offset]);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT_2SPIN_DIR_PFYM(Ym,pf);
|
|
||||||
}
|
|
||||||
YM_RECON_ACCUM;
|
|
||||||
|
|
||||||
// Zm
|
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
// Prefetch
|
|
||||||
SE=st.GetEntry(ptype,Tm,ss);
|
|
||||||
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
|
|
||||||
else pf=(void *)&pbuf[SE->_offset];
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
ZM_PROJMEM(&plocal[offset]);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR1; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(&pbuf[offset]);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT_2SPIN_DIR_PFZM(Zm,pf);
|
|
||||||
}
|
|
||||||
ZM_RECON_ACCUM;
|
|
||||||
|
|
||||||
// Tm
|
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
SE=st.GetEntry(ptype,Tp,ss);
|
|
||||||
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
|
|
||||||
else pf=(void *)&pbuf[SE->_offset];
|
|
||||||
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
TM_PROJMEM(&plocal[offset]);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR0; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(&pbuf[offset]);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT_2SPIN_DIR_PFTM(Tm,pf);
|
|
||||||
}
|
|
||||||
TM_RECON_ACCUM;
|
|
||||||
|
|
||||||
// Tp
|
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
// Prefetch
|
|
||||||
SE=st.GetEntry(ptype,Zp,ss);
|
|
||||||
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
|
|
||||||
else pf=(void *)&pbuf[SE->_offset];
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
TP_PROJMEM(&plocal[offset]);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR0; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(&pbuf[offset]);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT_2SPIN_DIR_PFTP(Tp,pf);
|
|
||||||
}
|
|
||||||
TP_RECON_ACCUM;
|
|
||||||
|
|
||||||
// Zp
|
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
// Prefetch
|
|
||||||
SE=st.GetEntry(ptype,Yp,ss);
|
|
||||||
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
|
|
||||||
else pf=(void *)&pbuf[SE->_offset];
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
ZP_PROJMEM(&plocal[offset]);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR1; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(&pbuf[offset]);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT_2SPIN_DIR_PFZP(Zp,pf);
|
|
||||||
}
|
|
||||||
ZP_RECON_ACCUM;
|
|
||||||
|
|
||||||
|
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
// Prefetch
|
|
||||||
SE=st.GetEntry(ptype,Xp,ss);
|
|
||||||
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
|
|
||||||
else pf=(void *)&pbuf[SE->_offset];
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
YP_PROJMEM(&plocal[offset]);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR2; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(&pbuf[offset]);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT_2SPIN_DIR_PFYP(Yp,pf);
|
|
||||||
}
|
|
||||||
YP_RECON_ACCUM;
|
|
||||||
|
|
||||||
// Xp
|
|
||||||
perm = SE->_permute;
|
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
|
|
||||||
// PREFETCH_R(A);
|
|
||||||
|
|
||||||
// Prefetch
|
|
||||||
SE=st.GetEntry(ptype,Xm,(ss+1)%osites);
|
|
||||||
if (SE->_is_local) pf=(void *)&plocal[SE->_offset];
|
|
||||||
else pf=(void *)&pbuf[SE->_offset];
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
XP_PROJMEM(&plocal[offset]);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR3; // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(&pbuf[offset]);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT_2SPIN_DIR_PFXP(Xp,pf);
|
|
||||||
}
|
|
||||||
XP_RECON_ACCUM;
|
|
||||||
|
|
||||||
debug:
|
|
||||||
SAVE_RESULT(&out._odata[ss]);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
template class WilsonKernels<WilsonImplF>;
|
|
||||||
template class WilsonKernels<WilsonImplD>;
|
|
||||||
|
|
||||||
|
template void WilsonKernels<WilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||||
|
template void WilsonKernels<GparityWilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||||
|
template void WilsonKernels<GparityWilsonImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||||
|
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||||
|
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptAsmDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int ss,int sU,int Ls,int Ns,const FermionField &in, FermionField &out);
|
||||||
}}
|
}}
|
||||||
#endif
|
|
||||||
|
164
lib/qcd/action/fermion/WilsonKernelsAsmBody.h
Normal file
164
lib/qcd/action/fermion/WilsonKernelsAsmBody.h
Normal file
@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
int locala,perma, ptypea;
|
||||||
|
int localb,permb, ptypeb;
|
||||||
|
uint64_t basea, baseb;
|
||||||
|
uint64_t basex;
|
||||||
|
const uint64_t plocal =(uint64_t) & in._odata[0];
|
||||||
|
|
||||||
|
// vComplexF isigns[2] = { signs[0], signs[1] };
|
||||||
|
vComplexF *isigns = &signs[0];
|
||||||
|
|
||||||
|
MASK_REGS;
|
||||||
|
|
||||||
|
for(int site=0;site<Ns;site++) {
|
||||||
|
for(int s=0;s<Ls;s++) {
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Xp
|
||||||
|
////////////////////////////////
|
||||||
|
int ent=ss*8;// 2*Ndim
|
||||||
|
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
||||||
|
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
||||||
|
basex = basea;
|
||||||
|
|
||||||
|
if ( locala ) {
|
||||||
|
LOAD64(%r10,isigns);
|
||||||
|
XM_PROJMEM(basea);
|
||||||
|
MAYBEPERM(PERMUTE_DIR3,perma);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(basea);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFXP(Xp,baseb);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns);
|
||||||
|
XM_RECON;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Yp
|
||||||
|
////////////////////////////////
|
||||||
|
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
||||||
|
if ( localb ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
YM_PROJMEM(baseb);
|
||||||
|
MAYBEPERM(PERMUTE_DIR2,permb);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(baseb);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFYP(Yp,basea);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
YM_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Zp
|
||||||
|
////////////////////////////////
|
||||||
|
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
||||||
|
if ( locala ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
ZM_PROJMEM(basea);
|
||||||
|
MAYBEPERM(PERMUTE_DIR1,perma);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(basea);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFZP(Zp,baseb);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
ZM_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Tp
|
||||||
|
////////////////////////////////
|
||||||
|
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
||||||
|
if ( localb ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
TM_PROJMEM(baseb);
|
||||||
|
MAYBEPERM(PERMUTE_DIR0,permb);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(baseb);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFTP(Tp,basea);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
TM_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Xm
|
||||||
|
////////////////////////////////
|
||||||
|
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
||||||
|
if ( locala ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
XP_PROJMEM(basea);
|
||||||
|
MAYBEPERM(PERMUTE_DIR3,perma);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(basea);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFXM(Xm,baseb);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
XP_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Ym
|
||||||
|
////////////////////////////////
|
||||||
|
basea = st.GetInfo(ptypea,locala,perma,Xp,ent,plocal); ent++;
|
||||||
|
if ( localb ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
YP_PROJMEM(baseb);
|
||||||
|
MAYBEPERM(PERMUTE_DIR2,permb);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(baseb);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFYM(Ym,basea);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
YP_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Zm
|
||||||
|
////////////////////////////////
|
||||||
|
baseb = st.GetInfo(ptypeb,localb,permb,Yp,ent,plocal); ent++;
|
||||||
|
if ( locala ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
ZP_PROJMEM(basea);
|
||||||
|
MAYBEPERM(PERMUTE_DIR1,perma);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(basea);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFZM(Zm,baseb);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
ZP_RECON_ACCUM;
|
||||||
|
|
||||||
|
////////////////////////////////
|
||||||
|
// Tm
|
||||||
|
////////////////////////////////
|
||||||
|
basea = (uint64_t)&out._odata[ss];
|
||||||
|
if ( localb ) {
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
TP_PROJMEM(baseb);
|
||||||
|
MAYBEPERM(PERMUTE_DIR0,permb);
|
||||||
|
} else {
|
||||||
|
LOAD_CHI(baseb);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN_DIR_PFTM(Tm,basea);
|
||||||
|
}
|
||||||
|
LOAD64(%r10,isigns); // times i => shuffle and xor the real part sign bit
|
||||||
|
TP_RECON_ACCUM;
|
||||||
|
|
||||||
|
PREFETCH_CHIMU(basex);
|
||||||
|
SAVE_RESULT(&out._odata[ss]);
|
||||||
|
|
||||||
|
|
||||||
|
ss++;
|
||||||
|
}
|
||||||
|
sU++;
|
||||||
|
}
|
||||||
|
}
|
@ -54,14 +54,15 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
Chi_11 = ref()(1)(1);\
|
Chi_11 = ref()(1)(1);\
|
||||||
Chi_12 = ref()(1)(2);
|
Chi_12 = ref()(1)(2);
|
||||||
|
|
||||||
|
// To splat or not to splat depends on the implementation
|
||||||
#define MULT_2SPIN(A)\
|
#define MULT_2SPIN(A)\
|
||||||
auto & ref(U._odata[sU](A)); \
|
auto & ref(U._odata[sU](A)); \
|
||||||
U_00 = ref()(0,0);\
|
Impl::loadLinkElement(U_00,ref()(0,0)); \
|
||||||
U_10 = ref()(1,0);\
|
Impl::loadLinkElement(U_10,ref()(1,0)); \
|
||||||
U_20 = ref()(2,0);\
|
Impl::loadLinkElement(U_20,ref()(2,0)); \
|
||||||
U_01 = ref()(0,1);\
|
Impl::loadLinkElement(U_01,ref()(0,1)); \
|
||||||
U_11 = ref()(1,1); \
|
Impl::loadLinkElement(U_11,ref()(1,1)); \
|
||||||
U_21 = ref()(2,1);\
|
Impl::loadLinkElement(U_21,ref()(2,1)); \
|
||||||
UChi_00 = U_00*Chi_00;\
|
UChi_00 = U_00*Chi_00;\
|
||||||
UChi_10 = U_00*Chi_10;\
|
UChi_10 = U_00*Chi_10;\
|
||||||
UChi_01 = U_10*Chi_00;\
|
UChi_01 = U_10*Chi_00;\
|
||||||
@ -74,9 +75,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
UChi_11+= U_11*Chi_11;\
|
UChi_11+= U_11*Chi_11;\
|
||||||
UChi_02+= U_21*Chi_01;\
|
UChi_02+= U_21*Chi_01;\
|
||||||
UChi_12+= U_21*Chi_11;\
|
UChi_12+= U_21*Chi_11;\
|
||||||
U_00 = ref()(0,2);\
|
Impl::loadLinkElement(U_00,ref()(0,2)); \
|
||||||
U_10 = ref()(1,2);\
|
Impl::loadLinkElement(U_10,ref()(1,2)); \
|
||||||
U_20 = ref()(2,2);\
|
Impl::loadLinkElement(U_20,ref()(2,2)); \
|
||||||
UChi_00+= U_00*Chi_02;\
|
UChi_00+= U_00*Chi_02;\
|
||||||
UChi_10+= U_00*Chi_12;\
|
UChi_10+= U_00*Chi_12;\
|
||||||
UChi_01+= U_10*Chi_02;\
|
UChi_01+= U_10*Chi_02;\
|
||||||
@ -84,6 +85,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
UChi_02+= U_20*Chi_02;\
|
UChi_02+= U_20*Chi_02;\
|
||||||
UChi_12+= U_20*Chi_12;
|
UChi_12+= U_20*Chi_12;
|
||||||
|
|
||||||
|
|
||||||
#define PERMUTE_DIR(dir) \
|
#define PERMUTE_DIR(dir) \
|
||||||
permute##dir(Chi_00,Chi_00);\
|
permute##dir(Chi_00,Chi_00);\
|
||||||
permute##dir(Chi_01,Chi_01);\
|
permute##dir(Chi_01,Chi_01);\
|
||||||
@ -309,546 +311,10 @@ namespace Grid {
|
|||||||
namespace QCD {
|
namespace QCD {
|
||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
|
||||||
int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
|
|
||||||
{
|
|
||||||
// std::cout << "Hand op Dhop "<<std::endl;
|
|
||||||
typedef typename Simd::scalar_type S;
|
|
||||||
typedef typename Simd::vector_type V;
|
|
||||||
|
|
||||||
REGISTER Simd result_00 ; zeroit(result_00); // 12 regs on knc
|
|
||||||
REGISTER Simd result_01 ; zeroit(result_01); // 12 regs on knc
|
|
||||||
REGISTER Simd result_02 ; zeroit(result_02); // 12 regs on knc
|
|
||||||
|
|
||||||
REGISTER Simd result_10 ; zeroit(result_10); // 12 regs on knc
|
|
||||||
REGISTER Simd result_11 ; zeroit(result_11); // 12 regs on knc
|
|
||||||
REGISTER Simd result_12 ; zeroit(result_12); // 12 regs on knc
|
|
||||||
|
|
||||||
REGISTER Simd result_20 ; zeroit(result_20); // 12 regs on knc
|
|
||||||
REGISTER Simd result_21 ; zeroit(result_21); // 12 regs on knc
|
|
||||||
REGISTER Simd result_22 ; zeroit(result_22); // 12 regs on knc
|
|
||||||
|
|
||||||
REGISTER Simd result_30 ; zeroit(result_30); // 12 regs on knc
|
|
||||||
REGISTER Simd result_31 ; zeroit(result_31); // 12 regs on knc
|
|
||||||
REGISTER Simd result_32 ; zeroit(result_32); // 12 regs on knc
|
|
||||||
|
|
||||||
REGISTER Simd Chi_00; // two spinor; 6 regs
|
|
||||||
REGISTER Simd Chi_01;
|
|
||||||
REGISTER Simd Chi_02;
|
|
||||||
|
|
||||||
REGISTER Simd Chi_10;
|
|
||||||
REGISTER Simd Chi_11;
|
|
||||||
REGISTER Simd Chi_12; // 14 left
|
|
||||||
|
|
||||||
REGISTER Simd UChi_00; // two spinor; 6 regs
|
|
||||||
REGISTER Simd UChi_01;
|
|
||||||
REGISTER Simd UChi_02;
|
|
||||||
|
|
||||||
REGISTER Simd UChi_10;
|
|
||||||
REGISTER Simd UChi_11;
|
|
||||||
REGISTER Simd UChi_12; // 8 left
|
|
||||||
|
|
||||||
REGISTER Simd U_00; // two rows of U matrix
|
|
||||||
REGISTER Simd U_10;
|
|
||||||
REGISTER Simd U_20;
|
|
||||||
REGISTER Simd U_01;
|
|
||||||
REGISTER Simd U_11;
|
|
||||||
REGISTER Simd U_21; // 2 reg left.
|
|
||||||
|
|
||||||
#define Chimu_00 Chi_00
|
|
||||||
#define Chimu_01 Chi_01
|
|
||||||
#define Chimu_02 Chi_02
|
|
||||||
#define Chimu_10 Chi_10
|
|
||||||
#define Chimu_11 Chi_11
|
|
||||||
#define Chimu_12 Chi_12
|
|
||||||
#define Chimu_20 UChi_00
|
|
||||||
#define Chimu_21 UChi_01
|
|
||||||
#define Chimu_22 UChi_02
|
|
||||||
#define Chimu_30 UChi_10
|
|
||||||
#define Chimu_31 UChi_11
|
|
||||||
#define Chimu_32 UChi_12
|
|
||||||
|
|
||||||
|
|
||||||
StencilEntry *SE;
|
|
||||||
int offset, ptype;
|
|
||||||
int num = 0;
|
|
||||||
|
|
||||||
// Xp
|
|
||||||
SE=st.GetEntry(ptype,Xp,ss);
|
|
||||||
offset = SE->_offset;
|
|
||||||
|
|
||||||
if (Local && SE->_is_local ) {
|
|
||||||
LOAD_CHIMU;
|
|
||||||
XP_PROJ;
|
|
||||||
if ( SE->_permute ) {
|
|
||||||
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( Nonlocal && (!SE->_is_local) ) {
|
|
||||||
LOAD_CHI;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
|
||||||
MULT_2SPIN(Xp);
|
|
||||||
XP_RECON_ACCUM;
|
|
||||||
num++;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Yp
|
|
||||||
SE=st.GetEntry(ptype,Yp,ss);
|
|
||||||
offset = SE->_offset;
|
|
||||||
|
|
||||||
if (Local && SE->_is_local ) {
|
|
||||||
LOAD_CHIMU;
|
|
||||||
YP_PROJ;
|
|
||||||
if ( SE->_permute ) {
|
|
||||||
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( Nonlocal && (!SE->_is_local) ) {
|
|
||||||
LOAD_CHI;
|
|
||||||
}
|
|
||||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
|
||||||
MULT_2SPIN(Yp);
|
|
||||||
YP_RECON_ACCUM;
|
|
||||||
num++;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// Zp
|
|
||||||
SE=st.GetEntry(ptype,Zp,ss);
|
|
||||||
offset = SE->_offset;
|
|
||||||
|
|
||||||
if (Local && SE->_is_local ) {
|
|
||||||
LOAD_CHIMU;
|
|
||||||
ZP_PROJ;
|
|
||||||
if ( SE->_permute ) {
|
|
||||||
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( Nonlocal && (!SE->_is_local) ) {
|
|
||||||
LOAD_CHI;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
|
||||||
MULT_2SPIN(Zp);
|
|
||||||
ZP_RECON_ACCUM;
|
|
||||||
num++;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Tp
|
|
||||||
SE=st.GetEntry(ptype,Tp,ss);
|
|
||||||
offset = SE->_offset;
|
|
||||||
|
|
||||||
if (Local && SE->_is_local ) {
|
|
||||||
LOAD_CHIMU;
|
|
||||||
TP_PROJ;
|
|
||||||
if ( SE->_permute ) {
|
|
||||||
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if ( Nonlocal && (!SE->_is_local) ) {
|
|
||||||
LOAD_CHI;
|
|
||||||
}
|
|
||||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
|
||||||
MULT_2SPIN(Tp);
|
|
||||||
TP_RECON_ACCUM;
|
|
||||||
num++;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Xm
|
|
||||||
SE=st.GetEntry(ptype,Xm,ss);
|
|
||||||
offset = SE->_offset;
|
|
||||||
|
|
||||||
if (Local && SE->_is_local ) {
|
|
||||||
LOAD_CHIMU;
|
|
||||||
XM_PROJ;
|
|
||||||
if ( SE->_permute ) {
|
|
||||||
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if ( Nonlocal && (!SE->_is_local) ) {
|
|
||||||
LOAD_CHI;
|
|
||||||
}
|
|
||||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
|
||||||
MULT_2SPIN(Xm);
|
|
||||||
XM_RECON_ACCUM;
|
|
||||||
num++;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Ym
|
|
||||||
SE=st.GetEntry(ptype,Ym,ss);
|
|
||||||
offset = SE->_offset;
|
|
||||||
|
|
||||||
if (Local && SE->_is_local ) {
|
|
||||||
LOAD_CHIMU;
|
|
||||||
YM_PROJ;
|
|
||||||
if ( SE->_permute ) {
|
|
||||||
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if ( Nonlocal && (!SE->_is_local) ) {
|
|
||||||
LOAD_CHI;
|
|
||||||
}
|
|
||||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
|
||||||
MULT_2SPIN(Ym);
|
|
||||||
YM_RECON_ACCUM;
|
|
||||||
num++;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Zm
|
|
||||||
SE=st.GetEntry(ptype,Zm,ss);
|
|
||||||
offset = SE->_offset;
|
|
||||||
|
|
||||||
if (Local && SE->_is_local ) {
|
|
||||||
LOAD_CHIMU;
|
|
||||||
ZM_PROJ;
|
|
||||||
if ( SE->_permute ) {
|
|
||||||
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if ( Nonlocal && (!SE->_is_local) ) {
|
|
||||||
LOAD_CHI;
|
|
||||||
}
|
|
||||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
|
||||||
MULT_2SPIN(Zm);
|
|
||||||
ZM_RECON_ACCUM;
|
|
||||||
num++;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Tm
|
|
||||||
SE=st.GetEntry(ptype,Tm,ss);
|
|
||||||
offset = SE->_offset;
|
|
||||||
|
|
||||||
if (Local && SE->_is_local ) {
|
|
||||||
LOAD_CHIMU;
|
|
||||||
TM_PROJ;
|
|
||||||
if ( SE->_permute ) {
|
|
||||||
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if ( Nonlocal && (!SE->_is_local) ) {
|
|
||||||
LOAD_CHI;
|
|
||||||
}
|
|
||||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
|
||||||
MULT_2SPIN(Tm);
|
|
||||||
TM_RECON_ACCUM;
|
|
||||||
num++;
|
|
||||||
}
|
|
||||||
|
|
||||||
SiteSpinor & ref (out._odata[ss]);
|
|
||||||
if ( Local ) {
|
|
||||||
vstream(ref()(0)(0),result_00*(-0.5));
|
|
||||||
vstream(ref()(0)(1),result_01*(-0.5));
|
|
||||||
vstream(ref()(0)(2),result_02*(-0.5));
|
|
||||||
vstream(ref()(1)(0),result_10*(-0.5));
|
|
||||||
vstream(ref()(1)(1),result_11*(-0.5));
|
|
||||||
vstream(ref()(1)(2),result_12*(-0.5));
|
|
||||||
vstream(ref()(2)(0),result_20*(-0.5));
|
|
||||||
vstream(ref()(2)(1),result_21*(-0.5));
|
|
||||||
vstream(ref()(2)(2),result_22*(-0.5));
|
|
||||||
vstream(ref()(3)(0),result_30*(-0.5));
|
|
||||||
vstream(ref()(3)(1),result_31*(-0.5));
|
|
||||||
vstream(ref()(3)(2),result_32*(-0.5));
|
|
||||||
return 1;
|
|
||||||
} else if ( num ) {
|
|
||||||
vstream(ref()(0)(0),ref()(0)(0)+result_00*(-0.5));
|
|
||||||
vstream(ref()(0)(1),ref()(0)(1)+result_01*(-0.5));
|
|
||||||
vstream(ref()(0)(2),ref()(0)(2)+result_02*(-0.5));
|
|
||||||
vstream(ref()(1)(0),ref()(1)(0)+result_10*(-0.5));
|
|
||||||
vstream(ref()(1)(1),ref()(1)(1)+result_11*(-0.5));
|
|
||||||
vstream(ref()(1)(2),ref()(1)(2)+result_12*(-0.5));
|
|
||||||
vstream(ref()(2)(0),ref()(2)(0)+result_20*(-0.5));
|
|
||||||
vstream(ref()(2)(1),ref()(2)(1)+result_21*(-0.5));
|
|
||||||
vstream(ref()(2)(2),ref()(2)(2)+result_22*(-0.5));
|
|
||||||
vstream(ref()(3)(0),ref()(3)(0)+result_30*(-0.5));
|
|
||||||
vstream(ref()(3)(1),ref()(3)(1)+result_31*(-0.5));
|
|
||||||
vstream(ref()(3)(2),ref()(3)(2)+result_32*(-0.5));
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
|
||||||
int WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
|
||||||
int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
|
|
||||||
{
|
|
||||||
// std::cout << "Hand op Dhop "<<std::endl;
|
|
||||||
typedef typename Simd::scalar_type S;
|
|
||||||
typedef typename Simd::vector_type V;
|
|
||||||
|
|
||||||
REGISTER Simd result_00 ; zeroit(result_00); // 12 regs on knc
|
|
||||||
REGISTER Simd result_01 ; zeroit(result_01); // 12 regs on knc
|
|
||||||
REGISTER Simd result_02 ; zeroit(result_02); // 12 regs on knc
|
|
||||||
|
|
||||||
REGISTER Simd result_10 ; zeroit(result_10); // 12 regs on knc
|
|
||||||
REGISTER Simd result_11 ; zeroit(result_11); // 12 regs on knc
|
|
||||||
REGISTER Simd result_12 ; zeroit(result_12); // 12 regs on knc
|
|
||||||
|
|
||||||
REGISTER Simd result_20 ; zeroit(result_20); // 12 regs on knc
|
|
||||||
REGISTER Simd result_21 ; zeroit(result_21); // 12 regs on knc
|
|
||||||
REGISTER Simd result_22 ; zeroit(result_22); // 12 regs on knc
|
|
||||||
|
|
||||||
REGISTER Simd result_30 ; zeroit(result_30); // 12 regs on knc
|
|
||||||
REGISTER Simd result_31 ; zeroit(result_31); // 12 regs on knc
|
|
||||||
REGISTER Simd result_32 ; zeroit(result_32); // 12 regs on knc
|
|
||||||
|
|
||||||
REGISTER Simd Chi_00; // two spinor; 6 regs
|
|
||||||
REGISTER Simd Chi_01;
|
|
||||||
REGISTER Simd Chi_02;
|
|
||||||
|
|
||||||
REGISTER Simd Chi_10;
|
|
||||||
REGISTER Simd Chi_11;
|
|
||||||
REGISTER Simd Chi_12; // 14 left
|
|
||||||
|
|
||||||
REGISTER Simd UChi_00; // two spinor; 6 regs
|
|
||||||
REGISTER Simd UChi_01;
|
|
||||||
REGISTER Simd UChi_02;
|
|
||||||
|
|
||||||
REGISTER Simd UChi_10;
|
|
||||||
REGISTER Simd UChi_11;
|
|
||||||
REGISTER Simd UChi_12; // 8 left
|
|
||||||
|
|
||||||
REGISTER Simd U_00; // two rows of U matrix
|
|
||||||
REGISTER Simd U_10;
|
|
||||||
REGISTER Simd U_20;
|
|
||||||
REGISTER Simd U_01;
|
|
||||||
REGISTER Simd U_11;
|
|
||||||
REGISTER Simd U_21; // 2 reg left.
|
|
||||||
|
|
||||||
#define Chimu_00 Chi_00
|
|
||||||
#define Chimu_01 Chi_01
|
|
||||||
#define Chimu_02 Chi_02
|
|
||||||
#define Chimu_10 Chi_10
|
|
||||||
#define Chimu_11 Chi_11
|
|
||||||
#define Chimu_12 Chi_12
|
|
||||||
#define Chimu_20 UChi_00
|
|
||||||
#define Chimu_21 UChi_01
|
|
||||||
#define Chimu_22 UChi_02
|
|
||||||
#define Chimu_30 UChi_10
|
|
||||||
#define Chimu_31 UChi_11
|
|
||||||
#define Chimu_32 UChi_12
|
|
||||||
|
|
||||||
|
|
||||||
StencilEntry *SE;
|
|
||||||
int offset, ptype;
|
|
||||||
int num = 0;
|
|
||||||
|
|
||||||
// Xp
|
|
||||||
SE=st.GetEntry(ptype,Xp,ss);
|
|
||||||
offset = SE->_offset;
|
|
||||||
|
|
||||||
if (Local && SE->_is_local ) {
|
|
||||||
LOAD_CHIMU;
|
|
||||||
XM_PROJ;
|
|
||||||
if ( SE->_permute ) {
|
|
||||||
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( Nonlocal && (!SE->_is_local) ) {
|
|
||||||
LOAD_CHI;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
|
||||||
MULT_2SPIN(Xp);
|
|
||||||
XM_RECON_ACCUM;
|
|
||||||
num++;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// Yp
|
|
||||||
SE=st.GetEntry(ptype,Yp,ss);
|
|
||||||
offset = SE->_offset;
|
|
||||||
|
|
||||||
if (Local && SE->_is_local ) {
|
|
||||||
LOAD_CHIMU;
|
|
||||||
YM_PROJ;
|
|
||||||
if ( SE->_permute ) {
|
|
||||||
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( Nonlocal && (!SE->_is_local) ) {
|
|
||||||
LOAD_CHI;
|
|
||||||
}
|
|
||||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
|
||||||
MULT_2SPIN(Yp);
|
|
||||||
YM_RECON_ACCUM;
|
|
||||||
num++;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// Zp
|
|
||||||
SE=st.GetEntry(ptype,Zp,ss);
|
|
||||||
offset = SE->_offset;
|
|
||||||
|
|
||||||
if (Local && SE->_is_local ) {
|
|
||||||
LOAD_CHIMU;
|
|
||||||
ZM_PROJ;
|
|
||||||
if ( SE->_permute ) {
|
|
||||||
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( Nonlocal && (!SE->_is_local) ) {
|
|
||||||
LOAD_CHI;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
|
||||||
MULT_2SPIN(Zp);
|
|
||||||
ZM_RECON_ACCUM;
|
|
||||||
num++;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Tp
|
|
||||||
SE=st.GetEntry(ptype,Tp,ss);
|
|
||||||
offset = SE->_offset;
|
|
||||||
|
|
||||||
if (Local && SE->_is_local ) {
|
|
||||||
LOAD_CHIMU;
|
|
||||||
TM_PROJ;
|
|
||||||
if ( SE->_permute ) {
|
|
||||||
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if ( Nonlocal && (!SE->_is_local) ) {
|
|
||||||
LOAD_CHI;
|
|
||||||
}
|
|
||||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
|
||||||
MULT_2SPIN(Tp);
|
|
||||||
TM_RECON_ACCUM;
|
|
||||||
num++;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Xm
|
|
||||||
SE=st.GetEntry(ptype,Xm,ss);
|
|
||||||
offset = SE->_offset;
|
|
||||||
|
|
||||||
if (Local && SE->_is_local ) {
|
|
||||||
LOAD_CHIMU;
|
|
||||||
XP_PROJ;
|
|
||||||
if ( SE->_permute ) {
|
|
||||||
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if ( Nonlocal && (!SE->_is_local) ) {
|
|
||||||
LOAD_CHI;
|
|
||||||
}
|
|
||||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
|
||||||
MULT_2SPIN(Xm);
|
|
||||||
XP_RECON_ACCUM;
|
|
||||||
num++;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Ym
|
|
||||||
SE=st.GetEntry(ptype,Ym,ss);
|
|
||||||
offset = SE->_offset;
|
|
||||||
|
|
||||||
if (Local && SE->_is_local ) {
|
|
||||||
LOAD_CHIMU;
|
|
||||||
YP_PROJ;
|
|
||||||
if ( SE->_permute ) {
|
|
||||||
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if ( Nonlocal && (!SE->_is_local) ) {
|
|
||||||
LOAD_CHI;
|
|
||||||
}
|
|
||||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
|
||||||
MULT_2SPIN(Ym);
|
|
||||||
YP_RECON_ACCUM;
|
|
||||||
num++;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Zm
|
|
||||||
SE=st.GetEntry(ptype,Zm,ss);
|
|
||||||
offset = SE->_offset;
|
|
||||||
|
|
||||||
if (Local && SE->_is_local ) {
|
|
||||||
LOAD_CHIMU;
|
|
||||||
ZP_PROJ;
|
|
||||||
if ( SE->_permute ) {
|
|
||||||
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if ( Nonlocal && (!SE->_is_local) ) {
|
|
||||||
LOAD_CHI;
|
|
||||||
}
|
|
||||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
|
||||||
MULT_2SPIN(Zm);
|
|
||||||
ZP_RECON_ACCUM;
|
|
||||||
num++;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Tm
|
|
||||||
SE=st.GetEntry(ptype,Tm,ss);
|
|
||||||
offset = SE->_offset;
|
|
||||||
|
|
||||||
if (Local && SE->_is_local ) {
|
|
||||||
LOAD_CHIMU;
|
|
||||||
TP_PROJ;
|
|
||||||
if ( SE->_permute ) {
|
|
||||||
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if ( Nonlocal && (!SE->_is_local) ) {
|
|
||||||
LOAD_CHI;
|
|
||||||
}
|
|
||||||
if ( (Local && SE->_is_local) || ( Nonlocal && (!SE->_is_local)) ) {
|
|
||||||
MULT_2SPIN(Tm);
|
|
||||||
TP_RECON_ACCUM;
|
|
||||||
num++;
|
|
||||||
}
|
|
||||||
|
|
||||||
SiteSpinor & ref (out._odata[ss]);
|
|
||||||
if ( Local ) {
|
|
||||||
vstream(ref()(0)(0),result_00*(-0.5));
|
|
||||||
vstream(ref()(0)(1),result_01*(-0.5));
|
|
||||||
vstream(ref()(0)(2),result_02*(-0.5));
|
|
||||||
vstream(ref()(1)(0),result_10*(-0.5));
|
|
||||||
vstream(ref()(1)(1),result_11*(-0.5));
|
|
||||||
vstream(ref()(1)(2),result_12*(-0.5));
|
|
||||||
vstream(ref()(2)(0),result_20*(-0.5));
|
|
||||||
vstream(ref()(2)(1),result_21*(-0.5));
|
|
||||||
vstream(ref()(2)(2),result_22*(-0.5));
|
|
||||||
vstream(ref()(3)(0),result_30*(-0.5));
|
|
||||||
vstream(ref()(3)(1),result_31*(-0.5));
|
|
||||||
vstream(ref()(3)(2),result_32*(-0.5));
|
|
||||||
return 1;
|
|
||||||
} else if ( num ) {
|
|
||||||
vstream(ref()(0)(0),ref()(0)(0)+result_00*(-0.5));
|
|
||||||
vstream(ref()(0)(1),ref()(0)(1)+result_01*(-0.5));
|
|
||||||
vstream(ref()(0)(2),ref()(0)(2)+result_02*(-0.5));
|
|
||||||
vstream(ref()(1)(0),ref()(1)(0)+result_10*(-0.5));
|
|
||||||
vstream(ref()(1)(1),ref()(1)(1)+result_11*(-0.5));
|
|
||||||
vstream(ref()(1)(2),ref()(1)(2)+result_12*(-0.5));
|
|
||||||
vstream(ref()(2)(0),ref()(2)(0)+result_20*(-0.5));
|
|
||||||
vstream(ref()(2)(1),ref()(2)(1)+result_21*(-0.5));
|
|
||||||
vstream(ref()(2)(2),ref()(2)(2)+result_22*(-0.5));
|
|
||||||
vstream(ref()(3)(0),ref()(3)(0)+result_30*(-0.5));
|
|
||||||
vstream(ref()(3)(1),ref()(3)(1)+result_31*(-0.5));
|
|
||||||
vstream(ref()(3)(2),ref()(3)(2)+result_32*(-0.5));
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
|
int ss,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
typedef typename Simd::scalar_type S;
|
typedef typename Simd::scalar_type S;
|
||||||
typedef typename Simd::vector_type V;
|
typedef typename Simd::vector_type V;
|
||||||
@ -1073,89 +539,346 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeFiel
|
|||||||
|
|
||||||
{
|
{
|
||||||
SiteSpinor & ref (out._odata[ss]);
|
SiteSpinor & ref (out._odata[ss]);
|
||||||
vstream(ref()(0)(0),result_00*(-0.5));
|
vstream(ref()(0)(0),result_00);
|
||||||
vstream(ref()(0)(1),result_01*(-0.5));
|
vstream(ref()(0)(1),result_01);
|
||||||
vstream(ref()(0)(2),result_02*(-0.5));
|
vstream(ref()(0)(2),result_02);
|
||||||
vstream(ref()(1)(0),result_10*(-0.5));
|
vstream(ref()(1)(0),result_10);
|
||||||
vstream(ref()(1)(1),result_11*(-0.5));
|
vstream(ref()(1)(1),result_11);
|
||||||
vstream(ref()(1)(2),result_12*(-0.5));
|
vstream(ref()(1)(2),result_12);
|
||||||
vstream(ref()(2)(0),result_20*(-0.5));
|
vstream(ref()(2)(0),result_20);
|
||||||
vstream(ref()(2)(1),result_21*(-0.5));
|
vstream(ref()(2)(1),result_21);
|
||||||
vstream(ref()(2)(2),result_22*(-0.5));
|
vstream(ref()(2)(2),result_22);
|
||||||
vstream(ref()(3)(0),result_30*(-0.5));
|
vstream(ref()(3)(0),result_30);
|
||||||
vstream(ref()(3)(1),result_31*(-0.5));
|
vstream(ref()(3)(1),result_31);
|
||||||
vstream(ref()(3)(2),result_32*(-0.5));
|
vstream(ref()(3)(2),result_32);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
*/
|
|
||||||
|
template<class Impl>
|
||||||
|
void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int ss,int sU,const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
|
// std::cout << "Hand op Dhop "<<std::endl;
|
||||||
|
typedef typename Simd::scalar_type S;
|
||||||
|
typedef typename Simd::vector_type V;
|
||||||
|
|
||||||
|
REGISTER Simd result_00; // 12 regs on knc
|
||||||
|
REGISTER Simd result_01;
|
||||||
|
REGISTER Simd result_02;
|
||||||
|
|
||||||
|
REGISTER Simd result_10;
|
||||||
|
REGISTER Simd result_11;
|
||||||
|
REGISTER Simd result_12;
|
||||||
|
|
||||||
|
REGISTER Simd result_20;
|
||||||
|
REGISTER Simd result_21;
|
||||||
|
REGISTER Simd result_22;
|
||||||
|
|
||||||
|
REGISTER Simd result_30;
|
||||||
|
REGISTER Simd result_31;
|
||||||
|
REGISTER Simd result_32; // 20 left
|
||||||
|
|
||||||
|
REGISTER Simd Chi_00; // two spinor; 6 regs
|
||||||
|
REGISTER Simd Chi_01;
|
||||||
|
REGISTER Simd Chi_02;
|
||||||
|
|
||||||
|
REGISTER Simd Chi_10;
|
||||||
|
REGISTER Simd Chi_11;
|
||||||
|
REGISTER Simd Chi_12; // 14 left
|
||||||
|
|
||||||
|
REGISTER Simd UChi_00; // two spinor; 6 regs
|
||||||
|
REGISTER Simd UChi_01;
|
||||||
|
REGISTER Simd UChi_02;
|
||||||
|
|
||||||
|
REGISTER Simd UChi_10;
|
||||||
|
REGISTER Simd UChi_11;
|
||||||
|
REGISTER Simd UChi_12; // 8 left
|
||||||
|
|
||||||
|
REGISTER Simd U_00; // two rows of U matrix
|
||||||
|
REGISTER Simd U_10;
|
||||||
|
REGISTER Simd U_20;
|
||||||
|
REGISTER Simd U_01;
|
||||||
|
REGISTER Simd U_11;
|
||||||
|
REGISTER Simd U_21; // 2 reg left.
|
||||||
|
|
||||||
|
#define Chimu_00 Chi_00
|
||||||
|
#define Chimu_01 Chi_01
|
||||||
|
#define Chimu_02 Chi_02
|
||||||
|
#define Chimu_10 Chi_10
|
||||||
|
#define Chimu_11 Chi_11
|
||||||
|
#define Chimu_12 Chi_12
|
||||||
|
#define Chimu_20 UChi_00
|
||||||
|
#define Chimu_21 UChi_01
|
||||||
|
#define Chimu_22 UChi_02
|
||||||
|
#define Chimu_30 UChi_10
|
||||||
|
#define Chimu_31 UChi_11
|
||||||
|
#define Chimu_32 UChi_12
|
||||||
|
|
||||||
|
|
||||||
|
StencilEntry *SE;
|
||||||
|
int offset,local,perm, ptype;
|
||||||
|
|
||||||
|
// Xp
|
||||||
|
SE=st.GetEntry(ptype,Xp,ss);
|
||||||
|
offset = SE->_offset;
|
||||||
|
local = SE->_is_local;
|
||||||
|
perm = SE->_permute;
|
||||||
|
|
||||||
|
if ( local ) {
|
||||||
|
LOAD_CHIMU;
|
||||||
|
XP_PROJ;
|
||||||
|
if ( perm) {
|
||||||
|
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
LOAD_CHI;
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
MULT_2SPIN(Xp);
|
||||||
|
}
|
||||||
|
XP_RECON;
|
||||||
|
|
||||||
|
// Yp
|
||||||
|
SE=st.GetEntry(ptype,Yp,ss);
|
||||||
|
offset = SE->_offset;
|
||||||
|
local = SE->_is_local;
|
||||||
|
perm = SE->_permute;
|
||||||
|
|
||||||
|
if ( local ) {
|
||||||
|
LOAD_CHIMU;
|
||||||
|
YP_PROJ;
|
||||||
|
if ( perm) {
|
||||||
|
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
LOAD_CHI;
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN(Yp);
|
||||||
|
}
|
||||||
|
YP_RECON_ACCUM;
|
||||||
|
|
||||||
|
|
||||||
|
// Zp
|
||||||
|
SE=st.GetEntry(ptype,Zp,ss);
|
||||||
|
offset = SE->_offset;
|
||||||
|
local = SE->_is_local;
|
||||||
|
perm = SE->_permute;
|
||||||
|
|
||||||
|
if ( local ) {
|
||||||
|
LOAD_CHIMU;
|
||||||
|
ZP_PROJ;
|
||||||
|
if ( perm) {
|
||||||
|
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
LOAD_CHI;
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN(Zp);
|
||||||
|
}
|
||||||
|
ZP_RECON_ACCUM;
|
||||||
|
|
||||||
|
// Tp
|
||||||
|
SE=st.GetEntry(ptype,Tp,ss);
|
||||||
|
offset = SE->_offset;
|
||||||
|
local = SE->_is_local;
|
||||||
|
perm = SE->_permute;
|
||||||
|
|
||||||
|
if ( local ) {
|
||||||
|
LOAD_CHIMU;
|
||||||
|
TP_PROJ;
|
||||||
|
if ( perm) {
|
||||||
|
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
LOAD_CHI;
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN(Tp);
|
||||||
|
}
|
||||||
|
TP_RECON_ACCUM;
|
||||||
|
|
||||||
|
// Xm
|
||||||
|
SE=st.GetEntry(ptype,Xm,ss);
|
||||||
|
offset = SE->_offset;
|
||||||
|
local = SE->_is_local;
|
||||||
|
perm = SE->_permute;
|
||||||
|
|
||||||
|
if ( local ) {
|
||||||
|
LOAD_CHIMU;
|
||||||
|
XM_PROJ;
|
||||||
|
if ( perm) {
|
||||||
|
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
LOAD_CHI;
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN(Xm);
|
||||||
|
}
|
||||||
|
XM_RECON_ACCUM;
|
||||||
|
|
||||||
|
// Ym
|
||||||
|
SE=st.GetEntry(ptype,Ym,ss);
|
||||||
|
offset = SE->_offset;
|
||||||
|
local = SE->_is_local;
|
||||||
|
perm = SE->_permute;
|
||||||
|
|
||||||
|
if ( local ) {
|
||||||
|
LOAD_CHIMU;
|
||||||
|
YM_PROJ;
|
||||||
|
if ( perm) {
|
||||||
|
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
LOAD_CHI;
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN(Ym);
|
||||||
|
}
|
||||||
|
YM_RECON_ACCUM;
|
||||||
|
|
||||||
|
// Zm
|
||||||
|
SE=st.GetEntry(ptype,Zm,ss);
|
||||||
|
offset = SE->_offset;
|
||||||
|
local = SE->_is_local;
|
||||||
|
perm = SE->_permute;
|
||||||
|
|
||||||
|
if ( local ) {
|
||||||
|
LOAD_CHIMU;
|
||||||
|
ZM_PROJ;
|
||||||
|
if ( perm) {
|
||||||
|
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
LOAD_CHI;
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN(Zm);
|
||||||
|
}
|
||||||
|
ZM_RECON_ACCUM;
|
||||||
|
|
||||||
|
// Tm
|
||||||
|
SE=st.GetEntry(ptype,Tm,ss);
|
||||||
|
offset = SE->_offset;
|
||||||
|
local = SE->_is_local;
|
||||||
|
perm = SE->_permute;
|
||||||
|
|
||||||
|
if ( local ) {
|
||||||
|
LOAD_CHIMU;
|
||||||
|
TM_PROJ;
|
||||||
|
if ( perm) {
|
||||||
|
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
LOAD_CHI;
|
||||||
|
}
|
||||||
|
{
|
||||||
|
MULT_2SPIN(Tm);
|
||||||
|
}
|
||||||
|
TM_RECON_ACCUM;
|
||||||
|
|
||||||
|
{
|
||||||
|
SiteSpinor & ref (out._odata[ss]);
|
||||||
|
vstream(ref()(0)(0),result_00);
|
||||||
|
vstream(ref()(0)(1),result_01);
|
||||||
|
vstream(ref()(0)(2),result_02);
|
||||||
|
vstream(ref()(1)(0),result_10);
|
||||||
|
vstream(ref()(1)(1),result_11);
|
||||||
|
vstream(ref()(1)(2),result_12);
|
||||||
|
vstream(ref()(2)(0),result_20);
|
||||||
|
vstream(ref()(2)(1),result_21);
|
||||||
|
vstream(ref()(2)(2),result_22);
|
||||||
|
vstream(ref()(3)(0),result_30);
|
||||||
|
vstream(ref()(3)(1),result_31);
|
||||||
|
vstream(ref()(3)(2),result_32);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
////////////////////////////////////////////////
|
////////////////////////////////////////////////
|
||||||
// Specialise Gparity to simple implementation
|
// Specialise Gparity to simple implementation
|
||||||
////////////////////////////////////////////////
|
////////////////////////////////////////////////
|
||||||
template<>
|
template<>
|
||||||
int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // returns void, will template override for Wilson Nc=3
|
assert(0);
|
||||||
//check consistency of return types between these functions and the ones in WilsonKernels.cc
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
|
assert(0);
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
DiracOptDhopSite(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
|
assert(0);
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int sF,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
|
int sF,int sU,const FermionField &in, FermionField &out)
|
||||||
{
|
{
|
||||||
DiracOptDhopSiteDag(st,U,buf,sF,sU,in,out); // will template override for Wilson Nc=3
|
assert(0);
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
////////////// Wilson ; uses this implementation /////////////////////
|
||||||
|
// Need Nc=3 though //
|
||||||
|
|
||||||
|
template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out,bool l,bool n);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
template int WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out, bool l, bool n);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
template int WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<WilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out, bool l, bool n);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
template int WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<WilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out, bool l, bool n);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
|
|
||||||
template int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
template int WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
template int WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
template void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
int ss,int sU,const FermionField &in, FermionField &out, bool l, bool nl);
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
|
|
||||||
|
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
|
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
|
template void WilsonKernels<DomainWallRedBlack5dImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
|
template void WilsonKernels<DomainWallRedBlack5dImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||||
|
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||||
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
|
|
||||||
}}
|
}}
|
||||||
|
@ -42,7 +42,9 @@ template<class Gimpl> class WilsonLoops;
|
|||||||
#define INHERIT_GIMPL_TYPES(GImpl) \
|
#define INHERIT_GIMPL_TYPES(GImpl) \
|
||||||
typedef typename GImpl::Simd Simd;\
|
typedef typename GImpl::Simd Simd;\
|
||||||
typedef typename GImpl::GaugeLinkField GaugeLinkField;\
|
typedef typename GImpl::GaugeLinkField GaugeLinkField;\
|
||||||
typedef typename GImpl::GaugeField GaugeField;
|
typedef typename GImpl::GaugeField GaugeField;\
|
||||||
|
typedef typename GImpl::SiteGaugeField SiteGaugeField;\
|
||||||
|
typedef typename GImpl::SiteGaugeLink SiteGaugeLink;
|
||||||
|
|
||||||
//
|
//
|
||||||
template<class S,int Nrepresentation=Nc>
|
template<class S,int Nrepresentation=Nc>
|
||||||
|
@ -92,13 +92,13 @@ public:
|
|||||||
|
|
||||||
// Create integrator, including the smearing policy
|
// Create integrator, including the smearing policy
|
||||||
// Smearing policy
|
// Smearing policy
|
||||||
std::cout << GridLogMessage << " Creating the Stout class\n";
|
std::cout << GridLogDebug << " Creating the Stout class\n";
|
||||||
double rho = 0.1; // smearing parameter
|
double rho = 0.1; // smearing parameter, now hardcoded
|
||||||
int Nsmear = 1; // number of smearing levels
|
int Nsmear = 1; // number of smearing levels
|
||||||
Smear_Stout<Gimpl> Stout(rho);
|
Smear_Stout<Gimpl> Stout(rho);
|
||||||
std::cout << GridLogMessage << " Creating the SmearedConfiguration class\n";
|
std::cout << GridLogDebug << " Creating the SmearedConfiguration class\n";
|
||||||
SmearedConfiguration<Gimpl> SmearingPolicy(UGrid, Nsmear, Stout);
|
SmearedConfiguration<Gimpl> SmearingPolicy(UGrid, Nsmear, Stout);
|
||||||
std::cout << GridLogMessage << " done\n";
|
std::cout << GridLogDebug << " done\n";
|
||||||
//////////////
|
//////////////
|
||||||
typedef MinimumNorm2<GaugeField, SmearedConfiguration<Gimpl> > IntegratorType;// change here to change the algorithm
|
typedef MinimumNorm2<GaugeField, SmearedConfiguration<Gimpl> > IntegratorType;// change here to change the algorithm
|
||||||
IntegratorParameters MDpar(20);
|
IntegratorParameters MDpar(20);
|
||||||
@ -116,27 +116,27 @@ public:
|
|||||||
|
|
||||||
if ( StartType == HotStart ) {
|
if ( StartType == HotStart ) {
|
||||||
// Hot start
|
// Hot start
|
||||||
HMCpar.NoMetropolisUntil =0;
|
HMCpar.NoMetropolisUntil =10;
|
||||||
HMCpar.MetropolisTest = true;
|
HMCpar.MetropolisTest = true;
|
||||||
sRNG.SeedFixedIntegers(SerSeed);
|
sRNG.SeedFixedIntegers(SerSeed);
|
||||||
pRNG.SeedFixedIntegers(ParSeed);
|
pRNG.SeedFixedIntegers(ParSeed);
|
||||||
SU3::HotConfiguration(pRNG, U);
|
SU3::HotConfiguration(pRNG, U);
|
||||||
} else if ( StartType == ColdStart ) {
|
} else if ( StartType == ColdStart ) {
|
||||||
// Cold start
|
// Cold start
|
||||||
HMCpar.NoMetropolisUntil =0;
|
HMCpar.NoMetropolisUntil =10;
|
||||||
HMCpar.MetropolisTest = true;
|
HMCpar.MetropolisTest = true;
|
||||||
sRNG.SeedFixedIntegers(SerSeed);
|
sRNG.SeedFixedIntegers(SerSeed);
|
||||||
pRNG.SeedFixedIntegers(ParSeed);
|
pRNG.SeedFixedIntegers(ParSeed);
|
||||||
SU3::ColdConfiguration(pRNG, U);
|
SU3::ColdConfiguration(pRNG, U);
|
||||||
} else if ( StartType == TepidStart ) {
|
} else if ( StartType == TepidStart ) {
|
||||||
// Tepid start
|
// Tepid start
|
||||||
HMCpar.NoMetropolisUntil =0;
|
HMCpar.NoMetropolisUntil =10;
|
||||||
HMCpar.MetropolisTest = true;
|
HMCpar.MetropolisTest = true;
|
||||||
sRNG.SeedFixedIntegers(SerSeed);
|
sRNG.SeedFixedIntegers(SerSeed);
|
||||||
pRNG.SeedFixedIntegers(ParSeed);
|
pRNG.SeedFixedIntegers(ParSeed);
|
||||||
SU3::TepidConfiguration(pRNG, U);
|
SU3::TepidConfiguration(pRNG, U);
|
||||||
} else if ( StartType == CheckpointStart ) {
|
} else if ( StartType == CheckpointStart ) {
|
||||||
HMCpar.NoMetropolisUntil =0;
|
HMCpar.NoMetropolisUntil =10;
|
||||||
HMCpar.MetropolisTest = true;
|
HMCpar.MetropolisTest = true;
|
||||||
// CheckpointRestart
|
// CheckpointRestart
|
||||||
Checkpoint.CheckpointRestore(StartTraj, U, sRNG, pRNG);
|
Checkpoint.CheckpointRestore(StartTraj, U, sRNG, pRNG);
|
||||||
|
@ -61,6 +61,31 @@ namespace Grid {
|
|||||||
" "
|
" "
|
||||||
};
|
};
|
||||||
|
|
||||||
|
SpinMatrix makeGammaProd(const unsigned int i)
|
||||||
|
{
|
||||||
|
SpinMatrix g;
|
||||||
|
|
||||||
|
g = 1.;
|
||||||
|
if (i & 0x1)
|
||||||
|
{
|
||||||
|
g = g*Gamma(Gamma::GammaMatrix::GammaX);
|
||||||
|
}
|
||||||
|
if (i & 0x2)
|
||||||
|
{
|
||||||
|
g = g*Gamma(Gamma::GammaMatrix::GammaY);
|
||||||
|
}
|
||||||
|
if (i & 0x4)
|
||||||
|
{
|
||||||
|
g = g*Gamma(Gamma::GammaMatrix::GammaZ);
|
||||||
|
}
|
||||||
|
if (i & 0x8)
|
||||||
|
{
|
||||||
|
g = g*Gamma(Gamma::GammaMatrix::GammaT);
|
||||||
|
}
|
||||||
|
|
||||||
|
return g;
|
||||||
|
}
|
||||||
|
|
||||||
// void sprojMul( vHalfSpinColourVector &out,vColourMatrix &u, vSpinColourVector &in){
|
// void sprojMul( vHalfSpinColourVector &out,vColourMatrix &u, vSpinColourVector &in){
|
||||||
// vHalfSpinColourVector hspin;
|
// vHalfSpinColourVector hspin;
|
||||||
// spProjXp(hspin,in);
|
// spProjXp(hspin,in);
|
||||||
|
@ -83,6 +83,9 @@ namespace QCD {
|
|||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Make gamma products (Chroma convention)
|
||||||
|
SpinMatrix makeGammaProd(const unsigned int i);
|
||||||
|
|
||||||
/* Gx
|
/* Gx
|
||||||
* 0 0 0 i
|
* 0 0 0 i
|
||||||
* 0 0 i 0
|
* 0 0 i 0
|
||||||
|
@ -608,14 +608,14 @@ Note that in step D setting B ~ X - A and using B in place of A in step E will g
|
|||||||
LatticeMatrix Umu(out._grid);
|
LatticeMatrix Umu(out._grid);
|
||||||
for(int mu=0;mu<Nd;mu++){
|
for(int mu=0;mu<Nd;mu++){
|
||||||
LieRandomize(pRNG,Umu,0.01);
|
LieRandomize(pRNG,Umu,0.01);
|
||||||
pokeLorentz(out,Umu,mu);
|
PokeIndex<LorentzIndex>(out,Umu,mu);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
static void ColdConfiguration(GridParallelRNG &pRNG,LatticeGaugeField &out){
|
static void ColdConfiguration(GridParallelRNG &pRNG,LatticeGaugeField &out){
|
||||||
LatticeMatrix Umu(out._grid);
|
LatticeMatrix Umu(out._grid);
|
||||||
Umu=1.0;
|
Umu=1.0;
|
||||||
for(int mu=0;mu<Nd;mu++){
|
for(int mu=0;mu<Nd;mu++){
|
||||||
pokeLorentz(out,Umu,mu);
|
PokeIndex<LorentzIndex>(out,Umu,mu);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -41,7 +41,11 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFourDimRedBlackGrid(const GridCartesia
|
|||||||
{
|
{
|
||||||
return new GridRedBlackCartesian(FourDimGrid);
|
return new GridRedBlackCartesian(FourDimGrid);
|
||||||
}
|
}
|
||||||
|
GridCartesian *SpaceTimeGrid::makeFourDimDWFGrid(const std::vector<int> & latt,const std::vector<int> &mpi)
|
||||||
|
{
|
||||||
|
std::vector<int> simd(4,1);
|
||||||
|
return makeFourDimGrid(latt,simd,mpi);
|
||||||
|
}
|
||||||
GridCartesian *SpaceTimeGrid::makeFiveDimGrid(int Ls,const GridCartesian *FourDimGrid)
|
GridCartesian *SpaceTimeGrid::makeFiveDimGrid(int Ls,const GridCartesian *FourDimGrid)
|
||||||
{
|
{
|
||||||
int N4=FourDimGrid->_ndimension;
|
int N4=FourDimGrid->_ndimension;
|
||||||
@ -58,6 +62,7 @@ GridCartesian *SpaceTimeGrid::makeFiveDimGrid(int Ls,const GridCartesian
|
|||||||
return new GridCartesian(latt5,simd5,mpi5);
|
return new GridCartesian(latt5,simd5,mpi5);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimRedBlackGrid(int Ls,const GridCartesian *FourDimGrid)
|
GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimRedBlackGrid(int Ls,const GridCartesian *FourDimGrid)
|
||||||
{
|
{
|
||||||
int N4=FourDimGrid->_ndimension;
|
int N4=FourDimGrid->_ndimension;
|
||||||
@ -76,4 +81,42 @@ GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimRedBlackGrid(int Ls,const GridC
|
|||||||
return new GridRedBlackCartesian(latt5,simd5,mpi5,cb5,cbd);
|
return new GridRedBlackCartesian(latt5,simd5,mpi5,cb5,cbd);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
GridCartesian *SpaceTimeGrid::makeFiveDimDWFGrid(int Ls,const GridCartesian *FourDimGrid)
|
||||||
|
{
|
||||||
|
int N4=FourDimGrid->_ndimension;
|
||||||
|
int nsimd = FourDimGrid->Nsimd();
|
||||||
|
|
||||||
|
std::vector<int> latt5(1,Ls);
|
||||||
|
std::vector<int> simd5(1,nsimd);
|
||||||
|
std::vector<int> mpi5(1,1);
|
||||||
|
|
||||||
|
for(int d=0;d<N4;d++){
|
||||||
|
latt5.push_back(FourDimGrid->_fdimensions[d]);
|
||||||
|
simd5.push_back(1);
|
||||||
|
mpi5.push_back(FourDimGrid->_processors[d]);
|
||||||
|
}
|
||||||
|
return new GridCartesian(latt5,simd5,mpi5);
|
||||||
|
}
|
||||||
|
|
||||||
|
GridRedBlackCartesian *SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(int Ls,const GridCartesian *FourDimGrid)
|
||||||
|
{
|
||||||
|
int N4=FourDimGrid->_ndimension;
|
||||||
|
int nsimd = FourDimGrid->Nsimd();
|
||||||
|
int cbd=0;
|
||||||
|
std::vector<int> latt5(1,Ls);
|
||||||
|
std::vector<int> simd5(1,nsimd);
|
||||||
|
std::vector<int> mpi5(1,1);
|
||||||
|
std::vector<int> cb5(1,1);
|
||||||
|
|
||||||
|
for(int d=0;d<N4;d++){
|
||||||
|
latt5.push_back(FourDimGrid->_fdimensions[d]);
|
||||||
|
simd5.push_back(1);
|
||||||
|
mpi5.push_back(FourDimGrid->_processors[d]);
|
||||||
|
cb5.push_back(1);
|
||||||
|
}
|
||||||
|
return new GridRedBlackCartesian(latt5,simd5,mpi5,cb5,cbd);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}}
|
}}
|
||||||
|
@ -35,9 +35,14 @@ class SpaceTimeGrid {
|
|||||||
|
|
||||||
static GridCartesian *makeFourDimGrid(const std::vector<int> & latt,const std::vector<int> &simd,const std::vector<int> &mpi);
|
static GridCartesian *makeFourDimGrid(const std::vector<int> & latt,const std::vector<int> &simd,const std::vector<int> &mpi);
|
||||||
static GridRedBlackCartesian *makeFourDimRedBlackGrid (const GridCartesian *FourDimGrid);
|
static GridRedBlackCartesian *makeFourDimRedBlackGrid (const GridCartesian *FourDimGrid);
|
||||||
|
|
||||||
static GridCartesian *makeFiveDimGrid (int Ls,const GridCartesian *FourDimGrid);
|
static GridCartesian *makeFiveDimGrid (int Ls,const GridCartesian *FourDimGrid);
|
||||||
static GridRedBlackCartesian *makeFiveDimRedBlackGrid(int Ls,const GridCartesian *FourDimGrid);
|
static GridRedBlackCartesian *makeFiveDimRedBlackGrid(int Ls,const GridCartesian *FourDimGrid);
|
||||||
|
|
||||||
|
static GridCartesian *makeFiveDimDWFGrid (int Ls,const GridCartesian *FourDimGrid);
|
||||||
|
static GridRedBlackCartesian *makeFiveDimDWFRedBlackGrid(int Ls,const GridCartesian *FourDimGrid);
|
||||||
|
static GridCartesian *makeFourDimDWFGrid (const std::vector<int> & latt,const std::vector<int> &mpi);
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
}}
|
}}
|
||||||
|
@ -101,15 +101,15 @@ namespace Grid {
|
|||||||
// average over all x,y,z,t and over all planes of plaquette
|
// average over all x,y,z,t and over all planes of plaquette
|
||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
static RealD avgPlaquette(const GaugeLorentz &Umu){
|
static RealD avgPlaquette(const GaugeLorentz &Umu){
|
||||||
|
|
||||||
RealD sumplaq = sumPlaquette(Umu);
|
RealD sumplaq = sumPlaquette(Umu);
|
||||||
|
|
||||||
double vol = Umu._grid->gSites();
|
double vol = Umu._grid->gSites();
|
||||||
|
|
||||||
double faces = (1.0*Nd*(Nd-1))/2.0;
|
double faces = (1.0*Nd*(Nd-1))/2.0;
|
||||||
|
|
||||||
return sumplaq/vol/faces/Nc; // Nd , Nc dependent... FIXME
|
return sumplaq/vol/faces/Nc; // Nd , Nc dependent... FIXME
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////
|
||||||
|
// average over traced single links
|
||||||
|
//////////////////////////////////////////////////
|
||||||
static RealD linkTrace(const GaugeLorentz &Umu){
|
static RealD linkTrace(const GaugeLorentz &Umu){
|
||||||
std::vector<GaugeMat> U(4,Umu._grid);
|
std::vector<GaugeMat> U(4,Umu._grid);
|
||||||
|
|
||||||
@ -126,47 +126,6 @@ namespace Grid {
|
|||||||
|
|
||||||
return p.real()/vol/4.0/3.0;
|
return p.real()/vol/4.0/3.0;
|
||||||
};
|
};
|
||||||
//////////////////////////////////////////////////
|
|
||||||
// the sum over all staples on each site
|
|
||||||
//////////////////////////////////////////////////
|
|
||||||
static void Staple(GaugeMat &staple,const GaugeLorentz &Umu,int mu){
|
|
||||||
|
|
||||||
GridBase *grid = Umu._grid;
|
|
||||||
|
|
||||||
std::vector<GaugeMat> U(4,grid);
|
|
||||||
for(int d=0;d<Nd;d++){
|
|
||||||
U[d] = PeekIndex<LorentzIndex>(Umu,d);
|
|
||||||
}
|
|
||||||
staple = zero;
|
|
||||||
|
|
||||||
|
|
||||||
for(int nu=0;nu<Nd;nu++){
|
|
||||||
|
|
||||||
if(nu != mu) {
|
|
||||||
|
|
||||||
// mu
|
|
||||||
// ^
|
|
||||||
// |__> nu
|
|
||||||
|
|
||||||
// __
|
|
||||||
// |
|
|
||||||
// __|
|
|
||||||
//
|
|
||||||
|
|
||||||
staple+=Gimpl::ShiftStaple(Gimpl::CovShiftForward (U[nu],nu,
|
|
||||||
Gimpl::CovShiftBackward(U[mu],mu,
|
|
||||||
Gimpl::CovShiftIdentityBackward(U[nu],nu))),mu);
|
|
||||||
|
|
||||||
// __
|
|
||||||
// |
|
|
||||||
// |__
|
|
||||||
//
|
|
||||||
//
|
|
||||||
staple+=Gimpl::ShiftStaple(Gimpl::CovShiftBackward(U[nu],nu,
|
|
||||||
Gimpl::CovShiftBackward(U[mu],mu,U[nu])),mu);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
// the sum over all staples on each site in direction mu,nu
|
// the sum over all staples on each site in direction mu,nu
|
||||||
@ -210,6 +169,51 @@ namespace Grid {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////
|
||||||
|
// the sum over all staples on each site
|
||||||
|
//////////////////////////////////////////////////
|
||||||
|
static void Staple(GaugeMat &staple,const GaugeLorentz &Umu,int mu){
|
||||||
|
|
||||||
|
GridBase *grid = Umu._grid;
|
||||||
|
|
||||||
|
std::vector<GaugeMat> U(Nd,grid);
|
||||||
|
for(int d=0;d<Nd;d++){
|
||||||
|
U[d] = PeekIndex<LorentzIndex>(Umu,d);
|
||||||
|
}
|
||||||
|
staple = zero;
|
||||||
|
GaugeMat tmp(grid);
|
||||||
|
|
||||||
|
|
||||||
|
for(int nu=0;nu<Nd;nu++){
|
||||||
|
|
||||||
|
if(nu != mu) {
|
||||||
|
|
||||||
|
// mu
|
||||||
|
// ^
|
||||||
|
// |__> nu
|
||||||
|
|
||||||
|
// __
|
||||||
|
// |
|
||||||
|
// __|
|
||||||
|
//
|
||||||
|
|
||||||
|
staple+=Gimpl::ShiftStaple(
|
||||||
|
Gimpl::CovShiftForward (U[nu],nu,
|
||||||
|
Gimpl::CovShiftBackward(U[mu],mu,
|
||||||
|
Gimpl::CovShiftIdentityBackward(U[nu],nu))),mu);
|
||||||
|
|
||||||
|
// __
|
||||||
|
// |
|
||||||
|
// |__
|
||||||
|
//
|
||||||
|
//
|
||||||
|
staple+=Gimpl::ShiftStaple(
|
||||||
|
Gimpl::CovShiftBackward(U[nu],nu,
|
||||||
|
Gimpl::CovShiftBackward(U[mu],mu,U[nu])),mu);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
// the sum over all staples on each site in direction mu,nu, upper part
|
// the sum over all staples on each site in direction mu,nu, upper part
|
||||||
@ -247,7 +251,6 @@ namespace Grid {
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
// Similar to above for rectangle is required
|
// Similar to above for rectangle is required
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
@ -276,11 +279,12 @@ namespace Grid {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
// sum over all x,y,z,t and over all planes of plaquette
|
// sum over all x,y,z,t and over all planes of plaquette
|
||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
static RealD sumRectangle(const GaugeLorentz &Umu){
|
static RealD sumRectangle(const GaugeLorentz &Umu){
|
||||||
std::vector<GaugeMat> U(4,Umu._grid);
|
std::vector<GaugeMat> U(Nd,Umu._grid);
|
||||||
|
|
||||||
for(int mu=0;mu<Nd;mu++){
|
for(int mu=0;mu<Nd;mu++){
|
||||||
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
|
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
|
||||||
@ -406,7 +410,7 @@ namespace Grid {
|
|||||||
static void RectStapleUnoptimised(GaugeMat &Stap,const GaugeLorentz &Umu,int mu){
|
static void RectStapleUnoptimised(GaugeMat &Stap,const GaugeLorentz &Umu,int mu){
|
||||||
GridBase *grid = Umu._grid;
|
GridBase *grid = Umu._grid;
|
||||||
|
|
||||||
std::vector<GaugeMat> U(4,grid);
|
std::vector<GaugeMat> U(Nd,grid);
|
||||||
for(int d=0;d<Nd;d++){
|
for(int d=0;d<Nd;d++){
|
||||||
U[d] = PeekIndex<LorentzIndex>(Umu,d);
|
U[d] = PeekIndex<LorentzIndex>(Umu,d);
|
||||||
}
|
}
|
||||||
|
@ -32,6 +32,40 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <type_traits>
|
#include <type_traits>
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
// helper function to read space-separated values
|
||||||
|
template <typename T>
|
||||||
|
std::vector<T> strToVec(const std::string s)
|
||||||
|
{
|
||||||
|
std::istringstream sstr(s);
|
||||||
|
T buf;
|
||||||
|
std::vector<T> v;
|
||||||
|
|
||||||
|
while(!sstr.eof())
|
||||||
|
{
|
||||||
|
sstr >> buf;
|
||||||
|
v.push_back(buf);
|
||||||
|
}
|
||||||
|
|
||||||
|
return v;
|
||||||
|
}
|
||||||
|
|
||||||
|
// output to streams for vectors
|
||||||
|
template < class T >
|
||||||
|
inline std::ostream & operator<<(std::ostream &os, const std::vector<T> &v)
|
||||||
|
{
|
||||||
|
os << "[";
|
||||||
|
for (auto &x: v)
|
||||||
|
{
|
||||||
|
os << x << " ";
|
||||||
|
}
|
||||||
|
if (v.size() > 0)
|
||||||
|
{
|
||||||
|
os << "\b";
|
||||||
|
}
|
||||||
|
os << "]";
|
||||||
|
|
||||||
|
return os;
|
||||||
|
}
|
||||||
|
|
||||||
class Serializable {};
|
class Serializable {};
|
||||||
|
|
||||||
@ -138,23 +172,6 @@ namespace Grid {
|
|||||||
r.read(s, output);
|
r.read(s, output);
|
||||||
}
|
}
|
||||||
|
|
||||||
template < class T >
|
|
||||||
inline std::ostream& operator << (std::ostream& os, const std::vector<T>& v)
|
|
||||||
{
|
|
||||||
os << "[";
|
|
||||||
for (auto &x: v)
|
|
||||||
{
|
|
||||||
os << x << " ";
|
|
||||||
}
|
|
||||||
if (v.size() > 0)
|
|
||||||
{
|
|
||||||
os << "\b";
|
|
||||||
}
|
|
||||||
os << "]";
|
|
||||||
|
|
||||||
return os;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Writer template implementation ////////////////////////////////////////////
|
// Writer template implementation ////////////////////////////////////////////
|
||||||
template <typename T>
|
template <typename T>
|
||||||
Writer<T>::Writer(void)
|
Writer<T>::Writer(void)
|
||||||
|
@ -120,7 +120,7 @@ THE SOFTWARE.
|
|||||||
\
|
\
|
||||||
\
|
\
|
||||||
template <typename T>\
|
template <typename T>\
|
||||||
static void write(Writer<T> &WR,const std::string &s, const cname &obj){ \
|
static inline void write(Writer<T> &WR,const std::string &s, const cname &obj){ \
|
||||||
push(WR,s);\
|
push(WR,s);\
|
||||||
GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_WRITE_MEMBER,__VA_ARGS__)) \
|
GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_WRITE_MEMBER,__VA_ARGS__)) \
|
||||||
pop(WR);\
|
pop(WR);\
|
||||||
@ -128,14 +128,14 @@ THE SOFTWARE.
|
|||||||
\
|
\
|
||||||
\
|
\
|
||||||
template <typename T>\
|
template <typename T>\
|
||||||
static void read(Reader<T> &RD,const std::string &s, cname &obj){ \
|
static inline void read(Reader<T> &RD,const std::string &s, cname &obj){ \
|
||||||
push(RD,s);\
|
push(RD,s);\
|
||||||
GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_READ_MEMBER,__VA_ARGS__)) \
|
GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_READ_MEMBER,__VA_ARGS__)) \
|
||||||
pop(RD);\
|
pop(RD);\
|
||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
\
|
\
|
||||||
friend std::ostream & operator << (std::ostream &os, const cname &obj ) { \
|
friend inline std::ostream & operator << (std::ostream &os, const cname &obj ) { \
|
||||||
os<<"class "<<#cname<<" {"<<std::endl;\
|
os<<"class "<<#cname<<" {"<<std::endl;\
|
||||||
GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_OS_WRITE_MEMBER,__VA_ARGS__)) \
|
GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_OS_WRITE_MEMBER,__VA_ARGS__)) \
|
||||||
os<<"}"; \
|
os<<"}"; \
|
||||||
@ -165,7 +165,7 @@ namespace Grid {
|
|||||||
class EnumIO<name> {\
|
class EnumIO<name> {\
|
||||||
public:\
|
public:\
|
||||||
template <typename T>\
|
template <typename T>\
|
||||||
static void write(Writer<T> &WR,const std::string &s, const name &obj){ \
|
static inline void write(Writer<T> &WR,const std::string &s, const name &obj){ \
|
||||||
switch (obj) {\
|
switch (obj) {\
|
||||||
GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASE,__VA_ARGS__))\
|
GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASE,__VA_ARGS__))\
|
||||||
default: Grid::write(WR,s,#undefname); break;\
|
default: Grid::write(WR,s,#undefname); break;\
|
||||||
@ -173,7 +173,7 @@ namespace Grid {
|
|||||||
}\
|
}\
|
||||||
\
|
\
|
||||||
template <typename T>\
|
template <typename T>\
|
||||||
static void read(Reader<T> &RD,const std::string &s, name &obj){ \
|
static inline void read(Reader<T> &RD,const std::string &s, name &obj){ \
|
||||||
std::string buf;\
|
std::string buf;\
|
||||||
Grid::read(RD, s, buf);\
|
Grid::read(RD, s, buf);\
|
||||||
if (buf == #undefname) {obj = name::undefname;}\
|
if (buf == #undefname) {obj = name::undefname;}\
|
||||||
@ -182,7 +182,7 @@ namespace Grid {
|
|||||||
}\
|
}\
|
||||||
};\
|
};\
|
||||||
\
|
\
|
||||||
std::ostream & operator << (std::ostream &os, const name &obj ) { \
|
inline std::ostream & operator << (std::ostream &os, const name &obj ) { \
|
||||||
switch (obj) {\
|
switch (obj) {\
|
||||||
GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASEIO,__VA_ARGS__))\
|
GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_ENUMCASEIO,__VA_ARGS__))\
|
||||||
default: os << #undefname; break;\
|
default: os << #undefname; break;\
|
||||||
|
@ -80,6 +80,20 @@ void XmlReader::pop(void)
|
|||||||
node_ = node_.parent();
|
node_ = node_.parent();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool XmlReader::nextElement(const std::string &s)
|
||||||
|
{
|
||||||
|
if (node_.next_sibling(s.c_str()))
|
||||||
|
{
|
||||||
|
node_ = node_.next_sibling(s.c_str());
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
void XmlReader::readDefault(const string &s, string &output)
|
void XmlReader::readDefault(const string &s, string &output)
|
||||||
{
|
{
|
||||||
|
@ -68,6 +68,7 @@ namespace Grid
|
|||||||
virtual ~XmlReader(void) = default;
|
virtual ~XmlReader(void) = default;
|
||||||
void push(const std::string &s);
|
void push(const std::string &s);
|
||||||
void pop(void);
|
void pop(void);
|
||||||
|
bool nextElement(const std::string &s);
|
||||||
template <typename U>
|
template <typename U>
|
||||||
void readDefault(const std::string &s, U &output);
|
void readDefault(const std::string &s, U &output);
|
||||||
template <typename U>
|
template <typename U>
|
||||||
|
1139
lib/simd/Avx512Asm.h
1139
lib/simd/Avx512Asm.h
File diff suppressed because it is too large
Load Diff
@ -410,22 +410,22 @@ namespace Optimization {
|
|||||||
struct Permute{
|
struct Permute{
|
||||||
|
|
||||||
static inline __m256 Permute0(__m256 in){
|
static inline __m256 Permute0(__m256 in){
|
||||||
return _mm256_permute2f128_ps(in,in,0x01);
|
return _mm256_permute2f128_ps(in,in,0x01); //ABCD EFGH -> EFGH ABCD
|
||||||
};
|
};
|
||||||
static inline __m256 Permute1(__m256 in){
|
static inline __m256 Permute1(__m256 in){
|
||||||
return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); //ABCD EFGH -> CDAB GHEF
|
||||||
};
|
};
|
||||||
static inline __m256 Permute2(__m256 in){
|
static inline __m256 Permute2(__m256 in){
|
||||||
return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
return _mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //ABCD EFGH -> BADC FEHG
|
||||||
};
|
};
|
||||||
static inline __m256 Permute3(__m256 in){
|
static inline __m256 Permute3(__m256 in){
|
||||||
return in;
|
return in;
|
||||||
};
|
};
|
||||||
|
|
||||||
static inline __m256d Permute0(__m256d in){
|
static inline __m256d Permute0(__m256d in){
|
||||||
return _mm256_permute2f128_pd(in,in,0x01);
|
return _mm256_permute2f128_pd(in,in,0x01); //AB CD -> CD AB
|
||||||
};
|
};
|
||||||
static inline __m256d Permute1(__m256d in){
|
static inline __m256d Permute1(__m256d in){ //AB CD -> BA DC
|
||||||
return _mm256_shuffle_pd(in,in,0x5);
|
return _mm256_shuffle_pd(in,in,0x5);
|
||||||
};
|
};
|
||||||
static inline __m256d Permute2(__m256d in){
|
static inline __m256d Permute2(__m256d in){
|
||||||
@ -437,6 +437,111 @@ namespace Optimization {
|
|||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#if defined (AVX2) || defined (AVXFMA4)
|
||||||
|
#define _mm256_alignr_epi32(ret,a,b,n) ret=(__m256) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*4)%16)
|
||||||
|
#define _mm256_alignr_epi64(ret,a,b,n) ret=(__m256d) _mm256_alignr_epi8((__m256i)a,(__m256i)b,(n*8)%16)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined (AVX1)
|
||||||
|
|
||||||
|
#define _mm256_alignr_epi32(ret,a,b,n) { \
|
||||||
|
__m128 aa, bb; \
|
||||||
|
\
|
||||||
|
aa = _mm256_extractf128_ps(a,1); \
|
||||||
|
bb = _mm256_extractf128_ps(b,1); \
|
||||||
|
aa = (__m128)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*4)%16); \
|
||||||
|
ret = _mm256_insertf128_ps(ret,aa,1); \
|
||||||
|
\
|
||||||
|
aa = _mm256_extractf128_ps(a,0); \
|
||||||
|
bb = _mm256_extractf128_ps(b,0); \
|
||||||
|
aa = (__m128)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*4)%16); \
|
||||||
|
ret = _mm256_insertf128_ps(ret,aa,0); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define _mm256_alignr_epi64(ret,a,b,n) { \
|
||||||
|
__m128d aa, bb; \
|
||||||
|
\
|
||||||
|
aa = _mm256_extractf128_pd(a,1); \
|
||||||
|
bb = _mm256_extractf128_pd(b,1); \
|
||||||
|
aa = (__m128d)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*8)%16); \
|
||||||
|
ret = _mm256_insertf128_pd(ret,aa,1); \
|
||||||
|
\
|
||||||
|
aa = _mm256_extractf128_pd(a,0); \
|
||||||
|
bb = _mm256_extractf128_pd(b,0); \
|
||||||
|
aa = (__m128d)_mm_alignr_epi8((__m128i)aa,(__m128i)bb,(n*8)%16); \
|
||||||
|
ret = _mm256_insertf128_pd(ret,aa,0); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
inline std::ostream & operator << (std::ostream& stream, const __m256 a)
|
||||||
|
{
|
||||||
|
const float *p=(const float *)&a;
|
||||||
|
stream<< "{"<<p[0]<<","<<p[1]<<","<<p[2]<<","<<p[3]<<","<<p[4]<<","<<p[5]<<","<<p[6]<<","<<p[7]<<"}";
|
||||||
|
return stream;
|
||||||
|
};
|
||||||
|
inline std::ostream & operator<< (std::ostream& stream, const __m256d a)
|
||||||
|
{
|
||||||
|
const double *p=(const double *)&a;
|
||||||
|
stream<< "{"<<p[0]<<","<<p[1]<<","<<p[2]<<","<<p[3]<<"}";
|
||||||
|
return stream;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct Rotate{
|
||||||
|
|
||||||
|
static inline __m256 rotate(__m256 in,int n){
|
||||||
|
switch(n){
|
||||||
|
case 0: return tRotate<0>(in);break;
|
||||||
|
case 1: return tRotate<1>(in);break;
|
||||||
|
case 2: return tRotate<2>(in);break;
|
||||||
|
case 3: return tRotate<3>(in);break;
|
||||||
|
case 4: return tRotate<4>(in);break;
|
||||||
|
case 5: return tRotate<5>(in);break;
|
||||||
|
case 6: return tRotate<6>(in);break;
|
||||||
|
case 7: return tRotate<7>(in);break;
|
||||||
|
default: assert(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
static inline __m256d rotate(__m256d in,int n){
|
||||||
|
switch(n){
|
||||||
|
case 0: return tRotate<0>(in);break;
|
||||||
|
case 1: return tRotate<1>(in);break;
|
||||||
|
case 2: return tRotate<2>(in);break;
|
||||||
|
case 3: return tRotate<3>(in);break;
|
||||||
|
default: assert(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template<int n>
|
||||||
|
static inline __m256 tRotate(__m256 in){
|
||||||
|
__m256 tmp = Permute::Permute0(in);
|
||||||
|
__m256 ret;
|
||||||
|
if ( n > 3 ) {
|
||||||
|
_mm256_alignr_epi32(ret,in,tmp,n);
|
||||||
|
} else {
|
||||||
|
_mm256_alignr_epi32(ret,tmp,in,n);
|
||||||
|
}
|
||||||
|
// std::cout << " align epi32 n=" <<n<<" in "<<tmp<<in<<" -> "<< ret <<std::endl;
|
||||||
|
return ret;
|
||||||
|
};
|
||||||
|
|
||||||
|
template<int n>
|
||||||
|
static inline __m256d tRotate(__m256d in){
|
||||||
|
__m256d tmp = Permute::Permute0(in);
|
||||||
|
__m256d ret;
|
||||||
|
if ( n > 1 ) {
|
||||||
|
_mm256_alignr_epi64(ret,in,tmp,n);
|
||||||
|
} else {
|
||||||
|
_mm256_alignr_epi64(ret,tmp,in,n);
|
||||||
|
}
|
||||||
|
// std::cout << " align epi64 n=" <<n<<" in "<<tmp<<in<<" -> "<< ret <<std::endl;
|
||||||
|
return ret;
|
||||||
|
};
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
//Complex float Reduce
|
//Complex float Reduce
|
||||||
template<>
|
template<>
|
||||||
|
@ -39,7 +39,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
|
|
||||||
|
|
||||||
|
namespace Grid{
|
||||||
namespace Optimization {
|
namespace Optimization {
|
||||||
|
|
||||||
struct Vsplat{
|
struct Vsplat{
|
||||||
@ -246,26 +246,30 @@ namespace Optimization {
|
|||||||
struct TimesMinusI{
|
struct TimesMinusI{
|
||||||
//Complex single
|
//Complex single
|
||||||
inline __m512 operator()(__m512 in, __m512 ret){
|
inline __m512 operator()(__m512 in, __m512 ret){
|
||||||
__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag
|
//__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag
|
||||||
return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2)); // 0x4E??
|
//return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,1,0)); // 0x4E??
|
||||||
|
__m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||||
|
return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp);
|
||||||
}
|
}
|
||||||
//Complex double
|
//Complex double
|
||||||
inline __m512d operator()(__m512d in, __m512d ret){
|
inline __m512d operator()(__m512d in, __m512d ret){
|
||||||
__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag
|
//__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag
|
||||||
return _mm512_shuffle_pd(tmp,tmp,0x55);
|
//return _mm512_shuffle_pd(tmp,tmp,0x55);
|
||||||
|
__m512d tmp = _mm512_shuffle_pd(in,in,0x55);
|
||||||
|
return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct TimesI{
|
struct TimesI{
|
||||||
//Complex single
|
//Complex single
|
||||||
inline __m512 operator()(__m512 in, __m512 ret){
|
inline __m512 operator()(__m512 in, __m512 ret){
|
||||||
__m512 tmp = _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
__m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
||||||
return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp);
|
return _mm512_mask_sub_ps(tmp,0x5555,_mm512_setzero_ps(),tmp);
|
||||||
}
|
}
|
||||||
//Complex double
|
//Complex double
|
||||||
inline __m512d operator()(__m512d in, __m512d ret){
|
inline __m512d operator()(__m512d in, __m512d ret){
|
||||||
__m512d tmp = _mm512_shuffle_pd(tmp,tmp,0x55);
|
__m512d tmp = _mm512_shuffle_pd(in,in,0x55);
|
||||||
return _mm512_mask_sub_pd(tmp,0xaa,_mm512_setzero_pd(),tmp);
|
return _mm512_mask_sub_pd(tmp,0x55,_mm512_setzero_pd(),tmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -305,6 +309,54 @@ namespace Optimization {
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
struct Rotate{
|
||||||
|
|
||||||
|
static inline __m512 rotate(__m512 in,int n){
|
||||||
|
switch(n){
|
||||||
|
case 0: return tRotate<0>(in);break;
|
||||||
|
case 1: return tRotate<1>(in);break;
|
||||||
|
case 2: return tRotate<2>(in);break;
|
||||||
|
case 3: return tRotate<3>(in);break;
|
||||||
|
case 4: return tRotate<4>(in);break;
|
||||||
|
case 5: return tRotate<5>(in);break;
|
||||||
|
case 6: return tRotate<6>(in);break;
|
||||||
|
case 7: return tRotate<7>(in);break;
|
||||||
|
|
||||||
|
case 8 : return tRotate<8>(in);break;
|
||||||
|
case 9 : return tRotate<9>(in);break;
|
||||||
|
case 10: return tRotate<10>(in);break;
|
||||||
|
case 11: return tRotate<11>(in);break;
|
||||||
|
case 12: return tRotate<12>(in);break;
|
||||||
|
case 13: return tRotate<13>(in);break;
|
||||||
|
case 14: return tRotate<14>(in);break;
|
||||||
|
case 15: return tRotate<15>(in);break;
|
||||||
|
default: assert(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
static inline __m512d rotate(__m512d in,int n){
|
||||||
|
switch(n){
|
||||||
|
case 0: return tRotate<0>(in);break;
|
||||||
|
case 1: return tRotate<1>(in);break;
|
||||||
|
case 2: return tRotate<2>(in);break;
|
||||||
|
case 3: return tRotate<3>(in);break;
|
||||||
|
case 4: return tRotate<4>(in);break;
|
||||||
|
case 5: return tRotate<5>(in);break;
|
||||||
|
case 6: return tRotate<6>(in);break;
|
||||||
|
case 7: return tRotate<7>(in);break;
|
||||||
|
default: assert(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<int n> static inline __m512 tRotate(__m512 in){
|
||||||
|
return (__m512)_mm512_alignr_epi32((__m512i)in,(__m512i)in,n);
|
||||||
|
};
|
||||||
|
|
||||||
|
template<int n> static inline __m512d tRotate(__m512d in){
|
||||||
|
return (__m512d)_mm512_alignr_epi64((__m512i)in,(__m512i)in,n);
|
||||||
|
};
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
//////////////////////////////////////////////
|
//////////////////////////////////////////////
|
||||||
// Some Template specialization
|
// Some Template specialization
|
||||||
|
|
||||||
@ -345,7 +397,7 @@ namespace Optimization {
|
|||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Here assign types
|
// Here assign types
|
||||||
namespace Grid {
|
|
||||||
typedef __m512 SIMD_Ftype; // Single precision type
|
typedef __m512 SIMD_Ftype; // Single precision type
|
||||||
typedef __m512d SIMD_Dtype; // Double precision type
|
typedef __m512d SIMD_Dtype; // Double precision type
|
||||||
typedef __m512i SIMD_Itype; // Integer type
|
typedef __m512i SIMD_Itype; // Integer type
|
||||||
|
@ -35,6 +35,7 @@ Author: neo <cossu@post.kek.jp>
|
|||||||
// Time-stamp: <2015-06-09 14:28:02 neo>
|
// Time-stamp: <2015-06-09 14:28:02 neo>
|
||||||
//----------------------------------------------------------------------
|
//----------------------------------------------------------------------
|
||||||
|
|
||||||
|
namespace Grid {
|
||||||
namespace Optimization {
|
namespace Optimization {
|
||||||
|
|
||||||
template<class vtype>
|
template<class vtype>
|
||||||
@ -54,51 +55,67 @@ namespace Optimization {
|
|||||||
|
|
||||||
struct Vsplat{
|
struct Vsplat{
|
||||||
//Complex float
|
//Complex float
|
||||||
inline float operator()(float a, float b){
|
inline u128f operator()(float a, float b){
|
||||||
return 0;
|
u128f out;
|
||||||
|
out.f[0] = a;
|
||||||
|
out.f[1] = b;
|
||||||
|
out.f[2] = a;
|
||||||
|
out.f[3] = b;
|
||||||
|
return out;
|
||||||
}
|
}
|
||||||
// Real float
|
// Real float
|
||||||
inline float operator()(float a){
|
inline u128f operator()(float a){
|
||||||
return 0;
|
u128f out;
|
||||||
|
out.f[0] = a;
|
||||||
|
out.f[1] = a;
|
||||||
|
out.f[2] = a;
|
||||||
|
out.f[3] = a;
|
||||||
|
return out;
|
||||||
}
|
}
|
||||||
//Complex double
|
//Complex double
|
||||||
inline double operator()(double a, double b){
|
inline u128d operator()(double a, double b){
|
||||||
return 0;
|
u128d out;
|
||||||
|
out.f[0] = a;
|
||||||
|
out.f[1] = b;
|
||||||
|
return out;
|
||||||
}
|
}
|
||||||
//Real double
|
//Real double
|
||||||
inline double operator()(double a){
|
inline u128d operator()(double a){
|
||||||
return 0;
|
u128d out;
|
||||||
|
out.f[0] = a;
|
||||||
|
out.f[1] = a;
|
||||||
|
return out;
|
||||||
}
|
}
|
||||||
//Integer
|
//Integer
|
||||||
inline int operator()(Integer a){
|
inline int operator()(Integer a){
|
||||||
return 0;
|
return a;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct Vstore{
|
struct Vstore{
|
||||||
//Float
|
//Float
|
||||||
inline void operator()(float a, float* F){
|
inline void operator()(u128f a, float* F){
|
||||||
|
memcpy(F,a.f,4*sizeof(float));
|
||||||
}
|
}
|
||||||
//Double
|
//Double
|
||||||
inline void operator()(double a, double* D){
|
inline void operator()(u128d a, double* D){
|
||||||
|
memcpy(D,a.f,2*sizeof(double));
|
||||||
}
|
}
|
||||||
//Integer
|
//Integer
|
||||||
inline void operator()(int a, Integer* I){
|
inline void operator()(int a, Integer* I){
|
||||||
|
I[0] = a;
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct Vstream{
|
struct Vstream{
|
||||||
//Float
|
//Float
|
||||||
inline void operator()(float * a, float b){
|
inline void operator()(float * a, u128f b){
|
||||||
|
memcpy(a,b.f,4*sizeof(float));
|
||||||
}
|
}
|
||||||
//Double
|
//Double
|
||||||
inline void operator()(double * a, double b){
|
inline void operator()(double * a, u128d b){
|
||||||
|
memcpy(a,b.f,2*sizeof(double));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -106,24 +123,40 @@ namespace Optimization {
|
|||||||
|
|
||||||
struct Vset{
|
struct Vset{
|
||||||
// Complex float
|
// Complex float
|
||||||
inline float operator()(Grid::ComplexF *a){
|
inline u128f operator()(Grid::ComplexF *a){
|
||||||
return 0;
|
u128f out;
|
||||||
|
out.f[0] = a[0].real();
|
||||||
|
out.f[1] = a[0].imag();
|
||||||
|
out.f[2] = a[1].real();
|
||||||
|
out.f[3] = a[1].imag();
|
||||||
|
return out;
|
||||||
}
|
}
|
||||||
// Complex double
|
// Complex double
|
||||||
inline double operator()(Grid::ComplexD *a){
|
inline u128d operator()(Grid::ComplexD *a){
|
||||||
return 0;
|
u128d out;
|
||||||
|
out.f[0] = a[0].real();
|
||||||
|
out.f[1] = a[0].imag();
|
||||||
|
return out;
|
||||||
}
|
}
|
||||||
// Real float
|
// Real float
|
||||||
inline float operator()(float *a){
|
inline u128f operator()(float *a){
|
||||||
return 0;
|
u128f out;
|
||||||
|
out.f[0] = a[0];
|
||||||
|
out.f[1] = a[1];
|
||||||
|
out.f[2] = a[2];
|
||||||
|
out.f[3] = a[3];
|
||||||
|
return out;
|
||||||
}
|
}
|
||||||
// Real double
|
// Real double
|
||||||
inline double operator()(double *a){
|
inline u128d operator()(double *a){
|
||||||
return 0;
|
u128d out;
|
||||||
|
out.f[0] = a[0];
|
||||||
|
out.f[1] = a[1];
|
||||||
|
return out;
|
||||||
}
|
}
|
||||||
// Integer
|
// Integer
|
||||||
inline int operator()(Integer *a){
|
inline int operator()(Integer *a){
|
||||||
return 0;
|
return a[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -145,130 +178,279 @@ namespace Optimization {
|
|||||||
/////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////
|
||||||
struct Sum{
|
struct Sum{
|
||||||
//Complex/Real float
|
//Complex/Real float
|
||||||
inline float operator()(float a, float b){
|
inline u128f operator()(u128f a, u128f b){
|
||||||
return 0;
|
u128f out;
|
||||||
|
out.f[0] = a.f[0] + b.f[0];
|
||||||
|
out.f[1] = a.f[1] + b.f[1];
|
||||||
|
out.f[2] = a.f[2] + b.f[2];
|
||||||
|
out.f[3] = a.f[3] + b.f[3];
|
||||||
|
return out;
|
||||||
}
|
}
|
||||||
//Complex/Real double
|
//Complex/Real double
|
||||||
inline double operator()(double a, double b){
|
inline u128d operator()(u128d a, u128d b){
|
||||||
return 0;
|
u128d out;
|
||||||
|
out.f[0] = a.f[0] + b.f[0];
|
||||||
|
out.f[1] = a.f[1] + b.f[1];
|
||||||
|
return out;
|
||||||
}
|
}
|
||||||
//Integer
|
//Integer
|
||||||
inline int operator()(int a, int b){
|
inline int operator()(int a, int b){
|
||||||
return 0;
|
return a + b;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct Sub{
|
struct Sub{
|
||||||
//Complex/Real float
|
//Complex/Real float
|
||||||
inline float operator()(float a, float b){
|
inline u128f operator()(u128f a, u128f b){
|
||||||
return 0;
|
u128f out;
|
||||||
|
out.f[0] = a.f[0] - b.f[0];
|
||||||
|
out.f[1] = a.f[1] - b.f[1];
|
||||||
|
out.f[2] = a.f[2] - b.f[2];
|
||||||
|
out.f[3] = a.f[3] - b.f[3];
|
||||||
|
return out;
|
||||||
}
|
}
|
||||||
//Complex/Real double
|
//Complex/Real double
|
||||||
inline double operator()(double a, double b){
|
inline u128d operator()(u128d a, u128d b){
|
||||||
return 0;
|
u128d out;
|
||||||
|
out.f[0] = a.f[0] - b.f[0];
|
||||||
|
out.f[1] = a.f[1] - b.f[1];
|
||||||
|
return out;
|
||||||
}
|
}
|
||||||
//Integer
|
//Integer
|
||||||
inline int operator()(int a, int b){
|
inline int operator()(int a, int b){
|
||||||
return 0;
|
return a-b;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct MultComplex{
|
struct MultComplex{
|
||||||
// Complex float
|
// Complex float
|
||||||
inline float operator()(float a, float b){
|
inline u128f operator()(u128f a, u128f b){
|
||||||
return 0;
|
u128f out;
|
||||||
|
out.f[0] = a.f[0]*b.f[0] - a.f[1]*b.f[1];
|
||||||
|
out.f[1] = a.f[0]*b.f[1] + a.f[1]*b.f[0];
|
||||||
|
out.f[2] = a.f[2]*b.f[2] - a.f[3]*b.f[3];
|
||||||
|
out.f[3] = a.f[2]*b.f[3] + a.f[3]*b.f[2];
|
||||||
|
return out;
|
||||||
}
|
}
|
||||||
// Complex double
|
// Complex double
|
||||||
inline double operator()(double a, double b){
|
inline u128d operator()(u128d a, u128d b){
|
||||||
return 0;
|
u128d out;
|
||||||
|
out.f[0] = a.f[0]*b.f[0] - a.f[1]*b.f[1];
|
||||||
|
out.f[1] = a.f[0]*b.f[1] + a.f[1]*b.f[0];
|
||||||
|
return out;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct Mult{
|
struct Mult{
|
||||||
inline float mac(float a, float b,double c){
|
//CK: Appear unneeded
|
||||||
return 0;
|
// inline float mac(float a, float b,double c){
|
||||||
}
|
// return 0;
|
||||||
inline double mac(double a, double b,double c){
|
// }
|
||||||
return 0;
|
// inline double mac(double a, double b,double c){
|
||||||
}
|
// return 0;
|
||||||
|
// }
|
||||||
|
|
||||||
// Real float
|
// Real float
|
||||||
inline float operator()(float a, float b){
|
inline u128f operator()(u128f a, u128f b){
|
||||||
return 0;
|
u128f out;
|
||||||
|
out.f[0] = a.f[0]*b.f[0];
|
||||||
|
out.f[1] = a.f[1]*b.f[1];
|
||||||
|
out.f[2] = a.f[2]*b.f[2];
|
||||||
|
out.f[3] = a.f[3]*b.f[3];
|
||||||
|
return out;
|
||||||
}
|
}
|
||||||
// Real double
|
// Real double
|
||||||
inline double operator()(double a, double b){
|
inline u128d operator()(u128d a, u128d b){
|
||||||
return 0;
|
u128d out;
|
||||||
|
out.f[0] = a.f[0]*b.f[0];
|
||||||
|
out.f[1] = a.f[1]*b.f[1];
|
||||||
|
return out;
|
||||||
}
|
}
|
||||||
// Integer
|
// Integer
|
||||||
inline int operator()(int a, int b){
|
inline int operator()(int a, int b){
|
||||||
return 0;
|
return a*b;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct Conj{
|
struct Conj{
|
||||||
// Complex single
|
// Complex single
|
||||||
inline float operator()(float in){
|
inline u128f operator()(u128f in){
|
||||||
return 0;
|
u128f out;
|
||||||
|
out.f[0] = in.f[0];
|
||||||
|
out.f[1] = -in.f[1];
|
||||||
|
out.f[2] = in.f[2];
|
||||||
|
out.f[3] = -in.f[3];
|
||||||
|
return out;
|
||||||
}
|
}
|
||||||
// Complex double
|
// Complex double
|
||||||
inline double operator()(double in){
|
inline u128d operator()(u128d in){
|
||||||
return 0;
|
u128d out;
|
||||||
|
out.f[0] = in.f[0];
|
||||||
|
out.f[1] = -in.f[1];
|
||||||
|
return out;
|
||||||
}
|
}
|
||||||
// do not define for integer input
|
// do not define for integer input
|
||||||
};
|
};
|
||||||
|
|
||||||
struct TimesMinusI{
|
struct TimesMinusI{
|
||||||
//Complex single
|
//Complex single
|
||||||
inline float operator()(float in, float ret){
|
inline u128f operator()(u128f in, u128f ret){ //note ret is ignored
|
||||||
return 0;
|
u128f out;
|
||||||
|
out.f[0] = in.f[1];
|
||||||
|
out.f[1] = -in.f[0];
|
||||||
|
out.f[2] = in.f[3];
|
||||||
|
out.f[3] = -in.f[2];
|
||||||
|
return out;
|
||||||
}
|
}
|
||||||
//Complex double
|
//Complex double
|
||||||
inline double operator()(double in, double ret){
|
inline u128d operator()(u128d in, u128d ret){
|
||||||
return 0;
|
u128d out;
|
||||||
|
out.f[0] = in.f[1];
|
||||||
|
out.f[1] = -in.f[0];
|
||||||
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct TimesI{
|
struct TimesI{
|
||||||
//Complex single
|
//Complex single
|
||||||
inline float operator()(float in, float ret){
|
inline u128f operator()(u128f in, u128f ret){ //note ret is ignored
|
||||||
return 0;
|
u128f out;
|
||||||
|
out.f[0] = -in.f[1];
|
||||||
|
out.f[1] = in.f[0];
|
||||||
|
out.f[2] = -in.f[3];
|
||||||
|
out.f[3] = in.f[2];
|
||||||
|
return out;
|
||||||
}
|
}
|
||||||
//Complex double
|
//Complex double
|
||||||
inline double operator()(double in, double ret){
|
inline u128d operator()(u128d in, u128d ret){
|
||||||
return 0;
|
u128d out;
|
||||||
|
out.f[0] = -in.f[1];
|
||||||
|
out.f[1] = in.f[0];
|
||||||
|
return out;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
//////////////////////////////////////////////
|
//////////////////////////////////////////////
|
||||||
// Some Template specialization
|
// Some Template specialization
|
||||||
|
struct Permute{
|
||||||
|
//We just have to mirror the permutes of Grid_sse4.h
|
||||||
|
static inline u128f Permute0(u128f in){ //AB CD -> CD AB
|
||||||
|
u128f out;
|
||||||
|
out.f[0] = in.f[2];
|
||||||
|
out.f[1] = in.f[3];
|
||||||
|
out.f[2] = in.f[0];
|
||||||
|
out.f[3] = in.f[1];
|
||||||
|
return out;
|
||||||
|
};
|
||||||
|
static inline u128f Permute1(u128f in){ //AB CD -> BA DC
|
||||||
|
u128f out;
|
||||||
|
out.f[0] = in.f[1];
|
||||||
|
out.f[1] = in.f[0];
|
||||||
|
out.f[2] = in.f[3];
|
||||||
|
out.f[3] = in.f[2];
|
||||||
|
return out;
|
||||||
|
};
|
||||||
|
static inline u128f Permute2(u128f in){
|
||||||
|
return in;
|
||||||
|
};
|
||||||
|
static inline u128f Permute3(u128f in){
|
||||||
|
return in;
|
||||||
|
};
|
||||||
|
|
||||||
|
static inline u128d Permute0(u128d in){ //AB -> BA
|
||||||
|
u128d out;
|
||||||
|
out.f[0] = in.f[1];
|
||||||
|
out.f[1] = in.f[0];
|
||||||
|
return out;
|
||||||
|
};
|
||||||
|
static inline u128d Permute1(u128d in){
|
||||||
|
return in;
|
||||||
|
};
|
||||||
|
static inline u128d Permute2(u128d in){
|
||||||
|
return in;
|
||||||
|
};
|
||||||
|
static inline u128d Permute3(u128d in){
|
||||||
|
return in;
|
||||||
|
};
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
template < typename vtype >
|
template < typename vtype >
|
||||||
void permute(vtype &a, vtype b, int perm) {
|
void permute(vtype &a, vtype b, int perm) {
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct Rotate{
|
||||||
|
|
||||||
|
static inline u128f rotate(u128f in,int n){
|
||||||
|
u128f out;
|
||||||
|
switch(n){
|
||||||
|
case 0:
|
||||||
|
out.f[0] = in.f[0];
|
||||||
|
out.f[1] = in.f[1];
|
||||||
|
out.f[2] = in.f[2];
|
||||||
|
out.f[3] = in.f[3];
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
out.f[0] = in.f[1];
|
||||||
|
out.f[1] = in.f[2];
|
||||||
|
out.f[2] = in.f[3];
|
||||||
|
out.f[3] = in.f[0];
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
out.f[0] = in.f[2];
|
||||||
|
out.f[1] = in.f[3];
|
||||||
|
out.f[2] = in.f[0];
|
||||||
|
out.f[3] = in.f[1];
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
out.f[0] = in.f[3];
|
||||||
|
out.f[1] = in.f[0];
|
||||||
|
out.f[2] = in.f[1];
|
||||||
|
out.f[3] = in.f[2];
|
||||||
|
break;
|
||||||
|
default: assert(0);
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
static inline u128d rotate(u128d in,int n){
|
||||||
|
u128d out;
|
||||||
|
switch(n){
|
||||||
|
case 0:
|
||||||
|
out.f[0] = in.f[0];
|
||||||
|
out.f[1] = in.f[1];
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
out.f[0] = in.f[1];
|
||||||
|
out.f[1] = in.f[0];
|
||||||
|
break;
|
||||||
|
default: assert(0);
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
//Complex float Reduce
|
//Complex float Reduce
|
||||||
template<>
|
template<>
|
||||||
inline Grid::ComplexF Reduce<Grid::ComplexF, float>::operator()(float in){
|
inline Grid::ComplexF Reduce<Grid::ComplexF, u128f>::operator()(u128f in){ //2 complex
|
||||||
return 0;
|
return Grid::ComplexF(in.f[0] + in.f[2], in.f[1] + in.f[3]);
|
||||||
}
|
}
|
||||||
//Real float Reduce
|
//Real float Reduce
|
||||||
template<>
|
template<>
|
||||||
inline Grid::RealF Reduce<Grid::RealF, float>::operator()(float in){
|
inline Grid::RealF Reduce<Grid::RealF, u128f>::operator()(u128f in){ //4 floats
|
||||||
return 0;
|
return in.f[0] + in.f[1] + in.f[2] + in.f[3];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//Complex double Reduce
|
//Complex double Reduce
|
||||||
template<>
|
template<>
|
||||||
inline Grid::ComplexD Reduce<Grid::ComplexD, double>::operator()(double in){
|
inline Grid::ComplexD Reduce<Grid::ComplexD, u128d>::operator()(u128d in){ //1 complex
|
||||||
return 0;
|
return Grid::ComplexD(in.f[0],in.f[1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
//Real double Reduce
|
//Real double Reduce
|
||||||
template<>
|
template<>
|
||||||
inline Grid::RealD Reduce<Grid::RealD, double>::operator()(double in){
|
inline Grid::RealD Reduce<Grid::RealD, u128d>::operator()(u128d in){ //2 doubles
|
||||||
return 0;
|
return in.f[0] + in.f[1];
|
||||||
}
|
}
|
||||||
|
|
||||||
//Integer Reduce
|
//Integer Reduce
|
||||||
@ -282,10 +464,9 @@ namespace Optimization {
|
|||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Here assign types
|
// Here assign types
|
||||||
namespace Grid {
|
|
||||||
|
|
||||||
typedef float SIMD_Ftype; // Single precision type
|
typedef Optimization::u128f SIMD_Ftype; // Single precision type
|
||||||
typedef double SIMD_Dtype; // Double precision type
|
typedef Optimization::u128d SIMD_Dtype; // Double precision type
|
||||||
typedef int SIMD_Itype; // Integer type
|
typedef int SIMD_Itype; // Integer type
|
||||||
|
|
||||||
// prefetch utilities
|
// prefetch utilities
|
||||||
|
@ -36,7 +36,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
//----------------------------------------------------------------------
|
//----------------------------------------------------------------------
|
||||||
|
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
|
#include <zmmintrin.h>
|
||||||
|
|
||||||
|
namespace Grid{
|
||||||
namespace Optimization {
|
namespace Optimization {
|
||||||
|
|
||||||
struct Vsplat{
|
struct Vsplat{
|
||||||
@ -316,6 +318,54 @@ namespace Optimization {
|
|||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct Rotate{
|
||||||
|
|
||||||
|
static inline __m512 rotate(__m512 in,int n){
|
||||||
|
switch(n){
|
||||||
|
case 0: return tRotate<0>(in);break;
|
||||||
|
case 1: return tRotate<1>(in);break;
|
||||||
|
case 2: return tRotate<2>(in);break;
|
||||||
|
case 3: return tRotate<3>(in);break;
|
||||||
|
case 4: return tRotate<4>(in);break;
|
||||||
|
case 5: return tRotate<5>(in);break;
|
||||||
|
case 6: return tRotate<6>(in);break;
|
||||||
|
case 7: return tRotate<7>(in);break;
|
||||||
|
|
||||||
|
case 8 : return tRotate<8>(in);break;
|
||||||
|
case 9 : return tRotate<9>(in);break;
|
||||||
|
case 10: return tRotate<10>(in);break;
|
||||||
|
case 11: return tRotate<11>(in);break;
|
||||||
|
case 12: return tRotate<12>(in);break;
|
||||||
|
case 13: return tRotate<13>(in);break;
|
||||||
|
case 14: return tRotate<14>(in);break;
|
||||||
|
case 15: return tRotate<15>(in);break;
|
||||||
|
default: assert(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
static inline __m512d rotate(__m512d in,int n){
|
||||||
|
switch(n){
|
||||||
|
case 0: return tRotate<0>(in);break;
|
||||||
|
case 1: return tRotate<1>(in);break;
|
||||||
|
case 2: return tRotate<2>(in);break;
|
||||||
|
case 3: return tRotate<3>(in);break;
|
||||||
|
case 4: return tRotate<4>(in);break;
|
||||||
|
case 5: return tRotate<5>(in);break;
|
||||||
|
case 6: return tRotate<6>(in);break;
|
||||||
|
case 7: return tRotate<7>(in);break;
|
||||||
|
default: assert(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<int n> static inline __m512 tRotate(__m512 in){
|
||||||
|
return (__m512)_mm512_alignr_epi32((__m512i)in,(__m512i)in,n);
|
||||||
|
};
|
||||||
|
|
||||||
|
template<int n> static inline __m512d tRotate(__m512d in){
|
||||||
|
return (__m512d)_mm512_alignr_epi32((__m512i)in,(__m512i)in,2*n);
|
||||||
|
};
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
//////////////////////////////////////////////
|
//////////////////////////////////////////////
|
||||||
@ -358,7 +408,7 @@ namespace Optimization {
|
|||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Here assign types
|
// Here assign types
|
||||||
namespace Grid {
|
|
||||||
typedef __m512 SIMD_Ftype; // Single precision type
|
typedef __m512 SIMD_Ftype; // Single precision type
|
||||||
typedef __m512d SIMD_Dtype; // Double precision type
|
typedef __m512d SIMD_Dtype; // Double precision type
|
||||||
typedef __m512i SIMD_Itype; // Integer type
|
typedef __m512i SIMD_Itype; // Integer type
|
||||||
|
@ -267,10 +267,10 @@ namespace Optimization {
|
|||||||
struct Permute{
|
struct Permute{
|
||||||
|
|
||||||
static inline __m128 Permute0(__m128 in){
|
static inline __m128 Permute0(__m128 in){
|
||||||
return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2));
|
return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(1,0,3,2)); //AB CD -> CD AB
|
||||||
};
|
};
|
||||||
static inline __m128 Permute1(__m128 in){
|
static inline __m128 Permute1(__m128 in){
|
||||||
return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1));
|
return _mm_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //AB CD -> BA DC
|
||||||
};
|
};
|
||||||
static inline __m128 Permute2(__m128 in){
|
static inline __m128 Permute2(__m128 in){
|
||||||
return in;
|
return in;
|
||||||
@ -279,7 +279,7 @@ namespace Optimization {
|
|||||||
return in;
|
return in;
|
||||||
};
|
};
|
||||||
|
|
||||||
static inline __m128d Permute0(__m128d in){
|
static inline __m128d Permute0(__m128d in){ //AB -> BA
|
||||||
return _mm_shuffle_pd(in,in,0x1);
|
return _mm_shuffle_pd(in,in,0x1);
|
||||||
};
|
};
|
||||||
static inline __m128d Permute1(__m128d in){
|
static inline __m128d Permute1(__m128d in){
|
||||||
@ -294,6 +294,32 @@ namespace Optimization {
|
|||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct Rotate{
|
||||||
|
|
||||||
|
static inline __m128 rotate(__m128 in,int n){
|
||||||
|
switch(n){
|
||||||
|
case 0: return tRotate<0>(in);break;
|
||||||
|
case 1: return tRotate<1>(in);break;
|
||||||
|
case 2: return tRotate<2>(in);break;
|
||||||
|
case 3: return tRotate<3>(in);break;
|
||||||
|
default: assert(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
static inline __m128d rotate(__m128d in,int n){
|
||||||
|
switch(n){
|
||||||
|
case 0: return tRotate<0>(in);break;
|
||||||
|
case 1: return tRotate<1>(in);break;
|
||||||
|
default: assert(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#define _mm_alignr_epi32(a,b,n) _mm_alignr_epi8(a,b,(n*4)%16)
|
||||||
|
#define _mm_alignr_epi64(a,b,n) _mm_alignr_epi8(a,b,(n*8)%16)
|
||||||
|
|
||||||
|
template<int n> static inline __m128 tRotate(__m128 in){ return (__m128)_mm_alignr_epi32((__m128i)in,(__m128i)in,n); };
|
||||||
|
template<int n> static inline __m128d tRotate(__m128d in){ return (__m128d)_mm_alignr_epi64((__m128i)in,(__m128i)in,n); };
|
||||||
|
|
||||||
|
};
|
||||||
//////////////////////////////////////////////
|
//////////////////////////////////////////////
|
||||||
// Some Template specialization
|
// Some Template specialization
|
||||||
|
|
||||||
|
@ -299,16 +299,44 @@ namespace Grid {
|
|||||||
}
|
}
|
||||||
friend inline void permute(Grid_simd &y,Grid_simd b,int perm)
|
friend inline void permute(Grid_simd &y,Grid_simd b,int perm)
|
||||||
{
|
{
|
||||||
if (perm==3) permute3(y,b);
|
if ( perm & RotateBit ) {
|
||||||
else if (perm==2) permute2(y,b);
|
int dist = perm&0xF;
|
||||||
else if (perm==1) permute1(y,b);
|
y=rotate(b,dist);
|
||||||
else if (perm==0) permute0(y,b);
|
return;
|
||||||
|
}
|
||||||
|
switch(perm){
|
||||||
|
case 3: permute3(y,b); break;
|
||||||
|
case 2: permute2(y,b); break;
|
||||||
|
case 1: permute1(y,b); break;
|
||||||
|
case 0: permute0(y,b); break;
|
||||||
|
default: assert(0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
};// end of Grid_simd class definition
|
};// end of Grid_simd class definition
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////
|
||||||
|
// General rotate
|
||||||
|
////////////////////////////////////////////////////////////////////
|
||||||
|
template <class S, class V, IfNotComplex<S> =0>
|
||||||
|
inline Grid_simd<S,V> rotate(Grid_simd<S,V> b,int nrot)
|
||||||
|
{
|
||||||
|
nrot = nrot % Grid_simd<S,V>::Nsimd();
|
||||||
|
Grid_simd<S,V> ret;
|
||||||
|
// std::cout << "Rotate Real by "<<nrot<<std::endl;
|
||||||
|
ret.v = Optimization::Rotate::rotate(b.v,nrot);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
template <class S, class V, IfComplex<S> =0>
|
||||||
|
inline Grid_simd<S,V> rotate(Grid_simd<S,V> b,int nrot)
|
||||||
|
{
|
||||||
|
nrot = nrot % Grid_simd<S,V>::Nsimd();
|
||||||
|
Grid_simd<S,V> ret;
|
||||||
|
// std::cout << "Rotate Complex by "<<nrot<<std::endl;
|
||||||
|
ret.v = Optimization::Rotate::rotate(b.v,2*nrot);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
///////////////////////
|
///////////////////////
|
||||||
// Splat
|
// Splat
|
||||||
///////////////////////
|
///////////////////////
|
||||||
@ -339,6 +367,9 @@ namespace Grid {
|
|||||||
template <class S,class V, IfComplex<S> = 0 > inline void vzero(Grid_simd<S,V> &ret) { vsplat(ret,S(0.0,0.0)); }// use xor?
|
template <class S,class V, IfComplex<S> = 0 > inline void vzero(Grid_simd<S,V> &ret) { vsplat(ret,S(0.0,0.0)); }// use xor?
|
||||||
template <class S,class V, IfComplex<S> = 0 > inline void vcomplex_i(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0,1.0));}
|
template <class S,class V, IfComplex<S> = 0 > inline void vcomplex_i(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0,1.0));}
|
||||||
|
|
||||||
|
template <class S,class V, IfComplex<S> = 0 > inline void visign(Grid_simd<S,V> &ret){ vsplat(ret,S(1.0,-1.0));}
|
||||||
|
template <class S,class V, IfComplex<S> = 0 > inline void vrsign(Grid_simd<S,V> &ret){ vsplat(ret,S(-1.0,1.0));}
|
||||||
|
|
||||||
// if not complex overload here
|
// if not complex overload here
|
||||||
template <class S,class V, IfReal<S> = 0 > inline void vone (Grid_simd<S,V> &ret){ vsplat(ret,S(1.0)); }
|
template <class S,class V, IfReal<S> = 0 > inline void vone (Grid_simd<S,V> &ret){ vsplat(ret,S(1.0)); }
|
||||||
template <class S,class V, IfReal<S> = 0 > inline void vzero(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0)); }
|
template <class S,class V, IfReal<S> = 0 > inline void vzero(Grid_simd<S,V> &ret){ vsplat(ret,S(0.0)); }
|
||||||
|
197
lib/simd/Intel512avx.h
Normal file
197
lib/simd/Intel512avx.h
Normal file
@ -0,0 +1,197 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/simd/Avx512Asm.h
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#ifndef GRID_ASM_AV512_H
|
||||||
|
#define GRID_ASM_AV512_H
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////
|
||||||
|
// Knights Landing specials
|
||||||
|
////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
#define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri)
|
||||||
|
#define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri)
|
||||||
|
|
||||||
|
#define ZMULf(Ari,Air,B,Criir,Ciirr) VMULf(Ari,B,Criir) VMULf(Air,B,Ciirr)
|
||||||
|
#define ZMULd(Ari,Air,B,Criir,Ciirr) VMULd(Ari,B,Criir) VMULd(Air,B,Ciirr)
|
||||||
|
|
||||||
|
#define ZMADDf(Ari,Air,B,Criir,Ciirr) VMADDf(Ari,B,Criir) VMADDf(Air,B,Ciirr)
|
||||||
|
#define ZMADDd(Ari,Air,B,Criir,Ciirr) VMADDd(Ari,B,Criir) VMADDd(Air,B,Ciirr)
|
||||||
|
|
||||||
|
#define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp)
|
||||||
|
#define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp)
|
||||||
|
|
||||||
|
#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
|
||||||
|
VSHUFMEMf(O,P,tmp) \
|
||||||
|
VMULMEMf(O,P,B,Biirr) \
|
||||||
|
VMULMEMf(O,P,C,Ciirr) \
|
||||||
|
VMULf(tmp,B,Briir) \
|
||||||
|
VMULf(tmp,C,Criir)
|
||||||
|
|
||||||
|
#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
|
||||||
|
VSHUFMEMd(O,P,tmp) \
|
||||||
|
VMULMEMd(O,P,B,Biirr) \
|
||||||
|
VMULMEMd(O,P,C,Ciirr) \
|
||||||
|
VMULd(tmp,B,Briir) \
|
||||||
|
VMULd(tmp,C,Criir)
|
||||||
|
|
||||||
|
#define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
|
||||||
|
VSHUFMEMf(O,P,tmp) \
|
||||||
|
VMADDMEMf(O,P,B,Biirr) \
|
||||||
|
VMADDMEMf(O,P,C,Ciirr) \
|
||||||
|
VMADDf(tmp,B,Briir) \
|
||||||
|
VMADDf(tmp,C,Criir)
|
||||||
|
|
||||||
|
#define ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) \
|
||||||
|
VSHUFMEMd(O,P,tmp) \
|
||||||
|
VMADDMEMd(O,P,B,Biirr) \
|
||||||
|
VMADDMEMd(O,P,C,Ciirr) \
|
||||||
|
VMADDd(tmp,B,Briir) \
|
||||||
|
VMADDd(tmp,C,Criir)
|
||||||
|
|
||||||
|
// Merges accumulation for complex dot chain; less efficient under avx512
|
||||||
|
#define ZEND1f(Criir,Ciirr, tmp) "vshufps $0xb1," #Criir "," #Criir "," #tmp ";\n"\
|
||||||
|
"vaddps " #tmp "," #Criir "," #Criir"{%k6}" ";\n"
|
||||||
|
|
||||||
|
#define ZEND2f(Criir,Ciirr, tmp) "vshufps $0xb1," #Ciirr "," #Ciirr "," #tmp ";\n"\
|
||||||
|
"vsubps " #tmp "," #Ciirr "," #Criir"{%k7}" ";\n"
|
||||||
|
|
||||||
|
#define ZEND1d(Criir,Ciirr, tmp) "vshufpd $0x55," #Criir "," #Criir "," #tmp ";\n"\
|
||||||
|
"vaddps " #tmp "," #Criir "," #Criir"{%k6}" ";\n"
|
||||||
|
|
||||||
|
#define ZEND2d(Criir,Ciirr, tmp) "vshufpd $0x55," #Ciirr "," #Ciirr "," #tmp ";\n"\
|
||||||
|
"vsubpd " #tmp "," #Ciirr "," #Criir"{%k7};\n" // ri+ir ; ri+ir,rr-ii
|
||||||
|
|
||||||
|
#define VMOVRDUPd(OFF,A,DEST) "vpshufd $0x44," #OFF "*64(" #A ")," #DEST ";\n" // 32 bit level: 1,0,3,2
|
||||||
|
#define VMOVIDUPd(OFF,A,DEST) "vpshufd $0xee," #OFF "*64(" #A ")," #DEST ";\n" // 32 bit level: 3,2,3,2
|
||||||
|
#define VMOVRDUPf(OFF,PTR,DEST) "vmovsldup " #OFF "*64(" #PTR "), " #DEST ";\n"
|
||||||
|
#define VMOVIDUPf(OFF,PTR,DEST) "vmovshdup " #OFF "*64(" #PTR "), " #DEST ";\n"
|
||||||
|
|
||||||
|
#define VRDUPd(SRC,DEST) "vpshufd $0x44," #SRC"," #DEST ";\n" // 32 bit level: 1,0,3,2
|
||||||
|
#define VRDUPf(SRC,DEST) "vmovsldup " #SRC ", " #DEST ";\n"
|
||||||
|
#define VIDUPd(SRC,DEST) "vpshufd $0xee," #SRC"," #DEST ";\n" // 32 bit level: 3,2,3,2
|
||||||
|
#define VIDUPf(SRC,DEST) "vmovshdup " #SRC ", " #DEST ";\n"
|
||||||
|
|
||||||
|
#define VBCASTRDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+0)(" #A ")," #DEST ";\n"
|
||||||
|
#define VBCASTIDUPd(OFF,A,DEST) "vbroadcastsd (" #OFF "*16+8)(" #A ")," #DEST ";\n"
|
||||||
|
#define VBCASTRDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +0)(" #PTR "), " #DEST ";\n"
|
||||||
|
#define VBCASTIDUPf(OFF,PTR,DEST) "vbroadcastss (" #OFF "*8 +4)(" #PTR "), " #DEST ";\n"
|
||||||
|
|
||||||
|
#define VMADDSUBf(A,B,accum) "vfmaddsub231ps " #A "," #B "," #accum ";\n"
|
||||||
|
#define VMADDSUBd(A,B,accum) "vfmaddsub231pd " #A "," #B "," #accum ";\n"
|
||||||
|
#define VMADDSUBMEMf(O,P,B,accum) "vfmaddsub231ps " #O"*64("#P "),"#B "," #accum ";\n"
|
||||||
|
#define VMADDSUBMEMd(O,P,B,accum) "vfmaddsub231pd " #O"*64("#P "),"#B "," #accum ";\n"
|
||||||
|
|
||||||
|
|
||||||
|
#define VMADDSUBRDUPf(O,P,B,accum) "vfmaddsub231ps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n"
|
||||||
|
#define VMADDSUBIDUPf(O,P,B,accum) "vfmaddsub231ps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n"
|
||||||
|
#define VMULRDUPf(O,P,B,accum) "vmulps (" #O"*8+0)("#P "){1to16},"#B "," #accum ";\n"
|
||||||
|
#define VMULIDUPf(O,P,B,accum) "vmulps (" #O"*8+4)("#P "){1to16},"#B "," #accum ";\n"
|
||||||
|
|
||||||
|
#define VMADDSUBRDUPd(O,P,B,accum) "vfmaddsub231pd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n"
|
||||||
|
#define VMADDSUBIDUPd(O,P,B,accum) "vfmaddsub231pd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n"
|
||||||
|
#define VMULRDUPd(O,P,B,accum) "vmulpd (" #O"*16+0)("#P "){1to8},"#B "," #accum ";\n"
|
||||||
|
#define VMULIDUPd(O,P,B,accum) "vmulpd (" #O"*16+8)("#P "){1to8},"#B "," #accum ";\n"
|
||||||
|
/*
|
||||||
|
* TimesI is used only in the XP recon
|
||||||
|
* Could zero the regs and use RECON_ACCUM
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define VTIMESI0f(A,DEST, Z) VSHUFf(A,DEST)
|
||||||
|
#define VTIMESI1f(A,DEST, Z) "vaddps " #DEST "," #Z "," #DEST"{%k6}" ";\n"
|
||||||
|
#define VTIMESI2f(A,DEST, Z) "vsubps " #DEST "," #Z "," #DEST"{%k7}" ";\n"
|
||||||
|
|
||||||
|
#define VTIMESI0d(A,DEST, Z) VSHUFd(A,DEST)
|
||||||
|
#define VTIMESI1d(A,DEST, Z) "vaddpd " #DEST "," #Z "," #DEST"{%k6}" ";\n"
|
||||||
|
#define VTIMESI2d(A,DEST, Z) "vsubpd " #DEST "," #Z "," #DEST"{%k7}" ";\n"
|
||||||
|
|
||||||
|
#define VTIMESMINUSI0f(A,DEST,Z) VSHUFf(A,DEST)
|
||||||
|
#define VTIMESMINUSI1f(A,DEST,Z) "vsubps " #DEST "," #Z "," #DEST"{%k6}" ";\n"
|
||||||
|
#define VTIMESMINUSI2f(A,DEST,Z) "vaddps " #DEST "," #Z "," #DEST"{%k7}" ";\n"
|
||||||
|
|
||||||
|
#define VTIMESMINUSI0d(A,DEST,Z) VSHUFd(A,DEST)
|
||||||
|
#define VTIMESMINUSI1d(A,DEST,Z) "vsubpd " #DEST "," #Z "," #DEST"{%k6}" ";\n"
|
||||||
|
#define VTIMESMINUSI2d(A,DEST,Z) "vaddpd " #DEST "," #Z "," #DEST"{%k7}" ";\n"
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
|
||||||
|
#define VACCTIMESMINUSI0f(A,ACC,tmp) VSHUFf(A,tmp)
|
||||||
|
#define VACCTIMESMINUSI1f(A,ACC,tmp) "vsubps " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
|
||||||
|
#define VACCTIMESMINUSI2f(A,ACC,tmp) "vaddps " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
|
||||||
|
|
||||||
|
#define VACCTIMESMINUSI0d(A,ACC,tmp) VSHUFd(A,tmp)
|
||||||
|
#define VACCTIMESMINUSI1d(A,ACC,tmp) "vsubpd " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
|
||||||
|
#define VACCTIMESMINUSI2d(A,ACC,tmp) "vaddpd " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
|
||||||
|
|
||||||
|
#define VACCTIMESI0f(A,ACC,tmp) VSHUFf(A,tmp)
|
||||||
|
#define VACCTIMESI1f(A,ACC,tmp) "vaddps " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
|
||||||
|
#define VACCTIMESI2f(A,ACC,tmp) "vsubps " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
|
||||||
|
|
||||||
|
#define VACCTIMESI0d(A,ACC,tmp) VSHUFd(A,tmp)
|
||||||
|
#define VACCTIMESI1d(A,ACC,tmp) "vaddpd " #tmp "," #ACC "," #ACC"{%k6}" ";\n"
|
||||||
|
#define VACCTIMESI2d(A,ACC,tmp) "vsubpd " #tmp "," #ACC "," #ACC"{%k7}" ";\n"
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
// o_p must point to floating 1.0f/d
|
||||||
|
//
|
||||||
|
// Ai, Ar -> tmp (r i)
|
||||||
|
// tmp *1.0
|
||||||
|
// ACC i - Ar ; ACC r + Ai
|
||||||
|
#define VACCTIMESMINUSI0f(A,ACC,tmp) VSHUFf(A,tmp)
|
||||||
|
#define VACCTIMESMINUSI1f(A,ACC,tmp) VMADDMEMf(1,%r10,tmp,ACC)
|
||||||
|
#define VACCTIMESMINUSI2f(A,ACC,tmp)
|
||||||
|
|
||||||
|
|
||||||
|
#define VACCTIMESMINUSI0d(A,ACC,tmp) VSHUFd(A,tmp)
|
||||||
|
#define VACCTIMESMINUSI1d(A,ACC,tmp) VMADDMEMd(1,%r10,tmp,ACC)
|
||||||
|
#define VACCTIMESMINUSI2d(A,ACC,tmp)
|
||||||
|
|
||||||
|
// Ai, Ar -> tmp (r i)
|
||||||
|
// tmp *1.0
|
||||||
|
// ACC i + Ar ; ACC r - Ai
|
||||||
|
#define VACCTIMESI0f(A,ACC,tmp) VSHUFf(A,tmp)
|
||||||
|
#define VACCTIMESI1f(A,ACC,tmp) VMADDMEMf(0,%r10,tmp,ACC)
|
||||||
|
#define VACCTIMESI2f(A,ACC,tmp)
|
||||||
|
|
||||||
|
#define VACCTIMESI0d(A,ACC,tmp) VSHUFd(A,tmp)
|
||||||
|
#define VACCTIMESI1d(A,ACC,tmp) VMADDMEMd(0,%r10,tmp,ACC)
|
||||||
|
#define VACCTIMESI2d(A,ACC,tmp)
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define VPERM0f(A,B) "vshuff32x4 $0x4e," #A "," #B "," #B ";\n"
|
||||||
|
#define VPERM1f(A,B) "vshuff32x4 $0xb1," #A "," #B "," #B ";\n"
|
||||||
|
#define VPERM2f(A,B) "vshufps $0x4e," #A "," #B "," #B ";\n"
|
||||||
|
#define VPERM3f(A,B) "vshufps $0xb1," #A "," #B "," #B ";\n"
|
||||||
|
|
||||||
|
#define VPERM0d(A,B) "vshuff64x2 $0x4e," #A "," #B "," #B ";\n"
|
||||||
|
#define VPERM1d(A,B) "vshuff64x2 $0xb1," #A "," #B "," #B ";\n"
|
||||||
|
#define VPERM2d(A,B) "vshufpd $0x55," #A "," #B "," #B ";\n"
|
||||||
|
#define VPERM3d(A,B) VMOVd(A,B)
|
||||||
|
|
||||||
|
|
||||||
|
#endif
|
141
lib/simd/Intel512common.h
Normal file
141
lib/simd/Intel512common.h
Normal file
@ -0,0 +1,141 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/simd/Avx512Asm.h
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#ifndef GRID_ASM_INTEL_COMMON_512_H
|
||||||
|
#define GRID_ASM_INTEL_COMMON_512_H
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Opcodes common
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
#define MASK_REGS \
|
||||||
|
__asm__ ("mov $0xAAAA, %%eax \n"\
|
||||||
|
"kmovw %%eax, %%k6 \n"\
|
||||||
|
"mov $0x5555, %%eax \n"\
|
||||||
|
"kmovw %%eax, %%k7 \n" : : : "%eax");
|
||||||
|
|
||||||
|
#define VZEROf(A) "vpxorq " #A "," #A "," #A ";\n"
|
||||||
|
#define VZEROd(A) "vpxorq " #A "," #A "," #A ";\n"
|
||||||
|
|
||||||
|
#define VTIMESIf(A,DEST, Z) \
|
||||||
|
VTIMESI0f(A,DEST, Z) \
|
||||||
|
VTIMESI1f(A,DEST, Z) \
|
||||||
|
VTIMESI2f(A,DEST, Z)
|
||||||
|
|
||||||
|
#define VTIMESId(A,DEST, Z) \
|
||||||
|
VTIMESI0d(A,DEST, Z) \
|
||||||
|
VTIMESI1d(A,DEST, Z) \
|
||||||
|
VTIMESI2d(A,DEST, Z)
|
||||||
|
|
||||||
|
#define VTIMESMINUSIf(A,DEST, Z) \
|
||||||
|
VTIMESMINUSI0f(A,DEST, Z) \
|
||||||
|
VTIMESMINUSI1f(A,DEST, Z) \
|
||||||
|
VTIMESMINUSI2f(A,DEST, Z)
|
||||||
|
|
||||||
|
#define VTIMESMINUSId(A,DEST, Z) \
|
||||||
|
VTIMESMINUSI0d(A,DEST, Z) \
|
||||||
|
VTIMESMINUSI1d(A,DEST, Z) \
|
||||||
|
VTIMESMINUSI2d(A,DEST, Z)
|
||||||
|
|
||||||
|
#define VACCTIMESIf(A,ACC,tmp) \
|
||||||
|
VACCTIMESI0f(A,ACC,tmp) \
|
||||||
|
VACCTIMESI1f(A,ACC,tmp) \
|
||||||
|
VACCTIMESI2f(A,ACC,tmp)
|
||||||
|
|
||||||
|
#define VACCTIMESId(A,ACC,tmp) \
|
||||||
|
VACCTIMESI0d(A,ACC,tmp) \
|
||||||
|
VACCTIMESI1d(A,ACC,tmp) \
|
||||||
|
VACCTIMESI2d(A,ACC,tmp)
|
||||||
|
|
||||||
|
#define VACCTIMESMINUSIf(A,ACC,tmp) \
|
||||||
|
VACCTIMESMINUSI0f(A,ACC,tmp) \
|
||||||
|
VACCTIMESMINUSI1f(A,ACC,tmp) \
|
||||||
|
VACCTIMESMINUSI2f(A,ACC,tmp)
|
||||||
|
|
||||||
|
#define VACCTIMESMINUSId(A,ACC,tmp) \
|
||||||
|
VACCTIMESMINUSI0d(A,ACC,tmp) \
|
||||||
|
VACCTIMESMINUSI1d(A,ACC,tmp) \
|
||||||
|
VACCTIMESMINUSI2d(A,ACC,tmp)
|
||||||
|
|
||||||
|
#define LOAD64i(A,ptr) __asm__ ( "movq %0, %" #A : : "r"(ptr) : #A );
|
||||||
|
#define LOAD64(A,ptr) LOAD64i(A,ptr)
|
||||||
|
|
||||||
|
#define VMOVf(A,DEST) "vmovaps " #A ", " #DEST ";\n"
|
||||||
|
#define VMOVd(A,DEST) "vmovapd " #A ", " #DEST ";\n"
|
||||||
|
|
||||||
|
#define VPREFETCHG(O,A) "prefetcht0 "#O"*64("#A");\n"
|
||||||
|
#define VPREFETCH2(O,A) "prefetcht1 "#O"*64("#A");\n"
|
||||||
|
#define VPREFETCHW(O,A) "prefetchwt1 "#O"*64("#A");\n"
|
||||||
|
#define VEVICT(O,A)
|
||||||
|
|
||||||
|
//"vprefetche0 "#O"*64("#A");\n" "vprefetche1 ("#O"+12)*64("#A");\n"
|
||||||
|
// "clevict0 "#O"*64("#A");\n"
|
||||||
|
|
||||||
|
#define VLOADf(OFF,PTR,DEST) "vmovaps " #OFF "*64(" #PTR "), " #DEST ";\n"
|
||||||
|
#define VLOADd(OFF,PTR,DEST) "vmovapd " #OFF "*64(" #PTR "), " #DEST ";\n"
|
||||||
|
|
||||||
|
#define VADDf(A,B,DEST) "vaddps " #A "," #B "," #DEST ";\n"
|
||||||
|
#define VADDd(A,B,DEST) "vaddpd " #A "," #B "," #DEST ";\n"
|
||||||
|
|
||||||
|
#define VSUBf(A,B,DEST) "vsubps " #A "," #B "," #DEST ";\n"
|
||||||
|
#define VSUBd(A,B,DEST) "vsubpd " #A "," #B "," #DEST ";\n"
|
||||||
|
|
||||||
|
#define VADDMEMf(O,A,B,DEST) "vaddps "#O"*64("#A ")," #B "," #DEST ";\n"
|
||||||
|
#define VADDMEMd(O,A,B,DEST) "vaddpd "#O"*64("#A ")," #B "," #DEST ";\n"
|
||||||
|
|
||||||
|
#define VSUBMEMf(O,A,B,DEST) "vsubps "#O"*64("#A ")," #B "," #DEST ";\n"
|
||||||
|
#define VSUBMEMd(O,A,B,DEST) "vsubpd "#O"*64("#A ")," #B "," #DEST ";\n"
|
||||||
|
|
||||||
|
#define VMULf(A,B,DEST) "vmulps " #A "," #B "," #DEST ";\n"
|
||||||
|
#define VMULd(A,B,DEST) "vmulpd " #A "," #B "," #DEST ";\n"
|
||||||
|
|
||||||
|
#define VMADDf(A,B,DEST) "vfmadd231ps " #A "," #B "," #DEST ";\n"
|
||||||
|
#define VMADDd(A,B,DEST) "vfmadd231pd " #A "," #B "," #DEST ";\n"
|
||||||
|
|
||||||
|
#define VMULMEMf(O,A,B,DEST) "vmulps " #O"*64("#A ")," #B "," #DEST ";\n"
|
||||||
|
#define VMULMEMd(O,A,B,DEST) "vmulpd " #O"*64("#A ")," #B "," #DEST ";\n"
|
||||||
|
|
||||||
|
#define VMADDMEMf(O,A,B,DEST) "vfmadd231ps " #O"*64("#A "),"#B "," #DEST ";\n"
|
||||||
|
#define VMADDMEMd(O,A,B,DEST) "vfmadd231pd " #O"*64("#A "),"#B "," #DEST ";\n"
|
||||||
|
|
||||||
|
#define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri)
|
||||||
|
#define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri)
|
||||||
|
|
||||||
|
#define VPREFETCHNTA(O,A)
|
||||||
|
#define VPREFETCH(O,A)
|
||||||
|
|
||||||
|
#define VSTOREf(OFF,PTR,SRC) "vmovaps " #SRC "," #OFF "*64(" #PTR ")" ";\n"
|
||||||
|
#define VSTOREd(OFF,PTR,SRC) "vmovapd " #SRC "," #OFF "*64(" #PTR ")" ";\n"
|
||||||
|
|
||||||
|
// Swaps Re/Im ; could unify this with IMCI
|
||||||
|
#define VSHUFd(A,DEST) "vpshufd $0x4e," #A "," #DEST ";\n"
|
||||||
|
#define VSHUFf(A,DEST) "vpshufd $0xb1," #A "," #DEST ";\n"
|
||||||
|
#define VSHUFMEMd(OFF,A,DEST) "vpshufd $0x4e, " #OFF"*64("#A ")," #DEST ";\n" // 32 bit level: 1,0,3,2
|
||||||
|
#define VSHUFMEMf(OFF,A,DEST) "vpshufd $0xb1, " #OFF"*64("#A ")," #DEST ";\n" // 32 bit level: 2,3,0,1
|
||||||
|
|
||||||
|
#define TRAP " int3 ;\n"
|
||||||
|
|
||||||
|
#endif
|
154
lib/simd/Intel512double.h
Normal file
154
lib/simd/Intel512double.h
Normal file
@ -0,0 +1,154 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/simd/Avx512Asm.h
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
// No guard can be multiply included as undef clearage
|
||||||
|
#undef VZERO
|
||||||
|
#undef VMOV
|
||||||
|
#undef VLOAD
|
||||||
|
#undef VSTORE
|
||||||
|
#define VZERO(A) VZEROd(A)
|
||||||
|
#define VMOV(A,B) VMOVd(A,B)
|
||||||
|
#define VLOAD(OFF,PTR,DEST) VLOADd(OFF,PTR,DEST)
|
||||||
|
#define VSTORE(OFF,PTR,SRC) VSTOREd(OFF,PTR,SRC)
|
||||||
|
|
||||||
|
#undef VADD
|
||||||
|
#undef VSUB
|
||||||
|
#undef VMUL
|
||||||
|
#undef VMADD
|
||||||
|
#define VADD(A,B,C) VADDd(A,B,C)
|
||||||
|
#define VSUB(A,B,C) VSUBd(A,B,C)
|
||||||
|
#define VMUL(Uri,Uir,Chi) VMULd(Uri,Uir,Chi)
|
||||||
|
#define VMADD(Uri,Uir,Chi) VMADDd(Uri,Uir,Chi)
|
||||||
|
|
||||||
|
|
||||||
|
#undef VTIMESI
|
||||||
|
#undef VTIMESI0
|
||||||
|
#undef VTIMESI1
|
||||||
|
#undef VTIMESI2
|
||||||
|
#define VTIMESI(A,B,C) VTIMESId(A,B,C)
|
||||||
|
#define VTIMESI0(A,B,C) VTIMESI0d(A,B,C)
|
||||||
|
#define VTIMESI1(A,B,C) VTIMESI1d(A,B,C)
|
||||||
|
#define VTIMESI2(A,B,C) VTIMESI2d(A,B,C)
|
||||||
|
|
||||||
|
#undef VTIMESMINUSI
|
||||||
|
#undef VTIMESMINUSI0
|
||||||
|
#undef VTIMESMINUSI1
|
||||||
|
#undef VTIMESMINUSI2
|
||||||
|
#define VTIMESMINUSI(A,B,C) VTIMESMINUSId(A,B,C)
|
||||||
|
#define VTIMESMINUSI0(A,B,C) VTIMESMINUSI0d(A,B,C)
|
||||||
|
#define VTIMESMINUSI1(A,B,C) VTIMESMINUSI1d(A,B,C)
|
||||||
|
#define VTIMESMINUSI2(A,B,C) VTIMESMINUSI2d(A,B,C)
|
||||||
|
|
||||||
|
#undef VACCTIMESI
|
||||||
|
#undef VACCTIMESI0
|
||||||
|
#undef VACCTIMESI1
|
||||||
|
#undef VACCTIMESI2
|
||||||
|
#define VACCTIMESI(A,B,C) VACCTIMESId(A,B,C)
|
||||||
|
#define VACCTIMESI0(A,B,C) VACCTIMESI0d(A,B,C)
|
||||||
|
#define VACCTIMESI1(A,B,C) VACCTIMESI1d(A,B,C)
|
||||||
|
#define VACCTIMESI2(A,B,C) VACCTIMESI2d(A,B,C)
|
||||||
|
|
||||||
|
#undef VACCTIMESMINUSI
|
||||||
|
#undef VACCTIMESMINUSI0
|
||||||
|
#undef VACCTIMESMINUSI1
|
||||||
|
#undef VACCTIMESMINUSI2
|
||||||
|
#define VACCTIMESMINUSI(A,B,C) VACCTIMESMINUSId(A,B,C)
|
||||||
|
#define VACCTIMESMINUSI0(A,B,C) VACCTIMESMINUSI0d(A,B,C)
|
||||||
|
#define VACCTIMESMINUSI1(A,B,C) VACCTIMESMINUSI1d(A,B,C)
|
||||||
|
#define VACCTIMESMINUSI2(A,B,C) VACCTIMESMINUSI2d(A,B,C)
|
||||||
|
|
||||||
|
#undef VACCTIMESI1MEM
|
||||||
|
#undef VACCTIMESI2MEM
|
||||||
|
#define VACCTIMESI1MEM(A,ACC,O,P) VACCTIMESI1MEMd(A,ACC,O,P)
|
||||||
|
#define VACCTIMESI2MEM(A,ACC,O,P) VACCTIMESI2MEMd(A,ACC,O,P)
|
||||||
|
|
||||||
|
#undef VACCTIMESMINUSI1MEM
|
||||||
|
#undef VACCTIMESMINUSI2MEM
|
||||||
|
#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMd(A,ACC,O,P)
|
||||||
|
#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMd(A,ACC,O,P)
|
||||||
|
|
||||||
|
#undef VPERM0
|
||||||
|
#undef VPERM1
|
||||||
|
#undef VPERM2
|
||||||
|
#undef VPERM3
|
||||||
|
#define VPERM0(A,B) VPERM0d(A,B)
|
||||||
|
#define VPERM1(A,B) VPERM1d(A,B)
|
||||||
|
#define VPERM2(A,B) VPERM2d(A,B)
|
||||||
|
#define VPERM3(A,B) VPERM3d(A,B)
|
||||||
|
|
||||||
|
#undef VSHUFMEM
|
||||||
|
#undef VADDMEM
|
||||||
|
#undef VSUBMEM
|
||||||
|
#define VSHUFMEM(OFF,A,DEST) VSHUFMEMd(OFF,A,DEST)
|
||||||
|
#define VADDMEM(O,A,B,C) VADDMEMd(O,A,B,C)
|
||||||
|
#define VSUBMEM(O,A,B,C) VSUBMEMd(O,A,B,C)
|
||||||
|
|
||||||
|
#undef VMOVIDUP
|
||||||
|
#undef VMOVRDUP
|
||||||
|
#undef VMADDSUB
|
||||||
|
#undef VSHUF
|
||||||
|
#define VMOVIDUP(A,B,C) VMOVIDUPd(A,B,C)
|
||||||
|
#define VMOVRDUP(A,B,C) VMOVRDUPd(A,B,C)
|
||||||
|
#define VMADDSUB(A,B,accum) VMADDSUBd(A,B,accum)
|
||||||
|
#define VSHUF(A,B) VSHUFd(A,B)
|
||||||
|
|
||||||
|
|
||||||
|
#undef ZEND1
|
||||||
|
#undef ZEND2
|
||||||
|
#undef ZLOAD
|
||||||
|
#undef ZMUL
|
||||||
|
#undef ZMADD
|
||||||
|
#undef ZMULMEM2SP
|
||||||
|
#undef ZMADDMEM2SP
|
||||||
|
|
||||||
|
#define ZEND1(A,B,C) ZEND1d(A,B,C)
|
||||||
|
#define ZEND2(A,B,C) ZEND2d(A,B,C)
|
||||||
|
#define ZLOAD(A,B,C,D) ZLOADd(A,B,C,D)
|
||||||
|
#define ZMUL(A,B,C,D,E) ZMULd(A,B,C,D,E)
|
||||||
|
#define ZMADD(A,B,C,D,E) ZMADDd(A,B,C,D,E)
|
||||||
|
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
||||||
|
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
||||||
|
|
||||||
|
|
||||||
|
#undef VRDUP
|
||||||
|
#undef VIDUP
|
||||||
|
#undef VMADDSUBMEM
|
||||||
|
#undef VMADDMEM
|
||||||
|
#undef VMULMEM
|
||||||
|
#define VRDUP(SRC,DEST) VRDUPd(SRC,DEST)
|
||||||
|
#define VIDUP(SRC,DEST) VIDUPd(SRC,DEST)
|
||||||
|
#define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMd(O,P,B,accum)
|
||||||
|
#define VMADDMEM(O,P,B,accum) VMADDMEMd(O,P,B,accum)
|
||||||
|
#define VMULMEM(O,P,B,accum) VMULMEMd(O,P,B,accum)
|
||||||
|
#undef VMADDSUBRDUP
|
||||||
|
#undef VMADDSUBIDUP
|
||||||
|
#undef VMULRDUP
|
||||||
|
#undef VMULIDUP
|
||||||
|
#define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPd(O,P,B,accum)
|
||||||
|
#define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPd(O,P,B,accum)
|
||||||
|
#define VMULRDUP(O,P,B,accum) VMULRDUPd(O,P,B,accum)
|
||||||
|
#define VMULIDUP(O,P,B,accum) VMULIDUPd(O,P,B,accum)
|
127
lib/simd/Intel512imci.h
Normal file
127
lib/simd/Intel512imci.h
Normal file
@ -0,0 +1,127 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/simd/Avx512Asm.h
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#ifndef GRID_ASM_AV512_H
|
||||||
|
#define GRID_ASM_AV512_H
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////
|
||||||
|
// Knights Corner specials
|
||||||
|
////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
#define ZLOADf(OFF,PTR,ri,ir) VLOADf(OFF,PTR,ir) VSHUFf(ir,ri)
|
||||||
|
#define ZLOADd(OFF,PTR,ri,ir) VLOADd(OFF,PTR,ir) VSHUFd(ir,ri)
|
||||||
|
|
||||||
|
#define ZMULf(Ari,Air,B,Criir,Ciirr) VMULf(Ari,B,Criir) VMULf(Air,B,Ciirr)
|
||||||
|
#define ZMULd(Ari,Air,B,Criir,Ciirr) VMULd(Ari,B,Criir) VMULd(Air,B,Ciirr)
|
||||||
|
|
||||||
|
#define ZMADDf(Ari,Air,B,Criir,Ciirr) VMADDf(Ari,B,Criir) VMADDf(Air,B,Ciirr)
|
||||||
|
#define ZMADDd(Ari,Air,B,Criir,Ciirr) VMADDd(Ari,B,Criir) VMADDd(Air,B,Ciirr)
|
||||||
|
|
||||||
|
#define ZENDf(Criir,Ciirr, tmp) ZEND1f(Criir,Ciirr, tmp) ZEND2f(Criir,Ciirr, tmp)
|
||||||
|
#define ZENDd(Criir,Ciirr, tmp) ZEND1d(Criir,Ciirr, tmp) ZEND2d(Criir,Ciirr, tmp)
|
||||||
|
|
||||||
|
#define ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
|
||||||
|
VSHUFMEMf(O,P,tmp) \
|
||||||
|
VMULMEMf(O,P,B,Biirr) \
|
||||||
|
VMULMEMf(O,P,C,Ciirr) \
|
||||||
|
VMULf(tmp,B,Briir) \
|
||||||
|
VMULf(tmp,C,Criir)
|
||||||
|
|
||||||
|
#define ZMULMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
|
||||||
|
VSHUFMEMd(O,P,tmp) \
|
||||||
|
VMULMEMd(O,P,B,Biirr) \
|
||||||
|
VMULMEMd(O,P,C,Ciirr) \
|
||||||
|
VMULd(tmp,B,Briir) \
|
||||||
|
VMULd(tmp,C,Criir)
|
||||||
|
|
||||||
|
#define ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)\
|
||||||
|
VSHUFMEMf(O,P,tmp) \
|
||||||
|
VMADDMEMf(O,P,B,Biirr) \
|
||||||
|
VMADDMEMf(O,P,C,Ciirr) \
|
||||||
|
VMADDf(tmp,B,Briir) \
|
||||||
|
VMADDf(tmp,C,Criir)
|
||||||
|
|
||||||
|
#define ZMADDMEM2SPd(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) \
|
||||||
|
VSHUFMEMd(O,P,tmp) \
|
||||||
|
VMADDMEMd(O,P,B,Biirr) \
|
||||||
|
VMADDMEMd(O,P,C,Ciirr) \
|
||||||
|
VMADDd(tmp,B,Briir) \
|
||||||
|
VMADDd(tmp,C,Criir)
|
||||||
|
|
||||||
|
#define ZEND1d(Criir,Ciirr, tmp) "vaddpd " #Criir "{cdab} ," #Criir "," #Criir"{%k6}" ";\n"
|
||||||
|
#define ZEND2d(Criir,Ciirr, tmp) "vsubpd " #Ciirr "{cdab} ," #Ciirr "," #Criir"{%k7}" ";\n"
|
||||||
|
|
||||||
|
#define ZEND1f(Criir,Ciirr, tmp) "vaddps " #Criir "{cdab} ," #Criir "," #Criir"{%k6}" ";\n"
|
||||||
|
#define ZEND2f(Criir,Ciirr, tmp) "vsubps " #Ciirr "{cdab} ," #Ciirr "," #Criir"{%k7}" ";\n"
|
||||||
|
|
||||||
|
#define VTIMESI0f(A,DEST, Z)
|
||||||
|
#define VTIMESI1f(A,DEST, Z) "vaddps " #A "{cdab}," #Z "," #DEST"{%k7}" ";\n"
|
||||||
|
#define VTIMESI2f(A,DEST, Z) "vsubps " #A "{cdab}," #Z "," #DEST"{%k6}" ";\n"
|
||||||
|
|
||||||
|
#define VTIMESI0d(A,DEST, Z)
|
||||||
|
#define VTIMESI1d(A,DEST, Z) "vaddpd " #A "{cdab}," #Z "," #DEST"{%k7}" ";\n"
|
||||||
|
#define VTIMESI2d(A,DEST, Z) "vsubpd " #A "{cdab}," #Z "," #DEST"{%k6}" ";\n"
|
||||||
|
|
||||||
|
#define VTIMESMINUSI0f(A,DEST,Z)
|
||||||
|
#define VTIMESMINUSI1f(A,DEST,Z) "vsubps " #A "{cdab}," #Z "," #DEST"{%k7}" ";\n"
|
||||||
|
#define VTIMESMINUSI2f(A,DEST,Z) "vaddps " #A "{cdab}," #Z "," #DEST"{%k6}" ";\n"
|
||||||
|
|
||||||
|
#define VTIMESMINUSI0d(A,DEST,Z)
|
||||||
|
#define VTIMESMINUSI1d(A,DEST,Z) "vsubpd " #A "{cdab}," #Z "," #DEST"{%k7}" ";\n"
|
||||||
|
#define VTIMESMINUSI2d(A,DEST,Z) "vaddpd " #A "{cdab}," #Z "," #DEST"{%k6}" ";\n"
|
||||||
|
|
||||||
|
#define VACCTIMESI0f(A,ACC,tmp)
|
||||||
|
#define VACCTIMESI1f(A,ACC,tmp) "vaddps " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
|
||||||
|
#define VACCTIMESI2f(A,ACC,tmp) "vsubps " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
|
||||||
|
|
||||||
|
#define VACCTIMESI0d(A,ACC,tmp)
|
||||||
|
#define VACCTIMESI1d(A,ACC,tmp) "vaddpd " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
|
||||||
|
#define VACCTIMESI2d(A,ACC,tmp) "vsubpd " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
|
||||||
|
|
||||||
|
#define VACCTIMESMINUSI0f(A,ACC,tmp)
|
||||||
|
#define VACCTIMESMINUSI1f(A,ACC,tmp) "vsubps " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
|
||||||
|
#define VACCTIMESMINUSI2f(A,ACC,tmp) "vaddps " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
|
||||||
|
|
||||||
|
// Acc = Acc - i A
|
||||||
|
#define VACCTIMESMINUSI0d(A,ACC,tmp)
|
||||||
|
#define VACCTIMESMINUSI1d(A,ACC,tmp) "vsubpd " #A "{cdab}," #ACC "," #ACC"{%k7}" ";\n"
|
||||||
|
#define VACCTIMESMINUSI2d(A,ACC,tmp) "vaddpd " #A "{cdab}," #ACC "," #ACC"{%k6}" ";\n"
|
||||||
|
|
||||||
|
//((1<<6)|(0<<4)|(3<<2)|(2)) == 0100,1110 = 0x4e
|
||||||
|
//((2<<6)|(3<<4)|(0<<2)|(1)) == 1011,0001 = 0xb1
|
||||||
|
|
||||||
|
#define VPERM0f(A,B) "vpermf32x4 $0x4e," #A "," #B ";\n"
|
||||||
|
#define VPERM1f(A,B) "vpermf32x4 $0xb1," #A "," #B ";\n"
|
||||||
|
#define VPERM2f(A,B) "vmovaps " #A "{badc}," #B ";\n"
|
||||||
|
#define VPERM3f(A,B) "vmovaps " #A "{cdab}," #B ";\n"
|
||||||
|
|
||||||
|
#define VPERM0d(A,B) "vpermf32x4 $0x4e," #A "," #B ";\n"
|
||||||
|
#define VPERM1d(A,B) "vmovapd " #A "{badc}," #B ";\n"
|
||||||
|
#define VPERM2d(A,B) "vmovapd " #A "{cdab}," #B ";\n"
|
||||||
|
#define VPERM3d(A,B) VMOVd(A,B)
|
||||||
|
|
||||||
|
#endif
|
155
lib/simd/Intel512single.h
Normal file
155
lib/simd/Intel512single.h
Normal file
@ -0,0 +1,155 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/simd/Avx512Asm.h
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
// No guard can be multiply included as undef clearge of macros
|
||||||
|
#undef VZERO
|
||||||
|
#undef VMOV
|
||||||
|
#undef VLOAD
|
||||||
|
#undef VSTORE
|
||||||
|
#define VZERO(A) VZEROf(A)
|
||||||
|
#define VMOV(A,B) VMOVf(A,B)
|
||||||
|
#define VLOAD(OFF,PTR,DEST) VLOADf(OFF,PTR,DEST)
|
||||||
|
#define VSTORE(OFF,PTR,SRC) VSTOREf(OFF,PTR,SRC)
|
||||||
|
|
||||||
|
#undef VADD
|
||||||
|
#undef VSUB
|
||||||
|
#undef VMUL
|
||||||
|
#undef VMADD
|
||||||
|
#define VADD(A,B,C) VADDf(A,B,C)
|
||||||
|
#define VSUB(A,B,C) VSUBf(A,B,C)
|
||||||
|
#define VMUL(Uri,Uir,Chi) VMULf(Uri,Uir,Chi)
|
||||||
|
#define VMADD(Uri,Uir,Chi) VMADDf(Uri,Uir,Chi)
|
||||||
|
|
||||||
|
|
||||||
|
#undef VTIMESI
|
||||||
|
#undef VTIMESI0
|
||||||
|
#undef VTIMESI1
|
||||||
|
#undef VTIMESI2
|
||||||
|
#define VTIMESI(A,B,C) VTIMESIf(A,B,C)
|
||||||
|
#define VTIMESI0(A,B,C) VTIMESI0f(A,B,C)
|
||||||
|
#define VTIMESI1(A,B,C) VTIMESI1f(A,B,C)
|
||||||
|
#define VTIMESI2(A,B,C) VTIMESI2f(A,B,C)
|
||||||
|
|
||||||
|
#undef VTIMESMINUSI
|
||||||
|
#undef VTIMESMINUSI0
|
||||||
|
#undef VTIMESMINUSI1
|
||||||
|
#undef VTIMESMINUSI2
|
||||||
|
#define VTIMESMINUSI(A,B,C) VTIMESMINUSIf(A,B,C)
|
||||||
|
#define VTIMESMINUSI0(A,B,C) VTIMESMINUSI0f(A,B,C)
|
||||||
|
#define VTIMESMINUSI1(A,B,C) VTIMESMINUSI1f(A,B,C)
|
||||||
|
#define VTIMESMINUSI2(A,B,C) VTIMESMINUSI2f(A,B,C)
|
||||||
|
|
||||||
|
#undef VACCTIMESI
|
||||||
|
#undef VACCTIMESI0
|
||||||
|
#undef VACCTIMESI1
|
||||||
|
#undef VACCTIMESI2
|
||||||
|
#define VACCTIMESI(A,B,C) VACCTIMESIf(A,B,C)
|
||||||
|
#define VACCTIMESI0(A,B,C) VACCTIMESI0f(A,B,C)
|
||||||
|
#define VACCTIMESI1(A,B,C) VACCTIMESI1f(A,B,C)
|
||||||
|
#define VACCTIMESI2(A,B,C) VACCTIMESI2f(A,B,C)
|
||||||
|
|
||||||
|
#undef VACCTIMESMINUSI
|
||||||
|
#undef VACCTIMESMINUSI0
|
||||||
|
#undef VACCTIMESMINUSI1
|
||||||
|
#undef VACCTIMESMINUSI2
|
||||||
|
#define VACCTIMESMINUSI(A,B,C) VACCTIMESMINUSIf(A,B,C)
|
||||||
|
#define VACCTIMESMINUSI0(A,B,C) VACCTIMESMINUSI0f(A,B,C)
|
||||||
|
#define VACCTIMESMINUSI1(A,B,C) VACCTIMESMINUSI1f(A,B,C)
|
||||||
|
#define VACCTIMESMINUSI2(A,B,C) VACCTIMESMINUSI2f(A,B,C)
|
||||||
|
|
||||||
|
#undef VACCTIMESI1MEM
|
||||||
|
#undef VACCTIMESI2MEM
|
||||||
|
#define VACCTIMESI1MEM(A,ACC,O,P) VACCTIMESI1MEMf(A,ACC,O,P)
|
||||||
|
#define VACCTIMESI2MEM(A,ACC,O,P) VACCTIMESI2MEMf(A,ACC,O,P)
|
||||||
|
|
||||||
|
#undef VACCTIMESMINUSI1MEM
|
||||||
|
#undef VACCTIMESMINUSI2MEM
|
||||||
|
#define VACCTIMESMINUSI1MEM(A,ACC,O,P) VACCTIMESMINUSI1MEMf(A,ACC,O,P)
|
||||||
|
#define VACCTIMESMINUSI2MEM(A,ACC,O,P) VACCTIMESMINUSI2MEMf(A,ACC,O,P)
|
||||||
|
|
||||||
|
#undef VPERM0
|
||||||
|
#undef VPERM1
|
||||||
|
#undef VPERM2
|
||||||
|
#undef VPERM3
|
||||||
|
#define VPERM0(A,B) VPERM0f(A,B)
|
||||||
|
#define VPERM1(A,B) VPERM1f(A,B)
|
||||||
|
#define VPERM2(A,B) VPERM2f(A,B)
|
||||||
|
#define VPERM3(A,B) VPERM3f(A,B)
|
||||||
|
|
||||||
|
#undef VSHUFMEM
|
||||||
|
#undef VADDMEM
|
||||||
|
#undef VSUBMEM
|
||||||
|
#define VSHUFMEM(OFF,A,DEST) VSHUFMEMf(OFF,A,DEST)
|
||||||
|
#define VADDMEM(O,A,B,C) VADDMEMf(O,A,B,C)
|
||||||
|
#define VSUBMEM(O,A,B,C) VSUBMEMf(O,A,B,C)
|
||||||
|
|
||||||
|
#undef VMOVIDUP
|
||||||
|
#undef VMOVRDUP
|
||||||
|
#undef VMADDSUB
|
||||||
|
#undef VSHUF
|
||||||
|
#define VMOVIDUP(A,B,C) VMOVIDUPf(A,B,C)
|
||||||
|
#define VMOVRDUP(A,B,C) VMOVRDUPf(A,B,C)
|
||||||
|
#define VMADDSUB(A,B,accum) VMADDSUBf(A,B,accum)
|
||||||
|
#define VSHUF(A,B) VSHUFf(A,B)
|
||||||
|
|
||||||
|
#undef ZEND1
|
||||||
|
#undef ZEND2
|
||||||
|
#undef ZLOAD
|
||||||
|
#undef ZMUL
|
||||||
|
#undef ZMADD
|
||||||
|
#undef ZMULMEM2SP
|
||||||
|
#undef ZMADDMEM2SP
|
||||||
|
|
||||||
|
#define ZEND1(A,B,C) ZEND1f(A,B,C)
|
||||||
|
#define ZEND2(A,B,C) ZEND2f(A,B,C)
|
||||||
|
#define ZLOAD(A,B,C,D) ZLOADf(A,B,C,D)
|
||||||
|
#define ZMUL(A,B,C,D,E) ZMULf(A,B,C,D,E)
|
||||||
|
#define ZMADD(A,B,C,D,E) ZMADDf(A,B,C,D,E)
|
||||||
|
#define ZMULMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMULMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
||||||
|
#define ZMADDMEM2SP(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr) ZMADDMEM2SPf(O,P,tmp,B,C,Briir,Biirr,Criir,Ciirr)
|
||||||
|
|
||||||
|
#undef VRDUP
|
||||||
|
#undef VIDUP
|
||||||
|
#undef VMADDSUBMEM
|
||||||
|
#undef VMADDMEM
|
||||||
|
#undef VMULMEM
|
||||||
|
|
||||||
|
#define VRDUP(SRC,DEST) VRDUPf(SRC,DEST)
|
||||||
|
#define VIDUP(SRC,DEST) VIDUPf(SRC,DEST)
|
||||||
|
#define VMADDSUBMEM(O,P,B,accum) VMADDSUBMEMf(O,P,B,accum)
|
||||||
|
#define VMADDMEM(O,P,B,accum) VMADDMEMf(O,P,B,accum)
|
||||||
|
#define VMULMEM(O,P,B,accum) VMULMEMf(O,P,B,accum)
|
||||||
|
|
||||||
|
#undef VMADDSUBRDUP
|
||||||
|
#undef VMADDSUBIDUP
|
||||||
|
#undef VMULRDUP
|
||||||
|
#undef VMULIDUP
|
||||||
|
#define VMADDSUBRDUP(O,P,B,accum) VMADDSUBRDUPf(O,P,B,accum)
|
||||||
|
#define VMADDSUBIDUP(O,P,B,accum) VMADDSUBIDUPf(O,P,B,accum)
|
||||||
|
#define VMULRDUP(O,P,B,accum) VMULRDUPf(O,P,B,accum)
|
||||||
|
#define VMULIDUP(O,P,B,accum) VMULIDUPf(O,P,B,accum)
|
||||||
|
|
849
lib/simd/Intel512wilson.h
Normal file
849
lib/simd/Intel512wilson.h
Normal file
@ -0,0 +1,849 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/simd/Avx512Asm.h
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#ifndef GRID_ASM_INTEL_512_QCD_H
|
||||||
|
#define GRID_ASM_INTEL_512_QCD_H
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Register allocations for Wilson Kernel are precision indept
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
#define result_00 %zmm0
|
||||||
|
#define result_01 %zmm1
|
||||||
|
#define result_02 %zmm2
|
||||||
|
|
||||||
|
#define result_10 %zmm3
|
||||||
|
#define result_11 %zmm4
|
||||||
|
#define result_12 %zmm5
|
||||||
|
|
||||||
|
#define result_20 %zmm6
|
||||||
|
#define result_21 %zmm7
|
||||||
|
#define result_22 %zmm8
|
||||||
|
|
||||||
|
#define result_30 %zmm9
|
||||||
|
#define result_31 %zmm10
|
||||||
|
#define result_32 %zmm11
|
||||||
|
|
||||||
|
#define Chi_00 %zmm12
|
||||||
|
#define Chi_01 %zmm13
|
||||||
|
#define Chi_02 %zmm14
|
||||||
|
|
||||||
|
#define Chi_10 %zmm15
|
||||||
|
#define Chi_11 %zmm16
|
||||||
|
#define Chi_12 %zmm17
|
||||||
|
|
||||||
|
#define UChi_00 %zmm18
|
||||||
|
#define UChi_01 %zmm19
|
||||||
|
#define UChi_02 %zmm20
|
||||||
|
|
||||||
|
#define UChi_10 %zmm21
|
||||||
|
#define UChi_11 %zmm22
|
||||||
|
#define UChi_12 %zmm23
|
||||||
|
|
||||||
|
#define Uir %zmm24
|
||||||
|
#define Uri %zmm25
|
||||||
|
#define T1 %zmm24
|
||||||
|
#define T2 %zmm25
|
||||||
|
|
||||||
|
#define Z0 %zmm26
|
||||||
|
#define Z1 %zmm27
|
||||||
|
#define Z2 %zmm28
|
||||||
|
#define Z3 %zmm29
|
||||||
|
#define Z4 %zmm30
|
||||||
|
#define Z5 %zmm31
|
||||||
|
|
||||||
|
#define TMP Chi_00
|
||||||
|
|
||||||
|
#define Chimu_00 Chi_00
|
||||||
|
#define Chimu_01 Chi_01
|
||||||
|
#define Chimu_02 Chi_02
|
||||||
|
#define Chimu_10 Chi_10
|
||||||
|
#define Chimu_11 Chi_11
|
||||||
|
#define Chimu_12 Chi_12
|
||||||
|
#define Chimu_20 UChi_00
|
||||||
|
#define Chimu_21 UChi_01
|
||||||
|
#define Chimu_22 UChi_02
|
||||||
|
#define Chimu_30 UChi_10
|
||||||
|
#define Chimu_31 UChi_11
|
||||||
|
#define Chimu_32 UChi_12
|
||||||
|
|
||||||
|
#include <simd/Intel512common.h>
|
||||||
|
#include <simd/Intel512avx.h>
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////
|
||||||
|
// Macros used to build wilson kernel -- can rationalise and simplify
|
||||||
|
// a little as some duplication developed during trying different
|
||||||
|
// variants during optimisation. Could cut back to only those used.
|
||||||
|
//////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
// const SiteSpinor * ptr = & in._odata[offset];
|
||||||
|
#define LOAD_CHIMU(PTR) LOAD_CHIMUi(PTR)
|
||||||
|
#define LOAD_CHI(PTR) LOAD64(%r8,PTR) __asm__ ( LOAD_CHIi );
|
||||||
|
#define SAVE_UCHI(PTR) SAVE_UCHIi(PTR)
|
||||||
|
#define SAVE_CHI(PTR) SAVE_CHIi(PTR)
|
||||||
|
#define SAVE_RESULT(PTR) SAVE_RESULTi(PTR)
|
||||||
|
|
||||||
|
#define LOAD_CHIMUi \
|
||||||
|
LOAD_CHIMU01i \
|
||||||
|
LOAD_CHIMU23i );
|
||||||
|
|
||||||
|
|
||||||
|
#define LOAD_CHIMU01i\
|
||||||
|
VLOAD(0,%r8,Chimu_00) \
|
||||||
|
VLOAD(1,%r8,Chimu_01) \
|
||||||
|
VLOAD(2,%r8,Chimu_02) \
|
||||||
|
VLOAD(3,%r8,Chimu_10) \
|
||||||
|
VLOAD(4,%r8,Chimu_11) \
|
||||||
|
VLOAD(5,%r8,Chimu_12)
|
||||||
|
|
||||||
|
#define LOAD_CHIMU23i\
|
||||||
|
VLOAD(6,%r8,Chimu_20) \
|
||||||
|
VLOAD(7,%r8,Chimu_21) \
|
||||||
|
VLOAD(8,%r8,Chimu_22) \
|
||||||
|
VLOAD(9,%r8,Chimu_30) \
|
||||||
|
VLOAD(10,%r8,Chimu_31) \
|
||||||
|
VLOAD(11,%r8,Chimu_32)
|
||||||
|
|
||||||
|
#define SHUF_CHIMU23i\
|
||||||
|
VSHUFMEM(6,%r8,Chimu_20) \
|
||||||
|
VSHUFMEM(7,%r8,Chimu_21) \
|
||||||
|
VSHUFMEM(8,%r8,Chimu_22) \
|
||||||
|
VSHUFMEM(9,%r8,Chimu_30) \
|
||||||
|
VSHUFMEM(10,%r8,Chimu_31) \
|
||||||
|
VSHUFMEM(11,%r8,Chimu_32)
|
||||||
|
|
||||||
|
|
||||||
|
// const SiteHalfSpinor *ptr = &buf[offset];
|
||||||
|
|
||||||
|
#define LOAD_CHIi \
|
||||||
|
VLOAD(0,%r8,Chi_00) \
|
||||||
|
VLOAD(1,%r8,Chi_01) \
|
||||||
|
VLOAD(2,%r8,Chi_02) \
|
||||||
|
VLOAD(3,%r8,Chi_10) \
|
||||||
|
VLOAD(4,%r8,Chi_11) \
|
||||||
|
VLOAD(5,%r8,Chi_12)
|
||||||
|
|
||||||
|
|
||||||
|
#define SAVE_UCHIi(PTR) \
|
||||||
|
LOAD64(%r8,PTR) \
|
||||||
|
__asm__ ( \
|
||||||
|
VSTORE(0,%r8,UChi_00) \
|
||||||
|
VSTORE(1,%r8,UChi_01) \
|
||||||
|
VSTORE(2,%r8,UChi_02) \
|
||||||
|
VSTORE(3,%r8,UChi_10) \
|
||||||
|
VSTORE(4,%r8,UChi_11) \
|
||||||
|
VSTORE(5,%r8,UChi_12) \
|
||||||
|
);
|
||||||
|
|
||||||
|
#define SAVE_CHIi(PTR) \
|
||||||
|
LOAD64(%r8,PTR) \
|
||||||
|
__asm__ ( \
|
||||||
|
VSTORE(0,%r8,Chi_00) \
|
||||||
|
VSTORE(1,%r8,Chi_01) \
|
||||||
|
VSTORE(2,%r8,Chi_02) \
|
||||||
|
VSTORE(3,%r8,Chi_10) \
|
||||||
|
VSTORE(4,%r8,Chi_11) \
|
||||||
|
VSTORE(5,%r8,Chi_12) \
|
||||||
|
);
|
||||||
|
|
||||||
|
#define SAVE_RESULTi(PTR)\
|
||||||
|
LOAD64(%r8,PTR) \
|
||||||
|
__asm__ ( \
|
||||||
|
VSTORE(0,%r8,result_00) \
|
||||||
|
VSTORE(1,%r8,result_01) \
|
||||||
|
VSTORE(2,%r8,result_02) \
|
||||||
|
VSTORE(3,%r8,result_10) \
|
||||||
|
VSTORE(4,%r8,result_11) \
|
||||||
|
VSTORE(5,%r8,result_12) \
|
||||||
|
VSTORE(6,%r8,result_20) \
|
||||||
|
VSTORE(7,%r8,result_21) \
|
||||||
|
VSTORE(8,%r8,result_22) \
|
||||||
|
VSTORE(9,%r8,result_30) \
|
||||||
|
VSTORE(10,%r8,result_31) \
|
||||||
|
VSTORE(11,%r8,result_32) \
|
||||||
|
);
|
||||||
|
|
||||||
|
#define MULT_2SPIN_DIR_PFXP(A,p) MULT_2SPIN_PFXP(&U._odata[sU](A),p)
|
||||||
|
#define MULT_2SPIN_DIR_PFYP(A,p) MULT_2SPIN_PFYP(&U._odata[sU](A),p)
|
||||||
|
#define MULT_2SPIN_DIR_PFZP(A,p) MULT_2SPIN_PFZP(&U._odata[sU](A),p)
|
||||||
|
#define MULT_2SPIN_DIR_PFTP(A,p) MULT_2SPIN_PFTP(&U._odata[sU](A),p)
|
||||||
|
|
||||||
|
#define MULT_2SPIN_DIR_PFXM(A,p) MULT_2SPIN_PFXM(&U._odata[sU](A),p)
|
||||||
|
#define MULT_2SPIN_DIR_PFYM(A,p) MULT_2SPIN_PFYM(&U._odata[sU](A),p)
|
||||||
|
#define MULT_2SPIN_DIR_PFZM(A,p) MULT_2SPIN_PFZM(&U._odata[sU](A),p)
|
||||||
|
#define MULT_2SPIN_DIR_PFTM(A,p) MULT_2SPIN_PFTM(&U._odata[sU](A),p)
|
||||||
|
|
||||||
|
#define MULT_2SPIN_PFXM(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||||
|
#define MULT_2SPIN_PFYM(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||||
|
#define MULT_2SPIN_PFZM(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||||
|
#define MULT_2SPIN_PFTM(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||||
|
#define MULT_2SPIN_PFTP(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||||
|
#define MULT_2SPIN_PFZP(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||||
|
#define MULT_2SPIN_PFYP(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||||
|
#define MULT_2SPIN_PFXP(ptr,pf) MULT_2SPIN(ptr,pf)
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////
|
||||||
|
// Dirac algebra
|
||||||
|
//////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
// hspin(0)=fspin(0)+timesI(fspin(3));
|
||||||
|
// hspin(1)=fspin(1)+timesI(fspin(2));
|
||||||
|
#define XP_PROJMEM(PTR) \
|
||||||
|
LOAD64(%r8,PTR) \
|
||||||
|
__asm__ ( \
|
||||||
|
LOAD_CHIi \
|
||||||
|
SHUF_CHIMU23i \
|
||||||
|
VACCTIMESI1(Chi_00,Chi_00,Chimu_30) \
|
||||||
|
VACCTIMESI1(Chi_01,Chi_01,Chimu_31) \
|
||||||
|
VACCTIMESI1(Chi_02,Chi_02,Chimu_32) \
|
||||||
|
VACCTIMESI1(Chi_10,Chi_10,Chimu_20) \
|
||||||
|
VACCTIMESI1(Chi_11,Chi_11,Chimu_21) \
|
||||||
|
VACCTIMESI1(Chi_12,Chi_12,Chimu_22) \
|
||||||
|
VACCTIMESI2(Chi_00,Chi_00,Chimu_30) \
|
||||||
|
VACCTIMESI2(Chi_01,Chi_01,Chimu_31) \
|
||||||
|
VACCTIMESI2(Chi_02,Chi_02,Chimu_32) \
|
||||||
|
VACCTIMESI2(Chi_10,Chi_10,Chimu_20) \
|
||||||
|
VACCTIMESI2(Chi_11,Chi_11,Chimu_21) \
|
||||||
|
VACCTIMESI2(Chi_12,Chi_12,Chimu_22) );
|
||||||
|
|
||||||
|
|
||||||
|
#define YP_PROJMEM(ptr) \
|
||||||
|
LOAD64(%r8,ptr) \
|
||||||
|
__asm__ ( \
|
||||||
|
LOAD_CHIMU01i \
|
||||||
|
VSUBMEM(9,%r8 ,Chimu_00,Chi_00) \
|
||||||
|
VSUBMEM(10,%r8,Chimu_01,Chi_01) \
|
||||||
|
VSUBMEM(11,%r8,Chimu_02,Chi_02) \
|
||||||
|
VADDMEM(6,%r8,Chimu_10,Chi_10) \
|
||||||
|
VADDMEM(7,%r8,Chimu_11,Chi_11) \
|
||||||
|
VADDMEM(8,%r8,Chimu_12,Chi_12) );
|
||||||
|
|
||||||
|
#define ZP_PROJMEM(PTR) \
|
||||||
|
LOAD64(%r8,PTR) \
|
||||||
|
__asm__ ( \
|
||||||
|
LOAD_CHIi \
|
||||||
|
SHUF_CHIMU23i \
|
||||||
|
VACCTIMESI1(Chi_00,Chi_00,Chimu_20) \
|
||||||
|
VACCTIMESI1(Chi_01,Chi_01,Chimu_21) \
|
||||||
|
VACCTIMESI1(Chi_02,Chi_02,Chimu_22) \
|
||||||
|
VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_30) \
|
||||||
|
VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_31) \
|
||||||
|
VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_32) \
|
||||||
|
VACCTIMESI2(Chi_00,Chi_00,Chimu_20) \
|
||||||
|
VACCTIMESI2(Chi_01,Chi_01,Chimu_21) \
|
||||||
|
VACCTIMESI2(Chi_02,Chi_02,Chimu_22) \
|
||||||
|
VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_30) \
|
||||||
|
VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_31) \
|
||||||
|
VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_32) );
|
||||||
|
|
||||||
|
|
||||||
|
#define TP_PROJMEM(ptr) \
|
||||||
|
LOAD64(%r8,ptr) \
|
||||||
|
__asm__ ( \
|
||||||
|
LOAD_CHIMU01i \
|
||||||
|
VADDMEM(6,%r8 ,Chimu_00,Chi_00) \
|
||||||
|
VADDMEM(7,%r8,Chimu_01,Chi_01) \
|
||||||
|
VADDMEM(8,%r8,Chimu_02,Chi_02) \
|
||||||
|
VADDMEM(9,%r8,Chimu_10,Chi_10) \
|
||||||
|
VADDMEM(10,%r8,Chimu_11,Chi_11) \
|
||||||
|
VADDMEM(11,%r8,Chimu_12,Chi_12) );
|
||||||
|
|
||||||
|
// hspin(0)=fspin(0)-timesI(fspin(3))
|
||||||
|
// hspin(1)=fspin(1)-timesI(fspin(2))
|
||||||
|
|
||||||
|
#define XM_PROJMEM(PTR) \
|
||||||
|
LOAD64(%r8,PTR)\
|
||||||
|
__asm__ ( \
|
||||||
|
SHUF_CHIMU23i \
|
||||||
|
LOAD_CHIi \
|
||||||
|
VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_30)\
|
||||||
|
VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_31)\
|
||||||
|
VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_32)\
|
||||||
|
VACCTIMESMINUSI1(Chi_10,Chi_10,Chimu_20)\
|
||||||
|
VACCTIMESMINUSI1(Chi_11,Chi_11,Chimu_21)\
|
||||||
|
VACCTIMESMINUSI1(Chi_12,Chi_12,Chimu_22)\
|
||||||
|
VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_30)\
|
||||||
|
VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_31)\
|
||||||
|
VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_32)\
|
||||||
|
VACCTIMESMINUSI2(Chi_10,Chi_10,Chimu_20)\
|
||||||
|
VACCTIMESMINUSI2(Chi_11,Chi_11,Chimu_21)\
|
||||||
|
VACCTIMESMINUSI2(Chi_12,Chi_12,Chimu_22) );
|
||||||
|
|
||||||
|
#define YM_PROJMEM(ptr) \
|
||||||
|
LOAD64(%r8,ptr) \
|
||||||
|
__asm__ ( \
|
||||||
|
LOAD_CHIMU01i \
|
||||||
|
VADDMEM(9,%r8 ,Chimu_00,Chi_00) \
|
||||||
|
VADDMEM(10,%r8,Chimu_01,Chi_01) \
|
||||||
|
VADDMEM(11,%r8,Chimu_02,Chi_02) \
|
||||||
|
VSUBMEM(6,%r8,Chimu_10,Chi_10) \
|
||||||
|
VSUBMEM(7,%r8,Chimu_11,Chi_11) \
|
||||||
|
VSUBMEM(8,%r8,Chimu_12,Chi_12) );
|
||||||
|
|
||||||
|
#define ZM_PROJMEM(PTR) \
|
||||||
|
LOAD64(%r8,PTR) \
|
||||||
|
__asm__ ( \
|
||||||
|
SHUF_CHIMU23i \
|
||||||
|
LOAD_CHIi \
|
||||||
|
VACCTIMESMINUSI1(Chi_00,Chi_00,Chimu_20)\
|
||||||
|
VACCTIMESMINUSI1(Chi_01,Chi_01,Chimu_21)\
|
||||||
|
VACCTIMESMINUSI1(Chi_02,Chi_02,Chimu_22)\
|
||||||
|
VACCTIMESI1(Chi_10,Chi_10,Chimu_30)\
|
||||||
|
VACCTIMESI1(Chi_11,Chi_11,Chimu_31)\
|
||||||
|
VACCTIMESI1(Chi_12,Chi_12,Chimu_32)\
|
||||||
|
VACCTIMESMINUSI2(Chi_00,Chi_00,Chimu_20)\
|
||||||
|
VACCTIMESMINUSI2(Chi_01,Chi_01,Chimu_21)\
|
||||||
|
VACCTIMESMINUSI2(Chi_02,Chi_02,Chimu_22)\
|
||||||
|
VACCTIMESI2(Chi_10,Chi_10,Chimu_30)\
|
||||||
|
VACCTIMESI2(Chi_11,Chi_11,Chimu_31)\
|
||||||
|
VACCTIMESI2(Chi_12,Chi_12,Chimu_32) );
|
||||||
|
|
||||||
|
#define TM_PROJMEM(ptr) \
|
||||||
|
LOAD64(%r8,ptr) \
|
||||||
|
__asm__ ( \
|
||||||
|
LOAD_CHIMU01i \
|
||||||
|
VSUBMEM(6,%r8,Chimu_00,Chi_00) \
|
||||||
|
VSUBMEM(7,%r8,Chimu_01,Chi_01) \
|
||||||
|
VSUBMEM(8,%r8,Chimu_02,Chi_02) \
|
||||||
|
VSUBMEM(9,%r8,Chimu_10,Chi_10) \
|
||||||
|
VSUBMEM(10,%r8,Chimu_11,Chi_11) \
|
||||||
|
VSUBMEM(11,%r8,Chimu_12,Chi_12) );
|
||||||
|
|
||||||
|
// fspin(0)=hspin(0)
|
||||||
|
// fspin(1)=hspin(1)
|
||||||
|
// fspin(2)=timesMinusI(hspin(1))
|
||||||
|
// fspin(3)=timesMinusI(hspin(0))
|
||||||
|
#define XP_RECON __asm__ ( \
|
||||||
|
VZERO(TMP) \
|
||||||
|
VTIMESMINUSI0(UChi_00,result_30,TMP) \
|
||||||
|
VTIMESMINUSI0(UChi_10,result_20,TMP) \
|
||||||
|
VTIMESMINUSI0(UChi_01,result_31,TMP) \
|
||||||
|
VTIMESMINUSI0(UChi_11,result_21,TMP) \
|
||||||
|
VTIMESMINUSI0(UChi_02,result_32,TMP) \
|
||||||
|
VTIMESMINUSI0(UChi_12,result_22,TMP) \
|
||||||
|
VMOV(UChi_00,result_00) \
|
||||||
|
VMOV(UChi_10,result_10) \
|
||||||
|
VMOV(UChi_01,result_01) \
|
||||||
|
VMOV(UChi_11,result_11) \
|
||||||
|
VMOV(UChi_02,result_02) \
|
||||||
|
VMOV(UChi_12,result_12) \
|
||||||
|
VTIMESMINUSI1(UChi_10,result_20,TMP) \
|
||||||
|
VTIMESMINUSI1(UChi_11,result_21,TMP) \
|
||||||
|
VTIMESMINUSI1(UChi_12,result_22,TMP) \
|
||||||
|
VTIMESMINUSI1(UChi_00,result_30,TMP) \
|
||||||
|
VTIMESMINUSI1(UChi_01,result_31,TMP) \
|
||||||
|
VTIMESMINUSI1(UChi_02,result_32,TMP) \
|
||||||
|
VTIMESMINUSI2(UChi_10,result_20,TMP) \
|
||||||
|
VTIMESMINUSI2(UChi_11,result_21,TMP) \
|
||||||
|
VTIMESMINUSI2(UChi_12,result_22,TMP) \
|
||||||
|
VTIMESMINUSI2(UChi_00,result_30,TMP) \
|
||||||
|
VTIMESMINUSI2(UChi_01,result_31,TMP) \
|
||||||
|
VTIMESMINUSI2(UChi_02,result_32,TMP) \
|
||||||
|
);
|
||||||
|
// NB could save 6 ops using addsub => 12 cycles
|
||||||
|
#define XP_RECON_ACCUM __asm__ ( \
|
||||||
|
VZERO(TMP)\
|
||||||
|
VACCTIMESMINUSI0(UChi_00,result_30,Z3)\
|
||||||
|
VACCTIMESMINUSI0(UChi_10,result_20,Z0)\
|
||||||
|
VACCTIMESMINUSI0(UChi_01,result_31,Z4)\
|
||||||
|
VACCTIMESMINUSI0(UChi_11,result_21,Z1)\
|
||||||
|
VACCTIMESMINUSI0(UChi_02,result_32,Z5)\
|
||||||
|
VACCTIMESMINUSI0(UChi_12,result_22,Z2)\
|
||||||
|
VADD(UChi_00,result_00,result_00)\
|
||||||
|
VADD(UChi_10,result_10,result_10)\
|
||||||
|
VADD(UChi_01,result_01,result_01)\
|
||||||
|
VADD(UChi_11,result_11,result_11)\
|
||||||
|
VADD(UChi_02,result_02,result_02)\
|
||||||
|
VADD(UChi_12,result_12,result_12)\
|
||||||
|
VACCTIMESMINUSI1(UChi_00,result_30,Z3)\
|
||||||
|
VACCTIMESMINUSI1(UChi_10,result_20,Z0)\
|
||||||
|
VACCTIMESMINUSI1(UChi_01,result_31,Z4)\
|
||||||
|
VACCTIMESMINUSI1(UChi_11,result_21,Z1)\
|
||||||
|
VACCTIMESMINUSI1(UChi_02,result_32,Z5)\
|
||||||
|
VACCTIMESMINUSI1(UChi_12,result_22,Z2)\
|
||||||
|
VACCTIMESMINUSI2(UChi_10,result_20,Z0)\
|
||||||
|
VACCTIMESMINUSI2(UChi_11,result_21,Z1)\
|
||||||
|
VACCTIMESMINUSI2(UChi_12,result_22,Z2)\
|
||||||
|
VACCTIMESMINUSI2(UChi_00,result_30,Z3)\
|
||||||
|
VACCTIMESMINUSI2(UChi_01,result_31,Z4)\
|
||||||
|
VACCTIMESMINUSI2(UChi_02,result_32,Z5)\
|
||||||
|
);
|
||||||
|
|
||||||
|
#define XM_RECON __asm__ ( \
|
||||||
|
VZERO(TMP)\
|
||||||
|
VTIMESI0(UChi_00,result_30,TMP)\
|
||||||
|
VTIMESI0(UChi_10,result_20,TMP)\
|
||||||
|
VTIMESI0(UChi_01,result_31,TMP)\
|
||||||
|
VTIMESI0(UChi_11,result_21,TMP)\
|
||||||
|
VTIMESI0(UChi_02,result_32,TMP)\
|
||||||
|
VTIMESI0(UChi_12,result_22,TMP)\
|
||||||
|
VMOV(UChi_00,result_00)\
|
||||||
|
VMOV(UChi_10,result_10)\
|
||||||
|
VMOV(UChi_01,result_01)\
|
||||||
|
VMOV(UChi_11,result_11)\
|
||||||
|
VMOV(UChi_02,result_02)\
|
||||||
|
VMOV(UChi_12,result_12)\
|
||||||
|
VTIMESI1(UChi_00,result_30,TMP)\
|
||||||
|
VTIMESI1(UChi_10,result_20,TMP)\
|
||||||
|
VTIMESI1(UChi_01,result_31,TMP)\
|
||||||
|
VTIMESI1(UChi_11,result_21,TMP)\
|
||||||
|
VTIMESI1(UChi_02,result_32,TMP)\
|
||||||
|
VTIMESI1(UChi_12,result_22,TMP)\
|
||||||
|
VTIMESI2(UChi_10,result_20,TMP)\
|
||||||
|
VTIMESI2(UChi_11,result_21,TMP)\
|
||||||
|
VTIMESI2(UChi_12,result_22,TMP)\
|
||||||
|
VTIMESI2(UChi_00,result_30,TMP)\
|
||||||
|
VTIMESI2(UChi_01,result_31,TMP)\
|
||||||
|
VTIMESI2(UChi_02,result_32,TMP)\
|
||||||
|
);
|
||||||
|
|
||||||
|
#define XM_RECON_ACCUM __asm__ ( \
|
||||||
|
VACCTIMESI0(UChi_10,result_20,Z0)\
|
||||||
|
VACCTIMESI0(UChi_00,result_30,Z3)\
|
||||||
|
VACCTIMESI0(UChi_11,result_21,Z1)\
|
||||||
|
VACCTIMESI0(UChi_01,result_31,Z4)\
|
||||||
|
VACCTIMESI0(UChi_12,result_22,Z2)\
|
||||||
|
VACCTIMESI0(UChi_02,result_32,Z5)\
|
||||||
|
\
|
||||||
|
VADD(UChi_10,result_10,result_10)\
|
||||||
|
VADD(UChi_00,result_00,result_00)\
|
||||||
|
VADD(UChi_11,result_11,result_11)\
|
||||||
|
VADD(UChi_01,result_01,result_01)\
|
||||||
|
VADD(UChi_12,result_12,result_12)\
|
||||||
|
VADD(UChi_02,result_02,result_02)\
|
||||||
|
\
|
||||||
|
VACCTIMESI1(UChi_10,result_20,Z0)\
|
||||||
|
VACCTIMESI1(UChi_00,result_30,Z3)\
|
||||||
|
VACCTIMESI1(UChi_11,result_21,Z1)\
|
||||||
|
VACCTIMESI1(UChi_01,result_31,Z4)\
|
||||||
|
VACCTIMESI1(UChi_12,result_22,Z2)\
|
||||||
|
VACCTIMESI1(UChi_02,result_32,Z5)\
|
||||||
|
VACCTIMESI2(UChi_10,result_20,Z0)\
|
||||||
|
VACCTIMESI2(UChi_11,result_21,Z1)\
|
||||||
|
VACCTIMESI2(UChi_12,result_22,Z2)\
|
||||||
|
VACCTIMESI2(UChi_00,result_30,Z3)\
|
||||||
|
VACCTIMESI2(UChi_01,result_31,Z4)\
|
||||||
|
VACCTIMESI2(UChi_02,result_32,Z5)\
|
||||||
|
);
|
||||||
|
|
||||||
|
#define YP_RECON_ACCUM __asm__ ( \
|
||||||
|
VADD(UChi_00,result_00,result_00)\
|
||||||
|
VADD(UChi_10,result_10,result_10)\
|
||||||
|
VADD(UChi_01,result_01,result_01)\
|
||||||
|
VADD(UChi_11,result_11,result_11)\
|
||||||
|
VADD(UChi_02,result_02,result_02)\
|
||||||
|
VADD(UChi_12,result_12,result_12)\
|
||||||
|
VADD(UChi_10,result_20,result_20)\
|
||||||
|
VADD(UChi_11,result_21,result_21)\
|
||||||
|
VADD(UChi_12,result_22,result_22)\
|
||||||
|
VSUB(UChi_00,result_30,result_30)\
|
||||||
|
VSUB(UChi_01,result_31,result_31)\
|
||||||
|
VSUB(UChi_02,result_32,result_32) );
|
||||||
|
|
||||||
|
#define YM_RECON_ACCUM __asm__ ( \
|
||||||
|
VADD(UChi_00,result_00,result_00)\
|
||||||
|
VADD(UChi_10,result_10,result_10)\
|
||||||
|
VADD(UChi_01,result_01,result_01)\
|
||||||
|
VADD(UChi_11,result_11,result_11)\
|
||||||
|
VADD(UChi_02,result_02,result_02)\
|
||||||
|
VADD(UChi_12,result_12,result_12)\
|
||||||
|
VSUB(UChi_10,result_20,result_20)\
|
||||||
|
VSUB(UChi_11,result_21,result_21)\
|
||||||
|
VSUB(UChi_12,result_22,result_22)\
|
||||||
|
VADD(UChi_00,result_30,result_30)\
|
||||||
|
VADD(UChi_01,result_31,result_31)\
|
||||||
|
VADD(UChi_02,result_32,result_32) );
|
||||||
|
|
||||||
|
#define ZP_RECON_ACCUM __asm__ ( \
|
||||||
|
VACCTIMESMINUSI0(UChi_00,result_20,Z0)\
|
||||||
|
VACCTIMESI0(UChi_10,result_30,Z3)\
|
||||||
|
VACCTIMESMINUSI0(UChi_01,result_21,Z1)\
|
||||||
|
VACCTIMESI0(UChi_11,result_31,Z4)\
|
||||||
|
VACCTIMESMINUSI0(UChi_02,result_22,Z2)\
|
||||||
|
VACCTIMESI0(UChi_12,result_32,Z5)\
|
||||||
|
VADD(UChi_00,result_00,result_00)\
|
||||||
|
VADD(UChi_10,result_10,result_10)\
|
||||||
|
VADD(UChi_01,result_01,result_01)\
|
||||||
|
VADD(UChi_11,result_11,result_11)\
|
||||||
|
VADD(UChi_02,result_02,result_02)\
|
||||||
|
VADD(UChi_12,result_12,result_12)\
|
||||||
|
VACCTIMESMINUSI1(UChi_00,result_20,Z0)\
|
||||||
|
VACCTIMESI1(UChi_10,result_30,Z3)\
|
||||||
|
VACCTIMESMINUSI1(UChi_01,result_21,Z1)\
|
||||||
|
VACCTIMESI1(UChi_11,result_31,Z4)\
|
||||||
|
VACCTIMESMINUSI1(UChi_02,result_22,Z2)\
|
||||||
|
VACCTIMESI1(UChi_12,result_32,Z5)\
|
||||||
|
VACCTIMESMINUSI2(UChi_00,result_20,Z0)\
|
||||||
|
VACCTIMESMINUSI2(UChi_01,result_21,Z1)\
|
||||||
|
VACCTIMESMINUSI2(UChi_02,result_22,Z2)\
|
||||||
|
VACCTIMESI2(UChi_10,result_30,Z3)\
|
||||||
|
VACCTIMESI2(UChi_11,result_31,Z4)\
|
||||||
|
VACCTIMESI2(UChi_12,result_32,Z5)\
|
||||||
|
);
|
||||||
|
|
||||||
|
#define ZM_RECON_ACCUM __asm__ ( \
|
||||||
|
VACCTIMESI0(UChi_00,result_20,Z0)\
|
||||||
|
VACCTIMESMINUSI0(UChi_10,result_30,Z3)\
|
||||||
|
VACCTIMESI0(UChi_01,result_21,Z1)\
|
||||||
|
VACCTIMESMINUSI0(UChi_11,result_31,Z4)\
|
||||||
|
VACCTIMESI0(UChi_02,result_22,Z2)\
|
||||||
|
VACCTIMESMINUSI0(UChi_12,result_32,Z5)\
|
||||||
|
VADD(UChi_00,result_00,result_00)\
|
||||||
|
VADD(UChi_10,result_10,result_10)\
|
||||||
|
VADD(UChi_01,result_01,result_01)\
|
||||||
|
VADD(UChi_11,result_11,result_11)\
|
||||||
|
VADD(UChi_02,result_02,result_02)\
|
||||||
|
VADD(UChi_12,result_12,result_12)\
|
||||||
|
VACCTIMESI1(UChi_00,result_20,Z0)\
|
||||||
|
VACCTIMESMINUSI1(UChi_10,result_30,Z3)\
|
||||||
|
VACCTIMESI1(UChi_01,result_21,Z1)\
|
||||||
|
VACCTIMESMINUSI1(UChi_11,result_31,Z4)\
|
||||||
|
VACCTIMESI1(UChi_02,result_22,Z2)\
|
||||||
|
VACCTIMESMINUSI1(UChi_12,result_32,Z5)\
|
||||||
|
VACCTIMESI2(UChi_00,result_20,Z0)\
|
||||||
|
VACCTIMESI2(UChi_01,result_21,Z1)\
|
||||||
|
VACCTIMESI2(UChi_02,result_22,Z2)\
|
||||||
|
VACCTIMESMINUSI2(UChi_10,result_30,Z3)\
|
||||||
|
VACCTIMESMINUSI2(UChi_11,result_31,Z4)\
|
||||||
|
VACCTIMESMINUSI2(UChi_12,result_32,Z5)\
|
||||||
|
);
|
||||||
|
|
||||||
|
#define TP_RECON_ACCUM __asm__ ( \
|
||||||
|
VADD(UChi_00,result_00,result_00)\
|
||||||
|
VADD(UChi_10,result_10,result_10)\
|
||||||
|
VADD(UChi_01,result_01,result_01)\
|
||||||
|
VADD(UChi_11,result_11,result_11)\
|
||||||
|
VADD(UChi_02,result_02,result_02)\
|
||||||
|
VADD(UChi_12,result_12,result_12)\
|
||||||
|
VADD(UChi_00,result_20,result_20)\
|
||||||
|
VADD(UChi_10,result_30,result_30)\
|
||||||
|
VADD(UChi_01,result_21,result_21)\
|
||||||
|
VADD(UChi_11,result_31,result_31)\
|
||||||
|
VADD(UChi_02,result_22,result_22)\
|
||||||
|
VADD(UChi_12,result_32,result_32) );
|
||||||
|
|
||||||
|
#define TM_RECON_ACCUM __asm__ ( \
|
||||||
|
VADD(UChi_00,result_00,result_00)\
|
||||||
|
VADD(UChi_10,result_10,result_10)\
|
||||||
|
VADD(UChi_01,result_01,result_01)\
|
||||||
|
VADD(UChi_11,result_11,result_11)\
|
||||||
|
VADD(UChi_02,result_02,result_02)\
|
||||||
|
VADD(UChi_12,result_12,result_12)\
|
||||||
|
VSUB(UChi_00,result_20,result_20)\
|
||||||
|
VSUB(UChi_10,result_30,result_30)\
|
||||||
|
VSUB(UChi_01,result_21,result_21)\
|
||||||
|
VSUB(UChi_11,result_31,result_31)\
|
||||||
|
VSUB(UChi_02,result_22,result_22)\
|
||||||
|
VSUB(UChi_12,result_32,result_32) );
|
||||||
|
|
||||||
|
#define PREFETCH_CHIMU(A) \
|
||||||
|
LOAD64(%r9,A) \
|
||||||
|
__asm__ ( \
|
||||||
|
VPREFETCHG(12,%r9)\
|
||||||
|
VPREFETCHG(13,%r9)\
|
||||||
|
VPREFETCHG(14,%r9)\
|
||||||
|
VPREFETCHG(15,%r9)\
|
||||||
|
VPREFETCHG(16,%r9)\
|
||||||
|
VPREFETCHG(17,%r9)\
|
||||||
|
VPREFETCHG(18,%r9)\
|
||||||
|
VPREFETCHG(19,%r9)\
|
||||||
|
VPREFETCHG(20,%r9)\
|
||||||
|
VPREFETCHG(21,%r9)\
|
||||||
|
VPREFETCHG(22,%r9)\
|
||||||
|
VPREFETCHG(23,%r9));
|
||||||
|
|
||||||
|
#define PERMUTE_DIR0 __asm__ ( \
|
||||||
|
VPERM0(Chi_00,Chi_00) \
|
||||||
|
VPERM0(Chi_01,Chi_01) \
|
||||||
|
VPERM0(Chi_02,Chi_02) \
|
||||||
|
VPERM0(Chi_10,Chi_10) \
|
||||||
|
VPERM0(Chi_11,Chi_11) \
|
||||||
|
VPERM0(Chi_12,Chi_12) );
|
||||||
|
|
||||||
|
#define PERMUTE_DIR1 __asm__ ( \
|
||||||
|
VPERM1(Chi_00,Chi_00) \
|
||||||
|
VPERM1(Chi_01,Chi_01) \
|
||||||
|
VPERM1(Chi_02,Chi_02) \
|
||||||
|
VPERM1(Chi_10,Chi_10) \
|
||||||
|
VPERM1(Chi_11,Chi_11) \
|
||||||
|
VPERM1(Chi_12,Chi_12));
|
||||||
|
|
||||||
|
#define PERMUTE_DIR2 __asm__ ( \
|
||||||
|
VPERM2(Chi_00,Chi_00) \
|
||||||
|
VPERM2(Chi_01,Chi_01) \
|
||||||
|
VPERM2(Chi_02,Chi_02) \
|
||||||
|
VPERM2(Chi_10,Chi_10) \
|
||||||
|
VPERM2(Chi_11,Chi_11) \
|
||||||
|
VPERM2(Chi_12,Chi_12) );
|
||||||
|
|
||||||
|
#define PERMUTE_DIR3 __asm__ ( \
|
||||||
|
VPERM3(Chi_00,Chi_00) \
|
||||||
|
VPERM3(Chi_01,Chi_01) \
|
||||||
|
VPERM3(Chi_02,Chi_02) \
|
||||||
|
VPERM3(Chi_10,Chi_10) \
|
||||||
|
VPERM3(Chi_11,Chi_11) \
|
||||||
|
VPERM3(Chi_12,Chi_12) );
|
||||||
|
|
||||||
|
|
||||||
|
#define MULT_ADDSUB_2SPIN(ptr,pf) \
|
||||||
|
LOAD64(%r8,ptr) \
|
||||||
|
LOAD64(%r9,pf) \
|
||||||
|
__asm__ ( \
|
||||||
|
VPREFETCH2(9,%r8) \
|
||||||
|
VPREFETCH2(10,%r8) \
|
||||||
|
VPREFETCH2(11,%r8) \
|
||||||
|
VPREFETCH2(12,%r8) \
|
||||||
|
VPREFETCH2(13,%r8) \
|
||||||
|
VPREFETCH2(14,%r8) \
|
||||||
|
VPREFETCH2(15,%r8) \
|
||||||
|
VPREFETCH2(16,%r8) \
|
||||||
|
VPREFETCH2(17,%r8) \
|
||||||
|
VSHUF(Chi_00,T1) \
|
||||||
|
VMOVIDUP(0,%r8,Z0 ) \
|
||||||
|
VMOVIDUP(3,%r8,Z1 ) \
|
||||||
|
VMOVIDUP(6,%r8,Z2 ) VSHUF(Chi_10,T2) \
|
||||||
|
/*6*/ \
|
||||||
|
VMUL(Z0,T1,UChi_00) VMOVRDUP(0,%r8,Z3 ) \
|
||||||
|
VMUL(Z0,T2,UChi_10) VMOVRDUP(3,%r8,Z4 ) \
|
||||||
|
VMUL(Z1,T1,UChi_01) VMOVRDUP(6,%r8,Z5 ) \
|
||||||
|
VMUL(Z1,T2,UChi_11) VMOVIDUP(1,%r8,Z0 ) \
|
||||||
|
VMUL(Z2,T1,UChi_02) VMOVIDUP(4,%r8,Z1 ) \
|
||||||
|
VMUL(Z2,T2,UChi_12) VMOVIDUP(7,%r8,Z2 ) \
|
||||||
|
VPREFETCHG(0,%r9) \
|
||||||
|
VPREFETCHG(1,%r9) \
|
||||||
|
VPREFETCHG(2,%r9) \
|
||||||
|
VPREFETCHG(3,%r9) \
|
||||||
|
/*18*/ \
|
||||||
|
VMADDSUB(Z3,Chi_00,UChi_00) VSHUF(Chi_01,T1) \
|
||||||
|
VMADDSUB(Z3,Chi_10,UChi_10) \
|
||||||
|
VMADDSUB(Z4,Chi_00,UChi_01) VMOVRDUP(1,%r8,Z3 ) \
|
||||||
|
VMADDSUB(Z4,Chi_10,UChi_11) VSHUF(Chi_11,T2) \
|
||||||
|
VMADDSUB(Z5,Chi_00,UChi_02) VMOVRDUP(4,%r8,Z4 ) \
|
||||||
|
VMADDSUB(Z5,Chi_10,UChi_12) \
|
||||||
|
VPREFETCHG(4,%r9) \
|
||||||
|
VPREFETCHG(5,%r9) \
|
||||||
|
VPREFETCHG(6,%r9) \
|
||||||
|
VPREFETCHG(7,%r9) \
|
||||||
|
/*28*/ \
|
||||||
|
VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(7,%r8,Z5 ) \
|
||||||
|
VMADDSUB(Z0,T2,UChi_10) \
|
||||||
|
VMADDSUB(Z1,T1,UChi_01) VMOVIDUP(2,%r8,Z0 ) \
|
||||||
|
VMADDSUB(Z1,T2,UChi_11) \
|
||||||
|
VMADDSUB(Z2,T1,UChi_02) VMOVIDUP(5,%r8,Z1 ) \
|
||||||
|
VMADDSUB(Z2,T2,UChi_12) VMOVIDUP(8,%r8,Z2 ) \
|
||||||
|
VPREFETCH2(12,%r9) \
|
||||||
|
VPREFETCH2(13,%r9) \
|
||||||
|
VPREFETCH2(14,%r9) \
|
||||||
|
VPREFETCH2(15,%r9) \
|
||||||
|
VPREFETCH2(16,%r9) \
|
||||||
|
VPREFETCH2(17,%r9) \
|
||||||
|
VPREFETCH2(18,%r9) \
|
||||||
|
VPREFETCH2(19,%r9) \
|
||||||
|
VPREFETCH2(20,%r9) \
|
||||||
|
VPREFETCH2(21,%r9) \
|
||||||
|
VPREFETCH2(22,%r9) \
|
||||||
|
VPREFETCH2(23,%r9) \
|
||||||
|
/*38*/ \
|
||||||
|
VMADDSUB(Z3,Chi_01,UChi_00) VSHUF(Chi_02,T1) \
|
||||||
|
VMADDSUB(Z3,Chi_11,UChi_10) \
|
||||||
|
VMADDSUB(Z4,Chi_01,UChi_01) VMOVRDUP(2,%r8,Z3 ) \
|
||||||
|
VMADDSUB(Z4,Chi_11,UChi_11) VSHUF(Chi_12,T2) \
|
||||||
|
VMADDSUB(Z5,Chi_01,UChi_02) VMOVRDUP(5,%r8,Z4 ) \
|
||||||
|
VMADDSUB(Z5,Chi_11,UChi_12) \
|
||||||
|
VPREFETCHG(9,%r8) \
|
||||||
|
VPREFETCHG(10,%r8) \
|
||||||
|
VPREFETCHG(11,%r8) \
|
||||||
|
VPREFETCHG(12,%r8) \
|
||||||
|
VPREFETCHG(13,%r8) \
|
||||||
|
VPREFETCHG(14,%r8) \
|
||||||
|
VPREFETCHG(15,%r8) \
|
||||||
|
VPREFETCHG(16,%r8) \
|
||||||
|
VPREFETCHG(17,%r8) \
|
||||||
|
/*48*/ \
|
||||||
|
VMADDSUB(Z0,T1,UChi_00) VMOVRDUP(8,%r8,Z5 ) \
|
||||||
|
VMADDSUB(Z0,T2,UChi_10) \
|
||||||
|
VMADDSUB(Z1,T1,UChi_01) \
|
||||||
|
VMADDSUB(Z1,T2,UChi_11) \
|
||||||
|
VMADDSUB(Z2,T1,UChi_02) \
|
||||||
|
VMADDSUB(Z2,T2,UChi_12) \
|
||||||
|
VPREFETCHG(8,%r9) \
|
||||||
|
VPREFETCHG(9,%r9) \
|
||||||
|
VPREFETCHG(10,%r9) \
|
||||||
|
VPREFETCHG(11,%r9) \
|
||||||
|
/*55*/ \
|
||||||
|
VMADDSUB(Z3,Chi_02,UChi_00) \
|
||||||
|
VMADDSUB(Z3,Chi_12,UChi_10) \
|
||||||
|
VMADDSUB(Z4,Chi_02,UChi_01) \
|
||||||
|
VMADDSUB(Z4,Chi_12,UChi_11) \
|
||||||
|
VMADDSUB(Z5,Chi_02,UChi_02) \
|
||||||
|
VMADDSUB(Z5,Chi_12,UChi_12) \
|
||||||
|
/*61 insns*/ );
|
||||||
|
|
||||||
|
|
||||||
|
#define MULT_ADDSUB_2SPIN_LS(ptr,pf) \
|
||||||
|
LOAD64(%r8,ptr) \
|
||||||
|
LOAD64(%r9,pf) \
|
||||||
|
__asm__ ( \
|
||||||
|
VSHUF(Chi_00,T1) VSHUF(Chi_10,T2) \
|
||||||
|
VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10) \
|
||||||
|
VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11) \
|
||||||
|
VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12) \
|
||||||
|
VPREFETCHG(0,%r9) \
|
||||||
|
VPREFETCHG(1,%r9) \
|
||||||
|
VPREFETCHG(2,%r9) \
|
||||||
|
VPREFETCHG(3,%r9) \
|
||||||
|
/*8*/ \
|
||||||
|
VSHUF(Chi_01,T1) VSHUF(Chi_11,T2) \
|
||||||
|
VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \
|
||||||
|
VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \
|
||||||
|
VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \
|
||||||
|
VPREFETCHG(4,%r9) \
|
||||||
|
VPREFETCHG(5,%r9) \
|
||||||
|
VPREFETCHG(6,%r9) \
|
||||||
|
VPREFETCHG(7,%r9) \
|
||||||
|
/*16*/ \
|
||||||
|
VMADDSUBIDUP(1,%r8,T1,UChi_00) VMADDSUBIDUP(1,%r8,T2,UChi_10) \
|
||||||
|
VMADDSUBIDUP(4,%r8,T1,UChi_01) VMADDSUBIDUP(4,%r8,T2,UChi_11) \
|
||||||
|
VMADDSUBIDUP(7,%r8,T1,UChi_02) VMADDSUBIDUP(7,%r8,T2,UChi_12) \
|
||||||
|
VPREFETCHG(8,%r9) \
|
||||||
|
VPREFETCHG(9,%r9) \
|
||||||
|
VPREFETCHG(10,%r9) \
|
||||||
|
VPREFETCHG(11,%r9) \
|
||||||
|
/*22*/ \
|
||||||
|
VSHUF(Chi_02,T1) VSHUF(Chi_12,T2) \
|
||||||
|
VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \
|
||||||
|
VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \
|
||||||
|
VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \
|
||||||
|
VPREFETCH2(12,%r9) \
|
||||||
|
VPREFETCH2(13,%r9) \
|
||||||
|
VPREFETCH2(14,%r9) \
|
||||||
|
VPREFETCH2(15,%r9) \
|
||||||
|
/*30*/ \
|
||||||
|
VMADDSUBIDUP(2,%r8,T1,UChi_00) VMADDSUBIDUP(2,%r8,T2,UChi_10) \
|
||||||
|
VMADDSUBIDUP(5,%r8,T1,UChi_01) VMADDSUBIDUP(5,%r8,T2,UChi_11) \
|
||||||
|
VPREFETCH2(16,%r9) \
|
||||||
|
VPREFETCH2(17,%r9) \
|
||||||
|
VPREFETCH2(18,%r9) \
|
||||||
|
VPREFETCH2(19,%r9) \
|
||||||
|
VMADDSUBIDUP(8,%r8,T1,UChi_02) VMADDSUBIDUP(8,%r8,T2,UChi_12) \
|
||||||
|
/*36*/ \
|
||||||
|
VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
|
||||||
|
VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
|
||||||
|
VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
|
||||||
|
VPREFETCH2(20,%r9) \
|
||||||
|
VPREFETCH2(21,%r9) \
|
||||||
|
VPREFETCH2(22,%r9) \
|
||||||
|
VPREFETCH2(23,%r9) \
|
||||||
|
VPREFETCHG(2,%r8) \
|
||||||
|
VPREFETCHG(3,%r8) \
|
||||||
|
VPREFETCH2(4,%r8) \
|
||||||
|
VPREFETCH2(5,%r8) \
|
||||||
|
/*42 insns*/ );
|
||||||
|
|
||||||
|
#define MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf) \
|
||||||
|
LOAD64(%r8,ptr) \
|
||||||
|
LOAD64(%r9,pf) \
|
||||||
|
__asm__ ( \
|
||||||
|
VSHUF(Chi_00,T1) VSHUF(Chi_10,T2) \
|
||||||
|
VMULIDUP(0,%r8,T1,UChi_00) VMULIDUP(0,%r8,T2,UChi_10) \
|
||||||
|
VMULIDUP(3,%r8,T1,UChi_01) VMULIDUP(3,%r8,T2,UChi_11) \
|
||||||
|
VMULIDUP(6,%r8,T1,UChi_02) VMULIDUP(6,%r8,T2,UChi_12) \
|
||||||
|
/*8*/ \
|
||||||
|
VSHUF(Chi_01,T1) VSHUF(Chi_11,T2) \
|
||||||
|
VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r8,Chi_10,UChi_10) \
|
||||||
|
VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r8,Chi_10,UChi_11) \
|
||||||
|
VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r8,Chi_10,UChi_12) \
|
||||||
|
/*16*/ \
|
||||||
|
VMADDSUBIDUP(1,%r8,T1,UChi_00) VMADDSUBIDUP(1,%r8,T2,UChi_10) \
|
||||||
|
VMADDSUBIDUP(4,%r8,T1,UChi_01) VMADDSUBIDUP(4,%r8,T2,UChi_11) \
|
||||||
|
VMADDSUBIDUP(7,%r8,T1,UChi_02) VMADDSUBIDUP(7,%r8,T2,UChi_12) \
|
||||||
|
/*22*/ \
|
||||||
|
VSHUF(Chi_02,T1) VSHUF(Chi_12,T2) \
|
||||||
|
VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r8,Chi_11,UChi_10) \
|
||||||
|
VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r8,Chi_11,UChi_11) \
|
||||||
|
VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r8,Chi_11,UChi_12) \
|
||||||
|
/*30*/ \
|
||||||
|
VMADDSUBIDUP(2,%r8,T1,UChi_00) VMADDSUBIDUP(2,%r8,T2,UChi_10) \
|
||||||
|
VMADDSUBIDUP(5,%r8,T1,UChi_01) VMADDSUBIDUP(5,%r8,T2,UChi_11) \
|
||||||
|
VMADDSUBIDUP(8,%r8,T1,UChi_02) VMADDSUBIDUP(8,%r8,T2,UChi_12) \
|
||||||
|
/*36*/ \
|
||||||
|
VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r8,Chi_12,UChi_10) \
|
||||||
|
VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r8,Chi_12,UChi_11) \
|
||||||
|
VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r8,Chi_12,UChi_12) \
|
||||||
|
/* VPREFETCHG(2,%r8)*/ \
|
||||||
|
/* VPREFETCHG(3,%r8)*/ \
|
||||||
|
/*42 insns*/ );
|
||||||
|
|
||||||
|
|
||||||
|
#define Z6 Chi_00
|
||||||
|
#define MULT_ADDSUB_2SPIN_NEW(ptr,pf) \
|
||||||
|
LOAD64(%r8,ptr) \
|
||||||
|
__asm__ ( \
|
||||||
|
VSHUFMEM(0,%r8,Z0) \
|
||||||
|
VRDUP(Chi_00,T1) VIDUP(Chi_00,Chi_00) \
|
||||||
|
VRDUP(Chi_10,T2) VIDUP(Chi_10,Chi_10) \
|
||||||
|
VMUL(Z0,Chi_00,Z1) VMUL(Z0,Chi_10,Z2) \
|
||||||
|
VSHUFMEM(3,%r8,Z0) \
|
||||||
|
VMUL(Z0,Chi_00,Z3) VMUL(Z0,Chi_10,Z4) \
|
||||||
|
VSHUFMEM(6,%r8,Z0) \
|
||||||
|
VMUL(Z0,Chi_00,Z5) VMUL(Z0,Chi_10,Z6) \
|
||||||
|
VMULMEM(0,%r8,T1,UChi_00) VMULMEM(0,%r8,T2,UChi_10) \
|
||||||
|
VMULMEM(3,%r8,T1,UChi_01) VMULMEM(3,%r8,T2,UChi_11) \
|
||||||
|
VMULMEM(6,%r8,T1,UChi_02) VMULMEM(6,%r8,T2,UChi_12) \
|
||||||
|
/*11 cycles*/ \
|
||||||
|
VSHUFMEM(1,%r8,Z0) \
|
||||||
|
VRDUP(Chi_01,T1) VIDUP(Chi_01,Chi_01) \
|
||||||
|
VRDUP(Chi_11,T2) VIDUP(Chi_11,Chi_11) \
|
||||||
|
VMADD(Z0,Chi_01,Z1) VMADD(Z0,Chi_11,Z2) \
|
||||||
|
VSHUFMEM(4,%r8,Z0) \
|
||||||
|
VMADD(Z0,Chi_01,Z3) VMADD(Z0,Chi_11,Z4) \
|
||||||
|
VSHUFMEM(7,%r8,Z0) \
|
||||||
|
VMADD(Z0,Chi_01,Z5) VMADD(Z0,Chi_11,Z6) \
|
||||||
|
VMADDMEM(1,%r8,T1,UChi_00) VMADDMEM(1,%r8,T2,UChi_10) \
|
||||||
|
VMADDMEM(4,%r8,T1,UChi_01) VMADDMEM(4,%r8,T2,UChi_11) \
|
||||||
|
VMADDMEM(7,%r8,T1,UChi_02) VMADDMEM(7,%r8,T2,UChi_12) \
|
||||||
|
/*22 cycles*/ \
|
||||||
|
VSHUFMEM(2,%r8,Z0) \
|
||||||
|
VRDUP(Chi_02,T1) VIDUP(Chi_02,Chi_02) \
|
||||||
|
VRDUP(Chi_12,T2) VIDUP(Chi_12,Chi_12) \
|
||||||
|
VMADD(Z0,Chi_02,Z1) VMADD(Z0,Chi_12,Z2) \
|
||||||
|
VSHUFMEM(5,%r8,Z0) \
|
||||||
|
VMADD(Z0,Chi_02,Z3) VMADD(Z0,Chi_12,Z4) \
|
||||||
|
VSHUFMEM(8,%r8,Z0) \
|
||||||
|
VMADD(Z0,Chi_02,Z5) VMADD(Z0,Chi_12,Z6) \
|
||||||
|
/*33 cycles*/ \
|
||||||
|
VMADDSUBMEM(2,%r8,T1,Z1) VMADDSUBMEM(2,%r8,T2,Z2) \
|
||||||
|
VMADDSUBMEM(5,%r8,T1,Z3) VMADDSUBMEM(5,%r8,T2,Z4) \
|
||||||
|
VMADDSUBMEM(8,%r8,T1,Z5) VMADDSUBMEM(8,%r8,T2,Z6) \
|
||||||
|
/*stall*/ \
|
||||||
|
/*stall*/ \
|
||||||
|
/*stall*/ \
|
||||||
|
VADD(Z1,UChi_00,UChi_00) VADD(Z2,UChi_10,UChi_10) \
|
||||||
|
VADD(Z3,UChi_01,UChi_01) VADD(Z4,UChi_11,UChi_11) \
|
||||||
|
VADD(Z5,UChi_02,UChi_02) VADD(Z6,UChi_12,UChi_12) )
|
||||||
|
|
||||||
|
|
||||||
|
#endif
|
@ -103,9 +103,11 @@ void LebesgueOrder::IterateI(int ND,
|
|||||||
} else {
|
} else {
|
||||||
for(int d=0;d<ND;d++){
|
for(int d=0;d<ND;d++){
|
||||||
x[d]=xi[d]+xo[d];
|
x[d]=xi[d]+xo[d];
|
||||||
|
// std::cout << x[d]<<" ";
|
||||||
}
|
}
|
||||||
|
// std::cout << "\n";
|
||||||
IndexInteger index;
|
IndexInteger index;
|
||||||
grid->IndexFromCoor(x,index,grid->_rdimensions);
|
Lexicographic::IndexFromCoor(x,index,grid->_rdimensions);
|
||||||
_LebesgueReorder.push_back(index);
|
_LebesgueReorder.push_back(index);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -188,6 +190,7 @@ void LebesgueOrder::ZGraph(void)
|
|||||||
}
|
}
|
||||||
assert( _LebesgueReorder.size() == vol );
|
assert( _LebesgueReorder.size() == vol );
|
||||||
|
|
||||||
|
/*
|
||||||
std::vector<int> coor(4);
|
std::vector<int> coor(4);
|
||||||
for(IndexInteger asite=0;asite<vol;asite++){
|
for(IndexInteger asite=0;asite<vol;asite++){
|
||||||
grid->oCoorFromOindex (coor,_LebesgueReorder[asite]);
|
grid->oCoorFromOindex (coor,_LebesgueReorder[asite]);
|
||||||
@ -198,5 +201,6 @@ void LebesgueOrder::ZGraph(void)
|
|||||||
<< coor[3]<<"]"
|
<< coor[3]<<"]"
|
||||||
<<std::endl;
|
<<std::endl;
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -44,8 +44,8 @@ template<class vsimd,class scalar>
|
|||||||
inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const vsimd >::type * y,
|
inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const vsimd >::type * y,
|
||||||
std::vector<scalar *> &extracted,int offset){
|
std::vector<scalar *> &extracted,int offset){
|
||||||
// FIXME: bounce off memory is painful
|
// FIXME: bounce off memory is painful
|
||||||
|
static const int Nsimd=sizeof(vsimd)/sizeof(scalar);
|
||||||
int Nextr=extracted.size();
|
int Nextr=extracted.size();
|
||||||
int Nsimd=vsimd::Nsimd();
|
|
||||||
int s=Nsimd/Nextr;
|
int s=Nsimd/Nextr;
|
||||||
|
|
||||||
scalar*buf = (scalar *)y;
|
scalar*buf = (scalar *)y;
|
||||||
@ -59,8 +59,10 @@ inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const v
|
|||||||
template<class vsimd,class scalar>
|
template<class vsimd,class scalar>
|
||||||
inline void merge(typename std::enable_if<!isGridTensor<vsimd>::value, vsimd >::type * y,
|
inline void merge(typename std::enable_if<!isGridTensor<vsimd>::value, vsimd >::type * y,
|
||||||
std::vector<scalar *> &extracted,int offset){
|
std::vector<scalar *> &extracted,int offset){
|
||||||
|
|
||||||
|
static const int Nsimd=sizeof(vsimd)/sizeof(scalar);
|
||||||
|
|
||||||
int Nextr=extracted.size();
|
int Nextr=extracted.size();
|
||||||
int Nsimd=vsimd::Nsimd();
|
|
||||||
int s=Nsimd/Nextr; // can have sparse occupation of simd vector if simd_layout does not fill it
|
int s=Nsimd/Nextr; // can have sparse occupation of simd vector if simd_layout does not fill it
|
||||||
// replicate n-fold. Use to allow Integer masks to
|
// replicate n-fold. Use to allow Integer masks to
|
||||||
// predicate floating point of various width assignments and maintain conformable.
|
// predicate floating point of various width assignments and maintain conformable.
|
||||||
@ -85,6 +87,7 @@ inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const v
|
|||||||
scalar *buf = (scalar *)&y;
|
scalar *buf = (scalar *)&y;
|
||||||
for(int i=0;i<Nextr;i++){
|
for(int i=0;i<Nextr;i++){
|
||||||
extracted[i]=buf[i*s];
|
extracted[i]=buf[i*s];
|
||||||
|
#ifdef PARANOID
|
||||||
for(int ii=1;ii<s;ii++){
|
for(int ii=1;ii<s;ii++){
|
||||||
if ( buf[i*s]!=buf[i*s+ii] ){
|
if ( buf[i*s]!=buf[i*s+ii] ){
|
||||||
std::cout<<GridLogMessage << " SIMD extract failure splat = "<<s<<" ii "<<ii<<" " <<Nextr<<" "<< Nsimd<<" "<<std::endl;
|
std::cout<<GridLogMessage << " SIMD extract failure splat = "<<s<<" ii "<<ii<<" " <<Nextr<<" "<< Nsimd<<" "<<std::endl;
|
||||||
@ -96,6 +99,7 @@ inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const v
|
|||||||
}
|
}
|
||||||
assert(buf[i*s]==buf[i*s+ii]);
|
assert(buf[i*s]==buf[i*s+ii]);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
@ -106,7 +110,7 @@ inline void extract(typename std::enable_if<!isGridTensor<vsimd>::value, const v
|
|||||||
template<class vsimd,class scalar>
|
template<class vsimd,class scalar>
|
||||||
inline void merge(typename std::enable_if<!isGridTensor<vsimd>::value, vsimd >::type &y,std::vector<scalar> &extracted){
|
inline void merge(typename std::enable_if<!isGridTensor<vsimd>::value, vsimd >::type &y,std::vector<scalar> &extracted){
|
||||||
int Nextr=extracted.size();
|
int Nextr=extracted.size();
|
||||||
int Nsimd=vsimd::Nsimd();
|
static const int Nsimd=vsimd::Nsimd();
|
||||||
int s=Nsimd/Nextr;
|
int s=Nsimd/Nextr;
|
||||||
scalar *buf = (scalar *)&y;
|
scalar *buf = (scalar *)&y;
|
||||||
|
|
||||||
@ -125,9 +129,9 @@ template<class vobj> inline void extract(const vobj &vec,std::vector<typename vo
|
|||||||
typedef typename vobj::scalar_type scalar_type ;
|
typedef typename vobj::scalar_type scalar_type ;
|
||||||
typedef typename vobj::vector_type vector_type ;
|
typedef typename vobj::vector_type vector_type ;
|
||||||
|
|
||||||
const int Nsimd=vobj::vector_type::Nsimd();
|
static const int Nsimd=sizeof(vector_type)/sizeof(scalar_type);
|
||||||
|
static const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
int Nextr=extracted.size();
|
int Nextr=extracted.size();
|
||||||
const int words=sizeof(vobj)/sizeof(vector_type);
|
|
||||||
int s=Nsimd/Nextr;
|
int s=Nsimd/Nextr;
|
||||||
|
|
||||||
std::vector<scalar_type *> pointers(Nextr);
|
std::vector<scalar_type *> pointers(Nextr);
|
||||||
@ -148,8 +152,8 @@ void extract(const vobj &vec,std::vector<typename vobj::scalar_object *> &extrac
|
|||||||
typedef typename vobj::scalar_type scalar_type ;
|
typedef typename vobj::scalar_type scalar_type ;
|
||||||
typedef typename vobj::vector_type vector_type ;
|
typedef typename vobj::vector_type vector_type ;
|
||||||
|
|
||||||
const int words=sizeof(vobj)/sizeof(vector_type);
|
static const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
const int Nsimd=vobj::vector_type::Nsimd();
|
static const int Nsimd=vobj::vector_type::Nsimd();
|
||||||
|
|
||||||
int Nextr=extracted.size();
|
int Nextr=extracted.size();
|
||||||
int s = Nsimd/Nextr;
|
int s = Nsimd/Nextr;
|
||||||
@ -172,8 +176,8 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object> &extracted)
|
|||||||
typedef typename vobj::scalar_type scalar_type ;
|
typedef typename vobj::scalar_type scalar_type ;
|
||||||
typedef typename vobj::vector_type vector_type ;
|
typedef typename vobj::vector_type vector_type ;
|
||||||
|
|
||||||
const int Nsimd=vobj::vector_type::Nsimd();
|
static const int Nsimd=sizeof(vector_type)/sizeof(scalar_type);
|
||||||
const int words=sizeof(vobj)/sizeof(vector_type);
|
static const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
|
|
||||||
int Nextr = extracted.size();
|
int Nextr = extracted.size();
|
||||||
int splat=Nsimd/Nextr;
|
int splat=Nsimd/Nextr;
|
||||||
@ -197,7 +201,7 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
|
|||||||
typedef typename vobj::scalar_type scalar_type ;
|
typedef typename vobj::scalar_type scalar_type ;
|
||||||
typedef typename vobj::vector_type vector_type ;
|
typedef typename vobj::vector_type vector_type ;
|
||||||
|
|
||||||
const int Nsimd=vobj::vector_type::Nsimd();
|
const int Nsimd=sizeof(vector_type)/sizeof(scalar_type);
|
||||||
const int words=sizeof(vobj)/sizeof(vector_type);
|
const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
|
|
||||||
int Nextr=extracted.size();
|
int Nextr=extracted.size();
|
||||||
@ -224,20 +228,17 @@ void merge1(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
|
|||||||
typedef typename vobj::scalar_type scalar_type ;
|
typedef typename vobj::scalar_type scalar_type ;
|
||||||
typedef typename vobj::vector_type vector_type ;
|
typedef typename vobj::vector_type vector_type ;
|
||||||
|
|
||||||
const int Nsimd=vobj::vector_type::Nsimd();
|
static const int Nsimd=vobj::vector_type::Nsimd();
|
||||||
const int words=sizeof(vobj)/sizeof(vector_type);
|
static const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
|
|
||||||
scalar_type *pointer;
|
|
||||||
scalar_type *vp = (scalar_type *)&vec;
|
scalar_type *vp = (scalar_type *)&vec;
|
||||||
|
|
||||||
// assert( (((uint64_t)vp)&(sizeof(scalar_type)-1)) == 0);
|
// assert( (((uint64_t)vp)&(sizeof(scalar_type)-1)) == 0);
|
||||||
|
|
||||||
for(int i=0;i<Nsimd;i++){
|
|
||||||
pointer=(scalar_type *)&extracted[i][offset];
|
|
||||||
for(int w=0;w<words;w++){
|
for(int w=0;w<words;w++){
|
||||||
vp[w*Nsimd+i] = pointer[w];
|
for(int i=0;i<Nsimd;i++){
|
||||||
}
|
vp[w*Nsimd+i] = ((scalar_type *)&extracted[i][offset])[w];
|
||||||
}
|
}}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj> inline
|
template<class vobj> inline
|
||||||
|
@ -18,7 +18,7 @@ TESTS=`ls T*.cc`
|
|||||||
TESTLIST=`echo ${TESTS} | sed s/.cc//g `
|
TESTLIST=`echo ${TESTS} | sed s/.cc//g `
|
||||||
|
|
||||||
echo > Make.inc
|
echo > Make.inc
|
||||||
echo bin_PROGRAMS = ${TESTLIST} | sed s/Test_zmm//g >> Make.inc
|
echo bin_PROGRAMS += ${TESTLIST} | sed s/Test_zmm//g >> Make.inc
|
||||||
echo >> Make.inc
|
echo >> Make.inc
|
||||||
|
|
||||||
for f in $TESTS
|
for f in $TESTS
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
|
|
||||||
bin_PROGRAMS = Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_gpforce Test_dwf_hdcr Test_dwf_lanczos Test_gamma Test_GaugeAction Test_gparity Test_gpdwf_force Test_gp_rect_force Test_gpwilson_even_odd Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_rect_force Test_RectPlaq Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_wilson_tm_even_odd
|
bin_PROGRAMS += Test_cayley_cg Test_cayley_coarsen_support Test_cayley_even_odd Test_cayley_ldop_cr Test_cf_coarsen_support Test_cf_cr_unprec Test_cheby Test_contfrac_cg Test_contfrac_even_odd Test_contfrac_force Test_cshift Test_cshift_red_black Test_cshift_red_black_rotate Test_cshift_rotate Test_dwf_cg_prec Test_dwf_cg_schur Test_dwf_cg_unprec Test_dwf_cr_unprec Test_dwf_even_odd Test_dwf_force Test_dwf_fpgcr Test_dwf_gpforce Test_dwf_hdcr Test_dwf_lanczos Test_dwf_rb5d Test_gamma Test_GaugeAction Test_gparity Test_gpdwf_force Test_gp_rect_force Test_gpwilson_even_odd Test_hmc_EODWFRatio Test_hmc_EODWFRatio_Gparity Test_hmc_EOWilsonFermionGauge Test_hmc_EOWilsonRatio Test_hmc_GparityIwasakiGauge Test_hmc_GparityWilsonGauge Test_hmc_IwasakiGauge Test_hmc_RectGauge Test_hmc_WilsonFermionGauge Test_hmc_WilsonGauge Test_hmc_WilsonRatio Test_lie_generators Test_main Test_multishift_sqrt Test_nersc_io Test_partfrac_force Test_quenched_update Test_rect_force Test_RectPlaq Test_remez Test_rhmc_EOWilson1p1 Test_rhmc_EOWilsonRatio Test_rhmc_Wilson1p1 Test_rhmc_WilsonRatio Test_rng Test_rng_fixed Test_serialisation Test_simd Test_stencil Test_synthetic_lanczos Test_wilson_cg_prec Test_wilson_cg_schur Test_wilson_cg_unprec Test_wilson_cr_unprec Test_wilson_even_odd Test_wilson_force Test_wilson_force_phiMdagMphi Test_wilson_force_phiMphi Test_wilson_tm_even_odd
|
||||||
|
|
||||||
|
|
||||||
Test_cayley_cg_SOURCES=Test_cayley_cg.cc
|
Test_cayley_cg_SOURCES=Test_cayley_cg.cc
|
||||||
@ -50,6 +50,14 @@ Test_cshift_red_black_SOURCES=Test_cshift_red_black.cc
|
|||||||
Test_cshift_red_black_LDADD=-lGrid
|
Test_cshift_red_black_LDADD=-lGrid
|
||||||
|
|
||||||
|
|
||||||
|
Test_cshift_red_black_rotate_SOURCES=Test_cshift_red_black_rotate.cc
|
||||||
|
Test_cshift_red_black_rotate_LDADD=-lGrid
|
||||||
|
|
||||||
|
|
||||||
|
Test_cshift_rotate_SOURCES=Test_cshift_rotate.cc
|
||||||
|
Test_cshift_rotate_LDADD=-lGrid
|
||||||
|
|
||||||
|
|
||||||
Test_dwf_cg_prec_SOURCES=Test_dwf_cg_prec.cc
|
Test_dwf_cg_prec_SOURCES=Test_dwf_cg_prec.cc
|
||||||
Test_dwf_cg_prec_LDADD=-lGrid
|
Test_dwf_cg_prec_LDADD=-lGrid
|
||||||
|
|
||||||
@ -90,6 +98,10 @@ Test_dwf_lanczos_SOURCES=Test_dwf_lanczos.cc
|
|||||||
Test_dwf_lanczos_LDADD=-lGrid
|
Test_dwf_lanczos_LDADD=-lGrid
|
||||||
|
|
||||||
|
|
||||||
|
Test_dwf_rb5d_SOURCES=Test_dwf_rb5d.cc
|
||||||
|
Test_dwf_rb5d_LDADD=-lGrid
|
||||||
|
|
||||||
|
|
||||||
Test_gamma_SOURCES=Test_gamma.cc
|
Test_gamma_SOURCES=Test_gamma.cc
|
||||||
Test_gamma_LDADD=-lGrid
|
Test_gamma_LDADD=-lGrid
|
||||||
|
|
||||||
|
@ -8,8 +8,20 @@ endif
|
|||||||
AM_CXXFLAGS = -I$(top_srcdir)/lib
|
AM_CXXFLAGS = -I$(top_srcdir)/lib
|
||||||
AM_LDFLAGS = -L$(top_builddir)/lib
|
AM_LDFLAGS = -L$(top_builddir)/lib
|
||||||
|
|
||||||
|
if USE_LAPACK
|
||||||
|
AM_CXXFLAGS += -DUSE_LAPACK
|
||||||
|
if USE_LAPACK_LIB
|
||||||
|
#if test "X${ac_LAPACK}X" != XyesX
|
||||||
|
AM_CXXFLAGS += -I$(ac_LAPACK)/include
|
||||||
|
AM_LDFLAGS += -L$(ac_LAPACK)/lib
|
||||||
|
#fi
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
if BUILD_ZMM
|
if BUILD_ZMM
|
||||||
bin_PROGRAMS=Test_zmm
|
bin_PROGRAMS=Test_zmm
|
||||||
|
else
|
||||||
|
bin_PROGRAMS=
|
||||||
endif
|
endif
|
||||||
|
|
||||||
include Make.inc
|
include Make.inc
|
||||||
|
@ -96,13 +96,13 @@ int main (int argc, char ** argv)
|
|||||||
std::vector<int> peer(4);
|
std::vector<int> peer(4);
|
||||||
Complex tmp =cm;
|
Complex tmp =cm;
|
||||||
Integer index=real(tmp);
|
Integer index=real(tmp);
|
||||||
Fine.CoorFromIndex(peer,index,latt_size);
|
Lexicographic::CoorFromIndex(peer,index,latt_size);
|
||||||
|
|
||||||
if (nrm > 0){
|
if (nrm > 0){
|
||||||
std::cerr<<"FAIL shift "<< shift<<" in dir "<< dir<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "<< cm()()()<<" expect "<<scm<<" "<<nrm<<std::endl;
|
std::cerr<<"FAIL shift "<< shift<<" in dir "<< dir<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "<< cm()()()<<" expect "<<scm<<" "<<nrm<<std::endl;
|
||||||
std::cerr<<"Got "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
|
std::cerr<<"Got "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
|
||||||
index=real(scm);
|
index=real(scm);
|
||||||
Fine.CoorFromIndex(peer,index,latt_size);
|
Lexicographic::CoorFromIndex(peer,index,latt_size);
|
||||||
std::cerr<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
|
std::cerr<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
|
||||||
}
|
}
|
||||||
}}}}
|
}}}}
|
||||||
|
@ -132,7 +132,7 @@ int main (int argc, char ** argv)
|
|||||||
std::vector<int> peer(4);
|
std::vector<int> peer(4);
|
||||||
Complex ctmp = cm;
|
Complex ctmp = cm;
|
||||||
Integer index=real(ctmp);
|
Integer index=real(ctmp);
|
||||||
Fine.CoorFromIndex(peer,index,latt_size);
|
Lexicographic::CoorFromIndex(peer,index,latt_size);
|
||||||
|
|
||||||
if (nrm > 0){
|
if (nrm > 0){
|
||||||
std::cout<<"FAIL shift "<< shift<<" in dir "<< dir
|
std::cout<<"FAIL shift "<< shift<<" in dir "<< dir
|
||||||
@ -140,7 +140,7 @@ int main (int argc, char ** argv)
|
|||||||
<< cm()()()<<" expect "<<scm<<" "<<nrm<<std::endl;
|
<< cm()()()<<" expect "<<scm<<" "<<nrm<<std::endl;
|
||||||
std::cout<<"Got "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
|
std::cout<<"Got "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
|
||||||
index=real(scm);
|
index=real(scm);
|
||||||
Fine.CoorFromIndex(peer,index,latt_size);
|
Lexicographic::CoorFromIndex(peer,index,latt_size);
|
||||||
std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
|
std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
|
||||||
exit(-1);
|
exit(-1);
|
||||||
}
|
}
|
||||||
@ -180,7 +180,7 @@ int main (int argc, char ** argv)
|
|||||||
std::vector<int> peer(4);
|
std::vector<int> peer(4);
|
||||||
Complex ctmp=cmeo;
|
Complex ctmp=cmeo;
|
||||||
Integer index=real(ctmp);
|
Integer index=real(ctmp);
|
||||||
Fine.CoorFromIndex(peer,index,latt_size);
|
Lexicographic::CoorFromIndex(peer,index,latt_size);
|
||||||
|
|
||||||
double nrm = abs(cmeo()()()-scm);
|
double nrm = abs(cmeo()()()-scm);
|
||||||
if (nrm != 0) {
|
if (nrm != 0) {
|
||||||
@ -189,7 +189,7 @@ int main (int argc, char ** argv)
|
|||||||
<< cmeo()()()<<" expect "<<scm<<" "<<nrm<<std::endl;
|
<< cmeo()()()<<" expect "<<scm<<" "<<nrm<<std::endl;
|
||||||
std::cout<<"Got "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
|
std::cout<<"Got "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
|
||||||
index=real(scm);
|
index=real(scm);
|
||||||
Fine.CoorFromIndex(peer,index,latt_size);
|
Lexicographic::CoorFromIndex(peer,index,latt_size);
|
||||||
std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
|
std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
|
||||||
exx=1;
|
exx=1;
|
||||||
|
|
||||||
@ -205,7 +205,7 @@ int main (int argc, char ** argv)
|
|||||||
<< cm()()()<<" expect "<<scm<<" "<<nrm<<std::endl;
|
<< cm()()()<<" expect "<<scm<<" "<<nrm<<std::endl;
|
||||||
std::cout<<"Got "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
|
std::cout<<"Got "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
|
||||||
index=real(scm);
|
index=real(scm);
|
||||||
Fine.CoorFromIndex(peer,index,latt_size);
|
Lexicographic::CoorFromIndex(peer,index,latt_size);
|
||||||
std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
|
std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
|
||||||
exx=1;
|
exx=1;
|
||||||
} else if (1) {
|
} else if (1) {
|
||||||
|
223
tests/Test_cshift_red_black_rotate.cc
Normal file
223
tests/Test_cshift_red_black_rotate.cc
Normal file
@ -0,0 +1,223 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./tests/Test_cshift_red_black.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid.h>
|
||||||
|
|
||||||
|
using namespace Grid;
|
||||||
|
using namespace Grid::QCD;
|
||||||
|
|
||||||
|
int main (int argc, char ** argv)
|
||||||
|
{
|
||||||
|
Grid_init(&argc,&argv);
|
||||||
|
|
||||||
|
std::vector<int> latt_size = GridDefaultLatt();
|
||||||
|
int Nd = latt_size.size();
|
||||||
|
std::vector<int> simd_layout( { vComplex::Nsimd(),1,1,1});
|
||||||
|
std::vector<int> mpi_layout = GridDefaultMpi();
|
||||||
|
|
||||||
|
std::vector<int> mask(Nd,1);
|
||||||
|
mask[0]=0;
|
||||||
|
|
||||||
|
GridCartesian Fine (latt_size,simd_layout,mpi_layout);
|
||||||
|
GridRedBlackCartesian RBFine(latt_size,simd_layout,mpi_layout,mask,1);
|
||||||
|
|
||||||
|
GridParallelRNG FineRNG(&Fine); FineRNG.SeedRandomDevice();
|
||||||
|
|
||||||
|
LatticeComplex U(&Fine);
|
||||||
|
LatticeComplex ShiftU(&Fine);
|
||||||
|
LatticeComplex rbShiftU(&Fine);
|
||||||
|
LatticeComplex Ue(&RBFine);
|
||||||
|
LatticeComplex Uo(&RBFine);
|
||||||
|
LatticeComplex ShiftUe(&RBFine);
|
||||||
|
LatticeComplex ShiftUo(&RBFine);
|
||||||
|
LatticeComplex lex(&Fine);
|
||||||
|
lex=zero;
|
||||||
|
Integer stride =1;
|
||||||
|
{
|
||||||
|
double nrm;
|
||||||
|
LatticeComplex coor(&Fine);
|
||||||
|
|
||||||
|
for(int d=0;d<Nd;d++){
|
||||||
|
// Integer i=10000;
|
||||||
|
Integer i=0;
|
||||||
|
LatticeCoordinate(coor,d);
|
||||||
|
lex = lex + coor*stride+i;
|
||||||
|
stride=stride*latt_size[d];
|
||||||
|
}
|
||||||
|
U=lex;
|
||||||
|
}
|
||||||
|
|
||||||
|
pickCheckerboard(Even,Ue,U);
|
||||||
|
pickCheckerboard(Odd,Uo,U);
|
||||||
|
|
||||||
|
// std::cout<<GridLogMessage << U<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "Ue " <<norm2(Ue)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "Uo " <<norm2(Uo)<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
|
TComplex cm;
|
||||||
|
TComplex cmeo;
|
||||||
|
for(int dir=0;dir<Nd;dir++){
|
||||||
|
// if ( dir!=1 ) continue;
|
||||||
|
for(int shift=0;shift<latt_size[dir];shift++){
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage<<"Shifting by "<<shift<<" in direction"<<dir<<std::endl;
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage<<"Even grid"<<std::endl;
|
||||||
|
ShiftUe = Cshift(Ue,dir,shift); // Shift everything cb by cb
|
||||||
|
std::cout<<GridLogMessage << "\tShiftUe " <<norm2(ShiftUe)<<std::endl;
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage<<"Odd grid"<<std::endl;
|
||||||
|
ShiftUo = Cshift(Uo,dir,shift);
|
||||||
|
std::cout<<GridLogMessage << "\tShiftUo " <<norm2(ShiftUo)<<std::endl;
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage<<"Recombined Even/Odd grids"<<std::endl;
|
||||||
|
setCheckerboard(rbShiftU,ShiftUe);
|
||||||
|
setCheckerboard(rbShiftU,ShiftUo);
|
||||||
|
std::cout<<GridLogMessage << "\trbShiftU " <<norm2(rbShiftU)<<std::endl;
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage<<"Full grid shift"<<std::endl;
|
||||||
|
ShiftU = Cshift(U,dir,shift); // Shift everything
|
||||||
|
std::cout<<GridLogMessage << "\tShiftU " <<norm2(rbShiftU)<<std::endl;
|
||||||
|
|
||||||
|
std::vector<int> coor(4);
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "Checking the non-checkerboard shift"<<std::endl;
|
||||||
|
for(coor[3]=0;coor[3]<latt_size[3];coor[3]++){
|
||||||
|
for(coor[2]=0;coor[2]<latt_size[2];coor[2]++){
|
||||||
|
for(coor[1]=0;coor[1]<latt_size[1];coor[1]++){
|
||||||
|
for(coor[0]=0;coor[0]<latt_size[0];coor[0]++){
|
||||||
|
|
||||||
|
peekSite(cm,ShiftU,coor);
|
||||||
|
|
||||||
|
///////// double nrm=norm2(U);
|
||||||
|
|
||||||
|
std::vector<int> scoor(coor);
|
||||||
|
scoor[dir] = (scoor[dir]+shift)%latt_size[dir];
|
||||||
|
|
||||||
|
Integer slex = scoor[0]
|
||||||
|
+ latt_size[0]*scoor[1]
|
||||||
|
+ latt_size[0]*latt_size[1]*scoor[2]
|
||||||
|
+ latt_size[0]*latt_size[1]*latt_size[2]*scoor[3];
|
||||||
|
|
||||||
|
Complex scm(slex);
|
||||||
|
|
||||||
|
double nrm = abs(scm-cm()()());
|
||||||
|
std::vector<int> peer(4);
|
||||||
|
Complex ctmp = cm;
|
||||||
|
Integer index=real(ctmp);
|
||||||
|
Lexicographic::CoorFromIndex(peer,index,latt_size);
|
||||||
|
|
||||||
|
if (nrm > 0){
|
||||||
|
std::cout<<"FAIL shift "<< shift<<" in dir "<< dir
|
||||||
|
<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
|
||||||
|
<< cm()()()<<" expect "<<scm<<" "<<nrm<<std::endl;
|
||||||
|
std::cout<<"Got "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
|
||||||
|
index=real(scm);
|
||||||
|
Lexicographic::CoorFromIndex(peer,index,latt_size);
|
||||||
|
std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
}}}}
|
||||||
|
|
||||||
|
int exx=0;
|
||||||
|
std::cout<<GridLogMessage << "Checking the checkerboard shift"<<std::endl;
|
||||||
|
for(coor[3]=0;coor[3]<latt_size[3];coor[3]++){
|
||||||
|
for(coor[2]=0;coor[2]<latt_size[2];coor[2]++){
|
||||||
|
for(coor[1]=0;coor[1]<latt_size[1];coor[1]++){
|
||||||
|
for(coor[0]=0;coor[0]<latt_size[0];coor[0]++){
|
||||||
|
|
||||||
|
peekSite(cm,rbShiftU,coor);
|
||||||
|
|
||||||
|
Integer checkerboard = RBFine.CheckerBoard(coor);
|
||||||
|
|
||||||
|
// std::cout << " coor "<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] \n ";
|
||||||
|
// std::cout << "shift "<< shift <<" dir "<<dir<< " checker board "<< checkerboard << " ";
|
||||||
|
// std::cout << "Uo " << ShiftUo.checkerboard << " Ue "<<ShiftUe.checkerboard<<std::endl;
|
||||||
|
if ( checkerboard == ShiftUo.checkerboard ) {
|
||||||
|
peekSite(cmeo,ShiftUo,coor);
|
||||||
|
} else {
|
||||||
|
peekSite(cmeo,ShiftUe,coor);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
std::vector<int> scoor(coor);
|
||||||
|
scoor[dir] = (scoor[dir]+shift)%latt_size[dir];
|
||||||
|
|
||||||
|
Integer slex = scoor[0]
|
||||||
|
+ latt_size[0]*scoor[1]
|
||||||
|
+ latt_size[0]*latt_size[1]*scoor[2]
|
||||||
|
+ latt_size[0]*latt_size[1]*latt_size[2]*scoor[3];
|
||||||
|
|
||||||
|
Complex scm(slex);
|
||||||
|
|
||||||
|
std::vector<int> peer(4);
|
||||||
|
Complex ctmp=cmeo;
|
||||||
|
Integer index=real(ctmp);
|
||||||
|
Lexicographic::CoorFromIndex(peer,index,latt_size);
|
||||||
|
|
||||||
|
double nrm = abs(cmeo()()()-scm);
|
||||||
|
if (nrm != 0) {
|
||||||
|
std::cout<<"EOFAIL shift "<< shift<<" in dir "<< dir
|
||||||
|
<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
|
||||||
|
<< cmeo()()()<<" expect "<<scm<<" "<<nrm<<std::endl;
|
||||||
|
std::cout<<"Got "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
|
||||||
|
index=real(scm);
|
||||||
|
Lexicographic::CoorFromIndex(peer,index,latt_size);
|
||||||
|
std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
|
||||||
|
exx=1;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
ctmp=cm;
|
||||||
|
index=real(ctmp);
|
||||||
|
nrm = abs(scm-cm()()());
|
||||||
|
|
||||||
|
if (nrm > 0){
|
||||||
|
std::cout<<"FAIL shift "<< shift<<" in dir "<< dir
|
||||||
|
<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
|
||||||
|
<< cm()()()<<" expect "<<scm<<" "<<nrm<<std::endl;
|
||||||
|
std::cout<<"Got "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
|
||||||
|
index=real(scm);
|
||||||
|
Lexicographic::CoorFromIndex(peer,index,latt_size);
|
||||||
|
std::cout<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
|
||||||
|
exx=1;
|
||||||
|
} else if (1) {
|
||||||
|
std::cout<<GridLogMessage<<"PASS shift "<< shift<<" in dir "<< dir
|
||||||
|
<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "
|
||||||
|
<< cm()()()<<" expect "<<scm<<" "<<nrm<<std::endl;
|
||||||
|
}
|
||||||
|
}}}}
|
||||||
|
if (exx) exit(-1);
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Grid_finalize();
|
||||||
|
}
|
125
tests/Test_cshift_rotate.cc
Normal file
125
tests/Test_cshift_rotate.cc
Normal file
@ -0,0 +1,125 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./tests/Test_cshift.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid.h>
|
||||||
|
|
||||||
|
using namespace Grid;
|
||||||
|
using namespace Grid::QCD;
|
||||||
|
|
||||||
|
int main (int argc, char ** argv)
|
||||||
|
{
|
||||||
|
Grid_init(&argc,&argv);
|
||||||
|
|
||||||
|
std::vector<int> latt_size = GridDefaultLatt();
|
||||||
|
std::vector<int> simd_layout( { vComplex::Nsimd(),1,1,1});
|
||||||
|
std::vector<int> mpi_layout = GridDefaultMpi();
|
||||||
|
|
||||||
|
GridCartesian Fine(latt_size,simd_layout,mpi_layout);
|
||||||
|
|
||||||
|
GridParallelRNG FineRNG(&Fine); FineRNG.SeedRandomDevice();
|
||||||
|
|
||||||
|
LatticeComplex U(&Fine);
|
||||||
|
LatticeComplex ShiftU(&Fine);
|
||||||
|
|
||||||
|
LatticeComplex lex(&Fine);
|
||||||
|
lex=zero;
|
||||||
|
Integer stride =1;
|
||||||
|
{
|
||||||
|
double nrm;
|
||||||
|
LatticeComplex coor(&Fine);
|
||||||
|
|
||||||
|
for(int d=0;d<4;d++){
|
||||||
|
LatticeCoordinate(coor,d);
|
||||||
|
lex = lex + coor*stride;
|
||||||
|
stride=stride*latt_size[d];
|
||||||
|
}
|
||||||
|
U=lex;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
TComplex cm;
|
||||||
|
|
||||||
|
for(int dir=0;dir<4;dir++){
|
||||||
|
for(int shift=0;shift<latt_size[dir];shift++){
|
||||||
|
if ( Fine.IsBoss() )
|
||||||
|
std::cout<<GridLogMessage<<"Shifting by "<<shift<<" in direction"<<dir<<std::endl;
|
||||||
|
|
||||||
|
ShiftU = Cshift(U,dir,shift); // Shift everything
|
||||||
|
|
||||||
|
/*
|
||||||
|
std::cout << "U[0]" << U[0]<<std::endl;
|
||||||
|
std::cout << "U[1]" << U[1]<<std::endl;
|
||||||
|
std::cout << "ShiftU[0]" << ShiftU[0]<<std::endl;
|
||||||
|
std::cout << "ShiftU[1]" << ShiftU[1]<<std::endl;
|
||||||
|
*/
|
||||||
|
std::vector<int> coor(4);
|
||||||
|
|
||||||
|
for(coor[3]=0;coor[3]<latt_size[3];coor[3]++){
|
||||||
|
for(coor[2]=0;coor[2]<latt_size[2];coor[2]++){
|
||||||
|
for(coor[1]=0;coor[1]<latt_size[1];coor[1]++){
|
||||||
|
for(coor[0]=0;coor[0]<latt_size[0];coor[0]++){
|
||||||
|
|
||||||
|
peekSite(cm,ShiftU,coor);
|
||||||
|
|
||||||
|
double nrm=norm2(U);
|
||||||
|
|
||||||
|
std::vector<int> scoor(coor);
|
||||||
|
scoor[dir] = (scoor[dir]+shift)%latt_size[dir];
|
||||||
|
|
||||||
|
Integer slex = scoor[0]
|
||||||
|
+ latt_size[0]*scoor[1]
|
||||||
|
+ latt_size[0]*latt_size[1]*scoor[2]
|
||||||
|
+ latt_size[0]*latt_size[1]*latt_size[2]*scoor[3];
|
||||||
|
|
||||||
|
Complex scm(slex);
|
||||||
|
|
||||||
|
nrm = abs(scm-cm()()());
|
||||||
|
std::vector<int> peer(4);
|
||||||
|
Complex tmp =cm;
|
||||||
|
Integer index=real(tmp);
|
||||||
|
Lexicographic::CoorFromIndex(peer,index,latt_size);
|
||||||
|
|
||||||
|
if (nrm > 0){
|
||||||
|
std::cerr<<"FAIL shift "<< shift<<" in dir "<< dir<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "<< cm()()()<<" expect "<<scm<<" "<<nrm<<std::endl;
|
||||||
|
std::cerr<<"Got "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
|
||||||
|
index=real(scm);
|
||||||
|
Lexicographic::CoorFromIndex(peer,index,latt_size);
|
||||||
|
std::cerr<<"Expect "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
else {
|
||||||
|
std::cerr<<"PASS shift "<< shift<<" in dir "<< dir<<" ["<<coor[0]<<","<<coor[1]<<","<<coor[2]<<","<<coor[3]<<"] = "<< cm()()()<<" expect "<<scm<<" "<<nrm<<std::endl;
|
||||||
|
std::cerr<<"Got "<<index<<" " << peer[0]<<","<<peer[1]<<","<<peer[2]<<","<<peer[3]<<std::endl;
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
}}}}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Grid_finalize();
|
||||||
|
}
|
@ -42,6 +42,8 @@ public:
|
|||||||
int, domaindecompose,
|
int, domaindecompose,
|
||||||
int, domainsize,
|
int, domainsize,
|
||||||
int, order,
|
int, order,
|
||||||
|
int, Ls,
|
||||||
|
double, mq,
|
||||||
double, lo,
|
double, lo,
|
||||||
double, hi,
|
double, hi,
|
||||||
int, steps);
|
int, steps);
|
||||||
@ -263,11 +265,6 @@ public:
|
|||||||
resid = norm2(r) /norm2(src);
|
resid = norm2(r) /norm2(src);
|
||||||
std::cout << "SAP "<<i<<" resid "<<resid<<std::endl;
|
std::cout << "SAP "<<i<<" resid "<<resid<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
// Npoly*outer*2 1/2 vol matmuls.
|
|
||||||
// 71 iters => 20*71 = 1400 matmuls.
|
|
||||||
// 2*71 = 140 comms.
|
|
||||||
|
|
||||||
// Even domain solve
|
// Even domain solve
|
||||||
r= where(subset==(Integer)0,r,zz);
|
r= where(subset==(Integer)0,r,zz);
|
||||||
_SmootherOperator.AdjOp(r,vec1);
|
_SmootherOperator.AdjOp(r,vec1);
|
||||||
@ -332,7 +329,7 @@ public:
|
|||||||
CoarseVector Ctmp(_CoarseOperator.Grid());
|
CoarseVector Ctmp(_CoarseOperator.Grid());
|
||||||
CoarseVector Csol(_CoarseOperator.Grid()); Csol=zero;
|
CoarseVector Csol(_CoarseOperator.Grid()); Csol=zero;
|
||||||
|
|
||||||
ConjugateGradient<CoarseVector> CG(1.0e-3,100000);
|
ConjugateGradient<CoarseVector> CG(3.0e-3,100000);
|
||||||
// ConjugateGradient<FineField> fCG(3.0e-2,1000);
|
// ConjugateGradient<FineField> fCG(3.0e-2,1000);
|
||||||
|
|
||||||
HermitianLinearOperator<CoarseOperator,CoarseVector> HermOp(_CoarseOperator);
|
HermitianLinearOperator<CoarseOperator,CoarseVector> HermOp(_CoarseOperator);
|
||||||
@ -345,14 +342,14 @@ public:
|
|||||||
|
|
||||||
// Chebyshev<FineField> Cheby (0.5,70.0,30,InverseApproximation);
|
// Chebyshev<FineField> Cheby (0.5,70.0,30,InverseApproximation);
|
||||||
// Chebyshev<FineField> ChebyAccu(0.5,70.0,30,InverseApproximation);
|
// Chebyshev<FineField> ChebyAccu(0.5,70.0,30,InverseApproximation);
|
||||||
Chebyshev<FineField> Cheby (2.0,70.0,15,InverseApproximation);
|
Chebyshev<FineField> Cheby (params.lo,params.hi,params.order,InverseApproximation);
|
||||||
Chebyshev<FineField> ChebyAccu(2.0,70.0,15,InverseApproximation);
|
Chebyshev<FineField> ChebyAccu(params.lo,params.hi,params.order,InverseApproximation);
|
||||||
// Cheby.JacksonSmooth();
|
// Cheby.JacksonSmooth();
|
||||||
// ChebyAccu.JacksonSmooth();
|
// ChebyAccu.JacksonSmooth();
|
||||||
|
|
||||||
_Aggregates.ProjectToSubspace (Csrc,in);
|
// _Aggregates.ProjectToSubspace (Csrc,in);
|
||||||
_Aggregates.PromoteFromSubspace(Csrc,out);
|
// _Aggregates.PromoteFromSubspace(Csrc,out);
|
||||||
std::cout<<GridLogMessage<<"Completeness: "<<std::sqrt(norm2(out)/norm2(in))<<std::endl;
|
// std::cout<<GridLogMessage<<"Completeness: "<<std::sqrt(norm2(out)/norm2(in))<<std::endl;
|
||||||
|
|
||||||
// ofstream fout("smoother");
|
// ofstream fout("smoother");
|
||||||
// Cheby.csv(fout);
|
// Cheby.csv(fout);
|
||||||
@ -479,7 +476,7 @@ int main (int argc, char ** argv)
|
|||||||
read(RD,"params",params);
|
read(RD,"params",params);
|
||||||
std::cout<<"Params: Order "<<params.order<<"["<<params.lo<<","<<params.hi<<"]"<< " steps "<<params.steps<<std::endl;
|
std::cout<<"Params: Order "<<params.order<<"["<<params.lo<<","<<params.hi<<"]"<< " steps "<<params.steps<<std::endl;
|
||||||
|
|
||||||
const int Ls=8;
|
const int Ls=params.Ls;
|
||||||
|
|
||||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
||||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||||
@ -490,10 +487,12 @@ int main (int argc, char ** argv)
|
|||||||
///////////////////////////////////////////////////
|
///////////////////////////////////////////////////
|
||||||
// Construct a coarsened grid; utility for this?
|
// Construct a coarsened grid; utility for this?
|
||||||
///////////////////////////////////////////////////
|
///////////////////////////////////////////////////
|
||||||
const int block=2;
|
std::vector<int> block ({2,2,2,2});
|
||||||
|
const int nbasis= 32;
|
||||||
|
|
||||||
std::vector<int> clatt = GridDefaultLatt();
|
std::vector<int> clatt = GridDefaultLatt();
|
||||||
for(int d=0;d<clatt.size();d++){
|
for(int d=0;d<clatt.size();d++){
|
||||||
clatt[d] = clatt[d]/block;
|
clatt[d] = clatt[d]/block[d];
|
||||||
}
|
}
|
||||||
GridCartesian *Coarse4d = SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
|
GridCartesian *Coarse4d = SpaceTimeGrid::makeFourDimGrid(clatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());;
|
||||||
GridCartesian *Coarse5d = SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
|
GridCartesian *Coarse5d = SpaceTimeGrid::makeFiveDimGrid(1,Coarse4d);
|
||||||
@ -539,7 +538,7 @@ int main (int argc, char ** argv)
|
|||||||
// SU3::HotConfiguration(RNG4,Umu);
|
// SU3::HotConfiguration(RNG4,Umu);
|
||||||
// Umu=zero;
|
// Umu=zero;
|
||||||
|
|
||||||
RealD mass=0.01;
|
RealD mass=params.mq;
|
||||||
RealD M5=1.8;
|
RealD M5=1.8;
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
||||||
@ -548,9 +547,6 @@ int main (int argc, char ** argv)
|
|||||||
DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
DomainWallFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
||||||
DomainWallFermionR DdwfDD(UmuDD,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
DomainWallFermionR DdwfDD(UmuDD,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
||||||
|
|
||||||
const int nbasis = 32;
|
|
||||||
// const int nbasis = 4;
|
|
||||||
|
|
||||||
typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
|
typedef Aggregation<vSpinColourVector,vTComplex,nbasis> Subspace;
|
||||||
typedef CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> CoarseOperator;
|
typedef CoarsenedMatrix<vSpinColourVector,vTComplex,nbasis> CoarseOperator;
|
||||||
typedef CoarseOperator::CoarseVector CoarseVector;
|
typedef CoarseOperator::CoarseVector CoarseVector;
|
||||||
@ -564,7 +560,8 @@ int main (int argc, char ** argv)
|
|||||||
assert ( (nbasis & 0x1)==0);
|
assert ( (nbasis & 0x1)==0);
|
||||||
int nb=nbasis/2;
|
int nb=nbasis/2;
|
||||||
std::cout<<GridLogMessage << " nbasis/2 = "<<nb<<std::endl;
|
std::cout<<GridLogMessage << " nbasis/2 = "<<nb<<std::endl;
|
||||||
Aggregates.CreateSubspace(RNG5,HermDefOp,nb);
|
// Aggregates.CreateSubspace(RNG5,HermDefOp,nb);
|
||||||
|
Aggregates.CreateSubspaceLanczos(RNG5,HermDefOp,nb);
|
||||||
for(int n=0;n<nb;n++){
|
for(int n=0;n<nb;n++){
|
||||||
G5R5(Aggregates.subspace[n+nb],Aggregates.subspace[n]);
|
G5R5(Aggregates.subspace[n+nb],Aggregates.subspace[n]);
|
||||||
std::cout<<GridLogMessage<<n<<" subspace "<<norm2(Aggregates.subspace[n+nb])<<" "<<norm2(Aggregates.subspace[n]) <<std::endl;
|
std::cout<<GridLogMessage<<n<<" subspace "<<norm2(Aggregates.subspace[n+nb])<<" "<<norm2(Aggregates.subspace[n]) <<std::endl;
|
||||||
@ -600,7 +597,7 @@ int main (int argc, char ** argv)
|
|||||||
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
||||||
MdagMLinearOperator<CoarseOperator,CoarseVector> PosdefLdop(LDOp);
|
MdagMLinearOperator<CoarseOperator,CoarseVector> PosdefLdop(LDOp);
|
||||||
ConjugateGradient<CoarseVector> CG(1.0e-6,100000);
|
ConjugateGradient<CoarseVector> CG(1.0e-6,100000);
|
||||||
CG(PosdefLdop,c_src,c_res);
|
// CG(PosdefLdop,c_src,c_res);
|
||||||
|
|
||||||
// std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
// std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
||||||
// std::cout<<GridLogMessage << "Solving indef-MCR on coarse space "<< std::endl;
|
// std::cout<<GridLogMessage << "Solving indef-MCR on coarse space "<< std::endl;
|
||||||
@ -625,17 +622,17 @@ int main (int argc, char ** argv)
|
|||||||
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
||||||
std::cout<<GridLogMessage << "Testing smoother efficacy"<< std::endl;
|
std::cout<<GridLogMessage << "Testing smoother efficacy"<< std::endl;
|
||||||
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
||||||
Precon.SmootherTest(src);
|
// Precon.SmootherTest(src);
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
||||||
std::cout<<GridLogMessage << "Testing DD smoother efficacy"<< std::endl;
|
std::cout<<GridLogMessage << "Testing DD smoother efficacy"<< std::endl;
|
||||||
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
||||||
PreconDD.SmootherTest(src);
|
// PreconDD.SmootherTest(src);
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
||||||
std::cout<<GridLogMessage << "Testing SAP smoother efficacy"<< std::endl;
|
std::cout<<GridLogMessage << "Testing SAP smoother efficacy"<< std::endl;
|
||||||
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
||||||
PreconDD.SAP(src,result);
|
// PreconDD.SAP(src,result);
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
||||||
std::cout<<GridLogMessage << "Unprec CG "<< std::endl;
|
std::cout<<GridLogMessage << "Unprec CG "<< std::endl;
|
||||||
@ -663,18 +660,18 @@ int main (int argc, char ** argv)
|
|||||||
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
||||||
std::cout<<GridLogMessage << "Building a two level DDPGCR "<< std::endl;
|
std::cout<<GridLogMessage << "Building a two level DDPGCR "<< std::endl;
|
||||||
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
||||||
PrecGeneralisedConjugateResidual<LatticeFermion> PGCRDD(1.0e-8,100000,PreconDD,8,128);
|
// PrecGeneralisedConjugateResidual<LatticeFermion> PGCRDD(1.0e-8,100000,PreconDD,8,128);
|
||||||
result=zero;
|
// result=zero;
|
||||||
std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
|
// std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
|
||||||
PGCRDD(HermIndefOp,src,result);
|
// PGCRDD(HermIndefOp,src,result);
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
||||||
std::cout<<GridLogMessage << "Building a two level PGCR "<< std::endl;
|
std::cout<<GridLogMessage << "Building a two level PGCR "<< std::endl;
|
||||||
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
||||||
// PrecGeneralisedConjugateResidual<LatticeFermion> PGCR(1.0e-8,100000,Precon,8,128);
|
PrecGeneralisedConjugateResidual<LatticeFermion> PGCR(1.0e-8,100000,Precon,8,8);
|
||||||
// std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
|
std::cout<<GridLogMessage<<"checking norm src "<<norm2(src)<<std::endl;
|
||||||
// result=zero;
|
result=zero;
|
||||||
// PGCR(HermIndefOp,src,result);
|
PGCR(HermIndefOp,src,result);
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
std::cout<<GridLogMessage << "**************************************************"<< std::endl;
|
||||||
std::cout<<GridLogMessage << "Red Black Prec CG "<< std::endl;
|
std::cout<<GridLogMessage << "Red Black Prec CG "<< std::endl;
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user