mirror of
https://github.com/paboyle/Grid.git
synced 2025-07-05 16:07:05 +01:00
Compare commits
280 Commits
feature/hw
...
feature/be
Author | SHA1 | Date | |
---|---|---|---|
d4290a7434 | |||
92def28bd3 | |||
ca10bfa1c7 | |||
0e27e3847d | |||
8cfc7342cd | |||
15ae317858 | |||
834f536b5f | |||
c332d9f08b | |||
cf2923d5dd | |||
0e4413ddde | |||
009ccd581e | |||
8cd4263974 | |||
d45c868656 | |||
955a8113de | |||
dbe210dd53 | |||
86e11743ca | |||
980e721f6e | |||
e2a0142d87 | |||
895244ecc3 | |||
addeb621a7 | |||
a7fb25adf6 | |||
e947992957 | |||
bb89a82a07 | |||
8bdadbadac | |||
15c50a7442 | |||
49b0af2c95 | |||
9c2b37218a | |||
3c67d626ba | |||
51f506553c | |||
226be84937 | |||
001814b442 | |||
db3ac67506 | |||
da91a884ef | |||
a71e6755e3 | |||
cd5891eecd | |||
5bb7336f27 | |||
ce1fc1f48a | |||
82402c6a7c | |||
d9c4afe5b7 | |||
f786ff8d69 | |||
a651caed5f | |||
0e21adb3f6 | |||
58bf9b9e6d | |||
2146eebb65 | |||
6a429ee6d3 | |||
4d1ea15c79 | |||
a76cb005e0 | |||
a9604367c1 | |||
d7065023cc | |||
89d299ceec | |||
e34eda66df | |||
b24181aa4f | |||
aa173e2998 | |||
7a19432e0b | |||
9b15704290 | |||
017f955b2d | |||
f252d69eef | |||
3b06e4655e | |||
d4b4de8f42 | |||
c90beee774 | |||
1eea9d73b9 | |||
679d1d22f7 | |||
b2b5e0b98c | |||
03e54722c1 | |||
442336bd96 | |||
9c9566b9c9 | |||
1059a81a3c | |||
2e61556389 | |||
f9b1f240f6 | |||
69f41469dd | |||
d620b303ff | |||
157fd1428d | |||
c791cb2214 | |||
d5ab571a89 | |||
0ed800f6e4 | |||
0a32183825 | |||
2cacfbde2a | |||
c073e62e0b | |||
e3d019bc2f | |||
7ae030f585 | |||
86b58d5aff | |||
26e8b9f4a5 | |||
35114c9e62 | |||
dfd28a85c9 | |||
a503332924 | |||
1ac13ec3a7 | |||
55de69a569 | |||
eda9ab487b | |||
cd99edcc5f | |||
4705aa541d | |||
3215d88a91 | |||
9b9a53f870 | |||
019ffe17d4 | |||
bc496dd844 | |||
a673b6a54d | |||
1bf2e4d187 | |||
96dd7a8fbd | |||
7905afa9f5 | |||
712bb40650 | |||
81d88d9f4d | |||
77063418da | |||
2983b6fdf6 | |||
69f1f04f74 | |||
11a5fd09d6 | |||
ff1fa98808 | |||
df16202865 | |||
3ff7c2c02a | |||
fc6d07897f | |||
f9c8e5c8ef | |||
8bfa0e74f8 | |||
9b73a937e7 | |||
b0339bc5a4 | |||
3c23a947cc | |||
56111bb823 | |||
99445673f6 | |||
97a59643f7 | |||
579595f547 | |||
281ac5fc12 | |||
d8fa903b02 | |||
eaff0f3aeb | |||
e8e20c01b2 | |||
a4afc3ea2a | |||
fa12b9a329 | |||
45fc7ded3a | |||
74de2d9742 | |||
e759367d42 | |||
299d0de066 | |||
3fe75bc7cb | |||
45d49d8648 | |||
6013183361 | |||
4b882e8056 | |||
3f9ae6e7e7 | |||
909acd55cd | |||
4dd9e39e0d | |||
b4c1317ab4 | |||
f36d6f3923 | |||
7adb253e25 | |||
808f1e0e8c | |||
873519e960 | |||
9aec4a3c26 | |||
c438118fd7 | |||
70510d151b | |||
9e7bacb5a4 | |||
2ef1fa66a8 | |||
cf76741ec6 | |||
497e7c1c40 | |||
888eacd3b8 | |||
321f0f51b5 | |||
17ec9c5545 | |||
30ad9578a2 | |||
9dce101586 | |||
97e264d0ff | |||
683a5e5bf5 | |||
d4861a362c | |||
5ff3eae027 | |||
147dc15d26 | |||
c61ea72949 | |||
86e8b9fe38 | |||
612e468889 | |||
4ea8d128c2 | |||
e49b7f2f88 | |||
aace3d47b9 | |||
d5049949a4 | |||
f1c7480e3c | |||
5adae5d6ff | |||
a8412ace05 | |||
9fd1c2ad4b | |||
4cf3575353 | |||
804a810d68 | |||
8fcb392e24 | |||
dd8d70eeff | |||
aa8aba6543 | |||
13df14f96e | |||
3aab983760 | |||
9c4dcc5ea3 | |||
a1063ddbb9 | |||
18ef8056ec | |||
1c673977fa | |||
e9bc748828 | |||
f48156529b | |||
d05ce01809 | |||
cf23eff60e | |||
6e313575be | |||
b13d1f7238 | |||
b5e7945dd9 | |||
7535566f54 | |||
50b808ab33 | |||
f16c2665f5 | |||
41e28015ae | |||
3594ce877b | |||
9bae6b889a | |||
4014dfd5b9 | |||
67023c334b | |||
a3de7026c8 | |||
ee11678b1f | |||
a0ccbb3bd6 | |||
5eeabaa2bb | |||
00d0d6d008 | |||
537a9f7030 | |||
cc9c993f74 | |||
d10422ded8 | |||
f313565a3c | |||
b3881d2636 | |||
61d5860b46 | |||
52d17987dc | |||
19d8bba97d | |||
463d72d322 | |||
d060341168 | |||
c772bcd514 | |||
3362f8dfa0 | |||
bf3c9857e0 | |||
a88b3ceca5 | |||
aa135412f5 | |||
9945399e60 | |||
5eeffa49e8 | |||
3f06209720 | |||
12e239dd9f | |||
af2301afbb | |||
f98856a26f | |||
d55cc5b380 | |||
c2b688abc9 | |||
b0d61b9687 | |||
5f893bf9af | |||
0e17bd6597 | |||
22caa158cc | |||
b24a504d7c | |||
992ef6e9fc | |||
f32a320bc3 | |||
5f0fe029d2 | |||
6b1486e89b | |||
3f9c427a3a | |||
d201277652 | |||
fdda7cf9cf | |||
e22d30f715 | |||
1ba25a0d8c | |||
9ba3647bdf | |||
5ee832f738 | |||
467deee46f | |||
35a69a5133 | |||
e9c5a271a8 | |||
acac2d6938 | |||
97db2b8d20 | |||
80fd6ab407 | |||
5534921bee | |||
ace9cd64bb | |||
a3e2aeb603 | |||
049dd25785 | |||
d43d372294 | |||
b71a081cba | |||
c48909590b | |||
446ef40570 | |||
81441e98f4 | |||
ecd3f890f5 | |||
1c881ce23c | |||
dacbbdd051 | |||
2859955a03 | |||
cc220abd1d | |||
d1c0c0197e | |||
fd9424ef27 | |||
a5c35c4024 | |||
e03b64dc06 | |||
4677c40195 | |||
288c615782 | |||
48e81cf6f8 | |||
5cffa05c7e | |||
d50a2164d7 | |||
32ff766dbd | |||
01652d8cfe | |||
4d2dc7ba03 | |||
51d1beb1f3 | |||
249e2db87d | |||
cf3535d16e | |||
d61ee817f4 | |||
2a75516330 | |||
b2087f14c4 | |||
dd1ba266b2 | |||
1292d59563 | |||
9877ed9bf8 | |||
f0dc0f3621 | |||
63b0a19f37 |
61
.travis.yml
61
.travis.yml
@ -1,61 +0,0 @@
|
|||||||
language: cpp
|
|
||||||
|
|
||||||
cache:
|
|
||||||
directories:
|
|
||||||
- clang
|
|
||||||
|
|
||||||
matrix:
|
|
||||||
include:
|
|
||||||
- os: osx
|
|
||||||
osx_image: xcode8.3
|
|
||||||
compiler: clang
|
|
||||||
env: PREC=single
|
|
||||||
- os: osx
|
|
||||||
osx_image: xcode8.3
|
|
||||||
compiler: clang
|
|
||||||
env: PREC=double
|
|
||||||
|
|
||||||
before_install:
|
|
||||||
- export GRIDDIR=`pwd`
|
|
||||||
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]] && [ ! -e clang/bin ]; then wget $CLANG_LINK; tar -xf `basename $CLANG_LINK`; mkdir clang; mv clang+*/* clang/; fi
|
|
||||||
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export PATH="${GRIDDIR}/clang/bin:${PATH}"; fi
|
|
||||||
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
|
|
||||||
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
|
|
||||||
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc openssl; fi
|
|
||||||
|
|
||||||
install:
|
|
||||||
- export CWD=`pwd`
|
|
||||||
- echo $CWD
|
|
||||||
- export CC=$CC$VERSION
|
|
||||||
- export CXX=$CXX$VERSION
|
|
||||||
- echo $PATH
|
|
||||||
- which autoconf
|
|
||||||
- autoconf --version
|
|
||||||
- which automake
|
|
||||||
- automake --version
|
|
||||||
- which $CC
|
|
||||||
- $CC --version
|
|
||||||
- which $CXX
|
|
||||||
- $CXX --version
|
|
||||||
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
|
|
||||||
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export EXTRACONF='--with-openssl=/usr/local/opt/openssl'; fi
|
|
||||||
|
|
||||||
script:
|
|
||||||
- ./bootstrap.sh
|
|
||||||
- mkdir build
|
|
||||||
- cd build
|
|
||||||
- mkdir lime
|
|
||||||
- cd lime
|
|
||||||
- mkdir build
|
|
||||||
- cd build
|
|
||||||
- wget http://usqcd-software.github.io/downloads/c-lime/lime-1.3.2.tar.gz
|
|
||||||
- tar xf lime-1.3.2.tar.gz
|
|
||||||
- cd lime-1.3.2
|
|
||||||
- ./configure --prefix=$CWD/build/lime/install
|
|
||||||
- make -j4
|
|
||||||
- make install
|
|
||||||
- cd $CWD/build
|
|
||||||
- ../configure --enable-precision=$PREC --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF}
|
|
||||||
- make -j4
|
|
||||||
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
|
|
||||||
- make check
|
|
@ -37,7 +37,9 @@ directory
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
//disables and intel compiler specific warning (in json.hpp)
|
//disables and intel compiler specific warning (in json.hpp)
|
||||||
|
#ifdef __ICC
|
||||||
#pragma warning disable 488
|
#pragma warning disable 488
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef __NVCC__
|
#ifdef __NVCC__
|
||||||
//disables nvcc specific warning in json.hpp
|
//disables nvcc specific warning in json.hpp
|
||||||
|
@ -28,4 +28,7 @@
|
|||||||
///////////////////
|
///////////////////
|
||||||
#include "Config.h"
|
#include "Config.h"
|
||||||
|
|
||||||
|
#ifdef TOFU
|
||||||
|
#undef GRID_COMMS_THREADS
|
||||||
|
#endif
|
||||||
#endif /* GRID_STD_H */
|
#endif /* GRID_STD_H */
|
||||||
|
@ -34,6 +34,12 @@
|
|||||||
#define __SYCL__REDEFINE__
|
#define __SYCL__REDEFINE__
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/* HIP save and restore compile environment*/
|
||||||
|
#ifdef GRID_HIP
|
||||||
|
#pragma push
|
||||||
|
#pragma push_macro("__HIP_DEVICE_COMPILE__")
|
||||||
|
#endif
|
||||||
|
#define EIGEN_NO_HIP
|
||||||
|
|
||||||
#include <Grid/Eigen/Dense>
|
#include <Grid/Eigen/Dense>
|
||||||
#include <Grid/Eigen/unsupported/CXX11/Tensor>
|
#include <Grid/Eigen/unsupported/CXX11/Tensor>
|
||||||
@ -52,6 +58,12 @@
|
|||||||
#pragma pop
|
#pragma pop
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/*HIP restore*/
|
||||||
|
#ifdef __HIP__REDEFINE__
|
||||||
|
#pragma pop_macro("__HIP_DEVICE_COMPILE__")
|
||||||
|
#pragma pop
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined __GNUC__
|
#if defined __GNUC__
|
||||||
#pragma GCC diagnostic pop
|
#pragma GCC diagnostic pop
|
||||||
#endif
|
#endif
|
||||||
|
@ -21,6 +21,7 @@ if BUILD_HDF5
|
|||||||
extra_headers+=serialisation/Hdf5Type.h
|
extra_headers+=serialisation/Hdf5Type.h
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
||||||
all: version-cache Version.h
|
all: version-cache Version.h
|
||||||
|
|
||||||
version-cache:
|
version-cache:
|
||||||
@ -53,6 +54,19 @@ Version.h: version-cache
|
|||||||
include Make.inc
|
include Make.inc
|
||||||
include Eigen.inc
|
include Eigen.inc
|
||||||
|
|
||||||
|
extra_sources+=$(WILS_FERMION_FILES)
|
||||||
|
extra_sources+=$(STAG_FERMION_FILES)
|
||||||
|
if BUILD_ZMOBIUS
|
||||||
|
extra_sources+=$(ZWILS_FERMION_FILES)
|
||||||
|
endif
|
||||||
|
if BUILD_GPARITY
|
||||||
|
extra_sources+=$(GP_FERMION_FILES)
|
||||||
|
endif
|
||||||
|
if BUILD_FERMION_REPS
|
||||||
|
extra_sources+=$(ADJ_FERMION_FILES)
|
||||||
|
extra_sources+=$(TWOIND_FERMION_FILES)
|
||||||
|
endif
|
||||||
|
|
||||||
lib_LIBRARIES = libGrid.a
|
lib_LIBRARIES = libGrid.a
|
||||||
|
|
||||||
CCFILES += $(extra_sources)
|
CCFILES += $(extra_sources)
|
||||||
|
@ -31,6 +31,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#ifndef GRID_ALGORITHM_COARSENED_MATRIX_H
|
#ifndef GRID_ALGORITHM_COARSENED_MATRIX_H
|
||||||
#define GRID_ALGORITHM_COARSENED_MATRIX_H
|
#define GRID_ALGORITHM_COARSENED_MATRIX_H
|
||||||
|
|
||||||
|
#include <Grid/qcd/QCD.h> // needed for Dagger(Yes|No), Inverse(Yes|No)
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
@ -49,20 +50,24 @@ inline void blockMaskedInnerProduct(Lattice<CComplex> &CoarseInner,
|
|||||||
Lattice<dotp> fine_inner_msk(fine);
|
Lattice<dotp> fine_inner_msk(fine);
|
||||||
|
|
||||||
// Multiply could be fused with innerProduct
|
// Multiply could be fused with innerProduct
|
||||||
|
// Single block sum kernel could do both masks.
|
||||||
fine_inner = localInnerProduct(fineX,fineY);
|
fine_inner = localInnerProduct(fineX,fineY);
|
||||||
mult(fine_inner_msk, fine_inner,FineMask);
|
mult(fine_inner_msk, fine_inner,FineMask);
|
||||||
blockSum(CoarseInner,fine_inner_msk);
|
blockSum(CoarseInner,fine_inner_msk);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class Geometry {
|
class Geometry {
|
||||||
public:
|
public:
|
||||||
int npoint;
|
int npoint;
|
||||||
|
int base;
|
||||||
std::vector<int> directions ;
|
std::vector<int> directions ;
|
||||||
std::vector<int> displacements;
|
std::vector<int> displacements;
|
||||||
|
std::vector<int> points_dagger;
|
||||||
|
|
||||||
Geometry(int _d) {
|
Geometry(int _d) {
|
||||||
|
|
||||||
int base = (_d==5) ? 1:0;
|
base = (_d==5) ? 1:0;
|
||||||
|
|
||||||
// make coarse grid stencil for 4d , not 5d
|
// make coarse grid stencil for 4d , not 5d
|
||||||
if ( _d==5 ) _d=4;
|
if ( _d==5 ) _d=4;
|
||||||
@ -70,19 +75,50 @@ public:
|
|||||||
npoint = 2*_d+1;
|
npoint = 2*_d+1;
|
||||||
directions.resize(npoint);
|
directions.resize(npoint);
|
||||||
displacements.resize(npoint);
|
displacements.resize(npoint);
|
||||||
|
points_dagger.resize(npoint);
|
||||||
for(int d=0;d<_d;d++){
|
for(int d=0;d<_d;d++){
|
||||||
directions[d ] = d+base;
|
directions[d ] = d+base;
|
||||||
directions[d+_d] = d+base;
|
directions[d+_d] = d+base;
|
||||||
displacements[d ] = +1;
|
displacements[d ] = +1;
|
||||||
displacements[d+_d]= -1;
|
displacements[d+_d]= -1;
|
||||||
|
points_dagger[d ] = d+_d;
|
||||||
|
points_dagger[d+_d] = d;
|
||||||
}
|
}
|
||||||
directions [2*_d]=0;
|
directions [2*_d]=0;
|
||||||
displacements[2*_d]=0;
|
displacements[2*_d]=0;
|
||||||
|
points_dagger[2*_d]=2*_d;
|
||||||
|
}
|
||||||
|
|
||||||
std::cout <<GridLogMessage << "Geometry "<<std::endl;
|
int point(int dir, int disp) {
|
||||||
for(int p=0;p<npoint;p++){
|
assert(disp == -1 || disp == 0 || disp == 1);
|
||||||
std::cout <<GridLogMessage << "point " <<p<<" dir "<<directions[p]<<" delta " <<displacements[p]<<std::endl;
|
assert(base+0 <= dir && dir < base+4);
|
||||||
}
|
|
||||||
|
// directions faster index = new indexing
|
||||||
|
// 4d (base = 0):
|
||||||
|
// point 0 1 2 3 4 5 6 7 8
|
||||||
|
// dir 0 1 2 3 0 1 2 3 0
|
||||||
|
// disp +1 +1 +1 +1 -1 -1 -1 -1 0
|
||||||
|
// 5d (base = 1):
|
||||||
|
// point 0 1 2 3 4 5 6 7 8
|
||||||
|
// dir 1 2 3 4 1 2 3 4 0
|
||||||
|
// disp +1 +1 +1 +1 -1 -1 -1 -1 0
|
||||||
|
|
||||||
|
// displacements faster index = old indexing
|
||||||
|
// 4d (base = 0):
|
||||||
|
// point 0 1 2 3 4 5 6 7 8
|
||||||
|
// dir 0 0 1 1 2 2 3 3 0
|
||||||
|
// disp +1 -1 +1 -1 +1 -1 +1 -1 0
|
||||||
|
// 5d (base = 1):
|
||||||
|
// point 0 1 2 3 4 5 6 7 8
|
||||||
|
// dir 1 1 2 2 3 3 4 4 0
|
||||||
|
// disp +1 -1 +1 -1 +1 -1 +1 -1 0
|
||||||
|
|
||||||
|
if(dir == 0 and disp == 0)
|
||||||
|
return 8;
|
||||||
|
else // New indexing
|
||||||
|
return (1 - disp) / 2 * 4 + dir - base;
|
||||||
|
// else // Old indexing
|
||||||
|
// return (4 * (dir - base) + 1 - disp) / 2;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -104,8 +140,8 @@ public:
|
|||||||
Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) :
|
Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) :
|
||||||
CoarseGrid(_CoarseGrid),
|
CoarseGrid(_CoarseGrid),
|
||||||
FineGrid(_FineGrid),
|
FineGrid(_FineGrid),
|
||||||
checkerboard(_checkerboard),
|
subspace(nbasis,_FineGrid),
|
||||||
subspace(nbasis,_FineGrid)
|
checkerboard(_checkerboard)
|
||||||
{
|
{
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -260,7 +296,7 @@ public:
|
|||||||
// Fine Object == (per site) type of fine field
|
// Fine Object == (per site) type of fine field
|
||||||
// nbasis == number of deflation vectors
|
// nbasis == number of deflation vectors
|
||||||
template<class Fobj,class CComplex,int nbasis>
|
template<class Fobj,class CComplex,int nbasis>
|
||||||
class CoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > > {
|
class CoarsenedMatrix : public CheckerBoardedSparseMatrixBase<Lattice<iVector<CComplex,nbasis > > > {
|
||||||
public:
|
public:
|
||||||
|
|
||||||
typedef iVector<CComplex,nbasis > siteVector;
|
typedef iVector<CComplex,nbasis > siteVector;
|
||||||
@ -270,35 +306,59 @@ public:
|
|||||||
typedef iMatrix<CComplex,nbasis > Cobj;
|
typedef iMatrix<CComplex,nbasis > Cobj;
|
||||||
typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field
|
typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field
|
||||||
typedef Lattice<Fobj > FineField;
|
typedef Lattice<Fobj > FineField;
|
||||||
|
typedef CoarseVector FermionField;
|
||||||
|
|
||||||
|
// enrich interface, use default implementation as in FermionOperator ///////
|
||||||
|
void Dminus(CoarseVector const& in, CoarseVector& out) { out = in; }
|
||||||
|
void DminusDag(CoarseVector const& in, CoarseVector& out) { out = in; }
|
||||||
|
void ImportPhysicalFermionSource(CoarseVector const& input, CoarseVector& imported) { imported = input; }
|
||||||
|
void ImportUnphysicalFermion(CoarseVector const& input, CoarseVector& imported) { imported = input; }
|
||||||
|
void ExportPhysicalFermionSolution(CoarseVector const& solution, CoarseVector& exported) { exported = solution; };
|
||||||
|
void ExportPhysicalFermionSource(CoarseVector const& solution, CoarseVector& exported) { exported = solution; };
|
||||||
|
|
||||||
////////////////////
|
////////////////////
|
||||||
// Data members
|
// Data members
|
||||||
////////////////////
|
////////////////////
|
||||||
Geometry geom;
|
Geometry geom;
|
||||||
GridBase * _grid;
|
GridBase * _grid;
|
||||||
|
GridBase* _cbgrid;
|
||||||
int hermitian;
|
int hermitian;
|
||||||
|
|
||||||
CartesianStencil<siteVector,siteVector,int> Stencil;
|
CartesianStencil<siteVector,siteVector,int> Stencil;
|
||||||
|
CartesianStencil<siteVector,siteVector,int> StencilEven;
|
||||||
|
CartesianStencil<siteVector,siteVector,int> StencilOdd;
|
||||||
|
|
||||||
std::vector<CoarseMatrix> A;
|
std::vector<CoarseMatrix> A;
|
||||||
|
std::vector<CoarseMatrix> Aeven;
|
||||||
|
std::vector<CoarseMatrix> Aodd;
|
||||||
|
|
||||||
|
CoarseMatrix AselfInv;
|
||||||
|
CoarseMatrix AselfInvEven;
|
||||||
|
CoarseMatrix AselfInvOdd;
|
||||||
|
|
||||||
|
Vector<RealD> dag_factor;
|
||||||
|
|
||||||
///////////////////////
|
///////////////////////
|
||||||
// Interface
|
// Interface
|
||||||
///////////////////////
|
///////////////////////
|
||||||
GridBase * Grid(void) { return _grid; }; // this is all the linalg routines need to know
|
GridBase * Grid(void) { return _grid; }; // this is all the linalg routines need to know
|
||||||
|
GridBase * RedBlackGrid() { return _cbgrid; };
|
||||||
|
|
||||||
|
int ConstEE() { return 0; }
|
||||||
|
|
||||||
virtual std::vector<int> Directions(void) { return geom.directions; };
|
|
||||||
virtual std::vector<int> Displacements(void){ return geom.displacements; };
|
|
||||||
void M (const CoarseVector &in, CoarseVector &out)
|
void M (const CoarseVector &in, CoarseVector &out)
|
||||||
{
|
{
|
||||||
conformable(_grid,in.Grid());
|
conformable(_grid,in.Grid());
|
||||||
conformable(in.Grid(),out.Grid());
|
conformable(in.Grid(),out.Grid());
|
||||||
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
|
||||||
SimpleCompressor<siteVector> compressor;
|
SimpleCompressor<siteVector> compressor;
|
||||||
|
|
||||||
Stencil.HaloExchange(in,compressor);
|
Stencil.HaloExchange(in,compressor);
|
||||||
autoView( in_v , in, AcceleratorRead);
|
autoView( in_v , in, AcceleratorRead);
|
||||||
autoView( out_v , out, AcceleratorWrite);
|
autoView( out_v , out, AcceleratorWrite);
|
||||||
|
autoView( Stencil_v , Stencil, AcceleratorRead);
|
||||||
|
auto& geom_v = geom;
|
||||||
typedef LatticeView<Cobj> Aview;
|
typedef LatticeView<Cobj> Aview;
|
||||||
|
|
||||||
Vector<Aview> AcceleratorViewContainer;
|
Vector<Aview> AcceleratorViewContainer;
|
||||||
@ -312,9 +372,6 @@ public:
|
|||||||
|
|
||||||
int osites=Grid()->oSites();
|
int osites=Grid()->oSites();
|
||||||
|
|
||||||
autoView(st,Stencil,AcceleratorRead);
|
|
||||||
siteVector *CBp=Stencil.CommBuf();
|
|
||||||
|
|
||||||
accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
|
accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
|
||||||
int ss = sss/nbasis;
|
int ss = sss/nbasis;
|
||||||
int b = sss%nbasis;
|
int b = sss%nbasis;
|
||||||
@ -323,14 +380,14 @@ public:
|
|||||||
int ptype;
|
int ptype;
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
|
|
||||||
for(int point=0;point<geom.npoint;point++){
|
for(int point=0;point<geom_v.npoint;point++){
|
||||||
|
|
||||||
SE=st.GetEntry(ptype,point,ss);
|
SE=Stencil_v.GetEntry(ptype,point,ss);
|
||||||
|
|
||||||
if(SE->_is_local) {
|
if(SE->_is_local) {
|
||||||
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
|
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
|
||||||
} else {
|
} else {
|
||||||
nbr = coalescedRead(CBp[SE->_offset]);
|
nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]);
|
||||||
}
|
}
|
||||||
acceleratorSynchronise();
|
acceleratorSynchronise();
|
||||||
|
|
||||||
@ -339,7 +396,7 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
coalescedWrite(out_v[ss](b),res);
|
coalescedWrite(out_v[ss](b),res);
|
||||||
});
|
});
|
||||||
|
|
||||||
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
|
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
|
||||||
};
|
};
|
||||||
@ -351,12 +408,72 @@ public:
|
|||||||
return M(in,out);
|
return M(in,out);
|
||||||
} else {
|
} else {
|
||||||
// corresponds to Galerkin coarsening
|
// corresponds to Galerkin coarsening
|
||||||
CoarseVector tmp(Grid());
|
return MdagNonHermitian(in, out);
|
||||||
G5C(tmp, in);
|
|
||||||
M(tmp, out);
|
|
||||||
G5C(out, out);
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
void MdagNonHermitian(const CoarseVector &in, CoarseVector &out)
|
||||||
|
{
|
||||||
|
conformable(_grid,in.Grid());
|
||||||
|
conformable(in.Grid(),out.Grid());
|
||||||
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
|
||||||
|
SimpleCompressor<siteVector> compressor;
|
||||||
|
|
||||||
|
Stencil.HaloExchange(in,compressor);
|
||||||
|
autoView( in_v , in, AcceleratorRead);
|
||||||
|
autoView( out_v , out, AcceleratorWrite);
|
||||||
|
autoView( Stencil_v , Stencil, AcceleratorRead);
|
||||||
|
auto& geom_v = geom;
|
||||||
|
typedef LatticeView<Cobj> Aview;
|
||||||
|
|
||||||
|
Vector<Aview> AcceleratorViewContainer;
|
||||||
|
|
||||||
|
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
|
||||||
|
Aview *Aview_p = & AcceleratorViewContainer[0];
|
||||||
|
|
||||||
|
const int Nsimd = CComplex::Nsimd();
|
||||||
|
typedef decltype(coalescedRead(in_v[0])) calcVector;
|
||||||
|
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
|
||||||
|
|
||||||
|
int osites=Grid()->oSites();
|
||||||
|
|
||||||
|
Vector<int> points(geom.npoint, 0);
|
||||||
|
for(int p=0; p<geom.npoint; p++)
|
||||||
|
points[p] = geom.points_dagger[p];
|
||||||
|
|
||||||
|
RealD* dag_factor_p = &dag_factor[0];
|
||||||
|
|
||||||
|
accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
|
||||||
|
int ss = sss/nbasis;
|
||||||
|
int b = sss%nbasis;
|
||||||
|
calcComplex res = Zero();
|
||||||
|
calcVector nbr;
|
||||||
|
int ptype;
|
||||||
|
StencilEntry *SE;
|
||||||
|
|
||||||
|
for(int p=0;p<geom_v.npoint;p++){
|
||||||
|
int point = points[p];
|
||||||
|
|
||||||
|
SE=Stencil_v.GetEntry(ptype,point,ss);
|
||||||
|
|
||||||
|
if(SE->_is_local) {
|
||||||
|
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
|
||||||
|
} else {
|
||||||
|
nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]);
|
||||||
|
}
|
||||||
|
acceleratorSynchronise();
|
||||||
|
|
||||||
|
for(int bb=0;bb<nbasis;bb++) {
|
||||||
|
res = res + dag_factor_p[b*nbasis+bb]*coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
coalescedWrite(out_v[ss](b),res);
|
||||||
|
});
|
||||||
|
|
||||||
|
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
|
||||||
|
}
|
||||||
|
|
||||||
void MdirComms(const CoarseVector &in)
|
void MdirComms(const CoarseVector &in)
|
||||||
{
|
{
|
||||||
SimpleCompressor<siteVector> compressor;
|
SimpleCompressor<siteVector> compressor;
|
||||||
@ -366,6 +483,7 @@ public:
|
|||||||
{
|
{
|
||||||
conformable(_grid,in.Grid());
|
conformable(_grid,in.Grid());
|
||||||
conformable(_grid,out.Grid());
|
conformable(_grid,out.Grid());
|
||||||
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
|
||||||
typedef LatticeView<Cobj> Aview;
|
typedef LatticeView<Cobj> Aview;
|
||||||
Vector<Aview> AcceleratorViewContainer;
|
Vector<Aview> AcceleratorViewContainer;
|
||||||
@ -374,6 +492,7 @@ public:
|
|||||||
|
|
||||||
autoView( out_v , out, AcceleratorWrite);
|
autoView( out_v , out, AcceleratorWrite);
|
||||||
autoView( in_v , in, AcceleratorRead);
|
autoView( in_v , in, AcceleratorRead);
|
||||||
|
autoView( Stencil_v , Stencil, AcceleratorRead);
|
||||||
|
|
||||||
const int Nsimd = CComplex::Nsimd();
|
const int Nsimd = CComplex::Nsimd();
|
||||||
typedef decltype(coalescedRead(in_v[0])) calcVector;
|
typedef decltype(coalescedRead(in_v[0])) calcVector;
|
||||||
@ -387,12 +506,12 @@ public:
|
|||||||
int ptype;
|
int ptype;
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
|
|
||||||
SE=Stencil.GetEntry(ptype,point,ss);
|
SE=Stencil_v.GetEntry(ptype,point,ss);
|
||||||
|
|
||||||
if(SE->_is_local) {
|
if(SE->_is_local) {
|
||||||
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
|
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
|
||||||
} else {
|
} else {
|
||||||
nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
|
nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]);
|
||||||
}
|
}
|
||||||
acceleratorSynchronise();
|
acceleratorSynchronise();
|
||||||
|
|
||||||
@ -416,23 +535,11 @@ public:
|
|||||||
MdirCalc(in,out[p],p);
|
MdirCalc(in,out[p],p);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp)
|
void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp){
|
||||||
{
|
|
||||||
this->MdirComms(in);
|
this->MdirComms(in);
|
||||||
|
|
||||||
int ndim = in.Grid()->Nd();
|
MdirCalc(in,out,geom.point(dir,disp));
|
||||||
|
|
||||||
int point=-1;
|
|
||||||
for(int p=0;p<geom.npoint;p++){
|
|
||||||
if( (dir==geom.directions[p])&&(disp==geom.displacements[p])) point=p;
|
|
||||||
}
|
|
||||||
assert(point!=-1);// Must find
|
|
||||||
|
|
||||||
std::cout <<GridLogMessage << "Mdir point "<<point<<" dir "<<dir<<" disp "<<disp <<std::endl;
|
|
||||||
for(int p=0;p<geom.npoint;p++){
|
|
||||||
std::cout <<GridLogMessage << "point " <<p<<" dir "<<geom.directions[p]<<" delta " <<geom.displacements[p]<<std::endl;
|
|
||||||
}
|
|
||||||
MdirCalc(in,out,point);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
void Mdiag(const CoarseVector &in, CoarseVector &out)
|
void Mdiag(const CoarseVector &in, CoarseVector &out)
|
||||||
@ -441,63 +548,286 @@ public:
|
|||||||
MdirCalc(in, out, point); // No comms
|
MdirCalc(in, out, point); // No comms
|
||||||
};
|
};
|
||||||
|
|
||||||
|
void Mooee(const CoarseVector &in, CoarseVector &out) {
|
||||||
|
MooeeInternal(in, out, DaggerNo, InverseNo);
|
||||||
|
}
|
||||||
|
|
||||||
CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) :
|
void MooeeInv(const CoarseVector &in, CoarseVector &out) {
|
||||||
|
MooeeInternal(in, out, DaggerNo, InverseYes);
|
||||||
|
}
|
||||||
|
|
||||||
|
void MooeeDag(const CoarseVector &in, CoarseVector &out) {
|
||||||
|
MooeeInternal(in, out, DaggerYes, InverseNo);
|
||||||
|
}
|
||||||
|
|
||||||
|
void MooeeInvDag(const CoarseVector &in, CoarseVector &out) {
|
||||||
|
MooeeInternal(in, out, DaggerYes, InverseYes);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Meooe(const CoarseVector &in, CoarseVector &out) {
|
||||||
|
if(in.Checkerboard() == Odd) {
|
||||||
|
DhopEO(in, out, DaggerNo);
|
||||||
|
} else {
|
||||||
|
DhopOE(in, out, DaggerNo);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void MeooeDag(const CoarseVector &in, CoarseVector &out) {
|
||||||
|
if(in.Checkerboard() == Odd) {
|
||||||
|
DhopEO(in, out, DaggerYes);
|
||||||
|
} else {
|
||||||
|
DhopOE(in, out, DaggerYes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Dhop(const CoarseVector &in, CoarseVector &out, int dag) {
|
||||||
|
conformable(in.Grid(), _grid); // verifies full grid
|
||||||
|
conformable(in.Grid(), out.Grid());
|
||||||
|
|
||||||
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
|
||||||
|
DhopInternal(Stencil, A, in, out, dag);
|
||||||
|
}
|
||||||
|
|
||||||
|
void DhopOE(const CoarseVector &in, CoarseVector &out, int dag) {
|
||||||
|
conformable(in.Grid(), _cbgrid); // verifies half grid
|
||||||
|
conformable(in.Grid(), out.Grid()); // drops the cb check
|
||||||
|
|
||||||
|
assert(in.Checkerboard() == Even);
|
||||||
|
out.Checkerboard() = Odd;
|
||||||
|
|
||||||
|
DhopInternal(StencilEven, Aodd, in, out, dag);
|
||||||
|
}
|
||||||
|
|
||||||
|
void DhopEO(const CoarseVector &in, CoarseVector &out, int dag) {
|
||||||
|
conformable(in.Grid(), _cbgrid); // verifies half grid
|
||||||
|
conformable(in.Grid(), out.Grid()); // drops the cb check
|
||||||
|
|
||||||
|
assert(in.Checkerboard() == Odd);
|
||||||
|
out.Checkerboard() = Even;
|
||||||
|
|
||||||
|
DhopInternal(StencilOdd, Aeven, in, out, dag);
|
||||||
|
}
|
||||||
|
|
||||||
|
void MooeeInternal(const CoarseVector &in, CoarseVector &out, int dag, int inv) {
|
||||||
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
|
||||||
|
|
||||||
|
CoarseMatrix *Aself = nullptr;
|
||||||
|
if(in.Grid()->_isCheckerBoarded) {
|
||||||
|
if(in.Checkerboard() == Odd) {
|
||||||
|
Aself = (inv) ? &AselfInvOdd : &Aodd[geom.npoint-1];
|
||||||
|
DselfInternal(StencilOdd, *Aself, in, out, dag);
|
||||||
|
} else {
|
||||||
|
Aself = (inv) ? &AselfInvEven : &Aeven[geom.npoint-1];
|
||||||
|
DselfInternal(StencilEven, *Aself, in, out, dag);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
Aself = (inv) ? &AselfInv : &A[geom.npoint-1];
|
||||||
|
DselfInternal(Stencil, *Aself, in, out, dag);
|
||||||
|
}
|
||||||
|
assert(Aself != nullptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
void DselfInternal(CartesianStencil<siteVector,siteVector,int> &st, CoarseMatrix &a,
|
||||||
|
const CoarseVector &in, CoarseVector &out, int dag) {
|
||||||
|
int point = geom.npoint-1;
|
||||||
|
autoView( out_v, out, AcceleratorWrite);
|
||||||
|
autoView( in_v, in, AcceleratorRead);
|
||||||
|
autoView( st_v, st, AcceleratorRead);
|
||||||
|
autoView( a_v, a, AcceleratorRead);
|
||||||
|
|
||||||
|
const int Nsimd = CComplex::Nsimd();
|
||||||
|
typedef decltype(coalescedRead(in_v[0])) calcVector;
|
||||||
|
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
|
||||||
|
|
||||||
|
RealD* dag_factor_p = &dag_factor[0];
|
||||||
|
|
||||||
|
if(dag) {
|
||||||
|
accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, {
|
||||||
|
int ss = sss/nbasis;
|
||||||
|
int b = sss%nbasis;
|
||||||
|
calcComplex res = Zero();
|
||||||
|
calcVector nbr;
|
||||||
|
int ptype;
|
||||||
|
StencilEntry *SE;
|
||||||
|
|
||||||
|
SE=st_v.GetEntry(ptype,point,ss);
|
||||||
|
|
||||||
|
if(SE->_is_local) {
|
||||||
|
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
|
||||||
|
} else {
|
||||||
|
nbr = coalescedRead(st_v.CommBuf()[SE->_offset]);
|
||||||
|
}
|
||||||
|
acceleratorSynchronise();
|
||||||
|
|
||||||
|
for(int bb=0;bb<nbasis;bb++) {
|
||||||
|
res = res + dag_factor_p[b*nbasis+bb]*coalescedRead(a_v[ss](b,bb))*nbr(bb);
|
||||||
|
}
|
||||||
|
coalescedWrite(out_v[ss](b),res);
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, {
|
||||||
|
int ss = sss/nbasis;
|
||||||
|
int b = sss%nbasis;
|
||||||
|
calcComplex res = Zero();
|
||||||
|
calcVector nbr;
|
||||||
|
int ptype;
|
||||||
|
StencilEntry *SE;
|
||||||
|
|
||||||
|
SE=st_v.GetEntry(ptype,point,ss);
|
||||||
|
|
||||||
|
if(SE->_is_local) {
|
||||||
|
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
|
||||||
|
} else {
|
||||||
|
nbr = coalescedRead(st_v.CommBuf()[SE->_offset]);
|
||||||
|
}
|
||||||
|
acceleratorSynchronise();
|
||||||
|
|
||||||
|
for(int bb=0;bb<nbasis;bb++) {
|
||||||
|
res = res + coalescedRead(a_v[ss](b,bb))*nbr(bb);
|
||||||
|
}
|
||||||
|
coalescedWrite(out_v[ss](b),res);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void DhopInternal(CartesianStencil<siteVector,siteVector,int> &st, std::vector<CoarseMatrix> &a,
|
||||||
|
const CoarseVector &in, CoarseVector &out, int dag) {
|
||||||
|
SimpleCompressor<siteVector> compressor;
|
||||||
|
|
||||||
|
st.HaloExchange(in,compressor);
|
||||||
|
autoView( in_v, in, AcceleratorRead);
|
||||||
|
autoView( out_v, out, AcceleratorWrite);
|
||||||
|
autoView( st_v , st, AcceleratorRead);
|
||||||
|
typedef LatticeView<Cobj> Aview;
|
||||||
|
|
||||||
|
// determine in what order we need the points
|
||||||
|
int npoint = geom.npoint-1;
|
||||||
|
Vector<int> points(npoint, 0);
|
||||||
|
for(int p=0; p<npoint; p++)
|
||||||
|
points[p] = (dag && !hermitian) ? geom.points_dagger[p] : p;
|
||||||
|
|
||||||
|
Vector<Aview> AcceleratorViewContainer;
|
||||||
|
for(int p=0;p<npoint;p++) AcceleratorViewContainer.push_back(a[p].View(AcceleratorRead));
|
||||||
|
Aview *Aview_p = & AcceleratorViewContainer[0];
|
||||||
|
|
||||||
|
const int Nsimd = CComplex::Nsimd();
|
||||||
|
typedef decltype(coalescedRead(in_v[0])) calcVector;
|
||||||
|
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
|
||||||
|
|
||||||
|
RealD* dag_factor_p = &dag_factor[0];
|
||||||
|
|
||||||
|
if(dag) {
|
||||||
|
accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, {
|
||||||
|
int ss = sss/nbasis;
|
||||||
|
int b = sss%nbasis;
|
||||||
|
calcComplex res = Zero();
|
||||||
|
calcVector nbr;
|
||||||
|
int ptype;
|
||||||
|
StencilEntry *SE;
|
||||||
|
|
||||||
|
for(int p=0;p<npoint;p++){
|
||||||
|
int point = points[p];
|
||||||
|
SE=st_v.GetEntry(ptype,point,ss);
|
||||||
|
|
||||||
|
if(SE->_is_local) {
|
||||||
|
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
|
||||||
|
} else {
|
||||||
|
nbr = coalescedRead(st_v.CommBuf()[SE->_offset]);
|
||||||
|
}
|
||||||
|
acceleratorSynchronise();
|
||||||
|
|
||||||
|
for(int bb=0;bb<nbasis;bb++) {
|
||||||
|
res = res + dag_factor_p[b*nbasis+bb]*coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
coalescedWrite(out_v[ss](b),res);
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, {
|
||||||
|
int ss = sss/nbasis;
|
||||||
|
int b = sss%nbasis;
|
||||||
|
calcComplex res = Zero();
|
||||||
|
calcVector nbr;
|
||||||
|
int ptype;
|
||||||
|
StencilEntry *SE;
|
||||||
|
|
||||||
|
for(int p=0;p<npoint;p++){
|
||||||
|
int point = points[p];
|
||||||
|
SE=st_v.GetEntry(ptype,point,ss);
|
||||||
|
|
||||||
|
if(SE->_is_local) {
|
||||||
|
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
|
||||||
|
} else {
|
||||||
|
nbr = coalescedRead(st_v.CommBuf()[SE->_offset]);
|
||||||
|
}
|
||||||
|
acceleratorSynchronise();
|
||||||
|
|
||||||
|
for(int bb=0;bb<nbasis;bb++) {
|
||||||
|
res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
coalescedWrite(out_v[ss](b),res);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int p=0;p<npoint;p++) AcceleratorViewContainer[p].ViewClose();
|
||||||
|
}
|
||||||
|
|
||||||
|
CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) :
|
||||||
_grid(&CoarseGrid),
|
_grid(&CoarseGrid),
|
||||||
|
_cbgrid(new GridRedBlackCartesian(&CoarseGrid)),
|
||||||
geom(CoarseGrid._ndimension),
|
geom(CoarseGrid._ndimension),
|
||||||
hermitian(hermitian_),
|
hermitian(hermitian_),
|
||||||
Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
|
Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
|
||||||
A(geom.npoint,&CoarseGrid)
|
StencilEven(_cbgrid,geom.npoint,Even,geom.directions,geom.displacements,0),
|
||||||
|
StencilOdd(_cbgrid,geom.npoint,Odd,geom.directions,geom.displacements,0),
|
||||||
|
A(geom.npoint,&CoarseGrid),
|
||||||
|
Aeven(geom.npoint,_cbgrid),
|
||||||
|
Aodd(geom.npoint,_cbgrid),
|
||||||
|
AselfInv(&CoarseGrid),
|
||||||
|
AselfInvEven(_cbgrid),
|
||||||
|
AselfInvOdd(_cbgrid),
|
||||||
|
dag_factor(nbasis*nbasis)
|
||||||
{
|
{
|
||||||
|
fillFactor();
|
||||||
};
|
};
|
||||||
|
|
||||||
void Test(Aggregation<Fobj,CComplex,nbasis> &_Aggregates,GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop)
|
CoarsenedMatrix(GridCartesian &CoarseGrid, GridRedBlackCartesian &CoarseRBGrid, int hermitian_=0) :
|
||||||
|
|
||||||
|
_grid(&CoarseGrid),
|
||||||
|
_cbgrid(&CoarseRBGrid),
|
||||||
|
geom(CoarseGrid._ndimension),
|
||||||
|
hermitian(hermitian_),
|
||||||
|
Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
|
||||||
|
StencilEven(&CoarseRBGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
|
||||||
|
StencilOdd(&CoarseRBGrid,geom.npoint,Odd,geom.directions,geom.displacements,0),
|
||||||
|
A(geom.npoint,&CoarseGrid),
|
||||||
|
Aeven(geom.npoint,&CoarseRBGrid),
|
||||||
|
Aodd(geom.npoint,&CoarseRBGrid),
|
||||||
|
AselfInv(&CoarseGrid),
|
||||||
|
AselfInvEven(&CoarseRBGrid),
|
||||||
|
AselfInvOdd(&CoarseRBGrid),
|
||||||
|
dag_factor(nbasis*nbasis)
|
||||||
{
|
{
|
||||||
typedef Lattice<Fobj> FineField;
|
fillFactor();
|
||||||
CoarseVector Cin(_grid);
|
};
|
||||||
CoarseVector Cout(_grid);
|
|
||||||
CoarseVector CFout(_grid);
|
|
||||||
|
|
||||||
FineField Fin(FineGrid);
|
void fillFactor() {
|
||||||
FineField Fout(FineGrid);
|
Eigen::MatrixXd dag_factor_eigen = Eigen::MatrixXd::Ones(nbasis, nbasis);
|
||||||
|
if(!hermitian) {
|
||||||
|
const int nb = nbasis/2;
|
||||||
|
dag_factor_eigen.block(0,nb,nb,nb) *= -1.0;
|
||||||
|
dag_factor_eigen.block(nb,0,nb,nb) *= -1.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// GPU readable prefactor
|
||||||
std::vector<int> seeds({1,2,3,4,5});
|
thread_for(i, nbasis*nbasis, {
|
||||||
GridParallelRNG RNG(_grid); RNG.SeedFixedIntegers(seeds);
|
int j = i/nbasis;
|
||||||
gaussian(RNG,Cin);
|
int k = i%nbasis;
|
||||||
|
dag_factor[i] = dag_factor_eigen(j, k);
|
||||||
_Aggregates.PromoteFromSubspace(Cin,Fin);
|
});
|
||||||
_Aggregates.ProjectToSubspace(Cin,Fin);
|
|
||||||
|
|
||||||
std::cout << GridLogMessage<< "************ "<<std::endl;
|
|
||||||
std::cout << GridLogMessage<< " Testing M "<<std::endl;
|
|
||||||
std::cout << GridLogMessage<< "************ "<<std::endl;
|
|
||||||
// Coarse operator
|
|
||||||
this->M(Cin,Cout);
|
|
||||||
// Fine projected operator
|
|
||||||
_Aggregates.PromoteFromSubspace(Cin,Fin);
|
|
||||||
linop.Op(Fin,Fout);
|
|
||||||
_Aggregates.ProjectToSubspace(CFout,Fout);
|
|
||||||
|
|
||||||
CFout = CFout-Cout;
|
|
||||||
RealD diff = norm2(CFout);
|
|
||||||
std::cout << GridLogMessage<< " diff "<<diff<<std::endl;
|
|
||||||
assert(diff<1.0e-5);
|
|
||||||
|
|
||||||
std::cout << GridLogMessage<< "************ "<<std::endl;
|
|
||||||
std::cout << GridLogMessage<< " Testing Mdag "<<std::endl;
|
|
||||||
std::cout << GridLogMessage<< "************ "<<std::endl;
|
|
||||||
// Coarse operator
|
|
||||||
Mdag(Cin,Cout);
|
|
||||||
// Fine operator
|
|
||||||
linop.AdjOp(Fin,Fout);
|
|
||||||
_Aggregates.ProjectToSubspace(CFout,Fout);
|
|
||||||
|
|
||||||
CFout = CFout-Cout;
|
|
||||||
diff = norm2(CFout);
|
|
||||||
std::cout << GridLogMessage<< " diff "<<diff<<std::endl;
|
|
||||||
assert(diff<1.0e-5);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
|
void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
|
||||||
@ -506,6 +836,8 @@ public:
|
|||||||
typedef Lattice<typename Fobj::tensor_reduced> FineComplexField;
|
typedef Lattice<typename Fobj::tensor_reduced> FineComplexField;
|
||||||
typedef typename Fobj::scalar_type scalar_type;
|
typedef typename Fobj::scalar_type scalar_type;
|
||||||
|
|
||||||
|
std::cout << GridLogMessage<< "CoarsenMatrix "<< std::endl;
|
||||||
|
|
||||||
FineComplexField one(FineGrid); one=scalar_type(1.0,0.0);
|
FineComplexField one(FineGrid); one=scalar_type(1.0,0.0);
|
||||||
FineComplexField zero(FineGrid); zero=scalar_type(0.0,0.0);
|
FineComplexField zero(FineGrid); zero=scalar_type(0.0,0.0);
|
||||||
|
|
||||||
@ -536,22 +868,13 @@ public:
|
|||||||
|
|
||||||
CoarseScalar InnerProd(Grid());
|
CoarseScalar InnerProd(Grid());
|
||||||
|
|
||||||
std::cout << GridLogMessage<< "CoarsenMatrix Orthog " << std::endl;
|
std::cout << GridLogMessage<< "CoarsenMatrix Orthog "<< std::endl;
|
||||||
// Orthogonalise the subblocks over the basis
|
// Orthogonalise the subblocks over the basis
|
||||||
blockOrthogonalise(InnerProd,Subspace.subspace);
|
blockOrthogonalise(InnerProd,Subspace.subspace);
|
||||||
std::cout << GridLogMessage<< "CoarsenMatrix Orthog done " << std::endl;
|
|
||||||
|
|
||||||
auto OpDirections = linop.Directions();
|
|
||||||
auto OpDisplacements = linop.Displacements();
|
|
||||||
|
|
||||||
std::cout<<" Coarsening an operator with "<< OpDirections.size()<<" terms "<<std::endl;
|
|
||||||
for(int p=0;p<OpDirections.size();p++) {
|
|
||||||
assert(OpDirections[p]==geom.directions[p]);
|
|
||||||
assert(OpDisplacements[p]==geom.displacements[p]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Compute the matrix elements of linop between this orthonormal
|
// Compute the matrix elements of linop between this orthonormal
|
||||||
// set of vectors.
|
// set of vectors.
|
||||||
|
std::cout << GridLogMessage<< "CoarsenMatrix masks "<< std::endl;
|
||||||
int self_stencil=-1;
|
int self_stencil=-1;
|
||||||
for(int p=0;p<geom.npoint;p++)
|
for(int p=0;p<geom.npoint;p++)
|
||||||
{
|
{
|
||||||
@ -584,21 +907,7 @@ public:
|
|||||||
evenmask = where(mod(bcb,2)==(Integer)0,one,zero);
|
evenmask = where(mod(bcb,2)==(Integer)0,one,zero);
|
||||||
oddmask = one-evenmask;
|
oddmask = one-evenmask;
|
||||||
|
|
||||||
/*
|
|
||||||
{
|
|
||||||
phi=Subspace.subspace[0];
|
|
||||||
linop.OpDirAll(phi,Mphi_p);
|
|
||||||
for(int p=0;p<geom.npoint-1;p++){
|
|
||||||
int dir=geom.directions[p];
|
|
||||||
int disp=geom.displacements[p];
|
|
||||||
linop.OpDir(phi,Mphi,dir,disp);
|
|
||||||
Mphi=Mphi-Mphi_p[p];
|
|
||||||
std::cout << GridLogMessage <<" Direction mapping check " <<norm2(Mphi)<<std::endl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
assert(self_stencil!=-1);
|
assert(self_stencil!=-1);
|
||||||
int lhermitian=hermitian;
|
|
||||||
|
|
||||||
for(int i=0;i<nbasis;i++){
|
for(int i=0;i<nbasis;i++){
|
||||||
|
|
||||||
@ -615,7 +924,7 @@ public:
|
|||||||
int dir = geom.directions[p];
|
int dir = geom.directions[p];
|
||||||
int disp = geom.displacements[p];
|
int disp = geom.displacements[p];
|
||||||
|
|
||||||
if ( (disp==-1) || (!lhermitian ) ) {
|
if ( (disp==-1) || (!hermitian ) ) {
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
// Pick out contributions coming from this cell and neighbour cell
|
// Pick out contributions coming from this cell and neighbour cell
|
||||||
@ -633,7 +942,7 @@ public:
|
|||||||
autoView( A_self , A[self_stencil], AcceleratorWrite);
|
autoView( A_self , A[self_stencil], AcceleratorWrite);
|
||||||
|
|
||||||
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
|
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
|
||||||
if ( lhermitian && (disp==-1) ) {
|
if ( hermitian && (disp==-1) ) {
|
||||||
for(int pp=0;pp<geom.npoint;pp++){// Find the opposite link and set <j|A|i> = <i|A|j>*
|
for(int pp=0;pp<geom.npoint;pp++){// Find the opposite link and set <j|A|i> = <i|A|j>*
|
||||||
int dirp = geom.directions[pp];
|
int dirp = geom.directions[pp];
|
||||||
int dispp = geom.displacements[pp];
|
int dispp = geom.displacements[pp];
|
||||||
@ -645,11 +954,11 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::cout << GridLogMessage<< "CoarsenMatrix Diag "<<std::endl;
|
|
||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
// Faster alternate self coupling.. use hermiticity to save 2x
|
// Faster alternate self coupling.. use hermiticity to save 2x
|
||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
@ -681,35 +990,57 @@ public:
|
|||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if(hermitian) {
|
||||||
MemoryManager::PrintBytes();
|
std::cout << GridLogMessage << " ForceHermitian, new code "<<std::endl;
|
||||||
|
|
||||||
// Auto self test
|
|
||||||
Test( Subspace,FineGrid,linop);
|
|
||||||
|
|
||||||
#if 0
|
|
||||||
///////////////////////////
|
|
||||||
// test code worth preserving in if block
|
|
||||||
///////////////////////////
|
|
||||||
std::cout<<GridLogMessage<< " Computed matrix elements "<< self_stencil <<std::endl;
|
|
||||||
for(int p=0;p<geom.npoint;p++){
|
|
||||||
std::cout<<GridLogMessage<< "A["<<p<<"]" << std::endl;
|
|
||||||
std::cout<<GridLogMessage<< "\n"<<A[p] << std::endl;
|
|
||||||
}
|
}
|
||||||
std::cout<<GridLogMessage<< " picking by block0 "<< self_stencil <<std::endl;
|
|
||||||
|
|
||||||
phi=Subspace.subspace[0];
|
|
||||||
std::vector<int> bc(FineGrid->_ndimension,0);
|
|
||||||
blockPick(Grid(),phi,tmp,bc); // Pick out a block
|
|
||||||
linop.Op(tmp,Mphi); // Apply big dop
|
|
||||||
blockProject(iProj,Mphi,Subspace.subspace); // project it and print it
|
|
||||||
std::cout<<GridLogMessage<< " Computed matrix elements from block zero only "<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<< iProj <<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
InvertSelfStencilLink(); std::cout << GridLogMessage << "Coarse self link inverted" << std::endl;
|
||||||
|
FillHalfCbs(); std::cout << GridLogMessage << "Coarse half checkerboards filled" << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void InvertSelfStencilLink() {
|
||||||
|
std::cout << GridLogDebug << "CoarsenedMatrix::InvertSelfStencilLink" << std::endl;
|
||||||
|
int localVolume = Grid()->lSites();
|
||||||
|
|
||||||
|
typedef typename Cobj::scalar_object scalar_object;
|
||||||
|
|
||||||
|
autoView(Aself_v, A[geom.npoint-1], CpuRead);
|
||||||
|
autoView(AselfInv_v, AselfInv, CpuWrite);
|
||||||
|
thread_for(site, localVolume, { // NOTE: Not able to bring this to GPU because of Eigen + peek/poke
|
||||||
|
Eigen::MatrixXcd selfLinkEigen = Eigen::MatrixXcd::Zero(nbasis, nbasis);
|
||||||
|
Eigen::MatrixXcd selfLinkInvEigen = Eigen::MatrixXcd::Zero(nbasis, nbasis);
|
||||||
|
|
||||||
|
scalar_object selfLink = Zero();
|
||||||
|
scalar_object selfLinkInv = Zero();
|
||||||
|
|
||||||
|
Coordinate lcoor;
|
||||||
|
|
||||||
|
Grid()->LocalIndexToLocalCoor(site, lcoor);
|
||||||
|
peekLocalSite(selfLink, Aself_v, lcoor);
|
||||||
|
|
||||||
|
for (int i = 0; i < nbasis; ++i)
|
||||||
|
for (int j = 0; j < nbasis; ++j)
|
||||||
|
selfLinkEigen(i, j) = static_cast<ComplexD>(TensorRemove(selfLink(i, j)));
|
||||||
|
|
||||||
|
selfLinkInvEigen = selfLinkEigen.inverse();
|
||||||
|
|
||||||
|
for(int i = 0; i < nbasis; ++i)
|
||||||
|
for(int j = 0; j < nbasis; ++j)
|
||||||
|
selfLinkInv(i, j) = selfLinkInvEigen(i, j);
|
||||||
|
|
||||||
|
pokeLocalSite(selfLinkInv, AselfInv_v, lcoor);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
void FillHalfCbs() {
|
||||||
|
std::cout << GridLogDebug << "CoarsenedMatrix::FillHalfCbs" << std::endl;
|
||||||
|
for(int p = 0; p < geom.npoint; ++p) {
|
||||||
|
pickCheckerboard(Even, Aeven[p], A[p]);
|
||||||
|
pickCheckerboard(Odd, Aodd[p], A[p]);
|
||||||
|
}
|
||||||
|
pickCheckerboard(Even, AselfInvEven, AselfInv);
|
||||||
|
pickCheckerboard(Odd, AselfInvOdd, AselfInv);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
@ -52,9 +52,6 @@ public:
|
|||||||
virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base
|
virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base
|
||||||
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
|
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
|
||||||
virtual void HermOp(const Field &in, Field &out)=0;
|
virtual void HermOp(const Field &in, Field &out)=0;
|
||||||
|
|
||||||
virtual std::vector<int> Directions(void) =0;
|
|
||||||
virtual std::vector<int> Displacements(void)=0;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -79,9 +76,6 @@ class MdagMLinearOperator : public LinearOperatorBase<Field> {
|
|||||||
public:
|
public:
|
||||||
MdagMLinearOperator(Matrix &Mat): _Mat(Mat){};
|
MdagMLinearOperator(Matrix &Mat): _Mat(Mat){};
|
||||||
|
|
||||||
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
|
|
||||||
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
|
|
||||||
|
|
||||||
// Support for coarsening to a multigrid
|
// Support for coarsening to a multigrid
|
||||||
void OpDiag (const Field &in, Field &out) {
|
void OpDiag (const Field &in, Field &out) {
|
||||||
_Mat.Mdiag(in,out);
|
_Mat.Mdiag(in,out);
|
||||||
@ -117,8 +111,6 @@ class ShiftedMdagMLinearOperator : public LinearOperatorBase<Field> {
|
|||||||
Matrix &_Mat;
|
Matrix &_Mat;
|
||||||
RealD _shift;
|
RealD _shift;
|
||||||
public:
|
public:
|
||||||
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
|
|
||||||
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
|
|
||||||
ShiftedMdagMLinearOperator(Matrix &Mat,RealD shift): _Mat(Mat), _shift(shift){};
|
ShiftedMdagMLinearOperator(Matrix &Mat,RealD shift): _Mat(Mat), _shift(shift){};
|
||||||
// Support for coarsening to a multigrid
|
// Support for coarsening to a multigrid
|
||||||
void OpDiag (const Field &in, Field &out) {
|
void OpDiag (const Field &in, Field &out) {
|
||||||
@ -159,8 +151,6 @@ template<class Matrix,class Field>
|
|||||||
class HermitianLinearOperator : public LinearOperatorBase<Field> {
|
class HermitianLinearOperator : public LinearOperatorBase<Field> {
|
||||||
Matrix &_Mat;
|
Matrix &_Mat;
|
||||||
public:
|
public:
|
||||||
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
|
|
||||||
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
|
|
||||||
HermitianLinearOperator(Matrix &Mat): _Mat(Mat){};
|
HermitianLinearOperator(Matrix &Mat): _Mat(Mat){};
|
||||||
// Support for coarsening to a multigrid
|
// Support for coarsening to a multigrid
|
||||||
void OpDiag (const Field &in, Field &out) {
|
void OpDiag (const Field &in, Field &out) {
|
||||||
@ -192,8 +182,6 @@ template<class Matrix,class Field>
|
|||||||
class NonHermitianLinearOperator : public LinearOperatorBase<Field> {
|
class NonHermitianLinearOperator : public LinearOperatorBase<Field> {
|
||||||
Matrix &_Mat;
|
Matrix &_Mat;
|
||||||
public:
|
public:
|
||||||
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
|
|
||||||
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
|
|
||||||
NonHermitianLinearOperator(Matrix &Mat): _Mat(Mat){};
|
NonHermitianLinearOperator(Matrix &Mat): _Mat(Mat){};
|
||||||
// Support for coarsening to a multigrid
|
// Support for coarsening to a multigrid
|
||||||
void OpDiag (const Field &in, Field &out) {
|
void OpDiag (const Field &in, Field &out) {
|
||||||
@ -267,8 +255,6 @@ template<class Matrix,class Field>
|
|||||||
class SchurDiagMooeeOperator : public SchurOperatorBase<Field> {
|
class SchurDiagMooeeOperator : public SchurOperatorBase<Field> {
|
||||||
public:
|
public:
|
||||||
Matrix &_Mat;
|
Matrix &_Mat;
|
||||||
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
|
|
||||||
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
|
|
||||||
SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
|
SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
|
||||||
virtual void Mpc (const Field &in, Field &out) {
|
virtual void Mpc (const Field &in, Field &out) {
|
||||||
Field tmp(in.Grid());
|
Field tmp(in.Grid());
|
||||||
@ -295,8 +281,6 @@ template<class Matrix,class Field>
|
|||||||
protected:
|
protected:
|
||||||
Matrix &_Mat;
|
Matrix &_Mat;
|
||||||
public:
|
public:
|
||||||
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
|
|
||||||
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
|
|
||||||
SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){};
|
SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){};
|
||||||
|
|
||||||
virtual void Mpc (const Field &in, Field &out) {
|
virtual void Mpc (const Field &in, Field &out) {
|
||||||
@ -323,8 +307,6 @@ template<class Matrix,class Field>
|
|||||||
protected:
|
protected:
|
||||||
Matrix &_Mat;
|
Matrix &_Mat;
|
||||||
public:
|
public:
|
||||||
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
|
|
||||||
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
|
|
||||||
SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){};
|
SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){};
|
||||||
|
|
||||||
virtual void Mpc (const Field &in, Field &out) {
|
virtual void Mpc (const Field &in, Field &out) {
|
||||||
@ -390,8 +372,6 @@ class NonHermitianSchurDiagMooeeOperator : public NonHermitianSchurOperatorBase
|
|||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
Matrix& _Mat;
|
Matrix& _Mat;
|
||||||
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
|
|
||||||
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
|
|
||||||
NonHermitianSchurDiagMooeeOperator(Matrix& Mat): _Mat(Mat){};
|
NonHermitianSchurDiagMooeeOperator(Matrix& Mat): _Mat(Mat){};
|
||||||
virtual void Mpc(const Field& in, Field& out) {
|
virtual void Mpc(const Field& in, Field& out) {
|
||||||
Field tmp(in.Grid());
|
Field tmp(in.Grid());
|
||||||
@ -425,8 +405,6 @@ class NonHermitianSchurDiagOneOperator : public NonHermitianSchurOperatorBase<Fi
|
|||||||
Matrix &_Mat;
|
Matrix &_Mat;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
|
|
||||||
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
|
|
||||||
NonHermitianSchurDiagOneOperator (Matrix& Mat): _Mat(Mat){};
|
NonHermitianSchurDiagOneOperator (Matrix& Mat): _Mat(Mat){};
|
||||||
virtual void Mpc(const Field& in, Field& out) {
|
virtual void Mpc(const Field& in, Field& out) {
|
||||||
Field tmp(in.Grid());
|
Field tmp(in.Grid());
|
||||||
@ -457,8 +435,6 @@ class NonHermitianSchurDiagTwoOperator : public NonHermitianSchurOperatorBase<Fi
|
|||||||
Matrix& _Mat;
|
Matrix& _Mat;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
|
|
||||||
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
|
|
||||||
NonHermitianSchurDiagTwoOperator(Matrix& Mat): _Mat(Mat){};
|
NonHermitianSchurDiagTwoOperator(Matrix& Mat): _Mat(Mat){};
|
||||||
|
|
||||||
virtual void Mpc(const Field& in, Field& out) {
|
virtual void Mpc(const Field& in, Field& out) {
|
||||||
@ -499,8 +475,6 @@ class SchurStaggeredOperator : public SchurOperatorBase<Field> {
|
|||||||
Field tmp;
|
Field tmp;
|
||||||
RealD mass;
|
RealD mass;
|
||||||
public:
|
public:
|
||||||
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
|
|
||||||
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
|
|
||||||
SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid())
|
SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid())
|
||||||
{
|
{
|
||||||
assert( _Mat.isTrivialEE() );
|
assert( _Mat.isTrivialEE() );
|
||||||
|
@ -48,8 +48,6 @@ public:
|
|||||||
virtual void Mdiag (const Field &in, Field &out)=0;
|
virtual void Mdiag (const Field &in, Field &out)=0;
|
||||||
virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0;
|
virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0;
|
||||||
virtual void MdirAll (const Field &in, std::vector<Field> &out)=0;
|
virtual void MdirAll (const Field &in, std::vector<Field> &out)=0;
|
||||||
virtual std::vector<int> Directions(void) =0;
|
|
||||||
virtual std::vector<int> Displacements(void)=0;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
@ -75,8 +73,6 @@ public:
|
|||||||
virtual void MooeeDag (const Field &in, Field &out)=0;
|
virtual void MooeeDag (const Field &in, Field &out)=0;
|
||||||
virtual void MooeeInvDag (const Field &in, Field &out)=0;
|
virtual void MooeeInvDag (const Field &in, Field &out)=0;
|
||||||
|
|
||||||
virtual std::vector<int> Directions(void) =0;
|
|
||||||
virtual std::vector<int> Displacements(void)=0;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
/*************************************************************************************
|
/*************************************************************************************
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
@ -28,7 +28,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#ifndef GRID_ALGORITHMS_ITERATIVE_GENERIC_PCG
|
#ifndef GRID_ALGORITHMS_ITERATIVE_GENERIC_PCG
|
||||||
#define GRID_ALGORITHMS_ITERATIVE_GENERIC_PCG
|
#define GRID_ALGORITHMS_ITERATIVE_GENERIC_PCG
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
/*
|
/*
|
||||||
* Compared to Tang-2009: P=Pleft. P^T = PRight Q=MssInv.
|
* Compared to Tang-2009: P=Pleft. P^T = PRight Q=MssInv.
|
||||||
* Script A = SolverMatrix
|
* Script A = SolverMatrix
|
||||||
@ -51,54 +50,53 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
* Vout = x
|
* Vout = x
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
// abstract base
|
||||||
template<class Field, class CoarseField, class Aggregates>
|
template<class Field, class CoarseField>
|
||||||
class TwoLevelFlexiblePcg : public LinearFunction<Field>
|
class TwoLevelFlexiblePcg : public LinearFunction<Field>
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
|
|
||||||
int verbose;
|
int verbose;
|
||||||
|
|
||||||
RealD Tolerance;
|
RealD Tolerance;
|
||||||
Integer MaxIterations;
|
Integer MaxIterations;
|
||||||
const int mmax = 4;
|
const int mmax = 5;
|
||||||
GridBase *FineGrid;
|
GridBase *grid;
|
||||||
GridBase *CoarseGrid;
|
GridBase *coarsegrid;
|
||||||
|
|
||||||
LinearOperatorBase<Field> &_Linop;
|
LinearOperatorBase<Field> *_Linop
|
||||||
LinearFunction<Field> &_Smoother;
|
OperatorFunction<Field> *_Smoother,
|
||||||
LinearFunction<CoarseField> &_CoarseSolver;
|
LinearFunction<CoarseField> *_CoarseSolver;
|
||||||
Aggregates &_Aggregates;
|
|
||||||
|
// Need somthing that knows how to get from Coarse to fine and back again
|
||||||
|
|
||||||
// more most opertor functions
|
// more most opertor functions
|
||||||
TwoLevelFlexiblePcg(RealD tol,
|
TwoLevelFlexiblePcg(RealD tol,
|
||||||
Integer maxit,
|
Integer maxit,
|
||||||
LinearOperatorBase<Field> *Linop,
|
LinearOperatorBase<Field> *Linop,
|
||||||
LinearFunction<Field> *Smoother,
|
LinearOperatorBase<Field> *SmootherLinop,
|
||||||
LinearFunction<CoarseField> *CoarseSolver,
|
OperatorFunction<Field> *Smoother,
|
||||||
Aggregates *AggP
|
OperatorFunction<CoarseField> CoarseLinop
|
||||||
) :
|
) :
|
||||||
Tolerance(tol),
|
Tolerance(tol),
|
||||||
MaxIterations(maxit),
|
MaxIterations(maxit),
|
||||||
_Linop(*Linop),
|
_Linop(Linop),
|
||||||
_Smoother(*Smoother),
|
_PreconditionerLinop(PrecLinop),
|
||||||
_CoarseSolver(*CoarseSolver),
|
_Preconditioner(Preconditioner)
|
||||||
_Aggregates(*AggP)
|
|
||||||
{
|
{
|
||||||
CoarseGrid=_Aggregates.CoarseGrid;
|
|
||||||
FineGrid=_Aggregates.FineGrid;
|
|
||||||
verbose=0;
|
verbose=0;
|
||||||
};
|
};
|
||||||
|
|
||||||
// The Pcg routine is common to all, but the various matrices differ from derived
|
// The Pcg routine is common to all, but the various matrices differ from derived
|
||||||
// implementation to derived implmentation
|
// implementation to derived implmentation
|
||||||
|
void operator() (const Field &src, Field &psi){
|
||||||
void operator() (const Field &src, Field &psi){
|
void operator() (const Field &src, Field &psi){
|
||||||
|
|
||||||
psi.Checkerboard() = src.Checkerboard();
|
psi.Checkerboard() = src.Checkerboard();
|
||||||
|
grid = src.Grid();
|
||||||
|
|
||||||
|
RealD f;
|
||||||
RealD rtzp,rtz,a,d,b;
|
RealD rtzp,rtz,a,d,b;
|
||||||
// RealD rptzp;
|
RealD rptzp;
|
||||||
// RealD tn;
|
RealD tn;
|
||||||
RealD guess = norm2(psi);
|
RealD guess = norm2(psi);
|
||||||
RealD ssq = norm2(src);
|
RealD ssq = norm2(src);
|
||||||
RealD rsq = ssq*Tolerance*Tolerance;
|
RealD rsq = ssq*Tolerance*Tolerance;
|
||||||
@ -106,15 +104,15 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
|
|||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// Set up history vectors
|
// Set up history vectors
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
std::vector<Field> p (mmax,FineGrid);
|
std::vector<Field> p (mmax,grid);
|
||||||
std::vector<Field> mmp(mmax,FineGrid);
|
std::vector<Field> mmp(mmax,grid);
|
||||||
std::vector<RealD> pAp(mmax);
|
std::vector<RealD> pAp(mmax);
|
||||||
|
|
||||||
Field x (FineGrid); x = psi;
|
Field x (grid); x = psi;
|
||||||
Field z (FineGrid);
|
Field z (grid);
|
||||||
Field tmp(FineGrid);
|
Field tmp(grid);
|
||||||
Field r (FineGrid);
|
Field r (grid);
|
||||||
Field mu (FineGrid);
|
Field mu (grid);
|
||||||
|
|
||||||
//////////////////////////
|
//////////////////////////
|
||||||
// x0 = Vstart -- possibly modify guess
|
// x0 = Vstart -- possibly modify guess
|
||||||
@ -123,13 +121,13 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
|
|||||||
Vstart(x,src);
|
Vstart(x,src);
|
||||||
|
|
||||||
// r0 = b -A x0
|
// r0 = b -A x0
|
||||||
_Linop.HermOp(x,mmp[0]); // Shouldn't this be something else?
|
HermOp(x,mmp); // Shouldn't this be something else?
|
||||||
axpy (r, -1.0,mmp[0], src); // Recomputes r=src-Ax0
|
axpy (r, -1.0,mmp[0], src); // Recomputes r=src-Ax0
|
||||||
|
|
||||||
//////////////////////////////////
|
//////////////////////////////////
|
||||||
// Compute z = M1 x
|
// Compute z = M1 x
|
||||||
//////////////////////////////////
|
//////////////////////////////////
|
||||||
M1(r,z);
|
M1(r,z,tmp,mp,SmootherMirs);
|
||||||
rtzp =real(innerProduct(r,z));
|
rtzp =real(innerProduct(r,z));
|
||||||
|
|
||||||
///////////////////////////////////////
|
///////////////////////////////////////
|
||||||
@ -145,7 +143,7 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
|
|||||||
int peri_kp = (k+1) % mmax;
|
int peri_kp = (k+1) % mmax;
|
||||||
|
|
||||||
rtz=rtzp;
|
rtz=rtzp;
|
||||||
d= M3(p[peri_k],mmp[peri_k]);
|
d= M3(p[peri_k],mp,mmp[peri_k],tmp);
|
||||||
a = rtz/d;
|
a = rtz/d;
|
||||||
|
|
||||||
// Memorise this
|
// Memorise this
|
||||||
@ -155,13 +153,13 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
|
|||||||
RealD rn = axpy_norm(r,-a,mmp[peri_k],r);
|
RealD rn = axpy_norm(r,-a,mmp[peri_k],r);
|
||||||
|
|
||||||
// Compute z = M x
|
// Compute z = M x
|
||||||
M1(r,z);
|
M1(r,z,tmp,mp);
|
||||||
|
|
||||||
rtzp =real(innerProduct(r,z));
|
rtzp =real(innerProduct(r,z));
|
||||||
|
|
||||||
M2(z,mu); // ADEF-2 this is identity. Axpy possible to eliminate
|
M2(z,mu); // ADEF-2 this is identity. Axpy possible to eliminate
|
||||||
|
|
||||||
p[peri_kp]=mu;
|
p[peri_kp]=p[peri_k];
|
||||||
|
|
||||||
// Standard search direction p -> z + b p ; b =
|
// Standard search direction p -> z + b p ; b =
|
||||||
b = (rtzp)/rtz;
|
b = (rtzp)/rtz;
|
||||||
@ -183,7 +181,7 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
|
|||||||
// Stopping condition
|
// Stopping condition
|
||||||
if ( rn <= rsq ) {
|
if ( rn <= rsq ) {
|
||||||
|
|
||||||
_Linop.HermOp(x,mmp[0]); // Shouldn't this be something else?
|
HermOp(x,mmp); // Shouldn't this be something else?
|
||||||
axpy(tmp,-1.0,src,mmp[0]);
|
axpy(tmp,-1.0,src,mmp[0]);
|
||||||
|
|
||||||
RealD psinorm = sqrt(norm2(x));
|
RealD psinorm = sqrt(norm2(x));
|
||||||
@ -192,8 +190,7 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
|
|||||||
RealD true_residual = tmpnorm/srcnorm;
|
RealD true_residual = tmpnorm/srcnorm;
|
||||||
std::cout<<GridLogMessage<<"TwoLevelfPcg: true residual is "<<true_residual<<std::endl;
|
std::cout<<GridLogMessage<<"TwoLevelfPcg: true residual is "<<true_residual<<std::endl;
|
||||||
std::cout<<GridLogMessage<<"TwoLevelfPcg: target residual was"<<Tolerance<<std::endl;
|
std::cout<<GridLogMessage<<"TwoLevelfPcg: target residual was"<<Tolerance<<std::endl;
|
||||||
|
return k;
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Non-convergence
|
// Non-convergence
|
||||||
@ -202,40 +199,48 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
virtual void M1(Field & in, Field & out)
|
virtual void M(Field & in,Field & out,Field & tmp) {
|
||||||
{// the smoother
|
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual void M1(Field & in, Field & out) {// the smoother
|
||||||
|
|
||||||
// [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
|
// [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
|
||||||
Field tmp(FineGrid);
|
Field tmp(grid);
|
||||||
Field Min(FineGrid);
|
Field Min(grid);
|
||||||
|
|
||||||
CoarseField PleftProj(CoarseGrid);
|
PcgM(in,Min); // Smoother call
|
||||||
CoarseField PleftMss_proj(CoarseGrid);
|
|
||||||
|
|
||||||
_Smoother(in,Min); // Smoother call
|
HermOp(Min,out);
|
||||||
|
|
||||||
_Linop.HermOp(Min,out);
|
|
||||||
axpy(tmp,-1.0,out,in); // tmp = in - A Min
|
axpy(tmp,-1.0,out,in); // tmp = in - A Min
|
||||||
|
|
||||||
_Aggregates.ProjectToSubspace(PleftProj,tmp);
|
ProjectToSubspace(tmp,PleftProj);
|
||||||
_CoarseSolver(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s
|
ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s
|
||||||
_Aggregates.PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]
|
PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]
|
||||||
axpy(out,1.0,Min,tmp); // Min+tmp
|
axpy(out,1.0,Min,tmp); // Min+tmp
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual void M2(const Field & in, Field & out)
|
virtual void M2(const Field & in, Field & out) {
|
||||||
{
|
|
||||||
out=in;
|
out=in;
|
||||||
|
// Must override for Def2 only
|
||||||
|
// case PcgDef2:
|
||||||
|
// Pright(in,out);
|
||||||
|
// break;
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual RealD M3(const Field & p, Field & mmp)
|
virtual RealD M3(const Field & p, Field & mmp){
|
||||||
{
|
|
||||||
double d,dd;
|
double d,dd;
|
||||||
_Linop.HermOpAndNorm(p,mmp,d,dd);
|
HermOpAndNorm(p,mmp,d,dd);
|
||||||
return dd;
|
return dd;
|
||||||
|
// Must override for Def1 only
|
||||||
|
// case PcgDef1:
|
||||||
|
// d=linop_d->Mprec(p,mmp,tmp,0,1);// Dag no
|
||||||
|
// linop_d->Mprec(mmp,mp,tmp,1);// Dag yes
|
||||||
|
// Pleft(mp,mmp);
|
||||||
|
// d=real(linop_d->inner(p,mmp));
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual void Vstart(Field & x,const Field & src)
|
virtual void VstartDef2(Field & xconst Field & src){
|
||||||
{
|
|
||||||
//case PcgDef2:
|
//case PcgDef2:
|
||||||
//case PcgAdef2:
|
//case PcgAdef2:
|
||||||
//case PcgAdef2f:
|
//case PcgAdef2f:
|
||||||
@ -251,79 +256,142 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
|
|||||||
// = src_s - (A guess)_s - src_s + (A guess)_s
|
// = src_s - (A guess)_s - src_s + (A guess)_s
|
||||||
// = 0
|
// = 0
|
||||||
///////////////////////////////////
|
///////////////////////////////////
|
||||||
Field r(FineGrid);
|
Field r(grid);
|
||||||
Field mmp(FineGrid);
|
Field mmp(grid);
|
||||||
|
|
||||||
CoarseField PleftProj(CoarseGrid);
|
HermOp(x,mmp);
|
||||||
CoarseField PleftMss_proj(CoarseGrid);
|
|
||||||
|
|
||||||
_Linop.HermOp(x,mmp);
|
|
||||||
axpy (r, -1.0, mmp, src); // r_{-1} = src - A x
|
axpy (r, -1.0, mmp, src); // r_{-1} = src - A x
|
||||||
_Aggregates.ProjectToSubspace(PleftProj,r);
|
ProjectToSubspace(r,PleftProj);
|
||||||
_CoarseSolver(PleftProj,PleftMss_proj); // Ass^{-1} r_s
|
ApplyInverseCG(PleftProj,PleftMss_proj); // Ass^{-1} r_s
|
||||||
_Aggregates.PromoteFromSubspace(PleftMss_proj,mmp);
|
PromoteFromSubspace(PleftMss_proj,mmp);
|
||||||
x=x+mmp;
|
x=x+mmp;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
virtual void Vstart(Field & x,const Field & src){
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////
|
||||||
// Only Def1 has non-trivial Vout. Override in Def1
|
// Only Def1 has non-trivial Vout. Override in Def1
|
||||||
/////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////
|
||||||
virtual void Vout (Field & in, Field & out,Field & src){
|
virtual void Vout (Field & in, Field & out,Field & src){
|
||||||
out = in;
|
out = in;
|
||||||
|
//case PcgDef1:
|
||||||
|
// //Qb + PT x
|
||||||
|
// ProjectToSubspace(src,PleftProj);
|
||||||
|
// ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} r_s
|
||||||
|
// PromoteFromSubspace(PleftMss_proj,tmp);
|
||||||
|
//
|
||||||
|
// Pright(in,out);
|
||||||
|
//
|
||||||
|
// linop_d->axpy(out,tmp,out,1.0);
|
||||||
|
// break;
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Pright and Pleft are common to all implementations
|
// Pright and Pleft are common to all implementations
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
virtual void Pright(Field & in,Field & out)
|
virtual void Pright(Field & in,Field & out){
|
||||||
{
|
|
||||||
// P_R = [ 1 0 ]
|
// P_R = [ 1 0 ]
|
||||||
// [ -Mss^-1 Msb 0 ]
|
// [ -Mss^-1 Msb 0 ]
|
||||||
Field in_sbar(FineGrid);
|
Field in_sbar(grid);
|
||||||
|
|
||||||
CoarseField PleftProj(CoarseGrid);
|
ProjectToSubspace(in,PleftProj);
|
||||||
CoarseField PleftMss_proj(CoarseGrid);
|
PromoteFromSubspace(PleftProj,out);
|
||||||
|
|
||||||
_Aggregates.ProjectToSubspace(PleftProj,in);
|
|
||||||
_Aggregates.PromoteFromSubspace(PleftProj,out);
|
|
||||||
axpy(in_sbar,-1.0,out,in); // in_sbar = in - in_s
|
axpy(in_sbar,-1.0,out,in); // in_sbar = in - in_s
|
||||||
|
|
||||||
_Linop.HermOp(in_sbar,out);
|
HermOp(in_sbar,out);
|
||||||
_Aggregates.ProjectToSubspace(PleftProj,out); // Mssbar in_sbar (project)
|
ProjectToSubspace(out,PleftProj); // Mssbar in_sbar (project)
|
||||||
|
|
||||||
_CoarseSolver(PleftProj,PleftMss_proj); // Mss^{-1} Mssbar
|
ApplyInverse (PleftProj,PleftMss_proj); // Mss^{-1} Mssbar
|
||||||
_Aggregates.PromoteFromSubspace(PleftMss_proj,out); //
|
PromoteFromSubspace(PleftMss_proj,out); //
|
||||||
|
|
||||||
axpy(out,-1.0,out,in_sbar); // in_sbar - Mss^{-1} Mssbar in_sbar
|
axpy(out,-1.0,out,in_sbar); // in_sbar - Mss^{-1} Mssbar in_sbar
|
||||||
}
|
}
|
||||||
virtual void Pleft (Field & in,Field & out)
|
virtual void Pleft (Field & in,Field & out){
|
||||||
{
|
|
||||||
// P_L = [ 1 -Mbs Mss^-1]
|
// P_L = [ 1 -Mbs Mss^-1]
|
||||||
// [ 0 0 ]
|
// [ 0 0 ]
|
||||||
Field in_sbar(FineGrid);
|
Field in_sbar(grid);
|
||||||
Field tmp2(FineGrid);
|
Field tmp2(grid);
|
||||||
Field Mtmp(FineGrid);
|
Field Mtmp(grid);
|
||||||
|
|
||||||
CoarseField PleftProj(CoarseGrid);
|
ProjectToSubspace(in,PleftProj);
|
||||||
CoarseField PleftMss_proj(CoarseGrid);
|
PromoteFromSubspace(PleftProj,out);
|
||||||
|
|
||||||
_Aggregates.ProjectToSubspace(PleftProj,in);
|
|
||||||
_Aggregates.PromoteFromSubspace(PleftProj,out);
|
|
||||||
axpy(in_sbar,-1.0,out,in); // in_sbar = in - in_s
|
axpy(in_sbar,-1.0,out,in); // in_sbar = in - in_s
|
||||||
|
|
||||||
_CoarseSolver(PleftProj,PleftMss_proj); // Mss^{-1} in_s
|
ApplyInverse(PleftProj,PleftMss_proj); // Mss^{-1} in_s
|
||||||
_Aggregates.PromoteFromSubspace(PleftMss_proj,out);
|
PromoteFromSubspace(PleftMss_proj,out);
|
||||||
|
|
||||||
_Linop.HermOp(out,Mtmp);
|
HermOp(out,Mtmp);
|
||||||
|
|
||||||
_Aggregates.ProjectToSubspace(PleftProj,Mtmp); // Msbar s Mss^{-1}
|
ProjectToSubspace(Mtmp,PleftProj); // Msbar s Mss^{-1}
|
||||||
_Aggregates.PromoteFromSubspace(PleftProj,tmp2);
|
PromoteFromSubspace(PleftProj,tmp2);
|
||||||
|
|
||||||
axpy(out,-1.0,tmp2,Mtmp);
|
axpy(out,-1.0,tmp2,Mtmp);
|
||||||
axpy(out,-1.0,out,in_sbar); // in_sbar - Msbars Mss^{-1} in_s
|
axpy(out,-1.0,out,in_sbar); // in_sbar - Msbars Mss^{-1} in_s
|
||||||
}
|
}
|
||||||
};
|
}
|
||||||
NAMESPACE_END(Grid);
|
|
||||||
|
|
||||||
|
template<class Field>
|
||||||
|
class TwoLevelFlexiblePcgADef2 : public TwoLevelFlexiblePcg<Field> {
|
||||||
|
public:
|
||||||
|
virtual void M(Field & in,Field & out,Field & tmp){
|
||||||
|
|
||||||
|
}
|
||||||
|
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp){
|
||||||
|
|
||||||
|
}
|
||||||
|
virtual void M2(Field & in, Field & out){
|
||||||
|
|
||||||
|
}
|
||||||
|
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp){
|
||||||
|
|
||||||
|
}
|
||||||
|
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp){
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
template<class Field>
|
||||||
|
class TwoLevelFlexiblePcgAD : public TwoLevelFlexiblePcg<Field> {
|
||||||
|
public:
|
||||||
|
virtual void M(Field & in,Field & out,Field & tmp);
|
||||||
|
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
|
||||||
|
virtual void M2(Field & in, Field & out);
|
||||||
|
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
|
||||||
|
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Field>
|
||||||
|
class TwoLevelFlexiblePcgDef1 : public TwoLevelFlexiblePcg<Field> {
|
||||||
|
public:
|
||||||
|
virtual void M(Field & in,Field & out,Field & tmp);
|
||||||
|
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
|
||||||
|
virtual void M2(Field & in, Field & out);
|
||||||
|
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
|
||||||
|
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
|
||||||
|
virtual void Vout (Field & in, Field & out,Field & src,Field & tmp);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Field>
|
||||||
|
class TwoLevelFlexiblePcgDef2 : public TwoLevelFlexiblePcg<Field> {
|
||||||
|
public:
|
||||||
|
virtual void M(Field & in,Field & out,Field & tmp);
|
||||||
|
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
|
||||||
|
virtual void M2(Field & in, Field & out);
|
||||||
|
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
|
||||||
|
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Field>
|
||||||
|
class TwoLevelFlexiblePcgV11: public TwoLevelFlexiblePcg<Field> {
|
||||||
|
public:
|
||||||
|
virtual void M(Field & in,Field & out,Field & tmp);
|
||||||
|
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
|
||||||
|
virtual void M2(Field & in, Field & out);
|
||||||
|
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
|
||||||
|
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
|
||||||
|
}
|
||||||
|
*/
|
||||||
#endif
|
#endif
|
||||||
|
@ -60,8 +60,6 @@ public:
|
|||||||
DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {};
|
DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {};
|
||||||
|
|
||||||
virtual void operator()(const Field &src,Field &guess) {
|
virtual void operator()(const Field &src,Field &guess) {
|
||||||
RealD t=-usecond();
|
|
||||||
|
|
||||||
guess = Zero();
|
guess = Zero();
|
||||||
assert(evec.size()==eval.size());
|
assert(evec.size()==eval.size());
|
||||||
auto N = evec.size();
|
auto N = evec.size();
|
||||||
@ -70,8 +68,6 @@ public:
|
|||||||
axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
|
axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
|
||||||
}
|
}
|
||||||
guess.Checkerboard() = src.Checkerboard();
|
guess.Checkerboard() = src.Checkerboard();
|
||||||
t+=usecond();
|
|
||||||
std::cout<<GridLogMessage<<"\t\t\t" << "Deflated guess took "<< t/1000.0<< "ms" <<std::endl;
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -59,7 +59,7 @@ public:
|
|||||||
GridBase *grid = src.Grid();
|
GridBase *grid = src.Grid();
|
||||||
Field r(grid), p(grid), Ap(grid), Ar(grid), z(grid);
|
Field r(grid), p(grid), Ap(grid), Ar(grid), z(grid);
|
||||||
|
|
||||||
psi=Zero();
|
psi=zero;
|
||||||
r = src;
|
r = src;
|
||||||
Preconditioner(r,p);
|
Preconditioner(r,p);
|
||||||
|
|
||||||
|
@ -1,67 +0,0 @@
|
|||||||
#include <Grid/GridCore.h>
|
|
||||||
#include <fcntl.h>
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
|
|
||||||
MemoryStats *MemoryProfiler::stats = nullptr;
|
|
||||||
bool MemoryProfiler::debug = false;
|
|
||||||
|
|
||||||
void check_huge_pages(void *Buf,uint64_t BYTES)
|
|
||||||
{
|
|
||||||
#ifdef __linux__
|
|
||||||
int fd = open("/proc/self/pagemap", O_RDONLY);
|
|
||||||
assert(fd >= 0);
|
|
||||||
const int page_size = 4096;
|
|
||||||
uint64_t virt_pfn = (uint64_t)Buf / page_size;
|
|
||||||
off_t offset = sizeof(uint64_t) * virt_pfn;
|
|
||||||
uint64_t npages = (BYTES + page_size-1) / page_size;
|
|
||||||
uint64_t pagedata[npages];
|
|
||||||
uint64_t ret = lseek(fd, offset, SEEK_SET);
|
|
||||||
assert(ret == offset);
|
|
||||||
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
|
|
||||||
assert(ret == sizeof(uint64_t) * npages);
|
|
||||||
int nhugepages = npages / 512;
|
|
||||||
int n4ktotal, nnothuge;
|
|
||||||
n4ktotal = 0;
|
|
||||||
nnothuge = 0;
|
|
||||||
for (int i = 0; i < nhugepages; ++i) {
|
|
||||||
uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
|
|
||||||
for (int j = 0; j < 512; ++j) {
|
|
||||||
uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
|
|
||||||
++n4ktotal;
|
|
||||||
if (pageaddr != baseaddr + j * page_size)
|
|
||||||
++nnothuge;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
int rank = CartesianCommunicator::RankWorld();
|
|
||||||
printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string sizeString(const size_t bytes)
|
|
||||||
{
|
|
||||||
constexpr unsigned int bufSize = 256;
|
|
||||||
const char *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"};
|
|
||||||
char buf[256];
|
|
||||||
size_t s = 0;
|
|
||||||
double count = bytes;
|
|
||||||
|
|
||||||
while (count >= 1024 && s < 7)
|
|
||||||
{
|
|
||||||
s++;
|
|
||||||
count /= 1024;
|
|
||||||
}
|
|
||||||
if (count - floor(count) == 0.0)
|
|
||||||
{
|
|
||||||
snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
|
|
||||||
}
|
|
||||||
|
|
||||||
return std::string(buf);
|
|
||||||
}
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
|
||||||
|
|
@ -53,11 +53,7 @@ public:
|
|||||||
{
|
{
|
||||||
size_type bytes = __n*sizeof(_Tp);
|
size_type bytes = __n*sizeof(_Tp);
|
||||||
profilerAllocate(bytes);
|
profilerAllocate(bytes);
|
||||||
#ifdef GRID_UVM
|
|
||||||
_Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes);
|
|
||||||
#else
|
|
||||||
_Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes);
|
_Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes);
|
||||||
#endif
|
|
||||||
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
|
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
@ -66,11 +62,7 @@ public:
|
|||||||
{
|
{
|
||||||
size_type bytes = __n * sizeof(_Tp);
|
size_type bytes = __n * sizeof(_Tp);
|
||||||
profilerFree(bytes);
|
profilerFree(bytes);
|
||||||
#ifdef GRID_UVM
|
|
||||||
MemoryManager::SharedFree((void *)__p,bytes);
|
|
||||||
#else
|
|
||||||
MemoryManager::CpuFree((void *)__p,bytes);
|
MemoryManager::CpuFree((void *)__p,bytes);
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
|
// FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
|
||||||
@ -173,9 +165,18 @@ template<typename _Tp> inline bool operator!=(const devAllocator<_Tp>&, const d
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Template typedefs
|
// Template typedefs
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
//template<class T> using commAllocator = devAllocator<T>;
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
|
// Cshift on device
|
||||||
|
template<class T> using cshiftAllocator = devAllocator<T>;
|
||||||
|
#else
|
||||||
|
// Cshift on host
|
||||||
|
template<class T> using cshiftAllocator = std::allocator<T>;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
|
||||||
|
template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;
|
||||||
template<class T> using commVector = std::vector<T,devAllocator<T> >;
|
template<class T> using commVector = std::vector<T,devAllocator<T> >;
|
||||||
|
template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
@ -9,13 +9,11 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
#define AccSmall (3)
|
#define AccSmall (3)
|
||||||
#define Shared (4)
|
#define Shared (4)
|
||||||
#define SharedSmall (5)
|
#define SharedSmall (5)
|
||||||
uint64_t total_cache;
|
|
||||||
uint64_t total_shared;
|
uint64_t total_shared;
|
||||||
uint64_t total_device;
|
uint64_t total_device;
|
||||||
uint64_t total_host;;
|
uint64_t total_host;;
|
||||||
void MemoryManager::PrintBytes(void)
|
void MemoryManager::PrintBytes(void)
|
||||||
{
|
{
|
||||||
std::cout << " MemoryManager : "<<total_cache <<" cache bytes "<<std::endl;
|
|
||||||
std::cout << " MemoryManager : "<<total_shared<<" shared bytes "<<std::endl;
|
std::cout << " MemoryManager : "<<total_shared<<" shared bytes "<<std::endl;
|
||||||
std::cout << " MemoryManager : "<<total_device<<" accelerator bytes "<<std::endl;
|
std::cout << " MemoryManager : "<<total_device<<" accelerator bytes "<<std::endl;
|
||||||
std::cout << " MemoryManager : "<<total_host <<" cpu bytes "<<std::endl;
|
std::cout << " MemoryManager : "<<total_host <<" cpu bytes "<<std::endl;
|
||||||
@ -37,8 +35,6 @@ void *MemoryManager::AcceleratorAllocate(size_t bytes)
|
|||||||
if ( ptr == (void *) NULL ) {
|
if ( ptr == (void *) NULL ) {
|
||||||
ptr = (void *) acceleratorAllocDevice(bytes);
|
ptr = (void *) acceleratorAllocDevice(bytes);
|
||||||
total_device+=bytes;
|
total_device+=bytes;
|
||||||
} else {
|
|
||||||
// std::cout <<"AcceleratorAllocate: cache hit Device pointer "<<std::hex<<ptr<<std::dec<<" "<<bytes<<std::endl;
|
|
||||||
}
|
}
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
@ -57,10 +53,8 @@ void *MemoryManager::SharedAllocate(size_t bytes)
|
|||||||
if ( ptr == (void *) NULL ) {
|
if ( ptr == (void *) NULL ) {
|
||||||
ptr = (void *) acceleratorAllocShared(bytes);
|
ptr = (void *) acceleratorAllocShared(bytes);
|
||||||
total_shared+=bytes;
|
total_shared+=bytes;
|
||||||
// std::cout <<"SharedAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl;
|
// std::cout <<"AcceleratorAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl;
|
||||||
// PrintBytes();
|
// PrintBytes();
|
||||||
} else {
|
|
||||||
// std::cout <<"SharedAllocate: cache hit Shared pointer "<<std::hex<<ptr<<std::dec<<" "<<bytes<<std::endl;
|
|
||||||
}
|
}
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
@ -80,9 +74,6 @@ void *MemoryManager::CpuAllocate(size_t bytes)
|
|||||||
if ( ptr == (void *) NULL ) {
|
if ( ptr == (void *) NULL ) {
|
||||||
ptr = (void *) acceleratorAllocShared(bytes);
|
ptr = (void *) acceleratorAllocShared(bytes);
|
||||||
total_host+=bytes;
|
total_host+=bytes;
|
||||||
// std::cout <<"CpuAllocate: allocated Cpu pointer "<<std::hex<<ptr<<std::dec<<std::endl;
|
|
||||||
} else {
|
|
||||||
// std::cout <<"CpufAllocate: cache hit Cpu pointer "<<std::hex<<ptr<<std::dec<<" "<<bytes<<std::endl;
|
|
||||||
}
|
}
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
@ -129,7 +120,7 @@ void MemoryManager::Init(void)
|
|||||||
str= getenv("GRID_ALLOC_NCACHE_LARGE");
|
str= getenv("GRID_ALLOC_NCACHE_LARGE");
|
||||||
if ( str ) {
|
if ( str ) {
|
||||||
Nc = atoi(str);
|
Nc = atoi(str);
|
||||||
if ( (Nc>=0) && (Nc <= NallocCacheMax)) {
|
if ( (Nc>=0) && (Nc < NallocCacheMax)) {
|
||||||
Ncache[Cpu]=Nc;
|
Ncache[Cpu]=Nc;
|
||||||
Ncache[Acc]=Nc;
|
Ncache[Acc]=Nc;
|
||||||
Ncache[Shared]=Nc;
|
Ncache[Shared]=Nc;
|
||||||
@ -139,7 +130,7 @@ void MemoryManager::Init(void)
|
|||||||
str= getenv("GRID_ALLOC_NCACHE_SMALL");
|
str= getenv("GRID_ALLOC_NCACHE_SMALL");
|
||||||
if ( str ) {
|
if ( str ) {
|
||||||
Nc = atoi(str);
|
Nc = atoi(str);
|
||||||
if ( (Nc>=0) && (Nc <= NallocCacheMax)) {
|
if ( (Nc>=0) && (Nc < NallocCacheMax)) {
|
||||||
Ncache[CpuSmall]=Nc;
|
Ncache[CpuSmall]=Nc;
|
||||||
Ncache[AccSmall]=Nc;
|
Ncache[AccSmall]=Nc;
|
||||||
Ncache[SharedSmall]=Nc;
|
Ncache[SharedSmall]=Nc;
|
||||||
@ -220,7 +211,6 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
|
|||||||
|
|
||||||
if ( entries[v].valid ) {
|
if ( entries[v].valid ) {
|
||||||
ret = entries[v].address;
|
ret = entries[v].address;
|
||||||
total_cache-=entries[v].bytes;
|
|
||||||
entries[v].valid = 0;
|
entries[v].valid = 0;
|
||||||
entries[v].address = NULL;
|
entries[v].address = NULL;
|
||||||
entries[v].bytes = 0;
|
entries[v].bytes = 0;
|
||||||
@ -229,7 +219,6 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
|
|||||||
entries[v].address=ptr;
|
entries[v].address=ptr;
|
||||||
entries[v].bytes =bytes;
|
entries[v].bytes =bytes;
|
||||||
entries[v].valid =1;
|
entries[v].valid =1;
|
||||||
total_cache+=entries[v].bytes;
|
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
@ -254,7 +243,6 @@ void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncach
|
|||||||
for(int e=0;e<ncache;e++){
|
for(int e=0;e<ncache;e++){
|
||||||
if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
|
if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
|
||||||
entries[e].valid = 0;
|
entries[e].valid = 0;
|
||||||
total_cache-=bytes;
|
|
||||||
return entries[e].address;
|
return entries[e].address;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -34,8 +34,6 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
// Move control to configure.ac and Config.h?
|
// Move control to configure.ac and Config.h?
|
||||||
|
|
||||||
#define ALLOCATION_CACHE
|
|
||||||
#define GRID_ALLOC_ALIGN (2*1024*1024)
|
|
||||||
#define GRID_ALLOC_SMALL_LIMIT (4096)
|
#define GRID_ALLOC_SMALL_LIMIT (4096)
|
||||||
|
|
||||||
/*Pinning pages is costly*/
|
/*Pinning pages is costly*/
|
||||||
@ -93,8 +91,8 @@ private:
|
|||||||
static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
|
static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
|
||||||
static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
|
static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
|
||||||
|
|
||||||
public:
|
|
||||||
static void PrintBytes(void);
|
static void PrintBytes(void);
|
||||||
|
public:
|
||||||
static void Init(void);
|
static void Init(void);
|
||||||
static void InitMessage(void);
|
static void InitMessage(void);
|
||||||
static void *AcceleratorAllocate(size_t bytes);
|
static void *AcceleratorAllocate(size_t bytes);
|
||||||
|
@ -1,11 +1,12 @@
|
|||||||
#include <Grid/GridCore.h>
|
#include <Grid/GridCore.h>
|
||||||
|
|
||||||
#ifndef GRID_UVM
|
#ifndef GRID_UVM
|
||||||
|
|
||||||
#warning "Using explicit device memory copies"
|
#warning "Using explicit device memory copies"
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
//define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
|
||||||
#define dprintf(...)
|
#define dprintf(...)
|
||||||
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
// For caching copies of data on device
|
// For caching copies of data on device
|
||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
@ -103,7 +104,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
|
|||||||
///////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////
|
||||||
assert(AccCache.state!=Empty);
|
assert(AccCache.state!=Empty);
|
||||||
|
|
||||||
// dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
|
dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
|
||||||
assert(AccCache.accLock==0);
|
assert(AccCache.accLock==0);
|
||||||
assert(AccCache.cpuLock==0);
|
assert(AccCache.cpuLock==0);
|
||||||
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
||||||
@ -111,7 +112,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
|
|||||||
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
|
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
|
||||||
DeviceBytes -=AccCache.bytes;
|
DeviceBytes -=AccCache.bytes;
|
||||||
LRUremove(AccCache);
|
LRUremove(AccCache);
|
||||||
// dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
|
dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
|
||||||
}
|
}
|
||||||
uint64_t CpuPtr = AccCache.CpuPtr;
|
uint64_t CpuPtr = AccCache.CpuPtr;
|
||||||
EntryErase(CpuPtr);
|
EntryErase(CpuPtr);
|
||||||
@ -125,7 +126,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
|
|||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
assert(AccCache.state!=Empty);
|
assert(AccCache.state!=Empty);
|
||||||
|
|
||||||
// dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
|
dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
|
||||||
assert(AccCache.accLock==0);
|
assert(AccCache.accLock==0);
|
||||||
assert(AccCache.cpuLock==0);
|
assert(AccCache.cpuLock==0);
|
||||||
if(AccCache.state==AccDirty) {
|
if(AccCache.state==AccDirty) {
|
||||||
@ -136,7 +137,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
|
|||||||
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
|
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
|
||||||
DeviceBytes -=AccCache.bytes;
|
DeviceBytes -=AccCache.bytes;
|
||||||
LRUremove(AccCache);
|
LRUremove(AccCache);
|
||||||
// dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
|
dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
|
||||||
}
|
}
|
||||||
uint64_t CpuPtr = AccCache.CpuPtr;
|
uint64_t CpuPtr = AccCache.CpuPtr;
|
||||||
EntryErase(CpuPtr);
|
EntryErase(CpuPtr);
|
||||||
@ -149,7 +150,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
|
|||||||
assert(AccCache.AccPtr!=(uint64_t)NULL);
|
assert(AccCache.AccPtr!=(uint64_t)NULL);
|
||||||
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
||||||
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
|
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
|
||||||
// dprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
dprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
||||||
DeviceToHostBytes+=AccCache.bytes;
|
DeviceToHostBytes+=AccCache.bytes;
|
||||||
DeviceToHostXfer++;
|
DeviceToHostXfer++;
|
||||||
AccCache.state=Consistent;
|
AccCache.state=Consistent;
|
||||||
@ -164,7 +165,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
|
|||||||
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
|
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
|
||||||
DeviceBytes+=AccCache.bytes;
|
DeviceBytes+=AccCache.bytes;
|
||||||
}
|
}
|
||||||
// dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
||||||
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
|
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
|
||||||
HostToDeviceBytes+=AccCache.bytes;
|
HostToDeviceBytes+=AccCache.bytes;
|
||||||
HostToDeviceXfer++;
|
HostToDeviceXfer++;
|
||||||
@ -227,18 +228,24 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
|
|||||||
// Find if present, otherwise get or force an empty
|
// Find if present, otherwise get or force an empty
|
||||||
////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////
|
||||||
if ( EntryPresent(CpuPtr)==0 ){
|
if ( EntryPresent(CpuPtr)==0 ){
|
||||||
EvictVictims(bytes);
|
|
||||||
EntryCreate(CpuPtr,bytes,mode,hint);
|
EntryCreate(CpuPtr,bytes,mode,hint);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto AccCacheIterator = EntryLookup(CpuPtr);
|
auto AccCacheIterator = EntryLookup(CpuPtr);
|
||||||
auto & AccCache = AccCacheIterator->second;
|
auto & AccCache = AccCacheIterator->second;
|
||||||
|
if (!AccCache.AccPtr) {
|
||||||
|
EvictVictims(bytes);
|
||||||
|
}
|
||||||
assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard));
|
assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard));
|
||||||
|
|
||||||
assert(AccCache.cpuLock==0); // Programming error
|
assert(AccCache.cpuLock==0); // Programming error
|
||||||
|
|
||||||
if(AccCache.state!=Empty) {
|
if(AccCache.state!=Empty) {
|
||||||
|
dprintf("ViewOpen found entry %llx %llx : %lld %lld\n",
|
||||||
|
(uint64_t)AccCache.CpuPtr,
|
||||||
|
(uint64_t)CpuPtr,
|
||||||
|
(uint64_t)AccCache.bytes,
|
||||||
|
(uint64_t)bytes);
|
||||||
assert(AccCache.CpuPtr == CpuPtr);
|
assert(AccCache.CpuPtr == CpuPtr);
|
||||||
assert(AccCache.bytes ==bytes);
|
assert(AccCache.bytes ==bytes);
|
||||||
}
|
}
|
||||||
@ -285,21 +292,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
|
|||||||
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
|
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
|
||||||
}
|
}
|
||||||
AccCache.accLock++;
|
AccCache.accLock++;
|
||||||
// printf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock);
|
dprintf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock);
|
||||||
} else if(AccCache.state==Consistent) {
|
} else if(AccCache.state==Consistent) {
|
||||||
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
|
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
|
||||||
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
|
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
|
||||||
else
|
else
|
||||||
AccCache.state = Consistent; // Consistent + AccRead => Consistent
|
AccCache.state = Consistent; // Consistent + AccRead => Consistent
|
||||||
AccCache.accLock++;
|
AccCache.accLock++;
|
||||||
// printf("Consistent entry into device accLock %d\n",AccCache.accLock);
|
dprintf("Consistent entry into device accLock %d\n",AccCache.accLock);
|
||||||
} else if(AccCache.state==AccDirty) {
|
} else if(AccCache.state==AccDirty) {
|
||||||
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
|
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
|
||||||
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
|
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
|
||||||
else
|
else
|
||||||
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
|
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
|
||||||
AccCache.accLock++;
|
AccCache.accLock++;
|
||||||
// printf("AccDirty entry into device accLock %d\n",AccCache.accLock);
|
dprintf("AccDirty entry into device accLock %d\n",AccCache.accLock);
|
||||||
} else {
|
} else {
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
@ -361,13 +368,16 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
|
|||||||
// Find if present, otherwise get or force an empty
|
// Find if present, otherwise get or force an empty
|
||||||
////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////
|
||||||
if ( EntryPresent(CpuPtr)==0 ){
|
if ( EntryPresent(CpuPtr)==0 ){
|
||||||
EvictVictims(bytes);
|
|
||||||
EntryCreate(CpuPtr,bytes,mode,transient);
|
EntryCreate(CpuPtr,bytes,mode,transient);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto AccCacheIterator = EntryLookup(CpuPtr);
|
auto AccCacheIterator = EntryLookup(CpuPtr);
|
||||||
auto & AccCache = AccCacheIterator->second;
|
auto & AccCache = AccCacheIterator->second;
|
||||||
|
|
||||||
|
if (!AccCache.AccPtr) {
|
||||||
|
EvictVictims(bytes);
|
||||||
|
}
|
||||||
|
|
||||||
assert((mode==CpuRead)||(mode==CpuWrite));
|
assert((mode==CpuRead)||(mode==CpuWrite));
|
||||||
assert(AccCache.accLock==0); // Programming error
|
assert(AccCache.accLock==0); // Programming error
|
||||||
|
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
#include <Grid/GridCore.h>
|
#include <Grid/GridCore.h>
|
||||||
#ifdef GRID_UVM
|
#ifdef GRID_UVM
|
||||||
|
|
||||||
#warning "Grid is assuming unified virtual memory address space"
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
/////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////
|
||||||
// View management is 1:1 address space mapping
|
// View management is 1:1 address space mapping
|
||||||
|
@ -36,7 +36,7 @@ static const int CbBlack=1;
|
|||||||
static const int Even =CbRed;
|
static const int Even =CbRed;
|
||||||
static const int Odd =CbBlack;
|
static const int Odd =CbBlack;
|
||||||
|
|
||||||
accelerator_inline int RedBlackCheckerBoardFromOindex (int oindex, Coordinate &rdim, Coordinate &chk_dim_msk)
|
accelerator_inline int RedBlackCheckerBoardFromOindex (int oindex,const Coordinate &rdim,const Coordinate &chk_dim_msk)
|
||||||
{
|
{
|
||||||
int nd=rdim.size();
|
int nd=rdim.size();
|
||||||
Coordinate coor(nd);
|
Coordinate coor(nd);
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
|
|
||||||
/*************************************************************************************
|
/*************************************************************************************
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
@ -108,6 +107,8 @@ public:
|
|||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
// Reduction
|
// Reduction
|
||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
|
void GlobalMax(RealD &);
|
||||||
|
void GlobalMax(RealF &);
|
||||||
void GlobalSum(RealF &);
|
void GlobalSum(RealF &);
|
||||||
void GlobalSumVector(RealF *,int N);
|
void GlobalSumVector(RealF *,int N);
|
||||||
void GlobalSum(RealD &);
|
void GlobalSum(RealD &);
|
||||||
@ -138,21 +139,6 @@ public:
|
|||||||
int recv_from_rank,
|
int recv_from_rank,
|
||||||
int bytes);
|
int bytes);
|
||||||
|
|
||||||
void SendRecvPacket(void *xmit,
|
|
||||||
void *recv,
|
|
||||||
int xmit_to_rank,
|
|
||||||
int recv_from_rank,
|
|
||||||
int bytes);
|
|
||||||
|
|
||||||
void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
|
||||||
void *xmit,
|
|
||||||
int xmit_to_rank,
|
|
||||||
void *recv,
|
|
||||||
int recv_from_rank,
|
|
||||||
int bytes);
|
|
||||||
|
|
||||||
void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
|
|
||||||
|
|
||||||
double StencilSendToRecvFrom(void *xmit,
|
double StencilSendToRecvFrom(void *xmit,
|
||||||
int xmit_to_rank,
|
int xmit_to_rank,
|
||||||
void *recv,
|
void *recv,
|
||||||
|
@ -44,7 +44,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
|
|||||||
MPI_Initialized(&flag); // needed to coexist with other libs apparently
|
MPI_Initialized(&flag); // needed to coexist with other libs apparently
|
||||||
if ( !flag ) {
|
if ( !flag ) {
|
||||||
|
|
||||||
#if defined (TOFU) // FUGAKU, credits go to Issaku Kanamori
|
#ifndef GRID_COMMS_THREADS
|
||||||
nCommThreads=1;
|
nCommThreads=1;
|
||||||
// wrong results here too
|
// wrong results here too
|
||||||
// For now: comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs
|
// For now: comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs
|
||||||
@ -275,6 +275,16 @@ void CartesianCommunicator::GlobalXOR(uint64_t &u){
|
|||||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
}
|
}
|
||||||
|
void CartesianCommunicator::GlobalMax(float &f)
|
||||||
|
{
|
||||||
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::GlobalMax(double &d)
|
||||||
|
{
|
||||||
|
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
void CartesianCommunicator::GlobalSum(float &f){
|
void CartesianCommunicator::GlobalSum(float &f){
|
||||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
@ -358,16 +368,19 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
assert(from != _processor);
|
assert(from != _processor);
|
||||||
assert(gme == ShmRank);
|
assert(gme == ShmRank);
|
||||||
double off_node_bytes=0.0;
|
double off_node_bytes=0.0;
|
||||||
|
int tag;
|
||||||
|
|
||||||
if ( gfrom ==MPI_UNDEFINED) {
|
if ( gfrom ==MPI_UNDEFINED) {
|
||||||
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[commdir],&rrq);
|
tag= dir+from*32;
|
||||||
|
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
list.push_back(rrq);
|
list.push_back(rrq);
|
||||||
off_node_bytes+=bytes;
|
off_node_bytes+=bytes;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( gdest == MPI_UNDEFINED ) {
|
if ( gdest == MPI_UNDEFINED ) {
|
||||||
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[commdir],&xrq);
|
tag= dir+_processor*32;
|
||||||
|
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
list.push_back(xrq);
|
list.push_back(xrq);
|
||||||
off_node_bytes+=bytes;
|
off_node_bytes+=bytes;
|
||||||
|
@ -67,6 +67,8 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
|
|||||||
|
|
||||||
CartesianCommunicator::~CartesianCommunicator(){}
|
CartesianCommunicator::~CartesianCommunicator(){}
|
||||||
|
|
||||||
|
void CartesianCommunicator::GlobalMax(float &){}
|
||||||
|
void CartesianCommunicator::GlobalMax(double &){}
|
||||||
void CartesianCommunicator::GlobalSum(float &){}
|
void CartesianCommunicator::GlobalSum(float &){}
|
||||||
void CartesianCommunicator::GlobalSumVector(float *,int N){}
|
void CartesianCommunicator::GlobalSumVector(float *,int N){}
|
||||||
void CartesianCommunicator::GlobalSum(double &){}
|
void CartesianCommunicator::GlobalSum(double &){}
|
||||||
@ -77,15 +79,6 @@ void CartesianCommunicator::GlobalSumVector(uint64_t *,int N){}
|
|||||||
void CartesianCommunicator::GlobalXOR(uint32_t &){}
|
void CartesianCommunicator::GlobalXOR(uint32_t &){}
|
||||||
void CartesianCommunicator::GlobalXOR(uint64_t &){}
|
void CartesianCommunicator::GlobalXOR(uint64_t &){}
|
||||||
|
|
||||||
void CartesianCommunicator::SendRecvPacket(void *xmit,
|
|
||||||
void *recv,
|
|
||||||
int xmit_to_rank,
|
|
||||||
int recv_from_rank,
|
|
||||||
int bytes)
|
|
||||||
{
|
|
||||||
assert(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// Basic Halo comms primitive -- should never call in single node
|
// Basic Halo comms primitive -- should never call in single node
|
||||||
void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||||
@ -96,20 +89,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
|||||||
{
|
{
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
|
||||||
void *xmit,
|
|
||||||
int dest,
|
|
||||||
void *recv,
|
|
||||||
int from,
|
|
||||||
int bytes)
|
|
||||||
{
|
|
||||||
assert(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
|
||||||
{
|
|
||||||
assert(0);
|
|
||||||
}
|
|
||||||
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
|
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
|
||||||
{
|
{
|
||||||
bcopy(in,out,bytes*words);
|
bcopy(in,out,bytes*words);
|
||||||
@ -137,10 +116,6 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
|||||||
int recv_from_rank,
|
int recv_from_rank,
|
||||||
int bytes, int dir)
|
int bytes, int dir)
|
||||||
{
|
{
|
||||||
std::vector<CommsRequest_t> list;
|
|
||||||
// Discard the "dir"
|
|
||||||
SendToRecvFromBegin (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
|
|
||||||
SendToRecvFromComplete(list);
|
|
||||||
return 2.0*bytes;
|
return 2.0*bytes;
|
||||||
}
|
}
|
||||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
@ -150,13 +125,10 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
int recv_from_rank,
|
int recv_from_rank,
|
||||||
int bytes, int dir)
|
int bytes, int dir)
|
||||||
{
|
{
|
||||||
// Discard the "dir"
|
|
||||||
SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
|
|
||||||
return 2.0*bytes;
|
return 2.0*bytes;
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
|
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
|
||||||
{
|
{
|
||||||
SendToRecvFromComplete(waitall);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void CartesianCommunicator::StencilBarrier(void){};
|
void CartesianCommunicator::StencilBarrier(void){};
|
||||||
|
@ -102,7 +102,7 @@ public:
|
|||||||
///////////////////////////////////////////////////
|
///////////////////////////////////////////////////
|
||||||
static void SharedMemoryAllocate(uint64_t bytes, int flags);
|
static void SharedMemoryAllocate(uint64_t bytes, int flags);
|
||||||
static void SharedMemoryFree(void);
|
static void SharedMemoryFree(void);
|
||||||
static void SharedMemoryCopy(void *dest,const void *src,size_t bytes);
|
static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
|
||||||
static void SharedMemoryZero(void *dest,size_t bytes);
|
static void SharedMemoryZero(void *dest,size_t bytes);
|
||||||
|
|
||||||
};
|
};
|
||||||
|
@ -7,6 +7,7 @@
|
|||||||
Copyright (C) 2015
|
Copyright (C) 2015
|
||||||
|
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Christoph Lehner <christoph@lhnr.de>
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
This program is free software; you can redistribute it and/or modify
|
||||||
it under the terms of the GNU General Public License as published by
|
it under the terms of the GNU General Public License as published by
|
||||||
@ -32,6 +33,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#ifdef GRID_CUDA
|
#ifdef GRID_CUDA
|
||||||
#include <cuda_runtime_api.h>
|
#include <cuda_runtime_api.h>
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef GRID_HIP
|
||||||
|
#include <hip/hip_runtime_api.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
#define header "SharedMemoryMpi: "
|
#define header "SharedMemoryMpi: "
|
||||||
@ -166,6 +170,23 @@ static inline int divides(int a,int b)
|
|||||||
}
|
}
|
||||||
void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
|
void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
|
||||||
{
|
{
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// Allow user to configure through environment variable
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str());
|
||||||
|
if ( str ) {
|
||||||
|
std::vector<int> IntShmDims;
|
||||||
|
GridCmdOptionIntVector(std::string(str),IntShmDims);
|
||||||
|
assert(IntShmDims.size() == WorldDims.size());
|
||||||
|
long ShmSize = 1;
|
||||||
|
for (int dim=0;dim<WorldDims.size();dim++) {
|
||||||
|
ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
|
||||||
|
assert(divides(ShmDims[dim],WorldDims[dim]));
|
||||||
|
}
|
||||||
|
assert(ShmSize == WorldShmSize);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
// Powers of 2,3,5 only in prime decomposition for now
|
// Powers of 2,3,5 only in prime decomposition for now
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
@ -425,7 +446,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Hugetlbfs mapping intended
|
// Hugetlbfs mapping intended
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
#ifdef GRID_CUDA
|
#if defined(GRID_CUDA) ||defined(GRID_HIP)
|
||||||
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||||
{
|
{
|
||||||
void * ShmCommBuf ;
|
void * ShmCommBuf ;
|
||||||
@ -448,21 +469,16 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Each MPI rank should allocate our own buffer
|
// Each MPI rank should allocate our own buffer
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
#ifndef GRID_MPI3_SHM_NONE
|
ShmCommBuf = acceleratorAllocDevice(bytes);
|
||||||
auto err = cudaMalloc(&ShmCommBuf, bytes);
|
|
||||||
#else
|
|
||||||
auto err = cudaMallocManaged(&ShmCommBuf, bytes);
|
|
||||||
#endif
|
|
||||||
if ( err != cudaSuccess) {
|
|
||||||
std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
|
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
if (ShmCommBuf == (void *)NULL ) {
|
if (ShmCommBuf == (void *)NULL ) {
|
||||||
std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed NULL pointer for " << bytes<<" bytes " << std::endl;
|
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
if ( WorldRank == 0 ){
|
// if ( WorldRank == 0 ){
|
||||||
std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
|
if ( 1 ){
|
||||||
|
std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
|
||||||
|
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
|
||||||
}
|
}
|
||||||
SharedMemoryZero(ShmCommBuf,bytes);
|
SharedMemoryZero(ShmCommBuf,bytes);
|
||||||
|
|
||||||
@ -475,15 +491,26 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
// If it is me, pass around the IPC access key
|
// If it is me, pass around the IPC access key
|
||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
|
#ifdef GRID_CUDA
|
||||||
cudaIpcMemHandle_t handle;
|
cudaIpcMemHandle_t handle;
|
||||||
|
|
||||||
if ( r==WorldShmRank ) {
|
if ( r==WorldShmRank ) {
|
||||||
err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
|
auto err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
|
||||||
if ( err != cudaSuccess) {
|
if ( err != cudaSuccess) {
|
||||||
std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
|
std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_HIP
|
||||||
|
hipIpcMemHandle_t handle;
|
||||||
|
if ( r==WorldShmRank ) {
|
||||||
|
auto err = hipIpcGetMemHandle(&handle,ShmCommBuf);
|
||||||
|
if ( err != hipSuccess) {
|
||||||
|
std::cerr << " SharedMemoryMPI.cc hipIpcGetMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
// Share this IPC handle across the Shm Comm
|
// Share this IPC handle across the Shm Comm
|
||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
@ -500,13 +527,24 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
// If I am not the source, overwrite thisBuf with remote buffer
|
// If I am not the source, overwrite thisBuf with remote buffer
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
void * thisBuf = ShmCommBuf;
|
void * thisBuf = ShmCommBuf;
|
||||||
|
#ifdef GRID_CUDA
|
||||||
if ( r!=WorldShmRank ) {
|
if ( r!=WorldShmRank ) {
|
||||||
err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
|
auto err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
|
||||||
if ( err != cudaSuccess) {
|
if ( err != cudaSuccess) {
|
||||||
std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
|
std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_HIP
|
||||||
|
if ( r!=WorldShmRank ) {
|
||||||
|
auto err = hipIpcOpenMemHandle(&thisBuf,handle,hipIpcMemLazyEnablePeerAccess);
|
||||||
|
if ( err != hipSuccess) {
|
||||||
|
std::cerr << " SharedMemoryMPI.cc hipIpcOpenMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Save a copy of the device buffers
|
// Save a copy of the device buffers
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
@ -646,7 +684,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
#endif
|
#endif
|
||||||
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
|
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
|
||||||
|
|
||||||
// std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
|
|
||||||
if ( ptr == (void * )MAP_FAILED ) {
|
if ( ptr == (void * )MAP_FAILED ) {
|
||||||
perror("failed mmap");
|
perror("failed mmap");
|
||||||
assert(0);
|
assert(0);
|
||||||
@ -696,7 +733,7 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
|
|||||||
bzero(dest,bytes);
|
bzero(dest,bytes);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
void GlobalSharedMemory::SharedMemoryCopy(void *dest,const void *src,size_t bytes)
|
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
|
||||||
{
|
{
|
||||||
#ifdef GRID_CUDA
|
#ifdef GRID_CUDA
|
||||||
cudaMemcpy(dest,src,bytes,cudaMemcpyDefault);
|
cudaMemcpy(dest,src,bytes,cudaMemcpyDefault);
|
||||||
@ -752,19 +789,12 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
|
|||||||
std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r;
|
std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r;
|
||||||
MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]);
|
MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]);
|
||||||
|
|
||||||
#ifdef GRID_IBM_SUMMIT
|
#ifdef GRID_SHM_FORCE_MPI
|
||||||
// Hide the shared memory path between sockets
|
// Hide the shared memory path between ranks
|
||||||
// if even number of nodes
|
{
|
||||||
if ( (ShmSize & 0x1)==0 ) {
|
|
||||||
int SocketSize = ShmSize/2;
|
|
||||||
int mySocket = ShmRank/SocketSize;
|
|
||||||
for(int r=0;r<size;r++){
|
for(int r=0;r<size;r++){
|
||||||
int hisRank=ShmRanks[r];
|
if ( r!=rank ) {
|
||||||
if ( hisRank!= MPI_UNDEFINED ) {
|
ShmRanks[r] = MPI_UNDEFINED;
|
||||||
int hisSocket=hisRank/SocketSize;
|
|
||||||
if ( hisSocket != mySocket ) {
|
|
||||||
ShmRanks[r] = MPI_UNDEFINED;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -29,6 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <Grid/GridCore.h>
|
#include <Grid/GridCore.h>
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
#define header "SharedMemoryNone: "
|
||||||
|
|
||||||
/*Construct from an MPI communicator*/
|
/*Construct from an MPI communicator*/
|
||||||
void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
|
void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
|
||||||
@ -55,6 +56,38 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M
|
|||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Hugetlbfs mapping intended, use anonymous mmap
|
// Hugetlbfs mapping intended, use anonymous mmap
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
#if 1
|
||||||
|
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||||
|
{
|
||||||
|
std::cout << header "SharedMemoryAllocate "<< bytes<< " GPU implementation "<<std::endl;
|
||||||
|
void * ShmCommBuf ;
|
||||||
|
assert(_ShmSetup==1);
|
||||||
|
assert(_ShmAlloc==0);
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Each MPI rank should allocate our own buffer
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
ShmCommBuf = acceleratorAllocDevice(bytes);
|
||||||
|
|
||||||
|
if (ShmCommBuf == (void *)NULL ) {
|
||||||
|
std::cerr << " SharedMemoryNone.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
if ( WorldRank == 0 ){
|
||||||
|
std::cout << WorldRank << header " SharedMemoryNone.cc acceleratorAllocDevice "<< bytes
|
||||||
|
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
|
||||||
|
}
|
||||||
|
SharedMemoryZero(ShmCommBuf,bytes);
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Loop over ranks/gpu's on our node
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
WorldShmCommBufs[0] = ShmCommBuf;
|
||||||
|
|
||||||
|
_ShmAllocBytes=bytes;
|
||||||
|
_ShmAlloc=1;
|
||||||
|
}
|
||||||
|
#else
|
||||||
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||||
{
|
{
|
||||||
void * ShmCommBuf ;
|
void * ShmCommBuf ;
|
||||||
@ -83,7 +116,15 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
_ShmAllocBytes=bytes;
|
_ShmAllocBytes=bytes;
|
||||||
_ShmAlloc=1;
|
_ShmAlloc=1;
|
||||||
};
|
};
|
||||||
|
#endif
|
||||||
|
void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
|
||||||
|
{
|
||||||
|
acceleratorMemSet(dest,0,bytes);
|
||||||
|
}
|
||||||
|
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
|
||||||
|
{
|
||||||
|
acceleratorCopyToDevice(src,dest,bytes);
|
||||||
|
}
|
||||||
////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////
|
||||||
// Global shared functionality finished
|
// Global shared functionality finished
|
||||||
// Now move to per communicator functionality
|
// Now move to per communicator functionality
|
||||||
|
@ -35,7 +35,7 @@ extern Vector<std::pair<int,int> > Cshift_table;
|
|||||||
// Gather for when there is no need to SIMD split
|
// Gather for when there is no need to SIMD split
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
template<class vobj> void
|
template<class vobj> void
|
||||||
Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
|
Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
|
||||||
{
|
{
|
||||||
int rd = rhs.Grid()->_rdimensions[dimension];
|
int rd = rhs.Grid()->_rdimensions[dimension];
|
||||||
|
|
||||||
@ -73,12 +73,19 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
autoView(rhs_v , rhs, AcceleratorRead);
|
|
||||||
auto buffer_p = & buffer[0];
|
auto buffer_p = & buffer[0];
|
||||||
auto table = &Cshift_table[0];
|
auto table = &Cshift_table[0];
|
||||||
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
|
autoView(rhs_v , rhs, AcceleratorRead);
|
||||||
accelerator_for(i,ent,vobj::Nsimd(),{
|
accelerator_for(i,ent,vobj::Nsimd(),{
|
||||||
coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
|
coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
|
||||||
});
|
});
|
||||||
|
#else
|
||||||
|
autoView(rhs_v , rhs, CpuRead);
|
||||||
|
thread_for(i,ent,{
|
||||||
|
buffer_p[table[i].first]=rhs_v[table[i].second];
|
||||||
|
});
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -103,21 +110,36 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
|
|||||||
int n1=rhs.Grid()->_slice_stride[dimension];
|
int n1=rhs.Grid()->_slice_stride[dimension];
|
||||||
|
|
||||||
if ( cbmask ==0x3){
|
if ( cbmask ==0x3){
|
||||||
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
autoView(rhs_v , rhs, AcceleratorRead);
|
autoView(rhs_v , rhs, AcceleratorRead);
|
||||||
accelerator_for2d(n,e1,b,e2,1,{
|
accelerator_for(nn,e1*e2,1,{
|
||||||
|
int n = nn%e1;
|
||||||
|
int b = nn/e1;
|
||||||
int o = n*n1;
|
int o = n*n1;
|
||||||
int offset = b+n*e2;
|
int offset = b+n*e2;
|
||||||
|
|
||||||
vobj temp =rhs_v[so+o+b];
|
vobj temp =rhs_v[so+o+b];
|
||||||
extract<vobj>(temp,pointers,offset);
|
extract<vobj>(temp,pointers,offset);
|
||||||
});
|
});
|
||||||
} else {
|
#else
|
||||||
autoView(rhs_v , rhs, AcceleratorRead);
|
autoView(rhs_v , rhs, CpuRead);
|
||||||
|
thread_for2d(n,e1,b,e2,{
|
||||||
|
int o = n*n1;
|
||||||
|
int offset = b+n*e2;
|
||||||
|
|
||||||
|
vobj temp =rhs_v[so+o+b];
|
||||||
|
extract<vobj>(temp,pointers,offset);
|
||||||
|
});
|
||||||
|
#endif
|
||||||
|
} else {
|
||||||
Coordinate rdim=rhs.Grid()->_rdimensions;
|
Coordinate rdim=rhs.Grid()->_rdimensions;
|
||||||
Coordinate cdm =rhs.Grid()->_checker_dim_mask;
|
Coordinate cdm =rhs.Grid()->_checker_dim_mask;
|
||||||
std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
|
std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
|
||||||
accelerator_for2d(n,e1,b,e2,1,{
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
|
autoView(rhs_v , rhs, AcceleratorRead);
|
||||||
|
accelerator_for(nn,e1*e2,1,{
|
||||||
|
int n = nn%e1;
|
||||||
|
int b = nn/e1;
|
||||||
|
|
||||||
Coordinate coor;
|
Coordinate coor;
|
||||||
|
|
||||||
@ -134,13 +156,33 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
|
|||||||
extract<vobj>(temp,pointers,offset);
|
extract<vobj>(temp,pointers,offset);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
#else
|
||||||
|
autoView(rhs_v , rhs, CpuRead);
|
||||||
|
thread_for2d(n,e1,b,e2,{
|
||||||
|
|
||||||
|
Coordinate coor;
|
||||||
|
|
||||||
|
int o=n*n1;
|
||||||
|
int oindex = o+b;
|
||||||
|
|
||||||
|
int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
|
||||||
|
|
||||||
|
int ocb=1<<cb;
|
||||||
|
int offset = b+n*e2;
|
||||||
|
|
||||||
|
if ( ocb & cbmask ) {
|
||||||
|
vobj temp =rhs_v[so+o+b];
|
||||||
|
extract<vobj>(temp,pointers,offset);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
// Scatter for when there is no need to SIMD split
|
// Scatter for when there is no need to SIMD split
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
|
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<vobj> &buffer, int dimension,int plane,int cbmask)
|
||||||
{
|
{
|
||||||
int rd = rhs.Grid()->_rdimensions[dimension];
|
int rd = rhs.Grid()->_rdimensions[dimension];
|
||||||
|
|
||||||
@ -182,12 +224,19 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
|
|||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
autoView( rhs_v, rhs, AcceleratorWrite);
|
|
||||||
auto buffer_p = & buffer[0];
|
auto buffer_p = & buffer[0];
|
||||||
auto table = &Cshift_table[0];
|
auto table = &Cshift_table[0];
|
||||||
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
|
autoView( rhs_v, rhs, AcceleratorWrite);
|
||||||
accelerator_for(i,ent,vobj::Nsimd(),{
|
accelerator_for(i,ent,vobj::Nsimd(),{
|
||||||
coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
|
coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
|
||||||
});
|
});
|
||||||
|
#else
|
||||||
|
autoView( rhs_v, rhs, CpuWrite);
|
||||||
|
thread_for(i,ent,{
|
||||||
|
rhs_v[table[i].first]=buffer_p[table[i].second];
|
||||||
|
});
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -208,19 +257,30 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
|
|||||||
int e2=rhs.Grid()->_slice_block[dimension];
|
int e2=rhs.Grid()->_slice_block[dimension];
|
||||||
|
|
||||||
if(cbmask ==0x3 ) {
|
if(cbmask ==0x3 ) {
|
||||||
autoView( rhs_v , rhs, AcceleratorWrite);
|
|
||||||
int _slice_stride = rhs.Grid()->_slice_stride[dimension];
|
int _slice_stride = rhs.Grid()->_slice_stride[dimension];
|
||||||
int _slice_block = rhs.Grid()->_slice_block[dimension];
|
int _slice_block = rhs.Grid()->_slice_block[dimension];
|
||||||
accelerator_for2d(n,e1,b,e2,1,{
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
|
autoView( rhs_v , rhs, AcceleratorWrite);
|
||||||
|
accelerator_for(nn,e1*e2,1,{
|
||||||
|
int n = nn%e1;
|
||||||
|
int b = nn/e1;
|
||||||
int o = n*_slice_stride;
|
int o = n*_slice_stride;
|
||||||
int offset = b+n*_slice_block;
|
int offset = b+n*_slice_block;
|
||||||
merge(rhs_v[so+o+b],pointers,offset);
|
merge(rhs_v[so+o+b],pointers,offset);
|
||||||
});
|
});
|
||||||
|
#else
|
||||||
|
autoView( rhs_v , rhs, CpuWrite);
|
||||||
|
thread_for2d(n,e1,b,e2,{
|
||||||
|
int o = n*_slice_stride;
|
||||||
|
int offset = b+n*_slice_block;
|
||||||
|
merge(rhs_v[so+o+b],pointers,offset);
|
||||||
|
});
|
||||||
|
#endif
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
// Case of SIMD split AND checker dim cannot currently be hit, except in
|
// Case of SIMD split AND checker dim cannot currently be hit, except in
|
||||||
// Test_cshift_red_black code.
|
// Test_cshift_red_black code.
|
||||||
// std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
|
std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
|
||||||
std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
|
std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
|
||||||
assert(0); // This will fail if hit on GPU
|
assert(0); // This will fail if hit on GPU
|
||||||
autoView( rhs_v, rhs, CpuWrite);
|
autoView( rhs_v, rhs, CpuWrite);
|
||||||
@ -280,12 +340,20 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
|
|||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
|
auto table = &Cshift_table[0];
|
||||||
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
autoView(rhs_v , rhs, AcceleratorRead);
|
autoView(rhs_v , rhs, AcceleratorRead);
|
||||||
autoView(lhs_v , lhs, AcceleratorWrite);
|
autoView(lhs_v , lhs, AcceleratorWrite);
|
||||||
auto table = &Cshift_table[0];
|
|
||||||
accelerator_for(i,ent,vobj::Nsimd(),{
|
accelerator_for(i,ent,vobj::Nsimd(),{
|
||||||
coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
|
coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
|
||||||
});
|
});
|
||||||
|
#else
|
||||||
|
autoView(rhs_v , rhs, CpuRead);
|
||||||
|
autoView(lhs_v , lhs, CpuWrite);
|
||||||
|
thread_for(i,ent,{
|
||||||
|
lhs_v[table[i].first]=rhs_v[table[i].second];
|
||||||
|
});
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -324,12 +392,20 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
|
|||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
|
auto table = &Cshift_table[0];
|
||||||
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
autoView( rhs_v, rhs, AcceleratorRead);
|
autoView( rhs_v, rhs, AcceleratorRead);
|
||||||
autoView( lhs_v, lhs, AcceleratorWrite);
|
autoView( lhs_v, lhs, AcceleratorWrite);
|
||||||
auto table = &Cshift_table[0];
|
|
||||||
accelerator_for(i,ent,1,{
|
accelerator_for(i,ent,1,{
|
||||||
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
|
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
|
||||||
});
|
});
|
||||||
|
#else
|
||||||
|
autoView( rhs_v, rhs, CpuRead);
|
||||||
|
autoView( lhs_v, lhs, CpuWrite);
|
||||||
|
thread_for(i,ent,{
|
||||||
|
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
|
||||||
|
});
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -101,7 +101,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
|
|||||||
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
|
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#define ACCELERATOR_CSHIFT_NO_COPY
|
||||||
|
#ifdef ACCELERATOR_CSHIFT_NO_COPY
|
||||||
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||||
{
|
{
|
||||||
typedef typename vobj::vector_type vector_type;
|
typedef typename vobj::vector_type vector_type;
|
||||||
@ -121,8 +122,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
|||||||
assert(shift<fd);
|
assert(shift<fd);
|
||||||
|
|
||||||
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
|
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
|
||||||
commVector<vobj> send_buf(buffer_size);
|
static cshiftVector<vobj> send_buf; send_buf.resize(buffer_size);
|
||||||
commVector<vobj> recv_buf(buffer_size);
|
static cshiftVector<vobj> recv_buf; recv_buf.resize(buffer_size);
|
||||||
|
|
||||||
int cb= (cbmask==0x2)? Odd : Even;
|
int cb= (cbmask==0x2)? Odd : Even;
|
||||||
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
||||||
@ -138,7 +139,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
|||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
int words = send_buf.size();
|
int words = buffer_size;
|
||||||
if (cbmask != 0x3) words=words>>1;
|
if (cbmask != 0x3) words=words>>1;
|
||||||
|
|
||||||
int bytes = words * sizeof(vobj);
|
int bytes = words * sizeof(vobj);
|
||||||
@ -150,12 +151,14 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
|||||||
int xmit_to_rank;
|
int xmit_to_rank;
|
||||||
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
|
grid->Barrier();
|
||||||
|
|
||||||
grid->SendToRecvFrom((void *)&send_buf[0],
|
grid->SendToRecvFrom((void *)&send_buf[0],
|
||||||
xmit_to_rank,
|
xmit_to_rank,
|
||||||
(void *)&recv_buf[0],
|
(void *)&recv_buf[0],
|
||||||
recv_from_rank,
|
recv_from_rank,
|
||||||
bytes);
|
bytes);
|
||||||
|
|
||||||
grid->Barrier();
|
grid->Barrier();
|
||||||
|
|
||||||
Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
|
Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
|
||||||
@ -195,8 +198,15 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
|
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
|
||||||
// int words = sizeof(vobj)/sizeof(vector_type);
|
// int words = sizeof(vobj)/sizeof(vector_type);
|
||||||
|
|
||||||
std::vector<commVector<scalar_object> > send_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
|
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
|
||||||
std::vector<commVector<scalar_object> > recv_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
|
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
|
||||||
|
scalar_object * recv_buf_extract_mpi;
|
||||||
|
scalar_object * send_buf_extract_mpi;
|
||||||
|
|
||||||
|
for(int s=0;s<Nsimd;s++){
|
||||||
|
send_buf_extract[s].resize(buffer_size);
|
||||||
|
recv_buf_extract[s].resize(buffer_size);
|
||||||
|
}
|
||||||
|
|
||||||
int bytes = buffer_size*sizeof(scalar_object);
|
int bytes = buffer_size*sizeof(scalar_object);
|
||||||
|
|
||||||
@ -242,11 +252,204 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
if(nbr_proc){
|
if(nbr_proc){
|
||||||
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0],
|
grid->Barrier();
|
||||||
|
|
||||||
|
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
|
||||||
|
recv_buf_extract_mpi = &recv_buf_extract[i][0];
|
||||||
|
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
|
||||||
xmit_to_rank,
|
xmit_to_rank,
|
||||||
(void *)&recv_buf_extract[i][0],
|
(void *)recv_buf_extract_mpi,
|
||||||
recv_from_rank,
|
recv_from_rank,
|
||||||
bytes);
|
bytes);
|
||||||
|
|
||||||
|
grid->Barrier();
|
||||||
|
|
||||||
|
rpointers[i] = &recv_buf_extract[i][0];
|
||||||
|
} else {
|
||||||
|
rpointers[i] = &send_buf_extract[nbr_lane][0];
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||||
|
{
|
||||||
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
|
||||||
|
GridBase *grid=rhs.Grid();
|
||||||
|
Lattice<vobj> temp(rhs.Grid());
|
||||||
|
|
||||||
|
int fd = rhs.Grid()->_fdimensions[dimension];
|
||||||
|
int rd = rhs.Grid()->_rdimensions[dimension];
|
||||||
|
int pd = rhs.Grid()->_processors[dimension];
|
||||||
|
int simd_layout = rhs.Grid()->_simd_layout[dimension];
|
||||||
|
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
|
||||||
|
assert(simd_layout==1);
|
||||||
|
assert(comm_dim==1);
|
||||||
|
assert(shift>=0);
|
||||||
|
assert(shift<fd);
|
||||||
|
|
||||||
|
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
|
||||||
|
static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
|
||||||
|
static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
|
||||||
|
vobj *send_buf;
|
||||||
|
vobj *recv_buf;
|
||||||
|
{
|
||||||
|
grid->ShmBufferFreeAll();
|
||||||
|
size_t bytes = buffer_size*sizeof(vobj);
|
||||||
|
send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
|
||||||
|
recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
int cb= (cbmask==0x2)? Odd : Even;
|
||||||
|
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
||||||
|
|
||||||
|
for(int x=0;x<rd;x++){
|
||||||
|
|
||||||
|
int sx = (x+sshift)%rd;
|
||||||
|
int comm_proc = ((x+sshift)/rd)%pd;
|
||||||
|
|
||||||
|
if (comm_proc==0) {
|
||||||
|
|
||||||
|
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
int words = buffer_size;
|
||||||
|
if (cbmask != 0x3) words=words>>1;
|
||||||
|
|
||||||
|
int bytes = words * sizeof(vobj);
|
||||||
|
|
||||||
|
Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
|
||||||
|
|
||||||
|
// int rank = grid->_processor;
|
||||||
|
int recv_from_rank;
|
||||||
|
int xmit_to_rank;
|
||||||
|
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
|
|
||||||
|
grid->Barrier();
|
||||||
|
|
||||||
|
acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
|
||||||
|
grid->SendToRecvFrom((void *)&send_buf[0],
|
||||||
|
xmit_to_rank,
|
||||||
|
(void *)&recv_buf[0],
|
||||||
|
recv_from_rank,
|
||||||
|
bytes);
|
||||||
|
acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
|
||||||
|
|
||||||
|
grid->Barrier();
|
||||||
|
|
||||||
|
Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||||
|
{
|
||||||
|
GridBase *grid=rhs.Grid();
|
||||||
|
const int Nsimd = grid->Nsimd();
|
||||||
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
typedef typename vobj::scalar_object scalar_object;
|
||||||
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
|
||||||
|
int fd = grid->_fdimensions[dimension];
|
||||||
|
int rd = grid->_rdimensions[dimension];
|
||||||
|
int ld = grid->_ldimensions[dimension];
|
||||||
|
int pd = grid->_processors[dimension];
|
||||||
|
int simd_layout = grid->_simd_layout[dimension];
|
||||||
|
int comm_dim = grid->_processors[dimension] >1 ;
|
||||||
|
|
||||||
|
//std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
|
||||||
|
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
|
||||||
|
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
|
||||||
|
|
||||||
|
assert(comm_dim==1);
|
||||||
|
assert(simd_layout==2);
|
||||||
|
assert(shift>=0);
|
||||||
|
assert(shift<fd);
|
||||||
|
|
||||||
|
int permute_type=grid->PermuteType(dimension);
|
||||||
|
|
||||||
|
///////////////////////////////////////////////
|
||||||
|
// Simd direction uses an extract/merge pair
|
||||||
|
///////////////////////////////////////////////
|
||||||
|
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
|
||||||
|
// int words = sizeof(vobj)/sizeof(vector_type);
|
||||||
|
|
||||||
|
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
|
||||||
|
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
|
||||||
|
scalar_object * recv_buf_extract_mpi;
|
||||||
|
scalar_object * send_buf_extract_mpi;
|
||||||
|
{
|
||||||
|
size_t bytes = sizeof(scalar_object)*buffer_size;
|
||||||
|
grid->ShmBufferFreeAll();
|
||||||
|
send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
|
||||||
|
recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
|
||||||
|
}
|
||||||
|
for(int s=0;s<Nsimd;s++){
|
||||||
|
send_buf_extract[s].resize(buffer_size);
|
||||||
|
recv_buf_extract[s].resize(buffer_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
int bytes = buffer_size*sizeof(scalar_object);
|
||||||
|
|
||||||
|
ExtractPointerArray<scalar_object> pointers(Nsimd); //
|
||||||
|
ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
|
||||||
|
|
||||||
|
///////////////////////////////////////////
|
||||||
|
// Work out what to send where
|
||||||
|
///////////////////////////////////////////
|
||||||
|
int cb = (cbmask==0x2)? Odd : Even;
|
||||||
|
int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
||||||
|
|
||||||
|
// loop over outer coord planes orthog to dim
|
||||||
|
for(int x=0;x<rd;x++){
|
||||||
|
|
||||||
|
// FIXME call local permute copy if none are offnode.
|
||||||
|
for(int i=0;i<Nsimd;i++){
|
||||||
|
pointers[i] = &send_buf_extract[i][0];
|
||||||
|
}
|
||||||
|
int sx = (x+sshift)%rd;
|
||||||
|
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
|
||||||
|
|
||||||
|
for(int i=0;i<Nsimd;i++){
|
||||||
|
|
||||||
|
int inner_bit = (Nsimd>>(permute_type+1));
|
||||||
|
int ic= (i&inner_bit)? 1:0;
|
||||||
|
|
||||||
|
int my_coor = rd*ic + x;
|
||||||
|
int nbr_coor = my_coor+sshift;
|
||||||
|
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
|
||||||
|
|
||||||
|
int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer
|
||||||
|
int nbr_ox = (nbr_coor%rd); // outer coord of peer
|
||||||
|
int nbr_lane = (i&(~inner_bit));
|
||||||
|
|
||||||
|
int recv_from_rank;
|
||||||
|
int xmit_to_rank;
|
||||||
|
|
||||||
|
if (nbr_ic) nbr_lane|=inner_bit;
|
||||||
|
|
||||||
|
assert (sx == nbr_ox);
|
||||||
|
|
||||||
|
if(nbr_proc){
|
||||||
|
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
|
grid->Barrier();
|
||||||
|
|
||||||
|
acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
|
||||||
|
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
|
||||||
|
xmit_to_rank,
|
||||||
|
(void *)recv_buf_extract_mpi,
|
||||||
|
recv_from_rank,
|
||||||
|
bytes);
|
||||||
|
acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
|
||||||
|
|
||||||
grid->Barrier();
|
grid->Barrier();
|
||||||
rpointers[i] = &recv_buf_extract[i][0];
|
rpointers[i] = &recv_buf_extract[i][0];
|
||||||
} else {
|
} else {
|
||||||
@ -258,7 +461,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -342,19 +342,14 @@ inline void ExpressionViewClose(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
|
|||||||
|
|
||||||
GridUnopClass(UnarySub, -a);
|
GridUnopClass(UnarySub, -a);
|
||||||
GridUnopClass(UnaryNot, Not(a));
|
GridUnopClass(UnaryNot, Not(a));
|
||||||
GridUnopClass(UnaryAdj, adj(a));
|
|
||||||
GridUnopClass(UnaryConj, conjugate(a));
|
|
||||||
GridUnopClass(UnaryTrace, trace(a));
|
GridUnopClass(UnaryTrace, trace(a));
|
||||||
GridUnopClass(UnaryTranspose, transpose(a));
|
GridUnopClass(UnaryTranspose, transpose(a));
|
||||||
GridUnopClass(UnaryTa, Ta(a));
|
GridUnopClass(UnaryTa, Ta(a));
|
||||||
GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
|
GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
|
||||||
GridUnopClass(UnaryToReal, toReal(a));
|
|
||||||
GridUnopClass(UnaryToComplex, toComplex(a));
|
|
||||||
GridUnopClass(UnaryTimesI, timesI(a));
|
GridUnopClass(UnaryTimesI, timesI(a));
|
||||||
GridUnopClass(UnaryTimesMinusI, timesMinusI(a));
|
GridUnopClass(UnaryTimesMinusI, timesMinusI(a));
|
||||||
GridUnopClass(UnaryAbs, abs(a));
|
GridUnopClass(UnaryAbs, abs(a));
|
||||||
GridUnopClass(UnarySqrt, sqrt(a));
|
GridUnopClass(UnarySqrt, sqrt(a));
|
||||||
GridUnopClass(UnaryRsqrt, rsqrt(a));
|
|
||||||
GridUnopClass(UnarySin, sin(a));
|
GridUnopClass(UnarySin, sin(a));
|
||||||
GridUnopClass(UnaryCos, cos(a));
|
GridUnopClass(UnaryCos, cos(a));
|
||||||
GridUnopClass(UnaryAsin, asin(a));
|
GridUnopClass(UnaryAsin, asin(a));
|
||||||
@ -456,20 +451,17 @@ GridTrinOpClass(TrinaryWhere,
|
|||||||
GRID_DEF_UNOP(operator-, UnarySub);
|
GRID_DEF_UNOP(operator-, UnarySub);
|
||||||
GRID_DEF_UNOP(Not, UnaryNot);
|
GRID_DEF_UNOP(Not, UnaryNot);
|
||||||
GRID_DEF_UNOP(operator!, UnaryNot);
|
GRID_DEF_UNOP(operator!, UnaryNot);
|
||||||
GRID_DEF_UNOP(adj, UnaryAdj);
|
//GRID_DEF_UNOP(adj, UnaryAdj);
|
||||||
GRID_DEF_UNOP(conjugate, UnaryConj);
|
//GRID_DEF_UNOP(conjugate, UnaryConj);
|
||||||
GRID_DEF_UNOP(trace, UnaryTrace);
|
GRID_DEF_UNOP(trace, UnaryTrace);
|
||||||
GRID_DEF_UNOP(transpose, UnaryTranspose);
|
GRID_DEF_UNOP(transpose, UnaryTranspose);
|
||||||
GRID_DEF_UNOP(Ta, UnaryTa);
|
GRID_DEF_UNOP(Ta, UnaryTa);
|
||||||
GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
|
GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
|
||||||
GRID_DEF_UNOP(toReal, UnaryToReal);
|
|
||||||
GRID_DEF_UNOP(toComplex, UnaryToComplex);
|
|
||||||
GRID_DEF_UNOP(timesI, UnaryTimesI);
|
GRID_DEF_UNOP(timesI, UnaryTimesI);
|
||||||
GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI);
|
GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI);
|
||||||
GRID_DEF_UNOP(abs, UnaryAbs); // abs overloaded in cmath C++98; DON'T do the
|
GRID_DEF_UNOP(abs, UnaryAbs); // abs overloaded in cmath C++98; DON'T do the
|
||||||
// abs-fabs-dabs-labs thing
|
// abs-fabs-dabs-labs thing
|
||||||
GRID_DEF_UNOP(sqrt, UnarySqrt);
|
GRID_DEF_UNOP(sqrt, UnarySqrt);
|
||||||
GRID_DEF_UNOP(rsqrt, UnaryRsqrt);
|
|
||||||
GRID_DEF_UNOP(sin, UnarySin);
|
GRID_DEF_UNOP(sin, UnarySin);
|
||||||
GRID_DEF_UNOP(cos, UnaryCos);
|
GRID_DEF_UNOP(cos, UnaryCos);
|
||||||
GRID_DEF_UNOP(asin, UnaryAsin);
|
GRID_DEF_UNOP(asin, UnaryAsin);
|
||||||
@ -494,27 +486,27 @@ GRID_DEF_TRINOP(where, TrinaryWhere);
|
|||||||
/////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////
|
||||||
template <class Op, class T1>
|
template <class Op, class T1>
|
||||||
auto closure(const LatticeUnaryExpression<Op, T1> &expr)
|
auto closure(const LatticeUnaryExpression<Op, T1> &expr)
|
||||||
-> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1)))>
|
-> Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1)))>::type >
|
||||||
{
|
{
|
||||||
Lattice<decltype(expr.op.func(vecEval(0, expr.arg1)))> ret(expr);
|
Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1)))>::type > ret(expr);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
template <class Op, class T1, class T2>
|
template <class Op, class T1, class T2>
|
||||||
auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
|
auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
|
||||||
-> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))>
|
-> Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))>::type >
|
||||||
{
|
{
|
||||||
Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))> ret(expr);
|
Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))>::type > ret(expr);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
template <class Op, class T1, class T2, class T3>
|
template <class Op, class T1, class T2, class T3>
|
||||||
auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
|
auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
|
||||||
-> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),
|
-> Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),
|
||||||
vecEval(0, expr.arg2),
|
vecEval(0, expr.arg2),
|
||||||
vecEval(0, expr.arg3)))>
|
vecEval(0, expr.arg3)))>::type >
|
||||||
{
|
{
|
||||||
Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),
|
Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),
|
||||||
vecEval(0, expr.arg2),
|
vecEval(0, expr.arg2),
|
||||||
vecEval(0, expr.arg3)))> ret(expr);
|
vecEval(0, expr.arg3)))>::type > ret(expr);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
#define EXPRESSION_CLOSURE(function) \
|
#define EXPRESSION_CLOSURE(function) \
|
||||||
|
@ -60,9 +60,9 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
|
|||||||
autoView( lhs_v , lhs, AcceleratorRead);
|
autoView( lhs_v , lhs, AcceleratorRead);
|
||||||
autoView( rhs_v , rhs, AcceleratorRead);
|
autoView( rhs_v , rhs, AcceleratorRead);
|
||||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
|
||||||
auto lhs_t=lhs_v(ss);
|
auto lhs_t=lhs_v(ss);
|
||||||
auto rhs_t=rhs_v(ss);
|
auto rhs_t=rhs_v(ss);
|
||||||
|
auto tmp =ret_v(ss);
|
||||||
mac(&tmp,&lhs_t,&rhs_t);
|
mac(&tmp,&lhs_t,&rhs_t);
|
||||||
coalescedWrite(ret_v[ss],tmp);
|
coalescedWrite(ret_v[ss],tmp);
|
||||||
});
|
});
|
||||||
@ -124,7 +124,7 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
|||||||
autoView( ret_v , ret, AcceleratorWrite);
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
autoView( lhs_v , lhs, AcceleratorRead);
|
autoView( lhs_v , lhs, AcceleratorRead);
|
||||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
auto tmp =ret_v(ss);
|
||||||
auto lhs_t=lhs_v(ss);
|
auto lhs_t=lhs_v(ss);
|
||||||
mac(&tmp,&lhs_t,&rhs);
|
mac(&tmp,&lhs_t,&rhs);
|
||||||
coalescedWrite(ret_v[ss],tmp);
|
coalescedWrite(ret_v[ss],tmp);
|
||||||
@ -182,7 +182,7 @@ void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
|||||||
autoView( ret_v , ret, AcceleratorWrite);
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
autoView( rhs_v , lhs, AcceleratorRead);
|
autoView( rhs_v , lhs, AcceleratorRead);
|
||||||
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
auto tmp =ret_v(ss);
|
||||||
auto rhs_t=rhs_v(ss);
|
auto rhs_t=rhs_v(ss);
|
||||||
mac(&tmp,&lhs,&rhs_t);
|
mac(&tmp,&lhs,&rhs_t);
|
||||||
coalescedWrite(ret_v[ss],tmp);
|
coalescedWrite(ret_v[ss],tmp);
|
||||||
|
@ -62,7 +62,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
|
|||||||
basis_v.push_back(basis[k].View(AcceleratorWrite));
|
basis_v.push_back(basis[k].View(AcceleratorWrite));
|
||||||
}
|
}
|
||||||
|
|
||||||
#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) )
|
#if ( (!defined(GRID_CUDA)) )
|
||||||
int max_threads = thread_max();
|
int max_threads = thread_max();
|
||||||
Vector < vobj > Bt(Nm * max_threads);
|
Vector < vobj > Bt(Nm * max_threads);
|
||||||
thread_region
|
thread_region
|
||||||
@ -161,11 +161,13 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
|
|||||||
double * Qt_j = & Qt_jv[0];
|
double * Qt_j = & Qt_jv[0];
|
||||||
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
|
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
|
||||||
|
|
||||||
|
auto basis_vp=& basis_v[0];
|
||||||
autoView(result_v,result,AcceleratorWrite);
|
autoView(result_v,result,AcceleratorWrite);
|
||||||
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
|
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
|
||||||
auto B=coalescedRead(zz);
|
vobj zzz=Zero();
|
||||||
|
auto B=coalescedRead(zzz);
|
||||||
for(int k=k0; k<k1; ++k){
|
for(int k=k0; k<k1; ++k){
|
||||||
B +=Qt_j[k] * coalescedRead(basis_v[k][ss]);
|
B +=Qt_j[k] * coalescedRead(basis_vp[k][ss]);
|
||||||
}
|
}
|
||||||
coalescedWrite(result_v[ss], B);
|
coalescedWrite(result_v[ss], B);
|
||||||
});
|
});
|
||||||
|
@ -45,8 +45,8 @@ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
|
|||||||
autoView( ret_v, ret, AcceleratorWrite);
|
autoView( ret_v, ret, AcceleratorWrite);
|
||||||
|
|
||||||
ret.Checkerboard()=lhs.Checkerboard();
|
ret.Checkerboard()=lhs.Checkerboard();
|
||||||
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
|
accelerator_for( ss, lhs_v.size(), 1, {
|
||||||
coalescedWrite(ret_v[ss], adj(lhs_v(ss)));
|
ret_v[ss] = adj(lhs_v[ss]);
|
||||||
});
|
});
|
||||||
return ret;
|
return ret;
|
||||||
};
|
};
|
||||||
@ -64,6 +64,53 @@ template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
|
|||||||
return ret;
|
return ret;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<class vobj> inline Lattice<typename vobj::Complexified> toComplex(const Lattice<vobj> &lhs){
|
||||||
|
Lattice<typename vobj::Complexified> ret(lhs.Grid());
|
||||||
|
|
||||||
|
autoView( lhs_v, lhs, AcceleratorRead);
|
||||||
|
autoView( ret_v, ret, AcceleratorWrite);
|
||||||
|
|
||||||
|
ret.Checkerboard() = lhs.Checkerboard();
|
||||||
|
accelerator_for( ss, lhs_v.size(), 1, {
|
||||||
|
ret_v[ss] = toComplex(lhs_v[ss]);
|
||||||
|
});
|
||||||
|
return ret;
|
||||||
|
};
|
||||||
|
template<class vobj> inline Lattice<typename vobj::Realified> toReal(const Lattice<vobj> &lhs){
|
||||||
|
Lattice<typename vobj::Realified> ret(lhs.Grid());
|
||||||
|
|
||||||
|
autoView( lhs_v, lhs, AcceleratorRead);
|
||||||
|
autoView( ret_v, ret, AcceleratorWrite);
|
||||||
|
|
||||||
|
ret.Checkerboard() = lhs.Checkerboard();
|
||||||
|
accelerator_for( ss, lhs_v.size(), 1, {
|
||||||
|
ret_v[ss] = toReal(lhs_v[ss]);
|
||||||
|
});
|
||||||
|
return ret;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
|
||||||
|
auto toComplex(const Expression &expr) -> decltype(closure(expr))
|
||||||
|
{
|
||||||
|
return toComplex(closure(expr));
|
||||||
|
}
|
||||||
|
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
|
||||||
|
auto toReal(const Expression &expr) -> decltype(closure(expr))
|
||||||
|
{
|
||||||
|
return toReal(closure(expr));
|
||||||
|
}
|
||||||
|
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
|
||||||
|
auto adj(const Expression &expr) -> decltype(closure(expr))
|
||||||
|
{
|
||||||
|
return adj(closure(expr));
|
||||||
|
}
|
||||||
|
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
|
||||||
|
auto conjugate(const Expression &expr) -> decltype(closure(expr))
|
||||||
|
{
|
||||||
|
return conjugate(closure(expr));
|
||||||
|
}
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -96,8 +96,34 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
|
|||||||
ssobj ret = ssum;
|
ssobj ret = ssum;
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
/*
|
||||||
|
Threaded max, don't use for now
|
||||||
|
template<class Double>
|
||||||
|
inline Double max(const Double *arg, Integer osites)
|
||||||
|
{
|
||||||
|
// const int Nsimd = vobj::Nsimd();
|
||||||
|
const int nthread = GridThread::GetThreads();
|
||||||
|
|
||||||
|
std::vector<Double> maxarray(nthread);
|
||||||
|
|
||||||
|
thread_for(thr,nthread, {
|
||||||
|
int nwork, mywork, myoff;
|
||||||
|
nwork = osites;
|
||||||
|
GridThread::GetWork(nwork,thr,mywork,myoff);
|
||||||
|
Double max=arg[0];
|
||||||
|
for(int ss=myoff;ss<mywork+myoff; ss++){
|
||||||
|
if( arg[ss] > max ) max = arg[ss];
|
||||||
|
}
|
||||||
|
maxarray[thr]=max;
|
||||||
|
});
|
||||||
|
|
||||||
|
Double tmax=maxarray[0];
|
||||||
|
for(int i=0;i<nthread;i++){
|
||||||
|
if (maxarray[i]>tmax) tmax = maxarray[i];
|
||||||
|
}
|
||||||
|
return tmax;
|
||||||
|
}
|
||||||
|
*/
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
|
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
|
||||||
{
|
{
|
||||||
@ -141,6 +167,32 @@ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
|
|||||||
return real(nrm);
|
return real(nrm);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//The global maximum of the site norm2
|
||||||
|
template<class vobj> inline RealD maxLocalNorm2(const Lattice<vobj> &arg)
|
||||||
|
{
|
||||||
|
typedef typename vobj::tensor_reduced vscalar; //iScalar<iScalar<.... <vPODtype> > >
|
||||||
|
typedef typename vscalar::scalar_object scalar; //iScalar<iScalar<.... <PODtype> > >
|
||||||
|
|
||||||
|
Lattice<vscalar> inner = localNorm2(arg);
|
||||||
|
|
||||||
|
auto grid = arg.Grid();
|
||||||
|
|
||||||
|
RealD max;
|
||||||
|
for(int l=0;l<grid->lSites();l++){
|
||||||
|
Coordinate coor;
|
||||||
|
scalar val;
|
||||||
|
RealD r;
|
||||||
|
grid->LocalIndexToLocalCoor(l,coor);
|
||||||
|
peekLocalSite(val,inner,coor);
|
||||||
|
r=real(TensorRemove(val));
|
||||||
|
if( (l==0) || (r>max)){
|
||||||
|
max=r;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
grid->GlobalMax(max);
|
||||||
|
return max;
|
||||||
|
}
|
||||||
|
|
||||||
// Double inner product
|
// Double inner product
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
|
inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
|
||||||
|
@ -2,12 +2,13 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
#ifdef GRID_HIP
|
#ifdef GRID_HIP
|
||||||
extern hipDeviceProp_t *gpu_props;
|
extern hipDeviceProp_t *gpu_props;
|
||||||
|
#define WARP_SIZE 64
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_CUDA
|
#ifdef GRID_CUDA
|
||||||
extern cudaDeviceProp *gpu_props;
|
extern cudaDeviceProp *gpu_props;
|
||||||
|
#define WARP_SIZE 32
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define WARP_SIZE 32
|
|
||||||
__device__ unsigned int retirementCount = 0;
|
__device__ unsigned int retirementCount = 0;
|
||||||
|
|
||||||
template <class Iterator>
|
template <class Iterator>
|
||||||
@ -64,7 +65,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
|
|||||||
|
|
||||||
// cannot use overloaded operators for sobj as they are not volatile-qualified
|
// cannot use overloaded operators for sobj as they are not volatile-qualified
|
||||||
memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj));
|
memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj));
|
||||||
__syncwarp();
|
acceleratorSynchronise();
|
||||||
|
|
||||||
const Iterator VEC = WARP_SIZE;
|
const Iterator VEC = WARP_SIZE;
|
||||||
const Iterator vid = tid & (VEC-1);
|
const Iterator vid = tid & (VEC-1);
|
||||||
@ -78,9 +79,9 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
|
|||||||
beta += temp;
|
beta += temp;
|
||||||
memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj));
|
memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj));
|
||||||
}
|
}
|
||||||
__syncwarp();
|
acceleratorSynchronise();
|
||||||
}
|
}
|
||||||
__syncthreads();
|
acceleratorSynchroniseAll();
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
beta = Zero();
|
beta = Zero();
|
||||||
@ -90,7 +91,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
|
|||||||
}
|
}
|
||||||
memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj));
|
memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj));
|
||||||
}
|
}
|
||||||
__syncthreads();
|
acceleratorSynchroniseAll();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -97,6 +97,20 @@ accelerator_inline void convertType(ComplexF & out, const std::complex<float> &
|
|||||||
out = in;
|
out = in;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
accelerator_inline EnableIf<isGridFundamental<T>> convertType(T & out, const T & in) {
|
||||||
|
out = in;
|
||||||
|
}
|
||||||
|
|
||||||
|
// This would allow for conversions between GridFundamental types, but is not strictly needed as yet
|
||||||
|
/*template<typename T1, typename T2>
|
||||||
|
accelerator_inline typename std::enable_if<isGridFundamental<T1>::value && isGridFundamental<T2>::value>::type
|
||||||
|
// Or to make this very broad, conversions between anything that's not a GridTensor could be allowed
|
||||||
|
//accelerator_inline typename std::enable_if<!isGridTensor<T1>::value && !isGridTensor<T2>::value>::type
|
||||||
|
convertType(T1 & out, const T2 & in) {
|
||||||
|
out = in;
|
||||||
|
}*/
|
||||||
|
|
||||||
#ifdef GRID_SIMT
|
#ifdef GRID_SIMT
|
||||||
accelerator_inline void convertType(vComplexF & out, const ComplexF & in) {
|
accelerator_inline void convertType(vComplexF & out, const ComplexF & in) {
|
||||||
((ComplexF*)&out)[acceleratorSIMTlane(vComplexF::Nsimd())] = in;
|
((ComplexF*)&out)[acceleratorSIMTlane(vComplexF::Nsimd())] = in;
|
||||||
@ -117,18 +131,18 @@ accelerator_inline void convertType(vComplexD2 & out, const vComplexF & in) {
|
|||||||
Optimization::PrecisionChange::StoD(in.v,out._internal[0].v,out._internal[1].v);
|
Optimization::PrecisionChange::StoD(in.v,out._internal[0].v,out._internal[1].v);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T1,typename T2,int N>
|
template<typename T1,typename T2>
|
||||||
accelerator_inline void convertType(iMatrix<T1,N> & out, const iMatrix<T2,N> & in);
|
accelerator_inline void convertType(iScalar<T1> & out, const iScalar<T2> & in) {
|
||||||
template<typename T1,typename T2,int N>
|
convertType(out._internal,in._internal);
|
||||||
accelerator_inline void convertType(iVector<T1,N> & out, const iVector<T2,N> & in);
|
}
|
||||||
|
|
||||||
template<typename T1,typename T2, typename std::enable_if<!isGridScalar<T1>::value, T1>::type* = nullptr>
|
template<typename T1,typename T2>
|
||||||
accelerator_inline void convertType(T1 & out, const iScalar<T2> & in) {
|
accelerator_inline NotEnableIf<isGridScalar<T1>> convertType(T1 & out, const iScalar<T2> & in) {
|
||||||
convertType(out,in._internal);
|
convertType(out,in._internal);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T1,typename T2>
|
template<typename T1,typename T2>
|
||||||
accelerator_inline void convertType(iScalar<T1> & out, const T2 & in) {
|
accelerator_inline NotEnableIf<isGridScalar<T2>> convertType(iScalar<T1> & out, const T2 & in) {
|
||||||
convertType(out._internal,in);
|
convertType(out._internal,in);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -145,11 +159,6 @@ accelerator_inline void convertType(iVector<T1,N> & out, const iVector<T2,N> & i
|
|||||||
convertType(out._internal[i],in._internal[i]);
|
convertType(out._internal[i],in._internal[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T, typename std::enable_if<isGridFundamental<T>::value, T>::type* = nullptr>
|
|
||||||
accelerator_inline void convertType(T & out, const T & in) {
|
|
||||||
out = in;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename T1,typename T2>
|
template<typename T1,typename T2>
|
||||||
accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) {
|
accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) {
|
||||||
autoView( out_v , out,AcceleratorWrite);
|
autoView( out_v , out,AcceleratorWrite);
|
||||||
|
@ -52,7 +52,6 @@ public:
|
|||||||
// This will be safe to call from accelerator_for and is trivially copy constructible
|
// This will be safe to call from accelerator_for and is trivially copy constructible
|
||||||
// The copy constructor for this will need to be used by device lambda functions
|
// The copy constructor for this will need to be used by device lambda functions
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////
|
||||||
#undef LATTICE_BOUNDS_CHECK
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
class LatticeView : public LatticeAccelerator<vobj>
|
class LatticeView : public LatticeAccelerator<vobj>
|
||||||
{
|
{
|
||||||
@ -62,36 +61,19 @@ public:
|
|||||||
void * cpu_ptr;
|
void * cpu_ptr;
|
||||||
#ifdef GRID_SIMT
|
#ifdef GRID_SIMT
|
||||||
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const {
|
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const {
|
||||||
#ifdef LATTICE_BOUNDS_CHECK
|
|
||||||
assert(i<this->_odata_size);
|
|
||||||
assert(i>=0);
|
|
||||||
#endif
|
|
||||||
return coalescedRead(this->_odata[i]);
|
return coalescedRead(this->_odata[i]);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
accelerator_inline const vobj & operator()(size_t i) const {
|
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
|
||||||
#ifdef LATTICE_BOUNDS_CHECK
|
|
||||||
assert(i<this->_odata_size);
|
|
||||||
assert(i>=0);
|
|
||||||
#endif
|
|
||||||
return this->_odata[i];
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
accelerator_inline const vobj & operator[](size_t i) const {
|
#if 1
|
||||||
#ifdef LATTICE_BOUNDS_CHECK
|
// accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
|
||||||
assert(i<this->_odata_size);
|
accelerator_inline vobj & operator[](size_t i) const { return this->_odata[i]; };
|
||||||
assert(i>=0);
|
#else
|
||||||
|
accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
|
||||||
|
accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; };
|
||||||
#endif
|
#endif
|
||||||
return this->_odata[i];
|
|
||||||
};
|
|
||||||
accelerator_inline vobj & operator[](size_t i) {
|
|
||||||
#ifdef LATTICE_BOUNDS_CHECK
|
|
||||||
assert(i<this->_odata_size);
|
|
||||||
assert(i>=0);
|
|
||||||
#endif
|
|
||||||
return this->_odata[i];
|
|
||||||
};
|
|
||||||
|
|
||||||
accelerator_inline uint64_t begin(void) const { return 0;};
|
accelerator_inline uint64_t begin(void) const { return 0;};
|
||||||
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
|
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
|
||||||
|
@ -43,7 +43,7 @@ inline void whereWolf(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<
|
|||||||
conformable(iftrue,predicate);
|
conformable(iftrue,predicate);
|
||||||
conformable(iftrue,ret);
|
conformable(iftrue,ret);
|
||||||
|
|
||||||
GridBase *grid=iftrue._grid;
|
GridBase *grid=iftrue.Grid();
|
||||||
|
|
||||||
typedef typename vobj::scalar_object scalar_object;
|
typedef typename vobj::scalar_object scalar_object;
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
@ -52,22 +52,23 @@ inline void whereWolf(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<
|
|||||||
|
|
||||||
const int Nsimd = grid->Nsimd();
|
const int Nsimd = grid->Nsimd();
|
||||||
|
|
||||||
std::vector<Integer> mask(Nsimd);
|
autoView(iftrue_v,iftrue,CpuRead);
|
||||||
std::vector<scalar_object> truevals (Nsimd);
|
autoView(iffalse_v,iffalse,CpuRead);
|
||||||
std::vector<scalar_object> falsevals(Nsimd);
|
autoView(predicate_v,predicate,CpuRead);
|
||||||
|
autoView(ret_v,ret,CpuWrite);
|
||||||
parallel_for(int ss=0;ss<iftrue._grid->oSites(); ss++){
|
Integer NN= grid->oSites();
|
||||||
|
thread_for(ss,NN,{
|
||||||
extract(iftrue._odata[ss] ,truevals);
|
Integer mask;
|
||||||
extract(iffalse._odata[ss] ,falsevals);
|
scalar_object trueval;
|
||||||
extract<vInteger,Integer>(TensorRemove(predicate._odata[ss]),mask);
|
scalar_object falseval;
|
||||||
|
for(int l=0;l<Nsimd;l++){
|
||||||
for(int s=0;s<Nsimd;s++){
|
trueval =extractLane(l,iftrue_v[ss]);
|
||||||
if (mask[s]) falsevals[s]=truevals[s];
|
falseval=extractLane(l,iffalse_v[ss]);
|
||||||
|
mask =extractLane(l,predicate_v[ss]);
|
||||||
|
if (mask) falseval=trueval;
|
||||||
|
insertLane(l,ret_v[ss],falseval);
|
||||||
}
|
}
|
||||||
|
});
|
||||||
merge(ret._odata[ss],falsevals);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj,class iobj>
|
template<class vobj,class iobj>
|
||||||
@ -76,9 +77,9 @@ inline Lattice<vobj> whereWolf(const Lattice<iobj> &predicate,Lattice<vobj> &ift
|
|||||||
conformable(iftrue,iffalse);
|
conformable(iftrue,iffalse);
|
||||||
conformable(iftrue,predicate);
|
conformable(iftrue,predicate);
|
||||||
|
|
||||||
Lattice<vobj> ret(iftrue._grid);
|
Lattice<vobj> ret(iftrue.Grid());
|
||||||
|
|
||||||
where(ret,predicate,iftrue,iffalse);
|
whereWolf(ret,predicate,iftrue,iffalse);
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
@ -130,6 +130,8 @@ public:
|
|||||||
friend std::ostream& operator<< (std::ostream& stream, Logger& log){
|
friend std::ostream& operator<< (std::ostream& stream, Logger& log){
|
||||||
|
|
||||||
if ( log.active ) {
|
if ( log.active ) {
|
||||||
|
std::ios_base::fmtflags f(stream.flags());
|
||||||
|
|
||||||
stream << log.background()<< std::left;
|
stream << log.background()<< std::left;
|
||||||
if (log.topWidth > 0)
|
if (log.topWidth > 0)
|
||||||
{
|
{
|
||||||
@ -152,6 +154,8 @@ public:
|
|||||||
<< now << log.background() << " : " ;
|
<< now << log.background() << " : " ;
|
||||||
}
|
}
|
||||||
stream << log.colour();
|
stream << log.colour();
|
||||||
|
stream.flags(f);
|
||||||
|
|
||||||
return stream;
|
return stream;
|
||||||
} else {
|
} else {
|
||||||
return devnull;
|
return devnull;
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
#include <Grid/GridCore.h>
|
#include <Grid/GridCore.h>
|
||||||
|
|
||||||
int Grid::BinaryIO::latticeWriteMaxRetry = -1;
|
int Grid::BinaryIO::latticeWriteMaxRetry = -1;
|
||||||
|
Grid::BinaryIO::IoPerf Grid::BinaryIO::lastPerf;
|
||||||
|
@ -79,6 +79,13 @@ inline void removeWhitespace(std::string &key)
|
|||||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
class BinaryIO {
|
class BinaryIO {
|
||||||
public:
|
public:
|
||||||
|
struct IoPerf
|
||||||
|
{
|
||||||
|
uint64_t size{0},time{0};
|
||||||
|
double mbytesPerSecond{0.};
|
||||||
|
};
|
||||||
|
|
||||||
|
static IoPerf lastPerf;
|
||||||
static int latticeWriteMaxRetry;
|
static int latticeWriteMaxRetry;
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////
|
||||||
@ -264,7 +271,7 @@ class BinaryIO {
|
|||||||
uint32_t &scidac_csumb)
|
uint32_t &scidac_csumb)
|
||||||
{
|
{
|
||||||
grid->Barrier();
|
grid->Barrier();
|
||||||
GridStopWatch timer;
|
GridStopWatch timer, insideTimer;
|
||||||
GridStopWatch bstimer;
|
GridStopWatch bstimer;
|
||||||
|
|
||||||
nersc_csum=0;
|
nersc_csum=0;
|
||||||
@ -356,7 +363,10 @@ class BinaryIO {
|
|||||||
std::cout<< GridLogMessage<<"IOobject: MPI read I/O "<< file<< std::endl;
|
std::cout<< GridLogMessage<<"IOobject: MPI read I/O "<< file<< std::endl;
|
||||||
ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh); assert(ierr==0);
|
ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh); assert(ierr==0);
|
||||||
ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); assert(ierr==0);
|
ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); assert(ierr==0);
|
||||||
ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status); assert(ierr==0);
|
insideTimer.Start();
|
||||||
|
ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status);
|
||||||
|
insideTimer.Stop();
|
||||||
|
assert(ierr==0);
|
||||||
MPI_File_close(&fh);
|
MPI_File_close(&fh);
|
||||||
MPI_Type_free(&fileArray);
|
MPI_Type_free(&fileArray);
|
||||||
MPI_Type_free(&localArray);
|
MPI_Type_free(&localArray);
|
||||||
@ -431,7 +441,9 @@ class BinaryIO {
|
|||||||
assert(ierr == 0);
|
assert(ierr == 0);
|
||||||
|
|
||||||
std::cout << GridLogDebug << "MPI write I/O write all " << file << std::endl;
|
std::cout << GridLogDebug << "MPI write I/O write all " << file << std::endl;
|
||||||
|
insideTimer.Start();
|
||||||
ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);
|
ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);
|
||||||
|
insideTimer.Stop();
|
||||||
assert(ierr == 0);
|
assert(ierr == 0);
|
||||||
|
|
||||||
MPI_Offset os;
|
MPI_Offset os;
|
||||||
@ -502,12 +514,20 @@ class BinaryIO {
|
|||||||
timer.Stop();
|
timer.Stop();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
lastPerf.size = sizeof(fobj)*iodata.size()*nrank;
|
||||||
|
lastPerf.time = timer.useconds();
|
||||||
|
lastPerf.mbytesPerSecond = lastPerf.size/1024./1024./(lastPerf.time/1.0e6);
|
||||||
std::cout<<GridLogMessage<<"IOobject: ";
|
std::cout<<GridLogMessage<<"IOobject: ";
|
||||||
if ( control & BINARYIO_READ) std::cout << " read ";
|
if ( control & BINARYIO_READ) std::cout << " read ";
|
||||||
else std::cout << " write ";
|
else std::cout << " write ";
|
||||||
uint64_t bytes = sizeof(fobj)*iodata.size()*nrank;
|
uint64_t bytes = sizeof(fobj)*iodata.size()*nrank;
|
||||||
std::cout<< bytes <<" bytes in "<<timer.Elapsed() <<" "
|
std::cout<< lastPerf.size <<"bytes in "<< timer.Elapsed() <<" "
|
||||||
<< (double)bytes/ (double)timer.useconds() <<" MB/s "<<std::endl;
|
<< lastPerf.mbytesPerSecond <<" MB/s "<<std::endl;
|
||||||
|
std::cout << GridLogMessage << "IOobject: pure MPI IO call "
|
||||||
|
<< lastPerf.size <<" bytes in "
|
||||||
|
<< insideTimer.Elapsed() << " "
|
||||||
|
<< lastPerf.size/1024./1024./(insideTimer.useconds()/1.0e6)
|
||||||
|
<<" MB/s "<<std::endl;
|
||||||
|
|
||||||
std::cout<<GridLogMessage<<"IOobject: endian and checksum overhead "<<bstimer.Elapsed() <<std::endl;
|
std::cout<<GridLogMessage<<"IOobject: endian and checksum overhead "<<bstimer.Elapsed() <<std::endl;
|
||||||
|
|
||||||
@ -663,10 +683,15 @@ class BinaryIO {
|
|||||||
nersc_csum,scidac_csuma,scidac_csumb);
|
nersc_csum,scidac_csuma,scidac_csumb);
|
||||||
|
|
||||||
timer.Start();
|
timer.Start();
|
||||||
thread_for(lidx,lsites,{
|
thread_for(lidx,lsites,{ // FIX ME, suboptimal implementation
|
||||||
std::vector<RngStateType> tmp(RngStateCount);
|
std::vector<RngStateType> tmp(RngStateCount);
|
||||||
std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin());
|
std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin());
|
||||||
parallel_rng.SetState(tmp,lidx);
|
Coordinate lcoor;
|
||||||
|
grid->LocalIndexToLocalCoor(lidx, lcoor);
|
||||||
|
int o_idx=grid->oIndex(lcoor);
|
||||||
|
int i_idx=grid->iIndex(lcoor);
|
||||||
|
int gidx=parallel_rng.generator_idx(o_idx,i_idx);
|
||||||
|
parallel_rng.SetState(tmp,gidx);
|
||||||
});
|
});
|
||||||
timer.Stop();
|
timer.Stop();
|
||||||
|
|
||||||
@ -723,7 +748,12 @@ class BinaryIO {
|
|||||||
std::vector<RNGstate> iodata(lsites);
|
std::vector<RNGstate> iodata(lsites);
|
||||||
thread_for(lidx,lsites,{
|
thread_for(lidx,lsites,{
|
||||||
std::vector<RngStateType> tmp(RngStateCount);
|
std::vector<RngStateType> tmp(RngStateCount);
|
||||||
parallel_rng.GetState(tmp,lidx);
|
Coordinate lcoor;
|
||||||
|
grid->LocalIndexToLocalCoor(lidx, lcoor);
|
||||||
|
int o_idx=grid->oIndex(lcoor);
|
||||||
|
int i_idx=grid->iIndex(lcoor);
|
||||||
|
int gidx=parallel_rng.generator_idx(o_idx,i_idx);
|
||||||
|
parallel_rng.GetState(tmp,gidx);
|
||||||
std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin());
|
std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin());
|
||||||
});
|
});
|
||||||
timer.Stop();
|
timer.Stop();
|
||||||
|
@ -123,7 +123,7 @@ assert(GRID_FIELD_NORM_CALC(FieldNormMetaData_, n2ck) < 1.0e-5);
|
|||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
// Helper to fill out metadata
|
// Helper to fill out metadata
|
||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
template<class vobj> void ScidacMetaData(Lattice<vobj> & field,
|
template<class vobj> void ScidacMetaData(Lattice<vobj> & field,
|
||||||
FieldMetaData &header,
|
FieldMetaData &header,
|
||||||
scidacRecord & _scidacRecord,
|
scidacRecord & _scidacRecord,
|
||||||
scidacFile & _scidacFile)
|
scidacFile & _scidacFile)
|
||||||
@ -619,12 +619,12 @@ class IldgWriter : public ScidacWriter {
|
|||||||
// Don't require scidac records EXCEPT checksum
|
// Don't require scidac records EXCEPT checksum
|
||||||
// Use Grid MetaData object if present.
|
// Use Grid MetaData object if present.
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
template <class vsimd>
|
template <class stats = PeriodicGaugeStatistics>
|
||||||
void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,int sequence,std::string LFN,std::string description)
|
void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,int sequence,std::string LFN,std::string description)
|
||||||
{
|
{
|
||||||
GridBase * grid = Umu.Grid();
|
GridBase * grid = Umu.Grid();
|
||||||
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
|
typedef Lattice<vLorentzColourMatrixD> GaugeField;
|
||||||
typedef iLorentzColourMatrix<vsimd> vobj;
|
typedef vLorentzColourMatrixD vobj;
|
||||||
typedef typename vobj::scalar_object sobj;
|
typedef typename vobj::scalar_object sobj;
|
||||||
|
|
||||||
////////////////////////////////////////
|
////////////////////////////////////////
|
||||||
@ -636,6 +636,9 @@ class IldgWriter : public ScidacWriter {
|
|||||||
|
|
||||||
ScidacMetaData(Umu,header,_scidacRecord,_scidacFile);
|
ScidacMetaData(Umu,header,_scidacRecord,_scidacFile);
|
||||||
|
|
||||||
|
stats Stats;
|
||||||
|
Stats(Umu,header);
|
||||||
|
|
||||||
std::string format = header.floating_point;
|
std::string format = header.floating_point;
|
||||||
header.ensemble_id = description;
|
header.ensemble_id = description;
|
||||||
header.ensemble_label = description;
|
header.ensemble_label = description;
|
||||||
@ -705,10 +708,10 @@ class IldgReader : public GridLimeReader {
|
|||||||
// Else use ILDG MetaData object if present.
|
// Else use ILDG MetaData object if present.
|
||||||
// Else use SciDAC MetaData object if present.
|
// Else use SciDAC MetaData object if present.
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
template <class vsimd>
|
template <class stats = PeriodicGaugeStatistics>
|
||||||
void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu, FieldMetaData &FieldMetaData_) {
|
void readConfiguration(Lattice<vLorentzColourMatrixD> &Umu, FieldMetaData &FieldMetaData_) {
|
||||||
|
|
||||||
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
|
typedef Lattice<vLorentzColourMatrixD > GaugeField;
|
||||||
typedef typename GaugeField::vector_object vobj;
|
typedef typename GaugeField::vector_object vobj;
|
||||||
typedef typename vobj::scalar_object sobj;
|
typedef typename vobj::scalar_object sobj;
|
||||||
|
|
||||||
@ -921,7 +924,8 @@ class IldgReader : public GridLimeReader {
|
|||||||
|
|
||||||
if ( found_FieldMetaData || found_usqcdInfo ) {
|
if ( found_FieldMetaData || found_usqcdInfo ) {
|
||||||
FieldMetaData checker;
|
FieldMetaData checker;
|
||||||
GaugeStatistics(Umu,checker);
|
stats Stats;
|
||||||
|
Stats(Umu,checker);
|
||||||
assert(fabs(checker.plaquette - FieldMetaData_.plaquette )<1.0e-5);
|
assert(fabs(checker.plaquette - FieldMetaData_.plaquette )<1.0e-5);
|
||||||
assert(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5);
|
assert(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5);
|
||||||
std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl;
|
std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl;
|
||||||
|
@ -128,7 +128,7 @@ inline void MachineCharacteristics(FieldMetaData &header)
|
|||||||
std::time_t t = std::time(nullptr);
|
std::time_t t = std::time(nullptr);
|
||||||
std::tm tm_ = *std::localtime(&t);
|
std::tm tm_ = *std::localtime(&t);
|
||||||
std::ostringstream oss;
|
std::ostringstream oss;
|
||||||
// oss << std::put_time(&tm_, "%c %Z");
|
oss << std::put_time(&tm_, "%c %Z");
|
||||||
header.creation_date = oss.str();
|
header.creation_date = oss.str();
|
||||||
header.archive_date = header.creation_date;
|
header.archive_date = header.creation_date;
|
||||||
|
|
||||||
@ -176,29 +176,18 @@ template<class vobj> inline void PrepareMetaData(Lattice<vobj> & field, FieldMet
|
|||||||
GridMetaData(grid,header);
|
GridMetaData(grid,header);
|
||||||
MachineCharacteristics(header);
|
MachineCharacteristics(header);
|
||||||
}
|
}
|
||||||
inline void GaugeStatistics(Lattice<vLorentzColourMatrixF> & data,FieldMetaData &header)
|
template<class Impl>
|
||||||
|
class GaugeStatistics
|
||||||
{
|
{
|
||||||
// How to convert data precision etc...
|
public:
|
||||||
header.link_trace=WilsonLoops<PeriodicGimplF>::linkTrace(data);
|
void operator()(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
|
||||||
header.plaquette =WilsonLoops<PeriodicGimplF>::avgPlaquette(data);
|
{
|
||||||
}
|
header.link_trace=WilsonLoops<Impl>::linkTrace(data);
|
||||||
inline void GaugeStatistics(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
|
header.plaquette =WilsonLoops<Impl>::avgPlaquette(data);
|
||||||
{
|
}
|
||||||
// How to convert data precision etc...
|
};
|
||||||
header.link_trace=WilsonLoops<PeriodicGimplD>::linkTrace(data);
|
typedef GaugeStatistics<PeriodicGimplD> PeriodicGaugeStatistics;
|
||||||
header.plaquette =WilsonLoops<PeriodicGimplD>::avgPlaquette(data);
|
typedef GaugeStatistics<ConjugateGimplD> ConjugateGaugeStatistics;
|
||||||
}
|
|
||||||
template<> inline void PrepareMetaData<vLorentzColourMatrixF>(Lattice<vLorentzColourMatrixF> & field, FieldMetaData &header)
|
|
||||||
{
|
|
||||||
|
|
||||||
GridBase *grid = field.Grid();
|
|
||||||
std::string format = getFormatString<vLorentzColourMatrixF>();
|
|
||||||
header.floating_point = format;
|
|
||||||
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
|
|
||||||
GridMetaData(grid,header);
|
|
||||||
GaugeStatistics(field,header);
|
|
||||||
MachineCharacteristics(header);
|
|
||||||
}
|
|
||||||
template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzColourMatrixD> & field, FieldMetaData &header)
|
template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzColourMatrixD> & field, FieldMetaData &header)
|
||||||
{
|
{
|
||||||
GridBase *grid = field.Grid();
|
GridBase *grid = field.Grid();
|
||||||
@ -206,7 +195,6 @@ template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzCo
|
|||||||
header.floating_point = format;
|
header.floating_point = format;
|
||||||
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
|
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
|
||||||
GridMetaData(grid,header);
|
GridMetaData(grid,header);
|
||||||
GaugeStatistics(field,header);
|
|
||||||
MachineCharacteristics(header);
|
MachineCharacteristics(header);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -40,6 +40,8 @@ using namespace Grid;
|
|||||||
class NerscIO : public BinaryIO {
|
class NerscIO : public BinaryIO {
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
typedef Lattice<vLorentzColourMatrixD> GaugeField;
|
||||||
|
|
||||||
static inline void truncate(std::string file){
|
static inline void truncate(std::string file){
|
||||||
std::ofstream fout(file,std::ios::out);
|
std::ofstream fout(file,std::ios::out);
|
||||||
}
|
}
|
||||||
@ -129,12 +131,12 @@ public:
|
|||||||
// Now the meat: the object readers
|
// Now the meat: the object readers
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
template<class vsimd>
|
template<class GaugeStats=PeriodicGaugeStatistics>
|
||||||
static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
|
static inline void readConfiguration(GaugeField &Umu,
|
||||||
FieldMetaData& header,
|
FieldMetaData& header,
|
||||||
std::string file)
|
std::string file,
|
||||||
|
GaugeStats GaugeStatisticsCalculator=GaugeStats())
|
||||||
{
|
{
|
||||||
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
|
|
||||||
|
|
||||||
GridBase *grid = Umu.Grid();
|
GridBase *grid = Umu.Grid();
|
||||||
uint64_t offset = readHeader(file,Umu.Grid(),header);
|
uint64_t offset = readHeader(file,Umu.Grid(),header);
|
||||||
@ -153,23 +155,23 @@ public:
|
|||||||
// munger is a function of <floating point, Real, data_type>
|
// munger is a function of <floating point, Real, data_type>
|
||||||
if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
|
if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
|
||||||
if ( ieee32 || ieee32big ) {
|
if ( ieee32 || ieee32big ) {
|
||||||
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3F>
|
BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3F>
|
||||||
(Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
|
(Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
|
||||||
nersc_csum,scidac_csuma,scidac_csumb);
|
nersc_csum,scidac_csuma,scidac_csumb);
|
||||||
}
|
}
|
||||||
if ( ieee64 || ieee64big ) {
|
if ( ieee64 || ieee64big ) {
|
||||||
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3D>
|
BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3D>
|
||||||
(Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
|
(Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
|
||||||
nersc_csum,scidac_csuma,scidac_csumb);
|
nersc_csum,scidac_csuma,scidac_csumb);
|
||||||
}
|
}
|
||||||
} else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
|
} else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
|
||||||
if ( ieee32 || ieee32big ) {
|
if ( ieee32 || ieee32big ) {
|
||||||
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
|
BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixF>
|
||||||
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
|
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
|
||||||
nersc_csum,scidac_csuma,scidac_csumb);
|
nersc_csum,scidac_csuma,scidac_csumb);
|
||||||
}
|
}
|
||||||
if ( ieee64 || ieee64big ) {
|
if ( ieee64 || ieee64big ) {
|
||||||
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
|
BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixD>
|
||||||
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format,
|
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format,
|
||||||
nersc_csum,scidac_csuma,scidac_csumb);
|
nersc_csum,scidac_csuma,scidac_csumb);
|
||||||
}
|
}
|
||||||
@ -177,7 +179,7 @@ public:
|
|||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
GaugeStatistics(Umu,clone);
|
GaugeStats Stats; Stats(Umu,clone);
|
||||||
|
|
||||||
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" checksum "<<std::hex<<nersc_csum<< std::dec
|
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" checksum "<<std::hex<<nersc_csum<< std::dec
|
||||||
<<" header "<<std::hex<<header.checksum<<std::dec <<std::endl;
|
<<" header "<<std::hex<<header.checksum<<std::dec <<std::endl;
|
||||||
@ -203,15 +205,22 @@ public:
|
|||||||
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
|
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vsimd>
|
// Preferred interface
|
||||||
static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
|
template<class GaugeStats=PeriodicGaugeStatistics>
|
||||||
|
static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
|
||||||
|
std::string file,
|
||||||
|
std::string ens_label = std::string("DWF"))
|
||||||
|
{
|
||||||
|
writeConfiguration(Umu,file,0,1,ens_label);
|
||||||
|
}
|
||||||
|
template<class GaugeStats=PeriodicGaugeStatistics>
|
||||||
|
static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
|
||||||
std::string file,
|
std::string file,
|
||||||
int two_row,
|
int two_row,
|
||||||
int bits32)
|
int bits32,
|
||||||
|
std::string ens_label = std::string("DWF"))
|
||||||
{
|
{
|
||||||
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
|
typedef vLorentzColourMatrixD vobj;
|
||||||
|
|
||||||
typedef iLorentzColourMatrix<vsimd> vobj;
|
|
||||||
typedef typename vobj::scalar_object sobj;
|
typedef typename vobj::scalar_object sobj;
|
||||||
|
|
||||||
FieldMetaData header;
|
FieldMetaData header;
|
||||||
@ -219,8 +228,8 @@ public:
|
|||||||
// Following should become arguments
|
// Following should become arguments
|
||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
header.sequence_number = 1;
|
header.sequence_number = 1;
|
||||||
header.ensemble_id = "UKQCD";
|
header.ensemble_id = std::string("UKQCD");
|
||||||
header.ensemble_label = "DWF";
|
header.ensemble_label = ens_label;
|
||||||
|
|
||||||
typedef LorentzColourMatrixD fobj3D;
|
typedef LorentzColourMatrixD fobj3D;
|
||||||
typedef LorentzColour2x3D fobj2D;
|
typedef LorentzColour2x3D fobj2D;
|
||||||
@ -229,28 +238,28 @@ public:
|
|||||||
|
|
||||||
GridMetaData(grid,header);
|
GridMetaData(grid,header);
|
||||||
assert(header.nd==4);
|
assert(header.nd==4);
|
||||||
GaugeStatistics(Umu,header);
|
GaugeStats Stats; Stats(Umu,header);
|
||||||
MachineCharacteristics(header);
|
MachineCharacteristics(header);
|
||||||
|
|
||||||
uint64_t offset;
|
uint64_t offset;
|
||||||
|
|
||||||
// Sod it -- always write 3x3 double
|
// Sod it -- always write 3x3 double
|
||||||
header.floating_point = std::string("IEEE64BIG");
|
header.floating_point = std::string("IEEE64BIG");
|
||||||
header.data_type = std::string("4D_SU3_GAUGE_3x3");
|
header.data_type = std::string("4D_SU3_GAUGE_3x3");
|
||||||
GaugeSimpleUnmunger<fobj3D,sobj> munge;
|
GaugeSimpleUnmunger<fobj3D,sobj> munge;
|
||||||
if ( grid->IsBoss() ) {
|
if ( grid->IsBoss() ) {
|
||||||
truncate(file);
|
truncate(file);
|
||||||
offset = writeHeader(header,file);
|
offset = writeHeader(header,file);
|
||||||
}
|
}
|
||||||
grid->Broadcast(0,(void *)&offset,sizeof(offset));
|
grid->Broadcast(0,(void *)&offset,sizeof(offset));
|
||||||
|
|
||||||
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
|
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
|
||||||
BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
|
BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
|
||||||
nersc_csum,scidac_csuma,scidac_csumb);
|
nersc_csum,scidac_csuma,scidac_csumb);
|
||||||
header.checksum = nersc_csum;
|
header.checksum = nersc_csum;
|
||||||
if ( grid->IsBoss() ) {
|
if ( grid->IsBoss() ) {
|
||||||
writeHeader(header,file);
|
writeHeader(header,file);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::cout<<GridLogMessage <<"Written NERSC Configuration on "<< file << " checksum "
|
std::cout<<GridLogMessage <<"Written NERSC Configuration on "<< file << " checksum "
|
||||||
<<std::hex<<header.checksum
|
<<std::hex<<header.checksum
|
||||||
|
@ -154,7 +154,7 @@ public:
|
|||||||
grid->Barrier(); timer.Stop();
|
grid->Barrier(); timer.Stop();
|
||||||
std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: redistribute overhead " << timer.Elapsed() << std::endl;
|
std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: redistribute overhead " << timer.Elapsed() << std::endl;
|
||||||
|
|
||||||
GaugeStatistics(Umu, clone);
|
PeriodicGaugeStatistics Stats; Stats(Umu, clone);
|
||||||
|
|
||||||
RealD plaq_diff = fabs(clone.plaquette - header.plaquette);
|
RealD plaq_diff = fabs(clone.plaquette - header.plaquette);
|
||||||
|
|
||||||
|
@ -208,7 +208,7 @@ public:
|
|||||||
|
|
||||||
FieldMetaData clone(header);
|
FieldMetaData clone(header);
|
||||||
|
|
||||||
GaugeStatistics(Umu, clone);
|
PeriodicGaugeStatistics Stats; Stats(Umu, clone);
|
||||||
|
|
||||||
RealD plaq_diff = fabs(clone.plaquette - header.plaquette);
|
RealD plaq_diff = fabs(clone.plaquette - header.plaquette);
|
||||||
|
|
||||||
|
@ -47,7 +47,7 @@ static constexpr int Ym = 5;
|
|||||||
static constexpr int Zm = 6;
|
static constexpr int Zm = 6;
|
||||||
static constexpr int Tm = 7;
|
static constexpr int Tm = 7;
|
||||||
|
|
||||||
static constexpr int Nc=3;
|
static constexpr int Nc=Config_Nc;
|
||||||
static constexpr int Ns=4;
|
static constexpr int Ns=4;
|
||||||
static constexpr int Nd=4;
|
static constexpr int Nd=4;
|
||||||
static constexpr int Nhs=2; // half spinor
|
static constexpr int Nhs=2; // half spinor
|
||||||
@ -77,13 +77,13 @@ const int SpinorIndex = 2;
|
|||||||
template<typename T> struct isSpinor {
|
template<typename T> struct isSpinor {
|
||||||
static constexpr bool value = (SpinorIndex==T::TensorLevel);
|
static constexpr bool value = (SpinorIndex==T::TensorLevel);
|
||||||
};
|
};
|
||||||
const int CoarseIndex = 4;
|
|
||||||
template<typename T> struct isCoarsened {
|
|
||||||
static constexpr bool value = (CoarseIndex<=T::TensorLevel);
|
|
||||||
};
|
|
||||||
template <typename T> using IfSpinor = Invoke<std::enable_if< isSpinor<T>::value,int> > ;
|
template <typename T> using IfSpinor = Invoke<std::enable_if< isSpinor<T>::value,int> > ;
|
||||||
template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ;
|
template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ;
|
||||||
|
|
||||||
|
const int CoarseIndex = 4;
|
||||||
|
template<typename T> struct isCoarsened {
|
||||||
|
static constexpr bool value = (CoarseIndex<=T::TensorLevel);
|
||||||
|
};
|
||||||
template <typename T> using IfCoarsened = Invoke<std::enable_if< isCoarsened<T>::value,int> > ;
|
template <typename T> using IfCoarsened = Invoke<std::enable_if< isCoarsened<T>::value,int> > ;
|
||||||
template <typename T> using IfNotCoarsened = Invoke<std::enable_if<!isCoarsened<T>::value,int> > ;
|
template <typename T> using IfNotCoarsened = Invoke<std::enable_if<!isCoarsened<T>::value,int> > ;
|
||||||
|
|
||||||
|
@ -41,7 +41,7 @@ class Action
|
|||||||
public:
|
public:
|
||||||
bool is_smeared = false;
|
bool is_smeared = false;
|
||||||
// Heatbath?
|
// Heatbath?
|
||||||
virtual void refresh(const GaugeField& U, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
|
virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
|
||||||
virtual RealD S(const GaugeField& U) = 0; // evaluate the action
|
virtual RealD S(const GaugeField& U) = 0; // evaluate the action
|
||||||
virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0; // evaluate the action derivative
|
virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0; // evaluate the action derivative
|
||||||
virtual std::string action_name() = 0; // return the action name
|
virtual std::string action_name() = 0; // return the action name
|
||||||
|
@ -291,12 +291,6 @@ typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
|
|||||||
typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
|
typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
|
||||||
typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;
|
typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;
|
||||||
|
|
||||||
#ifndef GRID_CUDA
|
|
||||||
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplR> ImprovedStaggeredFermionVec5dR;
|
|
||||||
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplF> ImprovedStaggeredFermionVec5dF;
|
|
||||||
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermionVec5dD;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
////////////////////
|
////////////////////
|
||||||
|
@ -89,8 +89,7 @@ public:
|
|||||||
virtual void Mdiag (const FermionField &in, FermionField &out) { Mooee(in,out);}; // Same as Mooee applied to both CB's
|
virtual void Mdiag (const FermionField &in, FermionField &out) { Mooee(in,out);}; // Same as Mooee applied to both CB's
|
||||||
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp)=0; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
|
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp)=0; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
|
||||||
virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out)=0; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
|
virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out)=0; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
|
||||||
virtual std::vector<int> Directions(void) =0;
|
|
||||||
virtual std::vector<int> Displacements(void)=0;
|
|
||||||
|
|
||||||
virtual void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { assert(0);};
|
virtual void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { assert(0);};
|
||||||
|
|
||||||
|
@ -153,8 +153,8 @@ public:
|
|||||||
typedef typename Impl::StencilImpl StencilImpl; \
|
typedef typename Impl::StencilImpl StencilImpl; \
|
||||||
typedef typename Impl::ImplParams ImplParams; \
|
typedef typename Impl::ImplParams ImplParams; \
|
||||||
typedef typename Impl::StencilImpl::View_type StencilView; \
|
typedef typename Impl::StencilImpl::View_type StencilView; \
|
||||||
typedef typename ViewMap<FermionField>::Type FermionFieldView; \
|
typedef const typename ViewMap<FermionField>::Type FermionFieldView; \
|
||||||
typedef typename ViewMap<DoubledGaugeField>::Type DoubledGaugeFieldView;
|
typedef const typename ViewMap<DoubledGaugeField>::Type DoubledGaugeFieldView;
|
||||||
|
|
||||||
#define INHERIT_IMPL_TYPES(Base) \
|
#define INHERIT_IMPL_TYPES(Base) \
|
||||||
INHERIT_GIMPL_TYPES(Base) \
|
INHERIT_GIMPL_TYPES(Base) \
|
||||||
@ -183,7 +183,8 @@ NAMESPACE_CHECK(ImplStaggered);
|
|||||||
/////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////
|
||||||
// Single flavour one component spinors with colour index. 5d vec
|
// Single flavour one component spinors with colour index. 5d vec
|
||||||
/////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////
|
||||||
#include <Grid/qcd/action/fermion/StaggeredVec5dImpl.h>
|
// Deprecate Vec5d
|
||||||
NAMESPACE_CHECK(ImplStaggered5dVec);
|
//#include <Grid/qcd/action/fermion/StaggeredVec5dImpl.h>
|
||||||
|
//NAMESPACE_CHECK(ImplStaggered5dVec);
|
||||||
|
|
||||||
|
|
||||||
|
@ -97,42 +97,30 @@ public:
|
|||||||
Coordinate icoor;
|
Coordinate icoor;
|
||||||
|
|
||||||
#ifdef GRID_SIMT
|
#ifdef GRID_SIMT
|
||||||
_Spinor tmp;
|
|
||||||
|
|
||||||
const int Nsimd =SiteDoubledGaugeField::Nsimd();
|
const int Nsimd =SiteDoubledGaugeField::Nsimd();
|
||||||
int s = acceleratorSIMTlane(Nsimd);
|
int s = acceleratorSIMTlane(Nsimd);
|
||||||
St.iCoorFromIindex(icoor,s);
|
St.iCoorFromIindex(icoor,s);
|
||||||
|
|
||||||
int mmu = mu % Nd;
|
int mmu = mu % Nd;
|
||||||
if ( SE->_around_the_world && St.parameters.twists[mmu] ) {
|
|
||||||
|
|
||||||
int permute_lane = (sl==1)
|
auto UU0=coalescedRead(U(0)(mu));
|
||||||
|| ((distance== 1)&&(icoor[direction]==1))
|
auto UU1=coalescedRead(U(1)(mu));
|
||||||
|| ((distance==-1)&&(icoor[direction]==0));
|
|
||||||
|
|
||||||
if ( permute_lane ) {
|
//Decide whether we do a G-parity flavor twist
|
||||||
tmp(0) = chi(1);
|
//Note: this assumes (but does not check) that sl==1 || sl==2 i.e. max 2 SIMD lanes in G-parity dir
|
||||||
tmp(1) = chi(0);
|
//It also assumes (but does not check) that abs(distance) == 1
|
||||||
} else {
|
int permute_lane = (sl==1)
|
||||||
tmp(0) = chi(0);
|
|| ((distance== 1)&&(icoor[direction]==1))
|
||||||
tmp(1) = chi(1);
|
|| ((distance==-1)&&(icoor[direction]==0));
|
||||||
}
|
|
||||||
|
|
||||||
auto UU0=coalescedRead(U(0)(mu));
|
permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu]; //only if we are going around the world
|
||||||
auto UU1=coalescedRead(U(1)(mu));
|
|
||||||
|
|
||||||
mult(&phi(0),&UU0,&tmp(0));
|
//Apply the links
|
||||||
mult(&phi(1),&UU1,&tmp(1));
|
int f_upper = permute_lane ? 1 : 0;
|
||||||
|
int f_lower = !f_upper;
|
||||||
|
|
||||||
} else {
|
mult(&phi(0),&UU0,&chi(f_upper));
|
||||||
|
mult(&phi(1),&UU1,&chi(f_lower));
|
||||||
auto UU0=coalescedRead(U(0)(mu));
|
|
||||||
auto UU1=coalescedRead(U(1)(mu));
|
|
||||||
|
|
||||||
mult(&phi(0),&UU0,&chi(0));
|
|
||||||
mult(&phi(1),&UU1,&chi(1));
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
#else
|
||||||
typedef _Spinor vobj;
|
typedef _Spinor vobj;
|
||||||
|
@ -44,9 +44,6 @@ public:
|
|||||||
INHERIT_IMPL_TYPES(Impl);
|
INHERIT_IMPL_TYPES(Impl);
|
||||||
typedef StaggeredKernels<Impl> Kernels;
|
typedef StaggeredKernels<Impl> Kernels;
|
||||||
|
|
||||||
virtual std::vector<int> Directions(void) { return this->directions; };
|
|
||||||
virtual std::vector<int> Displacements(void){ return this->displacements;};
|
|
||||||
|
|
||||||
FermionField _tmp;
|
FermionField _tmp;
|
||||||
FermionField &tmp(void) { return _tmp; }
|
FermionField &tmp(void) { return _tmp; }
|
||||||
|
|
||||||
|
@ -49,9 +49,6 @@ public:
|
|||||||
INHERIT_IMPL_TYPES(Impl);
|
INHERIT_IMPL_TYPES(Impl);
|
||||||
typedef StaggeredKernels<Impl> Kernels;
|
typedef StaggeredKernels<Impl> Kernels;
|
||||||
|
|
||||||
virtual std::vector<int> Directions(void) { return this->directions; };
|
|
||||||
virtual std::vector<int> Displacements(void){ return this->displacements;};
|
|
||||||
|
|
||||||
FermionField _tmp;
|
FermionField _tmp;
|
||||||
FermionField &tmp(void) { return _tmp; }
|
FermionField &tmp(void) { return _tmp; }
|
||||||
|
|
||||||
|
@ -85,7 +85,7 @@ class MADWF
|
|||||||
maxiter =_maxiter;
|
maxiter =_maxiter;
|
||||||
};
|
};
|
||||||
|
|
||||||
void operator() (const FermionFieldo &src4,FermionFieldo &sol5)
|
void operator() (const FermionFieldo &src,FermionFieldo &sol5)
|
||||||
{
|
{
|
||||||
std::cout << GridLogMessage<< " ************************************************" << std::endl;
|
std::cout << GridLogMessage<< " ************************************************" << std::endl;
|
||||||
std::cout << GridLogMessage<< " MADWF-like algorithm " << std::endl;
|
std::cout << GridLogMessage<< " MADWF-like algorithm " << std::endl;
|
||||||
@ -114,8 +114,16 @@ class MADWF
|
|||||||
///////////////////////////////////////
|
///////////////////////////////////////
|
||||||
//Import source, include Dminus factors
|
//Import source, include Dminus factors
|
||||||
///////////////////////////////////////
|
///////////////////////////////////////
|
||||||
Mato.ImportPhysicalFermionSource(src4,b);
|
GridBase *src_grid = src.Grid();
|
||||||
std::cout << GridLogMessage << " src4 " <<norm2(src4)<<std::endl;
|
|
||||||
|
assert( (src_grid == Mato.GaugeGrid()) || (src_grid == Mato.FermionGrid()));
|
||||||
|
|
||||||
|
if ( src_grid == Mato.GaugeGrid() ) {
|
||||||
|
Mato.ImportPhysicalFermionSource(src,b);
|
||||||
|
} else {
|
||||||
|
b=src;
|
||||||
|
}
|
||||||
|
std::cout << GridLogMessage << " src " <<norm2(src)<<std::endl;
|
||||||
std::cout << GridLogMessage << " b " <<norm2(b)<<std::endl;
|
std::cout << GridLogMessage << " b " <<norm2(b)<<std::endl;
|
||||||
|
|
||||||
defect = b;
|
defect = b;
|
||||||
|
@ -47,9 +47,6 @@ public:
|
|||||||
FermionField _tmp;
|
FermionField _tmp;
|
||||||
FermionField &tmp(void) { return _tmp; }
|
FermionField &tmp(void) { return _tmp; }
|
||||||
|
|
||||||
virtual std::vector<int> Directions(void) { return this->directions; };
|
|
||||||
virtual std::vector<int> Displacements(void){ return this->displacements;};
|
|
||||||
|
|
||||||
////////////////////////////////////////
|
////////////////////////////////////////
|
||||||
// Performance monitoring
|
// Performance monitoring
|
||||||
////////////////////////////////////////
|
////////////////////////////////////////
|
||||||
|
@ -72,19 +72,23 @@ public:
|
|||||||
|
|
||||||
StaggeredImpl(const ImplParams &p = ImplParams()) : Params(p){};
|
StaggeredImpl(const ImplParams &p = ImplParams()) : Params(p){};
|
||||||
|
|
||||||
static accelerator_inline void multLink(SiteSpinor &phi,
|
template<class _Spinor>
|
||||||
|
static accelerator_inline void multLink(_Spinor &phi,
|
||||||
const SiteDoubledGaugeField &U,
|
const SiteDoubledGaugeField &U,
|
||||||
const SiteSpinor &chi,
|
const _Spinor &chi,
|
||||||
int mu)
|
int mu)
|
||||||
{
|
{
|
||||||
mult(&phi(), &U(mu), &chi());
|
auto UU = coalescedRead(U(mu));
|
||||||
|
mult(&phi(), &UU, &chi());
|
||||||
}
|
}
|
||||||
static accelerator_inline void multLinkAdd(SiteSpinor &phi,
|
template<class _Spinor>
|
||||||
|
static accelerator_inline void multLinkAdd(_Spinor &phi,
|
||||||
const SiteDoubledGaugeField &U,
|
const SiteDoubledGaugeField &U,
|
||||||
const SiteSpinor &chi,
|
const _Spinor &chi,
|
||||||
int mu)
|
int mu)
|
||||||
{
|
{
|
||||||
mac(&phi(), &U(mu), &chi());
|
auto UU = coalescedRead(U(mu));
|
||||||
|
mac(&phi(), &UU, &chi());
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class ref>
|
template <class ref>
|
||||||
|
@ -63,17 +63,20 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
|
|||||||
///////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Generic Nc kernels
|
// Generic Nc kernels
|
||||||
///////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
template<int Naik> accelerator_inline
|
template<int Naik>
|
||||||
|
static accelerator_inline
|
||||||
void DhopSiteGeneric(StencilView &st,
|
void DhopSiteGeneric(StencilView &st,
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||||
template<int Naik> accelerator_inline
|
|
||||||
|
template<int Naik> static accelerator_inline
|
||||||
void DhopSiteGenericInt(StencilView &st,
|
void DhopSiteGenericInt(StencilView &st,
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||||
template<int Naik> accelerator_inline
|
|
||||||
|
template<int Naik> static accelerator_inline
|
||||||
void DhopSiteGenericExt(StencilView &st,
|
void DhopSiteGenericExt(StencilView &st,
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
@ -82,17 +85,20 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
|
|||||||
///////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Nc=3 specific kernels
|
// Nc=3 specific kernels
|
||||||
///////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
template<int Naik> accelerator_inline
|
|
||||||
|
template<int Naik> static accelerator_inline
|
||||||
void DhopSiteHand(StencilView &st,
|
void DhopSiteHand(StencilView &st,
|
||||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||||
template<int Naik> accelerator_inline
|
|
||||||
|
template<int Naik> static accelerator_inline
|
||||||
void DhopSiteHandInt(StencilView &st,
|
void DhopSiteHandInt(StencilView &st,
|
||||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||||
template<int Naik> accelerator_inline
|
|
||||||
|
template<int Naik> static accelerator_inline
|
||||||
void DhopSiteHandExt(StencilView &st,
|
void DhopSiteHandExt(StencilView &st,
|
||||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
@ -101,6 +107,7 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
|
|||||||
///////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Asm Nc=3 specific kernels
|
// Asm Nc=3 specific kernels
|
||||||
///////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
void DhopSiteAsm(StencilView &st,
|
void DhopSiteAsm(StencilView &st,
|
||||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
|
@ -245,7 +245,7 @@ public:
|
|||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
protected:
|
||||||
// here fixing the 4 dimensions, make it more general?
|
// here fixing the 4 dimensions, make it more general?
|
||||||
|
|
||||||
RealD csw_r; // Clover coefficient - spatial
|
RealD csw_r; // Clover coefficient - spatial
|
||||||
|
@ -61,7 +61,7 @@ public:
|
|||||||
typedef typename SiteHalfSpinor::vector_type vComplexHigh;
|
typedef typename SiteHalfSpinor::vector_type vComplexHigh;
|
||||||
constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh);
|
constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh);
|
||||||
|
|
||||||
accelerator_inline int CommDatumSize(void) {
|
accelerator_inline int CommDatumSize(void) const {
|
||||||
return sizeof(SiteHalfCommSpinor);
|
return sizeof(SiteHalfCommSpinor);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -69,7 +69,7 @@ public:
|
|||||||
/* Compress includes precision change if mpi data is not same */
|
/* Compress includes precision change if mpi data is not same */
|
||||||
/*****************************************************/
|
/*****************************************************/
|
||||||
template<class _SiteHalfSpinor, class _SiteSpinor>
|
template<class _SiteHalfSpinor, class _SiteSpinor>
|
||||||
accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) {
|
accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) const {
|
||||||
_SiteHalfSpinor tmp;
|
_SiteHalfSpinor tmp;
|
||||||
projector::Proj(tmp,in,mu,dag);
|
projector::Proj(tmp,in,mu,dag);
|
||||||
vstream(buf[o],tmp);
|
vstream(buf[o],tmp);
|
||||||
@ -81,7 +81,7 @@ public:
|
|||||||
accelerator_inline void Exchange(SiteHalfSpinor *mp,
|
accelerator_inline void Exchange(SiteHalfSpinor *mp,
|
||||||
const SiteHalfSpinor * __restrict__ vp0,
|
const SiteHalfSpinor * __restrict__ vp0,
|
||||||
const SiteHalfSpinor * __restrict__ vp1,
|
const SiteHalfSpinor * __restrict__ vp1,
|
||||||
Integer type,Integer o){
|
Integer type,Integer o) const {
|
||||||
SiteHalfSpinor tmp1;
|
SiteHalfSpinor tmp1;
|
||||||
SiteHalfSpinor tmp2;
|
SiteHalfSpinor tmp2;
|
||||||
exchange(tmp1,tmp2,vp0[o],vp1[o],type);
|
exchange(tmp1,tmp2,vp0[o],vp1[o],type);
|
||||||
@ -93,7 +93,7 @@ public:
|
|||||||
/* Have a decompression step if mpi data is not same */
|
/* Have a decompression step if mpi data is not same */
|
||||||
/*****************************************************/
|
/*****************************************************/
|
||||||
accelerator_inline void Decompress(SiteHalfSpinor * __restrict__ out,
|
accelerator_inline void Decompress(SiteHalfSpinor * __restrict__ out,
|
||||||
SiteHalfSpinor * __restrict__ in, Integer o) {
|
SiteHalfSpinor * __restrict__ in, Integer o) const {
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -103,7 +103,7 @@ public:
|
|||||||
accelerator_inline void CompressExchange(SiteHalfSpinor * __restrict__ out0,
|
accelerator_inline void CompressExchange(SiteHalfSpinor * __restrict__ out0,
|
||||||
SiteHalfSpinor * __restrict__ out1,
|
SiteHalfSpinor * __restrict__ out1,
|
||||||
const SiteSpinor * __restrict__ in,
|
const SiteSpinor * __restrict__ in,
|
||||||
Integer j,Integer k, Integer m,Integer type)
|
Integer j,Integer k, Integer m,Integer type) const
|
||||||
{
|
{
|
||||||
SiteHalfSpinor temp1, temp2;
|
SiteHalfSpinor temp1, temp2;
|
||||||
SiteHalfSpinor temp3, temp4;
|
SiteHalfSpinor temp3, temp4;
|
||||||
@ -117,7 +117,7 @@ public:
|
|||||||
/*****************************************************/
|
/*****************************************************/
|
||||||
/* Pass the info to the stencil */
|
/* Pass the info to the stencil */
|
||||||
/*****************************************************/
|
/*****************************************************/
|
||||||
accelerator_inline bool DecompressionStep(void) { return false; }
|
accelerator_inline bool DecompressionStep(void) const { return false; }
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -142,7 +142,7 @@ public:
|
|||||||
typedef typename SiteHalfSpinor::vector_type vComplexHigh;
|
typedef typename SiteHalfSpinor::vector_type vComplexHigh;
|
||||||
constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh);
|
constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh);
|
||||||
|
|
||||||
accelerator_inline int CommDatumSize(void) {
|
accelerator_inline int CommDatumSize(void) const {
|
||||||
return sizeof(SiteHalfCommSpinor);
|
return sizeof(SiteHalfCommSpinor);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -150,7 +150,7 @@ public:
|
|||||||
/* Compress includes precision change if mpi data is not same */
|
/* Compress includes precision change if mpi data is not same */
|
||||||
/*****************************************************/
|
/*****************************************************/
|
||||||
template<class _SiteHalfSpinor, class _SiteSpinor>
|
template<class _SiteHalfSpinor, class _SiteSpinor>
|
||||||
accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) {
|
accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) const {
|
||||||
_SiteHalfSpinor hsp;
|
_SiteHalfSpinor hsp;
|
||||||
SiteHalfCommSpinor *hbuf = (SiteHalfCommSpinor *)buf;
|
SiteHalfCommSpinor *hbuf = (SiteHalfCommSpinor *)buf;
|
||||||
projector::Proj(hsp,in,mu,dag);
|
projector::Proj(hsp,in,mu,dag);
|
||||||
@ -163,7 +163,7 @@ public:
|
|||||||
accelerator_inline void Exchange(SiteHalfSpinor *mp,
|
accelerator_inline void Exchange(SiteHalfSpinor *mp,
|
||||||
SiteHalfSpinor *vp0,
|
SiteHalfSpinor *vp0,
|
||||||
SiteHalfSpinor *vp1,
|
SiteHalfSpinor *vp1,
|
||||||
Integer type,Integer o){
|
Integer type,Integer o) const {
|
||||||
SiteHalfSpinor vt0,vt1;
|
SiteHalfSpinor vt0,vt1;
|
||||||
SiteHalfCommSpinor *vpp0 = (SiteHalfCommSpinor *)vp0;
|
SiteHalfCommSpinor *vpp0 = (SiteHalfCommSpinor *)vp0;
|
||||||
SiteHalfCommSpinor *vpp1 = (SiteHalfCommSpinor *)vp1;
|
SiteHalfCommSpinor *vpp1 = (SiteHalfCommSpinor *)vp1;
|
||||||
@ -175,7 +175,7 @@ public:
|
|||||||
/*****************************************************/
|
/*****************************************************/
|
||||||
/* Have a decompression step if mpi data is not same */
|
/* Have a decompression step if mpi data is not same */
|
||||||
/*****************************************************/
|
/*****************************************************/
|
||||||
accelerator_inline void Decompress(SiteHalfSpinor *out, SiteHalfSpinor *in, Integer o){
|
accelerator_inline void Decompress(SiteHalfSpinor *out, SiteHalfSpinor *in, Integer o) const {
|
||||||
SiteHalfCommSpinor *hin=(SiteHalfCommSpinor *)in;
|
SiteHalfCommSpinor *hin=(SiteHalfCommSpinor *)in;
|
||||||
precisionChange((vComplexHigh *)&out[o],(vComplexLow *)&hin[o],Nw);
|
precisionChange((vComplexHigh *)&out[o],(vComplexLow *)&hin[o],Nw);
|
||||||
}
|
}
|
||||||
@ -186,7 +186,7 @@ public:
|
|||||||
accelerator_inline void CompressExchange(SiteHalfSpinor *out0,
|
accelerator_inline void CompressExchange(SiteHalfSpinor *out0,
|
||||||
SiteHalfSpinor *out1,
|
SiteHalfSpinor *out1,
|
||||||
const SiteSpinor *in,
|
const SiteSpinor *in,
|
||||||
Integer j,Integer k, Integer m,Integer type){
|
Integer j,Integer k, Integer m,Integer type) const {
|
||||||
SiteHalfSpinor temp1, temp2,temp3,temp4;
|
SiteHalfSpinor temp1, temp2,temp3,temp4;
|
||||||
SiteHalfCommSpinor *hout0 = (SiteHalfCommSpinor *)out0;
|
SiteHalfCommSpinor *hout0 = (SiteHalfCommSpinor *)out0;
|
||||||
SiteHalfCommSpinor *hout1 = (SiteHalfCommSpinor *)out1;
|
SiteHalfCommSpinor *hout1 = (SiteHalfCommSpinor *)out1;
|
||||||
@ -200,7 +200,7 @@ public:
|
|||||||
/*****************************************************/
|
/*****************************************************/
|
||||||
/* Pass the info to the stencil */
|
/* Pass the info to the stencil */
|
||||||
/*****************************************************/
|
/*****************************************************/
|
||||||
accelerator_inline bool DecompressionStep(void) { return true; }
|
accelerator_inline bool DecompressionStep(void) const { return true; }
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -63,9 +63,6 @@ public:
|
|||||||
INHERIT_IMPL_TYPES(Impl);
|
INHERIT_IMPL_TYPES(Impl);
|
||||||
typedef WilsonKernels<Impl> Kernels;
|
typedef WilsonKernels<Impl> Kernels;
|
||||||
|
|
||||||
virtual std::vector<int> Directions(void) { return this->directions; };
|
|
||||||
virtual std::vector<int> Displacements(void){ return this->displacements;};
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Implement the abstract base
|
// Implement the abstract base
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
|
@ -72,9 +72,6 @@ public:
|
|||||||
typedef WilsonKernels<Impl> Kernels;
|
typedef WilsonKernels<Impl> Kernels;
|
||||||
PmuStat stat;
|
PmuStat stat;
|
||||||
|
|
||||||
virtual std::vector<int> Directions(void) { return this->directions; };
|
|
||||||
virtual std::vector<int> Displacements(void){ return this->displacements;};
|
|
||||||
|
|
||||||
FermionField _tmp;
|
FermionField _tmp;
|
||||||
FermionField &tmp(void) { return _tmp; }
|
FermionField &tmp(void) { return _tmp; }
|
||||||
|
|
||||||
|
@ -72,7 +72,7 @@ public:
|
|||||||
typedef WilsonCompressor<SiteHalfCommSpinor,SiteHalfSpinor, SiteSpinor> Compressor;
|
typedef WilsonCompressor<SiteHalfCommSpinor,SiteHalfSpinor, SiteSpinor> Compressor;
|
||||||
typedef WilsonImplParams ImplParams;
|
typedef WilsonImplParams ImplParams;
|
||||||
typedef WilsonStencil<SiteSpinor, SiteHalfSpinor,ImplParams> StencilImpl;
|
typedef WilsonStencil<SiteSpinor, SiteHalfSpinor,ImplParams> StencilImpl;
|
||||||
typedef typename StencilImpl::View_type StencilView;
|
typedef const typename StencilImpl::View_type StencilView;
|
||||||
|
|
||||||
ImplParams Params;
|
ImplParams Params;
|
||||||
|
|
||||||
@ -106,11 +106,15 @@ public:
|
|||||||
const _SpinorField & phi,
|
const _SpinorField & phi,
|
||||||
int mu)
|
int mu)
|
||||||
{
|
{
|
||||||
|
const int Nsimd = SiteHalfSpinor::Nsimd();
|
||||||
autoView( out_v, out, AcceleratorWrite);
|
autoView( out_v, out, AcceleratorWrite);
|
||||||
autoView( phi_v, phi, AcceleratorRead);
|
autoView( phi_v, phi, AcceleratorRead);
|
||||||
autoView( Umu_v, Umu, AcceleratorRead);
|
autoView( Umu_v, Umu, AcceleratorRead);
|
||||||
accelerator_for(sss,out.Grid()->oSites(),1,{
|
typedef decltype(coalescedRead(out_v[0])) calcSpinor;
|
||||||
multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu);
|
accelerator_for(sss,out.Grid()->oSites(),Nsimd,{
|
||||||
|
calcSpinor tmp;
|
||||||
|
multLink(tmp,Umu_v[sss],phi_v(sss),mu);
|
||||||
|
coalescedWrite(out_v[sss],tmp);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -180,18 +184,22 @@ public:
|
|||||||
mat = TraceIndex<SpinIndex>(P);
|
mat = TraceIndex<SpinIndex>(P);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
|
inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds)
|
||||||
|
{
|
||||||
for (int mu = 0; mu < Nd; mu++)
|
for (int mu = 0; mu < Nd; mu++)
|
||||||
mat[mu] = PeekIndex<LorentzIndex>(Uds, mu);
|
mat[mu] = PeekIndex<LorentzIndex>(Uds, mu);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã,int mu)
|
||||||
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã,int mu){
|
{
|
||||||
|
#undef USE_OLD_INSERT_FORCE
|
||||||
int Ls=Btilde.Grid()->_fdimensions[0];
|
int Ls=Btilde.Grid()->_fdimensions[0];
|
||||||
|
autoView( mat_v , mat, AcceleratorWrite);
|
||||||
|
#ifdef USE_OLD_INSERT_FORCE
|
||||||
GaugeLinkField tmp(mat.Grid());
|
GaugeLinkField tmp(mat.Grid());
|
||||||
tmp = Zero();
|
tmp = Zero();
|
||||||
{
|
{
|
||||||
|
const int Nsimd = SiteSpinor::Nsimd();
|
||||||
autoView( tmp_v , tmp, AcceleratorWrite);
|
autoView( tmp_v , tmp, AcceleratorWrite);
|
||||||
autoView( Btilde_v , Btilde, AcceleratorRead);
|
autoView( Btilde_v , Btilde, AcceleratorRead);
|
||||||
autoView( Atilde_v , Atilde, AcceleratorRead);
|
autoView( Atilde_v , Atilde, AcceleratorRead);
|
||||||
@ -204,6 +212,29 @@ public:
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
PokeIndex<LorentzIndex>(mat,tmp,mu);
|
PokeIndex<LorentzIndex>(mat,tmp,mu);
|
||||||
|
#else
|
||||||
|
{
|
||||||
|
const int Nsimd = SiteSpinor::Nsimd();
|
||||||
|
autoView( Btilde_v , Btilde, AcceleratorRead);
|
||||||
|
autoView( Atilde_v , Atilde, AcceleratorRead);
|
||||||
|
accelerator_for(sss,mat.Grid()->oSites(),Nsimd,{
|
||||||
|
int sU=sss;
|
||||||
|
typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
|
||||||
|
ColorMatrixType sum;
|
||||||
|
zeroit(sum);
|
||||||
|
for(int s=0;s<Ls;s++){
|
||||||
|
int sF = s+Ls*sU;
|
||||||
|
for(int spn=0;spn<Ns;spn++){ //sum over spin
|
||||||
|
auto bb = coalescedRead(Btilde_v[sF]()(spn) ); //color vector
|
||||||
|
auto aa = coalescedRead(Atilde_v[sF]()(spn) );
|
||||||
|
auto op = outerProduct(bb,aa);
|
||||||
|
sum = sum + op;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
coalescedWrite(mat_v[sU](mu)(), sum);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -49,9 +49,17 @@ public:
|
|||||||
|
|
||||||
INHERIT_IMPL_TYPES(Impl);
|
INHERIT_IMPL_TYPES(Impl);
|
||||||
typedef FermionOperator<Impl> Base;
|
typedef FermionOperator<Impl> Base;
|
||||||
|
typedef AcceleratorVector<int,STENCIL_MAX> StencilVector;
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
#define SYCL_HACK
|
||||||
|
#endif
|
||||||
|
#ifdef SYCL_HACK
|
||||||
|
static void HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p, SiteDoubledGaugeField *U,SiteHalfSpinor *buf,
|
||||||
|
int ss,int sU,const SiteSpinor *in, SiteSpinor *out);
|
||||||
|
#endif
|
||||||
|
|
||||||
static void DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
static void DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||||
int interior=1,int exterior=1) ;
|
int interior=1,int exterior=1) ;
|
||||||
|
@ -79,8 +79,6 @@ public:
|
|||||||
_Mat.M(in,tmp);
|
_Mat.M(in,tmp);
|
||||||
G5R5(out,tmp);
|
G5R5(out,tmp);
|
||||||
}
|
}
|
||||||
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
|
|
||||||
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -129,8 +127,6 @@ public:
|
|||||||
_Mat.M(in,tmp);
|
_Mat.M(in,tmp);
|
||||||
out=g5*tmp;
|
out=g5*tmp;
|
||||||
}
|
}
|
||||||
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
|
|
||||||
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
|
|
||||||
};
|
};
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
@ -642,7 +642,7 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
|
|||||||
Current curr_type,
|
Current curr_type,
|
||||||
unsigned int mu)
|
unsigned int mu)
|
||||||
{
|
{
|
||||||
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
|
#if (!defined(GRID_HIP))
|
||||||
Gamma::Algebra Gmu [] = {
|
Gamma::Algebra Gmu [] = {
|
||||||
Gamma::Algebra::GammaX,
|
Gamma::Algebra::GammaX,
|
||||||
Gamma::Algebra::GammaY,
|
Gamma::Algebra::GammaY,
|
||||||
@ -799,7 +799,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
|||||||
|
|
||||||
PropagatorField tmp(UGrid);
|
PropagatorField tmp(UGrid);
|
||||||
PropagatorField Utmp(UGrid);
|
PropagatorField Utmp(UGrid);
|
||||||
LatticeInteger zz (UGrid); zz=0.0;
|
PropagatorField zz (UGrid); zz=0.0;
|
||||||
LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
|
LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
|
||||||
for (int s=0;s<Ls;s++) {
|
for (int s=0;s<Ls;s++) {
|
||||||
|
|
||||||
@ -826,7 +826,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
|
#if (!defined(GRID_HIP))
|
||||||
int tshift = (mu == Nd-1) ? 1 : 0;
|
int tshift = (mu == Nd-1) ? 1 : 0;
|
||||||
////////////////////////////////////////////////
|
////////////////////////////////////////////////
|
||||||
// GENERAL CAYLEY CASE
|
// GENERAL CAYLEY CASE
|
||||||
@ -850,7 +850,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
|||||||
PropagatorField tmp(UGrid);
|
PropagatorField tmp(UGrid);
|
||||||
PropagatorField Utmp(UGrid);
|
PropagatorField Utmp(UGrid);
|
||||||
|
|
||||||
LatticeInteger zz (UGrid); zz=0.0;
|
PropagatorField zz (UGrid); zz=0.0;
|
||||||
LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
|
LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
|
||||||
|
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
@ -880,11 +880,23 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::vector<RealD> G_s(Ls,1.0);
|
std::vector<RealD> G_s(Ls,1.0);
|
||||||
|
Integer sign = 1; // sign flip for vector/tadpole
|
||||||
if ( curr_type == Current::Axial ) {
|
if ( curr_type == Current::Axial ) {
|
||||||
for(int s=0;s<Ls/2;s++){
|
for(int s=0;s<Ls/2;s++){
|
||||||
G_s[s] = -1.0;
|
G_s[s] = -1.0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
else if ( curr_type == Current::Tadpole ) {
|
||||||
|
auto b=this->_b;
|
||||||
|
auto c=this->_c;
|
||||||
|
if ( b == 1 && c == 0 ) {
|
||||||
|
sign = -1;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
std::cerr << "Error: Tadpole implementation currently unavailable for non-Shamir actions." << std::endl;
|
||||||
|
assert(b==1 && c==0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
|
|
||||||
@ -907,7 +919,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
|||||||
|
|
||||||
tmp = Cshift(tmp,mu,1);
|
tmp = Cshift(tmp,mu,1);
|
||||||
Impl::multLinkField(Utmp,this->Umu,tmp,mu);
|
Impl::multLinkField(Utmp,this->Umu,tmp,mu);
|
||||||
tmp = G_s[s]*( Utmp*ph - gmu*Utmp*ph ); // Forward hop
|
tmp = sign*G_s[s]*( Utmp*ph - gmu*Utmp*ph ); // Forward hop
|
||||||
tmp = where((lcoor>=tmin),tmp,zz); // Mask the time
|
tmp = where((lcoor>=tmin),tmp,zz); // Mask the time
|
||||||
L_Q = where((lcoor<=tmax),tmp,zz); // Position of current complicated
|
L_Q = where((lcoor<=tmax),tmp,zz); // Position of current complicated
|
||||||
|
|
||||||
|
@ -680,7 +680,8 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilView &st,
|
|||||||
gauge2 =(uint64_t)&UU[sU]( Z ); \
|
gauge2 =(uint64_t)&UU[sU]( Z ); \
|
||||||
gauge3 =(uint64_t)&UU[sU]( T );
|
gauge3 =(uint64_t)&UU[sU]( T );
|
||||||
|
|
||||||
|
#undef STAG_VEC5D
|
||||||
|
#ifdef STAG_VEC5D
|
||||||
// This is the single precision 5th direction vectorised kernel
|
// This is the single precision 5th direction vectorised kernel
|
||||||
#include <Grid/simd/Intel512single.h>
|
#include <Grid/simd/Intel512single.h>
|
||||||
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilView &st,
|
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilView &st,
|
||||||
@ -790,7 +791,7 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilView
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#define PERMUTE_DIR3 __asm__ ( \
|
#define PERMUTE_DIR3 __asm__ ( \
|
||||||
|
@ -32,25 +32,50 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
#define LOAD_CHI(b) \
|
#ifdef GRID_SIMT
|
||||||
|
|
||||||
|
#define LOAD_CHI(ptype,b) \
|
||||||
|
const SiteSpinor & ref (b[offset]); \
|
||||||
|
Chi_0=coalescedReadPermute<ptype>(ref()()(0),perm,lane); \
|
||||||
|
Chi_1=coalescedReadPermute<ptype>(ref()()(1),perm,lane); \
|
||||||
|
Chi_2=coalescedReadPermute<ptype>(ref()()(2),perm,lane);
|
||||||
|
|
||||||
|
#define LOAD_CHI_COMMS(b) \
|
||||||
const SiteSpinor & ref (b[offset]); \
|
const SiteSpinor & ref (b[offset]); \
|
||||||
Chi_0=ref()()(0);\
|
Chi_0=coalescedRead(ref()()(0),lane); \
|
||||||
Chi_1=ref()()(1);\
|
Chi_1=coalescedRead(ref()()(1),lane); \
|
||||||
Chi_2=ref()()(2);
|
Chi_2=coalescedRead(ref()()(2),lane);
|
||||||
|
|
||||||
|
#define PERMUTE_DIR(dir) ;
|
||||||
|
#else
|
||||||
|
#define LOAD_CHI(ptype,b) LOAD_CHI_COMMS(b)
|
||||||
|
|
||||||
|
#define LOAD_CHI_COMMS(b) \
|
||||||
|
const SiteSpinor & ref (b[offset]); \
|
||||||
|
Chi_0=ref()()(0); \
|
||||||
|
Chi_1=ref()()(1); \
|
||||||
|
Chi_2=ref()()(2);
|
||||||
|
|
||||||
|
#define PERMUTE_DIR(dir) \
|
||||||
|
permute##dir(Chi_0,Chi_0); \
|
||||||
|
permute##dir(Chi_1,Chi_1); \
|
||||||
|
permute##dir(Chi_2,Chi_2);
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
// To splat or not to splat depends on the implementation
|
// To splat or not to splat depends on the implementation
|
||||||
#define MULT(A,UChi) \
|
#define MULT(A,UChi) \
|
||||||
auto & ref(U[sU](A)); \
|
auto & ref(U[sU](A)); \
|
||||||
Impl::loadLinkElement(U_00,ref()(0,0)); \
|
U_00=coalescedRead(ref()(0,0),lane); \
|
||||||
Impl::loadLinkElement(U_10,ref()(1,0)); \
|
U_10=coalescedRead(ref()(1,0),lane); \
|
||||||
Impl::loadLinkElement(U_20,ref()(2,0)); \
|
U_20=coalescedRead(ref()(2,0),lane); \
|
||||||
Impl::loadLinkElement(U_01,ref()(0,1)); \
|
U_01=coalescedRead(ref()(0,1),lane); \
|
||||||
Impl::loadLinkElement(U_11,ref()(1,1)); \
|
U_11=coalescedRead(ref()(1,1),lane); \
|
||||||
Impl::loadLinkElement(U_21,ref()(2,1)); \
|
U_21=coalescedRead(ref()(2,1),lane); \
|
||||||
Impl::loadLinkElement(U_02,ref()(0,2)); \
|
U_02=coalescedRead(ref()(0,2),lane); \
|
||||||
Impl::loadLinkElement(U_12,ref()(1,2)); \
|
U_12=coalescedRead(ref()(1,2),lane); \
|
||||||
Impl::loadLinkElement(U_22,ref()(2,2)); \
|
U_22=coalescedRead(ref()(2,2),lane); \
|
||||||
UChi ## _0 = U_00*Chi_0; \
|
UChi ## _0 = U_00*Chi_0; \
|
||||||
UChi ## _1 = U_10*Chi_0;\
|
UChi ## _1 = U_10*Chi_0;\
|
||||||
UChi ## _2 = U_20*Chi_0;\
|
UChi ## _2 = U_20*Chi_0;\
|
||||||
@ -63,15 +88,15 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
#define MULT_ADD(U,A,UChi) \
|
#define MULT_ADD(U,A,UChi) \
|
||||||
auto & ref(U[sU](A)); \
|
auto & ref(U[sU](A)); \
|
||||||
Impl::loadLinkElement(U_00,ref()(0,0)); \
|
U_00=coalescedRead(ref()(0,0),lane); \
|
||||||
Impl::loadLinkElement(U_10,ref()(1,0)); \
|
U_10=coalescedRead(ref()(1,0),lane); \
|
||||||
Impl::loadLinkElement(U_20,ref()(2,0)); \
|
U_20=coalescedRead(ref()(2,0),lane); \
|
||||||
Impl::loadLinkElement(U_01,ref()(0,1)); \
|
U_01=coalescedRead(ref()(0,1),lane); \
|
||||||
Impl::loadLinkElement(U_11,ref()(1,1)); \
|
U_11=coalescedRead(ref()(1,1),lane); \
|
||||||
Impl::loadLinkElement(U_21,ref()(2,1)); \
|
U_21=coalescedRead(ref()(2,1),lane); \
|
||||||
Impl::loadLinkElement(U_02,ref()(0,2)); \
|
U_02=coalescedRead(ref()(0,2),lane); \
|
||||||
Impl::loadLinkElement(U_12,ref()(1,2)); \
|
U_12=coalescedRead(ref()(1,2),lane); \
|
||||||
Impl::loadLinkElement(U_22,ref()(2,2)); \
|
U_22=coalescedRead(ref()(2,2),lane); \
|
||||||
UChi ## _0 += U_00*Chi_0; \
|
UChi ## _0 += U_00*Chi_0; \
|
||||||
UChi ## _1 += U_10*Chi_0;\
|
UChi ## _1 += U_10*Chi_0;\
|
||||||
UChi ## _2 += U_20*Chi_0;\
|
UChi ## _2 += U_20*Chi_0;\
|
||||||
@ -83,24 +108,18 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
UChi ## _2 += U_22*Chi_2;
|
UChi ## _2 += U_22*Chi_2;
|
||||||
|
|
||||||
|
|
||||||
#define PERMUTE_DIR(dir) \
|
|
||||||
permute##dir(Chi_0,Chi_0); \
|
|
||||||
permute##dir(Chi_1,Chi_1); \
|
|
||||||
permute##dir(Chi_2,Chi_2);
|
|
||||||
|
|
||||||
|
|
||||||
#define HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
|
#define HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
|
||||||
SE=st.GetEntry(ptype,Dir+skew,sF); \
|
SE=st.GetEntry(ptype,Dir+skew,sF); \
|
||||||
offset = SE->_offset; \
|
offset = SE->_offset; \
|
||||||
local = SE->_is_local; \
|
local = SE->_is_local; \
|
||||||
perm = SE->_permute; \
|
perm = SE->_permute; \
|
||||||
if ( local ) { \
|
if ( local ) { \
|
||||||
LOAD_CHI(in); \
|
LOAD_CHI(Perm,in); \
|
||||||
if ( perm) { \
|
if ( perm) { \
|
||||||
PERMUTE_DIR(Perm); \
|
PERMUTE_DIR(Perm); \
|
||||||
} \
|
} \
|
||||||
} else { \
|
} else { \
|
||||||
LOAD_CHI(buf); \
|
LOAD_CHI_COMMS(buf); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even) \
|
#define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even) \
|
||||||
@ -116,19 +135,18 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even) \
|
#define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even) \
|
||||||
SE=st.GetEntry(ptype,Dir+skew,sF); \
|
SE=st.GetEntry(ptype,Dir+skew,sF); \
|
||||||
offset = SE->_offset; \
|
offset = SE->_offset; \
|
||||||
local = SE->_is_local; \
|
local = SE->_is_local; \
|
||||||
perm = SE->_permute; \
|
perm = SE->_permute; \
|
||||||
if ( local ) { \
|
if ( local ) { \
|
||||||
LOAD_CHI(in); \
|
LOAD_CHI(Perm,in); \
|
||||||
if ( perm) { \
|
if ( perm) { \
|
||||||
PERMUTE_DIR(Perm); \
|
PERMUTE_DIR(Perm); \
|
||||||
} \
|
} \
|
||||||
} else if ( st.same_node[Dir] ) { \
|
} else if ( st.same_node[Dir] ) { \
|
||||||
LOAD_CHI(buf); \
|
LOAD_CHI_COMMS(buf); \
|
||||||
} \
|
} \
|
||||||
if (local || st.same_node[Dir] ) { \
|
if (local || st.same_node[Dir] ) { \
|
||||||
MULT_ADD(U,Dir,even); \
|
MULT_ADD(U,Dir,even); \
|
||||||
@ -140,13 +158,35 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
local = SE->_is_local; \
|
local = SE->_is_local; \
|
||||||
if ((!local) && (!st.same_node[Dir]) ) { \
|
if ((!local) && (!st.same_node[Dir]) ) { \
|
||||||
nmu++; \
|
nmu++; \
|
||||||
{ LOAD_CHI(buf); } \
|
{ LOAD_CHI_COMMS(buf); } \
|
||||||
{ MULT_ADD(U,Dir,even); } \
|
{ MULT_ADD(U,Dir,even); } \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define HAND_DECLARATIONS(Simd) \
|
||||||
|
Simd even_0; \
|
||||||
|
Simd even_1; \
|
||||||
|
Simd even_2; \
|
||||||
|
Simd odd_0; \
|
||||||
|
Simd odd_1; \
|
||||||
|
Simd odd_2; \
|
||||||
|
\
|
||||||
|
Simd Chi_0; \
|
||||||
|
Simd Chi_1; \
|
||||||
|
Simd Chi_2; \
|
||||||
|
\
|
||||||
|
Simd U_00; \
|
||||||
|
Simd U_10; \
|
||||||
|
Simd U_20; \
|
||||||
|
Simd U_01; \
|
||||||
|
Simd U_11; \
|
||||||
|
Simd U_21; \
|
||||||
|
Simd U_02; \
|
||||||
|
Simd U_12; \
|
||||||
|
Simd U_22;
|
||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
template <int Naik>
|
template <int Naik> accelerator_inline
|
||||||
void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
|
void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
|
||||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int sF, int sU,
|
SiteSpinor *buf, int sF, int sU,
|
||||||
@ -155,28 +195,14 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
|
|||||||
typedef typename Simd::scalar_type S;
|
typedef typename Simd::scalar_type S;
|
||||||
typedef typename Simd::vector_type V;
|
typedef typename Simd::vector_type V;
|
||||||
|
|
||||||
Simd even_0; // 12 regs on knc
|
|
||||||
Simd even_1;
|
|
||||||
Simd even_2;
|
|
||||||
Simd odd_0; // 12 regs on knc
|
|
||||||
Simd odd_1;
|
|
||||||
Simd odd_2;
|
|
||||||
|
|
||||||
Simd Chi_0; // two spinor; 6 regs
|
const int Nsimd = SiteHalfSpinor::Nsimd();
|
||||||
Simd Chi_1;
|
const int lane=acceleratorSIMTlane(Nsimd);
|
||||||
Simd Chi_2;
|
typedef decltype( coalescedRead( in[0]()()(0) )) Simt;
|
||||||
|
HAND_DECLARATIONS(Simt);
|
||||||
|
|
||||||
Simd U_00; // two rows of U matrix
|
typedef decltype( coalescedRead( in[0] )) calcSiteSpinor;
|
||||||
Simd U_10;
|
calcSiteSpinor result;
|
||||||
Simd U_20;
|
|
||||||
Simd U_01;
|
|
||||||
Simd U_11;
|
|
||||||
Simd U_21; // 2 reg left.
|
|
||||||
Simd U_02;
|
|
||||||
Simd U_12;
|
|
||||||
Simd U_22;
|
|
||||||
|
|
||||||
SiteSpinor result;
|
|
||||||
int offset,local,perm, ptype;
|
int offset,local,perm, ptype;
|
||||||
|
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
@ -215,13 +241,13 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
|
|||||||
result()()(1) = even_1 + odd_1;
|
result()()(1) = even_1 + odd_1;
|
||||||
result()()(2) = even_2 + odd_2;
|
result()()(2) = even_2 + odd_2;
|
||||||
}
|
}
|
||||||
vstream(out[sF],result);
|
coalescedWrite(out[sF],result);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
template <int Naik>
|
template <int Naik> accelerator_inline
|
||||||
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
|
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int sF, int sU,
|
SiteSpinor *buf, int sF, int sU,
|
||||||
@ -230,28 +256,13 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
|
|||||||
typedef typename Simd::scalar_type S;
|
typedef typename Simd::scalar_type S;
|
||||||
typedef typename Simd::vector_type V;
|
typedef typename Simd::vector_type V;
|
||||||
|
|
||||||
Simd even_0; // 12 regs on knc
|
const int Nsimd = SiteHalfSpinor::Nsimd();
|
||||||
Simd even_1;
|
const int lane=acceleratorSIMTlane(Nsimd);
|
||||||
Simd even_2;
|
typedef decltype( coalescedRead( in[0]()()(0) )) Simt;
|
||||||
Simd odd_0; // 12 regs on knc
|
HAND_DECLARATIONS(Simt);
|
||||||
Simd odd_1;
|
|
||||||
Simd odd_2;
|
|
||||||
|
|
||||||
Simd Chi_0; // two spinor; 6 regs
|
typedef decltype( coalescedRead( in[0] )) calcSiteSpinor;
|
||||||
Simd Chi_1;
|
calcSiteSpinor result;
|
||||||
Simd Chi_2;
|
|
||||||
|
|
||||||
Simd U_00; // two rows of U matrix
|
|
||||||
Simd U_10;
|
|
||||||
Simd U_20;
|
|
||||||
Simd U_01;
|
|
||||||
Simd U_11;
|
|
||||||
Simd U_21; // 2 reg left.
|
|
||||||
Simd U_02;
|
|
||||||
Simd U_12;
|
|
||||||
Simd U_22;
|
|
||||||
|
|
||||||
SiteSpinor result;
|
|
||||||
int offset, ptype, local, perm;
|
int offset, ptype, local, perm;
|
||||||
|
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
@ -261,8 +272,8 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
|
|||||||
// int sF=s+LLs*sU;
|
// int sF=s+LLs*sU;
|
||||||
{
|
{
|
||||||
|
|
||||||
even_0 = Zero(); even_1 = Zero(); even_2 = Zero();
|
zeroit(even_0); zeroit(even_1); zeroit(even_2);
|
||||||
odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero();
|
zeroit(odd_0); zeroit(odd_1); zeroit(odd_2);
|
||||||
|
|
||||||
skew = 0;
|
skew = 0;
|
||||||
HAND_STENCIL_LEG_INT(U,Xp,3,skew,even);
|
HAND_STENCIL_LEG_INT(U,Xp,3,skew,even);
|
||||||
@ -294,13 +305,13 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
|
|||||||
result()()(1) = even_1 + odd_1;
|
result()()(1) = even_1 + odd_1;
|
||||||
result()()(2) = even_2 + odd_2;
|
result()()(2) = even_2 + odd_2;
|
||||||
}
|
}
|
||||||
vstream(out[sF],result);
|
coalescedWrite(out[sF],result);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
template <int Naik>
|
template <int Naik> accelerator_inline
|
||||||
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
|
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int sF, int sU,
|
SiteSpinor *buf, int sF, int sU,
|
||||||
@ -309,28 +320,13 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
|
|||||||
typedef typename Simd::scalar_type S;
|
typedef typename Simd::scalar_type S;
|
||||||
typedef typename Simd::vector_type V;
|
typedef typename Simd::vector_type V;
|
||||||
|
|
||||||
Simd even_0; // 12 regs on knc
|
const int Nsimd = SiteHalfSpinor::Nsimd();
|
||||||
Simd even_1;
|
const int lane=acceleratorSIMTlane(Nsimd);
|
||||||
Simd even_2;
|
typedef decltype( coalescedRead( in[0]()()(0) )) Simt;
|
||||||
Simd odd_0; // 12 regs on knc
|
HAND_DECLARATIONS(Simt);
|
||||||
Simd odd_1;
|
|
||||||
Simd odd_2;
|
|
||||||
|
|
||||||
Simd Chi_0; // two spinor; 6 regs
|
typedef decltype( coalescedRead( in[0] )) calcSiteSpinor;
|
||||||
Simd Chi_1;
|
calcSiteSpinor result;
|
||||||
Simd Chi_2;
|
|
||||||
|
|
||||||
Simd U_00; // two rows of U matrix
|
|
||||||
Simd U_10;
|
|
||||||
Simd U_20;
|
|
||||||
Simd U_01;
|
|
||||||
Simd U_11;
|
|
||||||
Simd U_21; // 2 reg left.
|
|
||||||
Simd U_02;
|
|
||||||
Simd U_12;
|
|
||||||
Simd U_22;
|
|
||||||
|
|
||||||
SiteSpinor result;
|
|
||||||
int offset, ptype, local;
|
int offset, ptype, local;
|
||||||
|
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
@ -340,8 +336,8 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
|
|||||||
// int sF=s+LLs*sU;
|
// int sF=s+LLs*sU;
|
||||||
{
|
{
|
||||||
|
|
||||||
even_0 = Zero(); even_1 = Zero(); even_2 = Zero();
|
zeroit(even_0); zeroit(even_1); zeroit(even_2);
|
||||||
odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero();
|
zeroit(odd_0); zeroit(odd_1); zeroit(odd_2);
|
||||||
int nmu=0;
|
int nmu=0;
|
||||||
skew = 0;
|
skew = 0;
|
||||||
HAND_STENCIL_LEG_EXT(U,Xp,3,skew,even);
|
HAND_STENCIL_LEG_EXT(U,Xp,3,skew,even);
|
||||||
@ -374,7 +370,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
|
|||||||
result()()(1) = even_1 + odd_1;
|
result()()(1) = even_1 + odd_1;
|
||||||
result()()(2) = even_2 + odd_2;
|
result()()(2) = even_2 + odd_2;
|
||||||
}
|
}
|
||||||
out[sF] = out[sF] + result;
|
coalescedWrite(out[sF] , out(sF)+ result);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -397,6 +393,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
|
|||||||
const FermionFieldView &in, FermionFieldView &out, int dag); \
|
const FermionFieldView &in, FermionFieldView &out, int dag); \
|
||||||
*/
|
*/
|
||||||
#undef LOAD_CHI
|
#undef LOAD_CHI
|
||||||
|
#undef HAND_DECLARATIONS
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
@ -35,39 +35,32 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
#define GENERIC_STENCIL_LEG(U,Dir,skew,multLink) \
|
#define GENERIC_STENCIL_LEG(U,Dir,skew,multLink) \
|
||||||
SE = st.GetEntry(ptype, Dir+skew, sF); \
|
SE = st.GetEntry(ptype, Dir+skew, sF); \
|
||||||
if (SE->_is_local ) { \
|
if (SE->_is_local ) { \
|
||||||
if (SE->_permute) { \
|
int perm= SE->_permute; \
|
||||||
chi_p = χ \
|
chi = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);\
|
||||||
permute(chi, in[SE->_offset], ptype); \
|
|
||||||
} else { \
|
|
||||||
chi_p = &in[SE->_offset]; \
|
|
||||||
} \
|
|
||||||
} else { \
|
} else { \
|
||||||
chi_p = &buf[SE->_offset]; \
|
chi = coalescedRead(buf[SE->_offset],lane); \
|
||||||
} \
|
} \
|
||||||
multLink(Uchi, U[sU], *chi_p, Dir);
|
acceleratorSynchronise(); \
|
||||||
|
multLink(Uchi, U[sU], chi, Dir);
|
||||||
|
|
||||||
#define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink) \
|
#define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink) \
|
||||||
SE = st.GetEntry(ptype, Dir+skew, sF); \
|
SE = st.GetEntry(ptype, Dir+skew, sF); \
|
||||||
if (SE->_is_local ) { \
|
if (SE->_is_local ) { \
|
||||||
if (SE->_permute) { \
|
int perm= SE->_permute; \
|
||||||
chi_p = χ \
|
chi = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);\
|
||||||
permute(chi, in[SE->_offset], ptype); \
|
|
||||||
} else { \
|
|
||||||
chi_p = &in[SE->_offset]; \
|
|
||||||
} \
|
|
||||||
} else if ( st.same_node[Dir] ) { \
|
} else if ( st.same_node[Dir] ) { \
|
||||||
chi_p = &buf[SE->_offset]; \
|
chi = coalescedRead(buf[SE->_offset],lane); \
|
||||||
} \
|
} \
|
||||||
if (SE->_is_local || st.same_node[Dir] ) { \
|
if (SE->_is_local || st.same_node[Dir] ) { \
|
||||||
multLink(Uchi, U[sU], *chi_p, Dir); \
|
multLink(Uchi, U[sU], chi, Dir); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \
|
#define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \
|
||||||
SE = st.GetEntry(ptype, Dir+skew, sF); \
|
SE = st.GetEntry(ptype, Dir+skew, sF); \
|
||||||
if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
|
if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
|
||||||
nmu++; \
|
nmu++; \
|
||||||
chi_p = &buf[SE->_offset]; \
|
chi = coalescedRead(buf[SE->_offset],lane); \
|
||||||
multLink(Uchi, U[sU], *chi_p, Dir); \
|
multLink(Uchi, U[sU], chi, Dir); \
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -78,18 +71,20 @@ StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
|
|||||||
// Int, Ext, Int+Ext cases for comms overlap
|
// Int, Ext, Int+Ext cases for comms overlap
|
||||||
////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
template <int Naik>
|
template <int Naik> accelerator_inline
|
||||||
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
|
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int sF, int sU,
|
SiteSpinor *buf, int sF, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out, int dag)
|
const FermionFieldView &in, FermionFieldView &out, int dag)
|
||||||
{
|
{
|
||||||
const SiteSpinor *chi_p;
|
typedef decltype(coalescedRead(in[0])) calcSpinor;
|
||||||
SiteSpinor chi;
|
calcSpinor chi;
|
||||||
SiteSpinor Uchi;
|
calcSpinor Uchi;
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
int ptype;
|
int ptype;
|
||||||
int skew;
|
int skew;
|
||||||
|
const int Nsimd = SiteHalfSpinor::Nsimd();
|
||||||
|
const int lane=acceleratorSIMTlane(Nsimd);
|
||||||
|
|
||||||
// for(int s=0;s<LLs;s++){
|
// for(int s=0;s<LLs;s++){
|
||||||
//
|
//
|
||||||
@ -118,7 +113,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
|
|||||||
if ( dag ) {
|
if ( dag ) {
|
||||||
Uchi = - Uchi;
|
Uchi = - Uchi;
|
||||||
}
|
}
|
||||||
vstream(out[sF], Uchi);
|
coalescedWrite(out[sF], Uchi,lane);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -126,17 +121,20 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
|
|||||||
// Only contributions from interior of our node
|
// Only contributions from interior of our node
|
||||||
///////////////////////////////////////////////////
|
///////////////////////////////////////////////////
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
template <int Naik>
|
template <int Naik> accelerator_inline
|
||||||
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
|
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int sF, int sU,
|
SiteSpinor *buf, int sF, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag) {
|
const FermionFieldView &in, FermionFieldView &out,int dag)
|
||||||
const SiteSpinor *chi_p;
|
{
|
||||||
SiteSpinor chi;
|
typedef decltype(coalescedRead(in[0])) calcSpinor;
|
||||||
SiteSpinor Uchi;
|
calcSpinor chi;
|
||||||
|
calcSpinor Uchi;
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
int ptype;
|
int ptype;
|
||||||
int skew ;
|
int skew ;
|
||||||
|
const int Nsimd = SiteHalfSpinor::Nsimd();
|
||||||
|
const int lane=acceleratorSIMTlane(Nsimd);
|
||||||
|
|
||||||
// for(int s=0;s<LLs;s++){
|
// for(int s=0;s<LLs;s++){
|
||||||
// int sF=LLs*sU+s;
|
// int sF=LLs*sU+s;
|
||||||
@ -165,7 +163,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
|
|||||||
if ( dag ) {
|
if ( dag ) {
|
||||||
Uchi = - Uchi;
|
Uchi = - Uchi;
|
||||||
}
|
}
|
||||||
vstream(out[sF], Uchi);
|
coalescedWrite(out[sF], Uchi,lane);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -174,18 +172,21 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
|
|||||||
// Only contributions from exterior of our node
|
// Only contributions from exterior of our node
|
||||||
///////////////////////////////////////////////////
|
///////////////////////////////////////////////////
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
template <int Naik>
|
template <int Naik> accelerator_inline
|
||||||
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
|
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int sF, int sU,
|
SiteSpinor *buf, int sF, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag) {
|
const FermionFieldView &in, FermionFieldView &out,int dag)
|
||||||
const SiteSpinor *chi_p;
|
{
|
||||||
// SiteSpinor chi;
|
typedef decltype(coalescedRead(in[0])) calcSpinor;
|
||||||
SiteSpinor Uchi;
|
calcSpinor chi;
|
||||||
|
calcSpinor Uchi;
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
int ptype;
|
int ptype;
|
||||||
int nmu=0;
|
int nmu=0;
|
||||||
int skew ;
|
int skew ;
|
||||||
|
const int Nsimd = SiteHalfSpinor::Nsimd();
|
||||||
|
const int lane=acceleratorSIMTlane(Nsimd);
|
||||||
|
|
||||||
// for(int s=0;s<LLs;s++){
|
// for(int s=0;s<LLs;s++){
|
||||||
// int sF=LLs*sU+s;
|
// int sF=LLs*sU+s;
|
||||||
@ -212,10 +213,11 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
|
|||||||
GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
|
GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
|
||||||
}
|
}
|
||||||
if ( nmu ) {
|
if ( nmu ) {
|
||||||
|
auto _out = coalescedRead(out[sF],lane);
|
||||||
if ( dag ) {
|
if ( dag ) {
|
||||||
out[sF] = out[sF] - Uchi;
|
coalescedWrite(out[sF], _out-Uchi,lane);
|
||||||
} else {
|
} else {
|
||||||
out[sF] = out[sF] + Uchi;
|
coalescedWrite(out[sF], _out+Uchi,lane);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -261,6 +263,8 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
GridBase *FGrid=in.Grid();
|
GridBase *FGrid=in.Grid();
|
||||||
GridBase *UGrid=U.Grid();
|
GridBase *UGrid=U.Grid();
|
||||||
typedef StaggeredKernels<Impl> ThisKernel;
|
typedef StaggeredKernels<Impl> ThisKernel;
|
||||||
|
const int Nsimd = SiteHalfSpinor::Nsimd();
|
||||||
|
const int lane=acceleratorSIMTlane(Nsimd);
|
||||||
autoView( UUU_v , UUU, AcceleratorRead);
|
autoView( UUU_v , UUU, AcceleratorRead);
|
||||||
autoView( U_v , U, AcceleratorRead);
|
autoView( U_v , U, AcceleratorRead);
|
||||||
autoView( in_v , in, AcceleratorRead);
|
autoView( in_v , in, AcceleratorRead);
|
||||||
@ -301,6 +305,8 @@ void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
GridBase *FGrid=in.Grid();
|
GridBase *FGrid=in.Grid();
|
||||||
GridBase *UGrid=U.Grid();
|
GridBase *UGrid=U.Grid();
|
||||||
typedef StaggeredKernels<Impl> ThisKernel;
|
typedef StaggeredKernels<Impl> ThisKernel;
|
||||||
|
const int Nsimd = SiteHalfSpinor::Nsimd();
|
||||||
|
const int lane=acceleratorSIMTlane(Nsimd);
|
||||||
autoView( UUU_v , U, AcceleratorRead);
|
autoView( UUU_v , U, AcceleratorRead);
|
||||||
autoView( U_v , U, AcceleratorRead);
|
autoView( U_v , U, AcceleratorRead);
|
||||||
autoView( in_v , in, AcceleratorRead);
|
autoView( in_v , in, AcceleratorRead);
|
||||||
|
@ -92,20 +92,16 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
|
|||||||
int lvol = _Umu.Grid()->lSites();
|
int lvol = _Umu.Grid()->lSites();
|
||||||
int DimRep = Impl::Dimension;
|
int DimRep = Impl::Dimension;
|
||||||
|
|
||||||
Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
|
|
||||||
Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
|
|
||||||
|
|
||||||
Coordinate lcoor;
|
|
||||||
typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
|
|
||||||
|
|
||||||
{
|
{
|
||||||
autoView(CTv,CloverTerm,CpuRead);
|
autoView(CTv,CloverTerm,CpuRead);
|
||||||
autoView(CTIv,CloverTermInv,CpuWrite);
|
autoView(CTIv,CloverTermInv,CpuWrite);
|
||||||
for (int site = 0; site < lvol; site++) {
|
thread_for(site, lvol, {
|
||||||
|
Coordinate lcoor;
|
||||||
grid->LocalIndexToLocalCoor(site, lcoor);
|
grid->LocalIndexToLocalCoor(site, lcoor);
|
||||||
EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
|
Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
|
||||||
|
Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
|
||||||
|
typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
|
||||||
peekLocalSite(Qx, CTv, lcoor);
|
peekLocalSite(Qx, CTv, lcoor);
|
||||||
Qxinv = Zero();
|
|
||||||
//if (csw!=0){
|
//if (csw!=0){
|
||||||
for (int j = 0; j < Ns; j++)
|
for (int j = 0; j < Ns; j++)
|
||||||
for (int k = 0; k < Ns; k++)
|
for (int k = 0; k < Ns; k++)
|
||||||
@ -126,7 +122,7 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
|
|||||||
// if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
|
// if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
|
||||||
// }
|
// }
|
||||||
pokeLocalSite(Qxinv, CTIv, lcoor);
|
pokeLocalSite(Qxinv, CTIv, lcoor);
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// Separate the even and odd parts
|
// Separate the even and odd parts
|
||||||
|
@ -397,6 +397,7 @@ void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, co
|
|||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag)
|
void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag)
|
||||||
{
|
{
|
||||||
|
DhopCalls+=2;
|
||||||
conformable(in.Grid(), _grid); // verifies full grid
|
conformable(in.Grid(), _grid); // verifies full grid
|
||||||
conformable(in.Grid(), out.Grid());
|
conformable(in.Grid(), out.Grid());
|
||||||
|
|
||||||
@ -408,6 +409,7 @@ void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int da
|
|||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag)
|
void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag)
|
||||||
{
|
{
|
||||||
|
DhopCalls++;
|
||||||
conformable(in.Grid(), _cbgrid); // verifies half grid
|
conformable(in.Grid(), _cbgrid); // verifies half grid
|
||||||
conformable(in.Grid(), out.Grid()); // drops the cb check
|
conformable(in.Grid(), out.Grid()); // drops the cb check
|
||||||
|
|
||||||
@ -420,6 +422,7 @@ void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int
|
|||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
|
void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
{
|
||||||
|
DhopCalls++;
|
||||||
conformable(in.Grid(), _cbgrid); // verifies half grid
|
conformable(in.Grid(), _cbgrid); // verifies half grid
|
||||||
conformable(in.Grid(), out.Grid()); // drops the cb check
|
conformable(in.Grid(), out.Grid()); // drops the cb check
|
||||||
|
|
||||||
|
@ -38,9 +38,6 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
|||||||
// undefine everything related to kernels
|
// undefine everything related to kernels
|
||||||
#include <simd/Fujitsu_A64FX_undef.h>
|
#include <simd/Fujitsu_A64FX_undef.h>
|
||||||
|
|
||||||
// enable A64FX body
|
|
||||||
#define WILSONKERNELSASMBODYA64FX
|
|
||||||
//#pragma message("A64FX Dslash: WilsonKernelsAsmBodyA64FX.h")
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////
|
||||||
// If we are A64FX specialise the single precision routine
|
// If we are A64FX specialise the single precision routine
|
||||||
@ -63,119 +60,89 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
|||||||
#define INTERIOR_AND_EXTERIOR
|
#define INTERIOR_AND_EXTERIOR
|
||||||
#undef INTERIOR
|
#undef INTERIOR
|
||||||
#undef EXTERIOR
|
#undef EXTERIOR
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<WilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<WilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<ZWilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<ZWilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#undef INTERIOR_AND_EXTERIOR
|
#undef INTERIOR_AND_EXTERIOR
|
||||||
#define INTERIOR
|
#define INTERIOR
|
||||||
#undef EXTERIOR
|
#undef EXTERIOR
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<WilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<WilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<ZWilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<ZWilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#undef INTERIOR_AND_EXTERIOR
|
#undef INTERIOR_AND_EXTERIOR
|
||||||
#undef INTERIOR
|
#undef INTERIOR
|
||||||
#define EXTERIOR
|
#define EXTERIOR
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<WilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<WilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<ZWilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<ZWilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////
|
||||||
@ -185,119 +152,89 @@ WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldV
|
|||||||
#define INTERIOR_AND_EXTERIOR
|
#define INTERIOR_AND_EXTERIOR
|
||||||
#undef INTERIOR
|
#undef INTERIOR
|
||||||
#undef EXTERIOR
|
#undef EXTERIOR
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#undef INTERIOR_AND_EXTERIOR
|
#undef INTERIOR_AND_EXTERIOR
|
||||||
#define INTERIOR
|
#define INTERIOR
|
||||||
#undef EXTERIOR
|
#undef EXTERIOR
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<WilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<WilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#undef INTERIOR_AND_EXTERIOR
|
#undef INTERIOR_AND_EXTERIOR
|
||||||
#undef INTERIOR
|
#undef INTERIOR
|
||||||
#define EXTERIOR
|
#define EXTERIOR
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<WilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<WilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
// undefine
|
// undefine
|
||||||
@ -330,119 +267,89 @@ WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFie
|
|||||||
#define INTERIOR_AND_EXTERIOR
|
#define INTERIOR_AND_EXTERIOR
|
||||||
#undef INTERIOR
|
#undef INTERIOR
|
||||||
#undef EXTERIOR
|
#undef EXTERIOR
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<WilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<WilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<ZWilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<ZWilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#undef INTERIOR_AND_EXTERIOR
|
#undef INTERIOR_AND_EXTERIOR
|
||||||
#define INTERIOR
|
#define INTERIOR
|
||||||
#undef EXTERIOR
|
#undef EXTERIOR
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<WilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<WilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<ZWilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<ZWilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#undef INTERIOR_AND_EXTERIOR
|
#undef INTERIOR_AND_EXTERIOR
|
||||||
#undef INTERIOR
|
#undef INTERIOR
|
||||||
#define EXTERIOR
|
#define EXTERIOR
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<WilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<WilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<ZWilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<ZWilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////
|
||||||
// XYZT vectorised, dag Kernel, double
|
// XYZT vectorised, dag Kernel, double
|
||||||
@ -451,124 +358,93 @@ WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldV
|
|||||||
#define INTERIOR_AND_EXTERIOR
|
#define INTERIOR_AND_EXTERIOR
|
||||||
#undef INTERIOR
|
#undef INTERIOR
|
||||||
#undef EXTERIOR
|
#undef EXTERIOR
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#undef INTERIOR_AND_EXTERIOR
|
#undef INTERIOR_AND_EXTERIOR
|
||||||
#define INTERIOR
|
#define INTERIOR
|
||||||
#undef EXTERIOR
|
#undef EXTERIOR
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<WilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<WilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#undef INTERIOR_AND_EXTERIOR
|
#undef INTERIOR_AND_EXTERIOR
|
||||||
#undef INTERIOR
|
#undef INTERIOR
|
||||||
#define EXTERIOR
|
#define EXTERIOR
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<WilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<WilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
template<> void
|
template<> void
|
||||||
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
#if defined (WILSONKERNELSASMBODYA64FX)
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
#else
|
|
||||||
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// undefs
|
// undefs
|
||||||
#undef WILSONKERNELSASMBODYA64FX
|
|
||||||
#include <simd/Fujitsu_A64FX_undef.h>
|
#include <simd/Fujitsu_A64FX_undef.h>
|
||||||
|
|
||||||
#endif //A64FXASM
|
#endif //A64FXASM
|
||||||
|
@ -25,6 +25,11 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
|||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
*************************************************************************************/
|
*************************************************************************************/
|
||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
|
|
||||||
|
// GCC 10 messes up SVE instruction scheduling using -O3, but
|
||||||
|
// -O3 -fno-schedule-insns -fno-schedule-insns2 does wonders
|
||||||
|
// performance now is better than armclang 20.2
|
||||||
|
|
||||||
#ifdef KERNEL_DAG
|
#ifdef KERNEL_DAG
|
||||||
#define DIR0_PROJ XP_PROJ
|
#define DIR0_PROJ XP_PROJ
|
||||||
#define DIR1_PROJ YP_PROJ
|
#define DIR1_PROJ YP_PROJ
|
||||||
@ -97,7 +102,7 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
|||||||
PROJ; \
|
PROJ; \
|
||||||
MAYBEPERM(PERMUTE_DIR,perm); \
|
MAYBEPERM(PERMUTE_DIR,perm); \
|
||||||
} else { \
|
} else { \
|
||||||
LOAD_CHI(base); \
|
LOAD_CHI(base); \
|
||||||
} \
|
} \
|
||||||
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
|
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
|
||||||
MULT_2SPIN_1(Dir); \
|
MULT_2SPIN_1(Dir); \
|
||||||
@ -110,6 +115,11 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
|||||||
} \
|
} \
|
||||||
RECON; \
|
RECON; \
|
||||||
|
|
||||||
|
/*
|
||||||
|
NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty
|
||||||
|
though I expected that it would improve on performance
|
||||||
|
*/
|
||||||
|
|
||||||
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||||
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
||||||
PREFETCH1_CHIMU(base); \
|
PREFETCH1_CHIMU(base); \
|
||||||
@ -126,73 +136,63 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
|||||||
|
|
||||||
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||||
basep = st.GetPFInfo(nent,plocal); nent++; \
|
basep = st.GetPFInfo(nent,plocal); nent++; \
|
||||||
if ( local ) { \
|
if ( local ) { \
|
||||||
LOAD_CHIMU(base); \
|
LOAD_CHIMU(base); \
|
||||||
LOAD_TABLE(PERMUTE_DIR); \
|
LOAD_TABLE(PERMUTE_DIR); \
|
||||||
PROJ; \
|
PROJ; \
|
||||||
MAYBEPERM(PERMUTE_DIR,perm); \
|
MAYBEPERM(PERMUTE_DIR,perm); \
|
||||||
}else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \
|
}else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \
|
||||||
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
|
if ( local || st.same_node[Dir] ) { \
|
||||||
if ( local || st.same_node[Dir] ) { \
|
MULT_2SPIN_1(Dir); \
|
||||||
MULT_2SPIN_1(Dir); \
|
MULT_2SPIN_2; \
|
||||||
PREFETCH_CHIMU(base); \
|
RECON; \
|
||||||
/* PREFETCH_GAUGE_L1(NxtDir); */ \
|
} \
|
||||||
MULT_2SPIN_2; \
|
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
|
||||||
if (s == 0) { \
|
PREFETCH_CHIMU(base); \
|
||||||
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
|
PREFETCH_CHIMU_L2(basep); \
|
||||||
} \
|
|
||||||
RECON; \
|
|
||||||
PREFETCH_CHIMU_L2(basep); \
|
|
||||||
} else { PREFETCH_CHIMU(base); } \
|
|
||||||
|
|
||||||
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||||
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
||||||
PREFETCH1_CHIMU(base); \
|
PREFETCH1_CHIMU(base); \
|
||||||
|
{ ZERO_PSI; } \
|
||||||
ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
|
ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
|
||||||
|
|
||||||
#define RESULT(base,basep) SAVE_RESULT(base,basep);
|
#define RESULT(base,basep) SAVE_RESULT(base,basep);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Post comms kernel
|
// Post comms kernel
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
#ifdef EXTERIOR
|
#ifdef EXTERIOR
|
||||||
|
|
||||||
|
|
||||||
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||||
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
||||||
if((!local)&&(!st.same_node[Dir]) ) { \
|
if((!local)&&(!st.same_node[Dir]) ) { \
|
||||||
LOAD_CHI(base); \
|
LOAD_CHI(base); \
|
||||||
MULT_2SPIN_1(Dir); \
|
MULT_2SPIN_1(Dir); \
|
||||||
PREFETCH_CHIMU(base); \
|
|
||||||
/* PREFETCH_GAUGE_L1(NxtDir); */ \
|
|
||||||
MULT_2SPIN_2; \
|
MULT_2SPIN_2; \
|
||||||
if (s == 0) { \
|
RECON; \
|
||||||
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
|
nmu++; \
|
||||||
} \
|
|
||||||
RECON; \
|
|
||||||
nmu++; \
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||||
nmu=0; \
|
nmu=0; \
|
||||||
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;\
|
{ ZERO_PSI;} \
|
||||||
if((!local)&&(!st.same_node[Dir]) ) { \
|
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
||||||
LOAD_CHI(base); \
|
if((!local)&&(!st.same_node[Dir]) ) { \
|
||||||
|
LOAD_CHI(base); \
|
||||||
MULT_2SPIN_1(Dir); \
|
MULT_2SPIN_1(Dir); \
|
||||||
PREFETCH_CHIMU(base); \
|
|
||||||
/* PREFETCH_GAUGE_L1(NxtDir); */ \
|
|
||||||
MULT_2SPIN_2; \
|
MULT_2SPIN_2; \
|
||||||
if (s == 0) { \
|
RECON; \
|
||||||
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
|
nmu++; \
|
||||||
} \
|
|
||||||
RECON; \
|
|
||||||
nmu++; \
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);}
|
#define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
{
|
{
|
||||||
int nmu;
|
int nmu;
|
||||||
int local,perm, ptype;
|
int local,perm, ptype;
|
||||||
@ -209,7 +209,6 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
|||||||
int ssn=ssU+1; if(ssn>=nmax) ssn=0;
|
int ssn=ssU+1; if(ssn>=nmax) ssn=0;
|
||||||
// int sUn=lo.Reorder(ssn);
|
// int sUn=lo.Reorder(ssn);
|
||||||
int sUn=ssn;
|
int sUn=ssn;
|
||||||
LOCK_GAUGE(0);
|
|
||||||
#else
|
#else
|
||||||
int sU =ssU;
|
int sU =ssU;
|
||||||
int ssn=ssU+1; if(ssn>=nmax) ssn=0;
|
int ssn=ssU+1; if(ssn>=nmax) ssn=0;
|
||||||
@ -295,6 +294,11 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
|||||||
std::cout << "----------------------------------------------------" << std::endl;
|
std::cout << "----------------------------------------------------" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// DC ZVA test
|
||||||
|
// { uint64_t basestore = (uint64_t)&out[ss];
|
||||||
|
// PREFETCH_RESULT_L2_STORE(basestore); }
|
||||||
|
|
||||||
|
|
||||||
ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJ,DIR5_RECON);
|
ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJ,DIR5_RECON);
|
||||||
|
|
||||||
#ifdef SHOW
|
#ifdef SHOW
|
||||||
@ -308,6 +312,11 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
|||||||
std::cout << "----------------------------------------------------" << std::endl;
|
std::cout << "----------------------------------------------------" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// DC ZVA test
|
||||||
|
//{ uint64_t basestore = (uint64_t)&out[ss];
|
||||||
|
// PREFETCH_RESULT_L2_STORE(basestore); }
|
||||||
|
|
||||||
|
|
||||||
ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJ,DIR6_RECON);
|
ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJ,DIR6_RECON);
|
||||||
|
|
||||||
#ifdef SHOW
|
#ifdef SHOW
|
||||||
@ -321,6 +330,11 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
|||||||
std::cout << "----------------------------------------------------" << std::endl;
|
std::cout << "----------------------------------------------------" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// DC ZVA test
|
||||||
|
//{ uint64_t basestore = (uint64_t)&out[ss];
|
||||||
|
// PREFETCH_RESULT_L2_STORE(basestore); }
|
||||||
|
|
||||||
|
|
||||||
ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJ,DIR7_RECON);
|
ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJ,DIR7_RECON);
|
||||||
|
|
||||||
#ifdef SHOW
|
#ifdef SHOW
|
||||||
@ -341,6 +355,7 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
|||||||
base = (uint64_t) &out[ss];
|
base = (uint64_t) &out[ss];
|
||||||
basep= st.GetPFInfo(nent,plocal); ent++;
|
basep= st.GetPFInfo(nent,plocal); ent++;
|
||||||
basep = (uint64_t) &out[ssn];
|
basep = (uint64_t) &out[ssn];
|
||||||
|
//PREFETCH_RESULT_L1_STORE(base);
|
||||||
RESULT(base,basep);
|
RESULT(base,basep);
|
||||||
|
|
||||||
#ifdef SHOW
|
#ifdef SHOW
|
||||||
|
@ -646,7 +646,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
HAND_RESULT_EXT(ss,F)
|
HAND_RESULT_EXT(ss,F)
|
||||||
|
|
||||||
#define HAND_SPECIALISE_GPARITY(IMPL) \
|
#define HAND_SPECIALISE_GPARITY(IMPL) \
|
||||||
template<> void \
|
template<> accelerator_inline void \
|
||||||
WilsonKernels<IMPL>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
WilsonKernels<IMPL>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||||
{ \
|
{ \
|
||||||
@ -662,7 +662,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
template<> void \
|
template<> accelerator_inline void \
|
||||||
WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||||
{ \
|
{ \
|
||||||
@ -678,7 +678,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
template<> void \
|
template<> accelerator_inline void \
|
||||||
WilsonKernels<IMPL>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
WilsonKernels<IMPL>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||||
{ \
|
{ \
|
||||||
@ -694,7 +694,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
template<> void \
|
template<> accelerator_inline void \
|
||||||
WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||||
{ \
|
{ \
|
||||||
@ -710,7 +710,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
template<> void \
|
template<> accelerator_inline void \
|
||||||
WilsonKernels<IMPL>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
WilsonKernels<IMPL>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||||
{ \
|
{ \
|
||||||
@ -727,7 +727,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
nmu = 0; \
|
nmu = 0; \
|
||||||
HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||||
} \
|
} \
|
||||||
template<> void \
|
template<> accelerator_inline void \
|
||||||
WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||||
{ \
|
{ \
|
||||||
|
@ -76,7 +76,24 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
#define REGISTER
|
#define REGISTER
|
||||||
|
|
||||||
#define LOAD_CHIMU \
|
#ifdef GRID_SIMT
|
||||||
|
#define LOAD_CHIMU(ptype) \
|
||||||
|
{const SiteSpinor & ref (in[offset]); \
|
||||||
|
Chimu_00=coalescedReadPermute<ptype>(ref()(0)(0),perm,lane); \
|
||||||
|
Chimu_01=coalescedReadPermute<ptype>(ref()(0)(1),perm,lane); \
|
||||||
|
Chimu_02=coalescedReadPermute<ptype>(ref()(0)(2),perm,lane); \
|
||||||
|
Chimu_10=coalescedReadPermute<ptype>(ref()(1)(0),perm,lane); \
|
||||||
|
Chimu_11=coalescedReadPermute<ptype>(ref()(1)(1),perm,lane); \
|
||||||
|
Chimu_12=coalescedReadPermute<ptype>(ref()(1)(2),perm,lane); \
|
||||||
|
Chimu_20=coalescedReadPermute<ptype>(ref()(2)(0),perm,lane); \
|
||||||
|
Chimu_21=coalescedReadPermute<ptype>(ref()(2)(1),perm,lane); \
|
||||||
|
Chimu_22=coalescedReadPermute<ptype>(ref()(2)(2),perm,lane); \
|
||||||
|
Chimu_30=coalescedReadPermute<ptype>(ref()(3)(0),perm,lane); \
|
||||||
|
Chimu_31=coalescedReadPermute<ptype>(ref()(3)(1),perm,lane); \
|
||||||
|
Chimu_32=coalescedReadPermute<ptype>(ref()(3)(2),perm,lane); }
|
||||||
|
#define PERMUTE_DIR(dir) ;
|
||||||
|
#else
|
||||||
|
#define LOAD_CHIMU(ptype) \
|
||||||
{const SiteSpinor & ref (in[offset]); \
|
{const SiteSpinor & ref (in[offset]); \
|
||||||
Chimu_00=ref()(0)(0);\
|
Chimu_00=ref()(0)(0);\
|
||||||
Chimu_01=ref()(0)(1);\
|
Chimu_01=ref()(0)(1);\
|
||||||
@ -91,55 +108,55 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
Chimu_31=ref()(3)(1);\
|
Chimu_31=ref()(3)(1);\
|
||||||
Chimu_32=ref()(3)(2);}
|
Chimu_32=ref()(3)(2);}
|
||||||
|
|
||||||
#define LOAD_CHI\
|
|
||||||
{const SiteHalfSpinor &ref(buf[offset]); \
|
|
||||||
Chi_00 = ref()(0)(0);\
|
|
||||||
Chi_01 = ref()(0)(1);\
|
|
||||||
Chi_02 = ref()(0)(2);\
|
|
||||||
Chi_10 = ref()(1)(0);\
|
|
||||||
Chi_11 = ref()(1)(1);\
|
|
||||||
Chi_12 = ref()(1)(2);}
|
|
||||||
|
|
||||||
// To splat or not to splat depends on the implementation
|
|
||||||
#define MULT_2SPIN(A)\
|
|
||||||
{auto & ref(U[sU](A)); \
|
|
||||||
Impl::loadLinkElement(U_00,ref()(0,0)); \
|
|
||||||
Impl::loadLinkElement(U_10,ref()(1,0)); \
|
|
||||||
Impl::loadLinkElement(U_20,ref()(2,0)); \
|
|
||||||
Impl::loadLinkElement(U_01,ref()(0,1)); \
|
|
||||||
Impl::loadLinkElement(U_11,ref()(1,1)); \
|
|
||||||
Impl::loadLinkElement(U_21,ref()(2,1)); \
|
|
||||||
UChi_00 = U_00*Chi_00;\
|
|
||||||
UChi_10 = U_00*Chi_10;\
|
|
||||||
UChi_01 = U_10*Chi_00;\
|
|
||||||
UChi_11 = U_10*Chi_10;\
|
|
||||||
UChi_02 = U_20*Chi_00;\
|
|
||||||
UChi_12 = U_20*Chi_10;\
|
|
||||||
UChi_00+= U_01*Chi_01;\
|
|
||||||
UChi_10+= U_01*Chi_11;\
|
|
||||||
UChi_01+= U_11*Chi_01;\
|
|
||||||
UChi_11+= U_11*Chi_11;\
|
|
||||||
UChi_02+= U_21*Chi_01;\
|
|
||||||
UChi_12+= U_21*Chi_11;\
|
|
||||||
Impl::loadLinkElement(U_00,ref()(0,2)); \
|
|
||||||
Impl::loadLinkElement(U_10,ref()(1,2)); \
|
|
||||||
Impl::loadLinkElement(U_20,ref()(2,2)); \
|
|
||||||
UChi_00+= U_00*Chi_02;\
|
|
||||||
UChi_10+= U_00*Chi_12;\
|
|
||||||
UChi_01+= U_10*Chi_02;\
|
|
||||||
UChi_11+= U_10*Chi_12;\
|
|
||||||
UChi_02+= U_20*Chi_02;\
|
|
||||||
UChi_12+= U_20*Chi_12;}
|
|
||||||
|
|
||||||
|
|
||||||
#define PERMUTE_DIR(dir) \
|
#define PERMUTE_DIR(dir) \
|
||||||
permute##dir(Chi_00,Chi_00);\
|
permute##dir(Chi_00,Chi_00); \
|
||||||
permute##dir(Chi_01,Chi_01);\
|
permute##dir(Chi_01,Chi_01);\
|
||||||
permute##dir(Chi_02,Chi_02);\
|
permute##dir(Chi_02,Chi_02);\
|
||||||
permute##dir(Chi_10,Chi_10);\
|
permute##dir(Chi_10,Chi_10); \
|
||||||
permute##dir(Chi_11,Chi_11);\
|
permute##dir(Chi_11,Chi_11);\
|
||||||
permute##dir(Chi_12,Chi_12);
|
permute##dir(Chi_12,Chi_12);
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define MULT_2SPIN(A)\
|
||||||
|
{auto & ref(U[sU](A)); \
|
||||||
|
U_00=coalescedRead(ref()(0,0),lane); \
|
||||||
|
U_10=coalescedRead(ref()(1,0),lane); \
|
||||||
|
U_20=coalescedRead(ref()(2,0),lane); \
|
||||||
|
U_01=coalescedRead(ref()(0,1),lane); \
|
||||||
|
U_11=coalescedRead(ref()(1,1),lane); \
|
||||||
|
U_21=coalescedRead(ref()(2,1),lane); \
|
||||||
|
UChi_00 = U_00*Chi_00; \
|
||||||
|
UChi_10 = U_00*Chi_10; \
|
||||||
|
UChi_01 = U_10*Chi_00; \
|
||||||
|
UChi_11 = U_10*Chi_10; \
|
||||||
|
UChi_02 = U_20*Chi_00; \
|
||||||
|
UChi_12 = U_20*Chi_10; \
|
||||||
|
UChi_00+= U_01*Chi_01; \
|
||||||
|
UChi_10+= U_01*Chi_11; \
|
||||||
|
UChi_01+= U_11*Chi_01; \
|
||||||
|
UChi_11+= U_11*Chi_11; \
|
||||||
|
UChi_02+= U_21*Chi_01; \
|
||||||
|
UChi_12+= U_21*Chi_11; \
|
||||||
|
U_00=coalescedRead(ref()(0,2),lane); \
|
||||||
|
U_10=coalescedRead(ref()(1,2),lane); \
|
||||||
|
U_20=coalescedRead(ref()(2,2),lane); \
|
||||||
|
UChi_00+= U_00*Chi_02; \
|
||||||
|
UChi_10+= U_00*Chi_12; \
|
||||||
|
UChi_01+= U_10*Chi_02; \
|
||||||
|
UChi_11+= U_10*Chi_12; \
|
||||||
|
UChi_02+= U_20*Chi_02; \
|
||||||
|
UChi_12+= U_20*Chi_12;}
|
||||||
|
|
||||||
|
#define LOAD_CHI \
|
||||||
|
{const SiteHalfSpinor &ref(buf[offset]); \
|
||||||
|
Chi_00 = coalescedRead(ref()(0)(0),lane); \
|
||||||
|
Chi_01 = coalescedRead(ref()(0)(1),lane); \
|
||||||
|
Chi_02 = coalescedRead(ref()(0)(2),lane); \
|
||||||
|
Chi_10 = coalescedRead(ref()(1)(0),lane); \
|
||||||
|
Chi_11 = coalescedRead(ref()(1)(1),lane); \
|
||||||
|
Chi_12 = coalescedRead(ref()(1)(2),lane);}
|
||||||
|
|
||||||
// hspin(0)=fspin(0)+timesI(fspin(3));
|
// hspin(0)=fspin(0)+timesI(fspin(3));
|
||||||
// hspin(1)=fspin(1)+timesI(fspin(2));
|
// hspin(1)=fspin(1)+timesI(fspin(2));
|
||||||
#define XP_PROJ \
|
#define XP_PROJ \
|
||||||
@ -353,13 +370,13 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
result_31-= UChi_11; \
|
result_31-= UChi_11; \
|
||||||
result_32-= UChi_12;
|
result_32-= UChi_12;
|
||||||
|
|
||||||
#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \
|
#define HAND_STENCIL_LEGB(PROJ,PERM,DIR,RECON) \
|
||||||
SE=st.GetEntry(ptype,DIR,ss); \
|
SE=st.GetEntry(ptype,DIR,ss); \
|
||||||
offset = SE->_offset; \
|
offset = SE->_offset; \
|
||||||
local = SE->_is_local; \
|
local = SE->_is_local; \
|
||||||
perm = SE->_permute; \
|
perm = SE->_permute; \
|
||||||
if ( local ) { \
|
if ( local ) { \
|
||||||
LOAD_CHIMU; \
|
LOAD_CHIMU(PERM); \
|
||||||
PROJ; \
|
PROJ; \
|
||||||
if ( perm) { \
|
if ( perm) { \
|
||||||
PERMUTE_DIR(PERM); \
|
PERMUTE_DIR(PERM); \
|
||||||
@ -367,6 +384,37 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
} else { \
|
} else { \
|
||||||
LOAD_CHI; \
|
LOAD_CHI; \
|
||||||
} \
|
} \
|
||||||
|
acceleratorSynchronise(); \
|
||||||
|
MULT_2SPIN(DIR); \
|
||||||
|
RECON;
|
||||||
|
|
||||||
|
#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \
|
||||||
|
SE=&st_p[DIR+8*ss]; \
|
||||||
|
ptype=st_perm[DIR]; \
|
||||||
|
offset = SE->_offset; \
|
||||||
|
local = SE->_is_local; \
|
||||||
|
perm = SE->_permute; \
|
||||||
|
if ( local ) { \
|
||||||
|
LOAD_CHIMU(PERM); \
|
||||||
|
PROJ; \
|
||||||
|
if ( perm) { \
|
||||||
|
PERMUTE_DIR(PERM); \
|
||||||
|
} \
|
||||||
|
} else { \
|
||||||
|
LOAD_CHI; \
|
||||||
|
} \
|
||||||
|
acceleratorSynchronise(); \
|
||||||
|
MULT_2SPIN(DIR); \
|
||||||
|
RECON;
|
||||||
|
|
||||||
|
#define HAND_STENCIL_LEGA(PROJ,PERM,DIR,RECON) \
|
||||||
|
SE=&st_p[DIR+8*ss]; \
|
||||||
|
ptype=st_perm[DIR]; \
|
||||||
|
/*SE=st.GetEntry(ptype,DIR,ss);*/ \
|
||||||
|
offset = SE->_offset; \
|
||||||
|
perm = SE->_permute; \
|
||||||
|
LOAD_CHIMU(PERM); \
|
||||||
|
PROJ; \
|
||||||
MULT_2SPIN(DIR); \
|
MULT_2SPIN(DIR); \
|
||||||
RECON;
|
RECON;
|
||||||
|
|
||||||
@ -376,7 +424,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
local = SE->_is_local; \
|
local = SE->_is_local; \
|
||||||
perm = SE->_permute; \
|
perm = SE->_permute; \
|
||||||
if ( local ) { \
|
if ( local ) { \
|
||||||
LOAD_CHIMU; \
|
LOAD_CHIMU(PERM); \
|
||||||
PROJ; \
|
PROJ; \
|
||||||
if ( perm) { \
|
if ( perm) { \
|
||||||
PERMUTE_DIR(PERM); \
|
PERMUTE_DIR(PERM); \
|
||||||
@ -384,10 +432,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
} else if ( st.same_node[DIR] ) { \
|
} else if ( st.same_node[DIR] ) { \
|
||||||
LOAD_CHI; \
|
LOAD_CHI; \
|
||||||
} \
|
} \
|
||||||
|
acceleratorSynchronise(); \
|
||||||
if (local || st.same_node[DIR] ) { \
|
if (local || st.same_node[DIR] ) { \
|
||||||
MULT_2SPIN(DIR); \
|
MULT_2SPIN(DIR); \
|
||||||
RECON; \
|
RECON; \
|
||||||
}
|
} \
|
||||||
|
acceleratorSynchronise();
|
||||||
|
|
||||||
#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON) \
|
#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON) \
|
||||||
SE=st.GetEntry(ptype,DIR,ss); \
|
SE=st.GetEntry(ptype,DIR,ss); \
|
||||||
@ -397,44 +447,44 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
MULT_2SPIN(DIR); \
|
MULT_2SPIN(DIR); \
|
||||||
RECON; \
|
RECON; \
|
||||||
nmu++; \
|
nmu++; \
|
||||||
}
|
} \
|
||||||
|
acceleratorSynchronise();
|
||||||
|
|
||||||
#define HAND_RESULT(ss) \
|
#define HAND_RESULT(ss) \
|
||||||
{ \
|
{ \
|
||||||
SiteSpinor & ref (out[ss]); \
|
SiteSpinor & ref (out[ss]); \
|
||||||
vstream(ref()(0)(0),result_00); \
|
coalescedWrite(ref()(0)(0),result_00,lane); \
|
||||||
vstream(ref()(0)(1),result_01); \
|
coalescedWrite(ref()(0)(1),result_01,lane); \
|
||||||
vstream(ref()(0)(2),result_02); \
|
coalescedWrite(ref()(0)(2),result_02,lane); \
|
||||||
vstream(ref()(1)(0),result_10); \
|
coalescedWrite(ref()(1)(0),result_10,lane); \
|
||||||
vstream(ref()(1)(1),result_11); \
|
coalescedWrite(ref()(1)(1),result_11,lane); \
|
||||||
vstream(ref()(1)(2),result_12); \
|
coalescedWrite(ref()(1)(2),result_12,lane); \
|
||||||
vstream(ref()(2)(0),result_20); \
|
coalescedWrite(ref()(2)(0),result_20,lane); \
|
||||||
vstream(ref()(2)(1),result_21); \
|
coalescedWrite(ref()(2)(1),result_21,lane); \
|
||||||
vstream(ref()(2)(2),result_22); \
|
coalescedWrite(ref()(2)(2),result_22,lane); \
|
||||||
vstream(ref()(3)(0),result_30); \
|
coalescedWrite(ref()(3)(0),result_30,lane); \
|
||||||
vstream(ref()(3)(1),result_31); \
|
coalescedWrite(ref()(3)(1),result_31,lane); \
|
||||||
vstream(ref()(3)(2),result_32); \
|
coalescedWrite(ref()(3)(2),result_32,lane); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define HAND_RESULT_EXT(ss) \
|
#define HAND_RESULT_EXT(ss) \
|
||||||
if (nmu){ \
|
{ \
|
||||||
SiteSpinor & ref (out[ss]); \
|
SiteSpinor & ref (out[ss]); \
|
||||||
ref()(0)(0)+=result_00; \
|
coalescedWrite(ref()(0)(0),coalescedRead(ref()(0)(0))+result_00,lane); \
|
||||||
ref()(0)(1)+=result_01; \
|
coalescedWrite(ref()(0)(1),coalescedRead(ref()(0)(1))+result_01,lane); \
|
||||||
ref()(0)(2)+=result_02; \
|
coalescedWrite(ref()(0)(2),coalescedRead(ref()(0)(2))+result_02,lane); \
|
||||||
ref()(1)(0)+=result_10; \
|
coalescedWrite(ref()(1)(0),coalescedRead(ref()(1)(0))+result_10,lane); \
|
||||||
ref()(1)(1)+=result_11; \
|
coalescedWrite(ref()(1)(1),coalescedRead(ref()(1)(1))+result_11,lane); \
|
||||||
ref()(1)(2)+=result_12; \
|
coalescedWrite(ref()(1)(2),coalescedRead(ref()(1)(2))+result_12,lane); \
|
||||||
ref()(2)(0)+=result_20; \
|
coalescedWrite(ref()(2)(0),coalescedRead(ref()(2)(0))+result_20,lane); \
|
||||||
ref()(2)(1)+=result_21; \
|
coalescedWrite(ref()(2)(1),coalescedRead(ref()(2)(1))+result_21,lane); \
|
||||||
ref()(2)(2)+=result_22; \
|
coalescedWrite(ref()(2)(2),coalescedRead(ref()(2)(2))+result_22,lane); \
|
||||||
ref()(3)(0)+=result_30; \
|
coalescedWrite(ref()(3)(0),coalescedRead(ref()(3)(0))+result_30,lane); \
|
||||||
ref()(3)(1)+=result_31; \
|
coalescedWrite(ref()(3)(1),coalescedRead(ref()(3)(1))+result_31,lane); \
|
||||||
ref()(3)(2)+=result_32; \
|
coalescedWrite(ref()(3)(2),coalescedRead(ref()(3)(2))+result_32,lane); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define HAND_DECLARATIONS(Simd) \
|
||||||
#define HAND_DECLARATIONS(a) \
|
|
||||||
Simd result_00; \
|
Simd result_00; \
|
||||||
Simd result_01; \
|
Simd result_01; \
|
||||||
Simd result_02; \
|
Simd result_02; \
|
||||||
@ -466,19 +516,19 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
Simd U_11; \
|
Simd U_11; \
|
||||||
Simd U_21;
|
Simd U_21;
|
||||||
|
|
||||||
#define ZERO_RESULT \
|
#define ZERO_RESULT \
|
||||||
result_00=Zero(); \
|
zeroit(result_00); \
|
||||||
result_01=Zero(); \
|
zeroit(result_01); \
|
||||||
result_02=Zero(); \
|
zeroit(result_02); \
|
||||||
result_10=Zero(); \
|
zeroit(result_10); \
|
||||||
result_11=Zero(); \
|
zeroit(result_11); \
|
||||||
result_12=Zero(); \
|
zeroit(result_12); \
|
||||||
result_20=Zero(); \
|
zeroit(result_20); \
|
||||||
result_21=Zero(); \
|
zeroit(result_21); \
|
||||||
result_22=Zero(); \
|
zeroit(result_22); \
|
||||||
result_30=Zero(); \
|
zeroit(result_30); \
|
||||||
result_31=Zero(); \
|
zeroit(result_31); \
|
||||||
result_32=Zero();
|
zeroit(result_32);
|
||||||
|
|
||||||
#define Chimu_00 Chi_00
|
#define Chimu_00 Chi_00
|
||||||
#define Chimu_01 Chi_01
|
#define Chimu_01 Chi_01
|
||||||
@ -495,15 +545,53 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
template<class Impl> void
|
|
||||||
WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
#ifdef SYCL_HACK
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
template<class Impl> accelerator_inline void
|
||||||
|
WilsonKernels<Impl>::HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p, SiteDoubledGaugeField *U,SiteHalfSpinor *buf,
|
||||||
|
int ss,int sU,const SiteSpinor *in, SiteSpinor *out)
|
||||||
{
|
{
|
||||||
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
typedef typename Simd::scalar_type S;
|
typedef typename Simd::scalar_type S;
|
||||||
typedef typename Simd::vector_type V;
|
typedef typename Simd::vector_type V;
|
||||||
|
typedef iSinglet<Simd> vCplx;
|
||||||
|
// typedef decltype( coalescedRead( vCplx()()() )) Simt;
|
||||||
|
typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
|
||||||
|
|
||||||
HAND_DECLARATIONS(ignore);
|
const int Nsimd = SiteHalfSpinor::Nsimd();
|
||||||
|
const int lane=acceleratorSIMTlane(Nsimd);
|
||||||
|
|
||||||
|
HAND_DECLARATIONS(Simt);
|
||||||
|
|
||||||
|
int offset,local,perm, ptype;
|
||||||
|
StencilEntry *SE;
|
||||||
|
HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
|
||||||
|
HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM);
|
||||||
|
HAND_RESULT(ss);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
template<class Impl> accelerator_inline void
|
||||||
|
WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||||
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
{
|
||||||
|
auto st_p = st._entries_p;
|
||||||
|
auto st_perm = st._permute_type;
|
||||||
|
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
|
typedef typename Simd::scalar_type S;
|
||||||
|
typedef typename Simd::vector_type V;
|
||||||
|
typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
|
||||||
|
|
||||||
|
const int Nsimd = SiteHalfSpinor::Nsimd();
|
||||||
|
const int lane=acceleratorSIMTlane(Nsimd);
|
||||||
|
|
||||||
|
HAND_DECLARATIONS(Simt);
|
||||||
|
|
||||||
int offset,local,perm, ptype;
|
int offset,local,perm, ptype;
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
@ -519,14 +607,20 @@ WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site
|
|||||||
HAND_RESULT(ss);
|
HAND_RESULT(ss);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl> accelerator_inline
|
||||||
void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
|
auto st_p = st._entries_p;
|
||||||
|
auto st_perm = st._permute_type;
|
||||||
typedef typename Simd::scalar_type S;
|
typedef typename Simd::scalar_type S;
|
||||||
typedef typename Simd::vector_type V;
|
typedef typename Simd::vector_type V;
|
||||||
|
typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
|
||||||
|
|
||||||
HAND_DECLARATIONS(ignore);
|
const int Nsimd = SiteHalfSpinor::Nsimd();
|
||||||
|
const int lane=acceleratorSIMTlane(Nsimd);
|
||||||
|
|
||||||
|
HAND_DECLARATIONS(Simt);
|
||||||
|
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
int offset,local,perm, ptype;
|
int offset,local,perm, ptype;
|
||||||
@ -542,15 +636,21 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView
|
|||||||
HAND_RESULT(ss);
|
HAND_RESULT(ss);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl> void
|
template<class Impl> accelerator_inline void
|
||||||
WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
|
auto st_p = st._entries_p;
|
||||||
|
auto st_perm = st._permute_type;
|
||||||
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
typedef typename Simd::scalar_type S;
|
typedef typename Simd::scalar_type S;
|
||||||
typedef typename Simd::vector_type V;
|
typedef typename Simd::vector_type V;
|
||||||
|
typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
|
||||||
|
|
||||||
HAND_DECLARATIONS(ignore);
|
const int Nsimd = SiteHalfSpinor::Nsimd();
|
||||||
|
const int lane=acceleratorSIMTlane(Nsimd);
|
||||||
|
|
||||||
|
HAND_DECLARATIONS(Simt);
|
||||||
|
|
||||||
int offset,local,perm, ptype;
|
int offset,local,perm, ptype;
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
@ -566,14 +666,20 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,Si
|
|||||||
HAND_RESULT(ss);
|
HAND_RESULT(ss);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl> accelerator_inline
|
||||||
void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
|
auto st_p = st._entries_p;
|
||||||
|
auto st_perm = st._permute_type;
|
||||||
typedef typename Simd::scalar_type S;
|
typedef typename Simd::scalar_type S;
|
||||||
typedef typename Simd::vector_type V;
|
typedef typename Simd::vector_type V;
|
||||||
|
typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
|
||||||
|
|
||||||
HAND_DECLARATIONS(ignore);
|
const int Nsimd = SiteHalfSpinor::Nsimd();
|
||||||
|
const int lane=acceleratorSIMTlane(Nsimd);
|
||||||
|
|
||||||
|
HAND_DECLARATIONS(Simt);
|
||||||
|
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
int offset,local,perm, ptype;
|
int offset,local,perm, ptype;
|
||||||
@ -589,15 +695,21 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi
|
|||||||
HAND_RESULT(ss);
|
HAND_RESULT(ss);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl> void
|
template<class Impl> accelerator_inline void
|
||||||
WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
|
auto st_p = st._entries_p;
|
||||||
|
auto st_perm = st._permute_type;
|
||||||
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
typedef typename Simd::scalar_type S;
|
typedef typename Simd::scalar_type S;
|
||||||
typedef typename Simd::vector_type V;
|
typedef typename Simd::vector_type V;
|
||||||
|
typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
|
||||||
|
|
||||||
HAND_DECLARATIONS(ignore);
|
const int Nsimd = SiteHalfSpinor::Nsimd();
|
||||||
|
const int lane=acceleratorSIMTlane(Nsimd);
|
||||||
|
|
||||||
|
HAND_DECLARATIONS(Simt);
|
||||||
|
|
||||||
int offset, ptype;
|
int offset, ptype;
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
@ -614,14 +726,20 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,Si
|
|||||||
HAND_RESULT_EXT(ss);
|
HAND_RESULT_EXT(ss);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl> accelerator_inline
|
||||||
void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
|
auto st_p = st._entries_p;
|
||||||
|
auto st_perm = st._permute_type;
|
||||||
typedef typename Simd::scalar_type S;
|
typedef typename Simd::scalar_type S;
|
||||||
typedef typename Simd::vector_type V;
|
typedef typename Simd::vector_type V;
|
||||||
|
typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
|
||||||
|
|
||||||
HAND_DECLARATIONS(ignore);
|
const int Nsimd = SiteHalfSpinor::Nsimd();
|
||||||
|
const int lane=acceleratorSIMTlane(Nsimd);
|
||||||
|
|
||||||
|
HAND_DECLARATIONS(Simt);
|
||||||
|
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
int offset, ptype;
|
int offset, ptype;
|
||||||
@ -682,3 +800,4 @@ NAMESPACE_END(Grid);
|
|||||||
#undef HAND_RESULT
|
#undef HAND_RESULT
|
||||||
#undef HAND_RESULT_INT
|
#undef HAND_RESULT_INT
|
||||||
#undef HAND_RESULT_EXT
|
#undef HAND_RESULT_EXT
|
||||||
|
#undef HAND_DECLARATIONS
|
||||||
|
@ -114,7 +114,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
|
|||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
// All legs kernels ; comms then compute
|
// All legs kernels ; comms then compute
|
||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
template <class Impl>
|
template <class Impl> accelerator_inline
|
||||||
void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
|
void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
|
||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
@ -140,7 +140,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldV
|
|||||||
coalescedWrite(out[sF],result,lane);
|
coalescedWrite(out[sF],result,lane);
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl> accelerator_inline
|
||||||
void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U,
|
void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U,
|
||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
@ -169,7 +169,7 @@ void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView
|
|||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
// Interior kernels
|
// Interior kernels
|
||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
template <class Impl>
|
template <class Impl> accelerator_inline
|
||||||
void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,
|
void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,
|
||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
@ -197,7 +197,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFi
|
|||||||
coalescedWrite(out[sF], result,lane);
|
coalescedWrite(out[sF], result,lane);
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl> accelerator_inline
|
||||||
void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,
|
void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,
|
||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
@ -227,7 +227,7 @@ void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, DoubledGaugeField
|
|||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
// Exterior kernels
|
// Exterior kernels
|
||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
template <class Impl>
|
template <class Impl> accelerator_inline
|
||||||
void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,
|
void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,
|
||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
@ -258,7 +258,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFi
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl> accelerator_inline
|
||||||
void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,
|
void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,
|
||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
@ -290,7 +290,7 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeField
|
|||||||
};
|
};
|
||||||
|
|
||||||
#define DhopDirMacro(Dir,spProj,spRecon) \
|
#define DhopDirMacro(Dir,spProj,spRecon) \
|
||||||
template <class Impl> \
|
template <class Impl> accelerator_inline \
|
||||||
void WilsonKernels<Impl>::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \
|
void WilsonKernels<Impl>::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \
|
int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \
|
||||||
{ \
|
{ \
|
||||||
@ -318,7 +318,7 @@ DhopDirMacro(Ym,spProjYm,spReconYm);
|
|||||||
DhopDirMacro(Zm,spProjZm,spReconZm);
|
DhopDirMacro(Zm,spProjZm,spReconZm);
|
||||||
DhopDirMacro(Tm,spProjTm,spReconTm);
|
DhopDirMacro(Tm,spProjTm,spReconTm);
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl> accelerator_inline
|
||||||
void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
|
void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma)
|
int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma)
|
||||||
{
|
{
|
||||||
@ -416,7 +416,21 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
|
|||||||
#undef LoopBody
|
#undef LoopBody
|
||||||
}
|
}
|
||||||
|
|
||||||
#define KERNEL_CALLNB(A) \
|
#define KERNEL_CALL_TMP(A) \
|
||||||
|
const uint64_t NN = Nsite*Ls; \
|
||||||
|
auto U_p = & U_v[0]; \
|
||||||
|
auto in_p = & in_v[0]; \
|
||||||
|
auto out_p = & out_v[0]; \
|
||||||
|
auto st_p = st_v._entries_p; \
|
||||||
|
auto st_perm = st_v._permute_type; \
|
||||||
|
accelerator_forNB( ss, NN, Simd::Nsimd(), { \
|
||||||
|
int sF = ss; \
|
||||||
|
int sU = ss/Ls; \
|
||||||
|
WilsonKernels<Impl>::A(st_perm,st_p,U_p,buf,sF,sU,in_p,out_p); \
|
||||||
|
}); \
|
||||||
|
accelerator_barrier();
|
||||||
|
|
||||||
|
#define KERNEL_CALLNB(A) \
|
||||||
const uint64_t NN = Nsite*Ls; \
|
const uint64_t NN = Nsite*Ls; \
|
||||||
accelerator_forNB( ss, NN, Simd::Nsimd(), { \
|
accelerator_forNB( ss, NN, Simd::Nsimd(), { \
|
||||||
int sF = ss; \
|
int sF = ss; \
|
||||||
@ -445,20 +459,24 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
|||||||
|
|
||||||
if( interior && exterior ) {
|
if( interior && exterior ) {
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
|
||||||
#ifndef GRID_CUDA
|
#ifdef SYCL_HACK
|
||||||
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_TMP(HandDhopSiteSycl); return; }
|
||||||
|
#else
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
|
||||||
|
#endif
|
||||||
|
#ifndef GRID_CUDA
|
||||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;}
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;}
|
||||||
#endif
|
#endif
|
||||||
} else if( interior ) {
|
} else if( interior ) {
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;}
|
||||||
#ifndef GRID_CUDA
|
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;}
|
||||||
|
#ifndef GRID_CUDA
|
||||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;}
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;}
|
||||||
#endif
|
#endif
|
||||||
} else if( exterior ) {
|
} else if( exterior ) {
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;}
|
||||||
#ifndef GRID_CUDA
|
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;}
|
||||||
|
#ifndef GRID_CUDA
|
||||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); return;}
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); return;}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@ -476,20 +494,20 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
|||||||
|
|
||||||
if( interior && exterior ) {
|
if( interior && exterior ) {
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;}
|
||||||
#ifndef GRID_CUDA
|
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;}
|
||||||
|
#ifndef GRID_CUDA
|
||||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;}
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;}
|
||||||
#endif
|
#endif
|
||||||
} else if( interior ) {
|
} else if( interior ) {
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
|
||||||
#ifndef GRID_CUDA
|
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;}
|
||||||
|
#ifndef GRID_CUDA
|
||||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;}
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;}
|
||||||
#endif
|
#endif
|
||||||
} else if( exterior ) {
|
} else if( exterior ) {
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
|
||||||
#ifndef GRID_CUDA
|
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;}
|
||||||
|
#ifndef GRID_CUDA
|
||||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;}
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
38
Grid/qcd/action/gauge/Gauge.cc
Normal file
38
Grid/qcd/action/gauge/Gauge.cc
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/qcd/action/gauge/Gauge.cc
|
||||||
|
|
||||||
|
Copyright (C) 2020
|
||||||
|
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution
|
||||||
|
directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
std::vector<int> ConjugateGaugeImplBase::_conjDirs;
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
||||||
|
|
@ -96,7 +96,7 @@ public:
|
|||||||
///////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////
|
||||||
// Move these to another class
|
// Move these to another class
|
||||||
// HMC auxiliary functions
|
// HMC auxiliary functions
|
||||||
static inline void generate_momenta(Field &P, GridParallelRNG &pRNG)
|
static inline void generate_momenta(Field &P, GridSerialRNG & sRNG, GridParallelRNG &pRNG)
|
||||||
{
|
{
|
||||||
// Zbigniew Srocinsky thesis:
|
// Zbigniew Srocinsky thesis:
|
||||||
//
|
//
|
||||||
@ -154,6 +154,10 @@ public:
|
|||||||
return Hsum.real();
|
return Hsum.real();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline void Project(Field &U) {
|
||||||
|
ProjectSUn(U);
|
||||||
|
}
|
||||||
|
|
||||||
static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
|
static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
|
||||||
SU<Nc>::HotConfiguration(pRNG, U);
|
SU<Nc>::HotConfiguration(pRNG, U);
|
||||||
}
|
}
|
||||||
|
@ -59,14 +59,14 @@ public:
|
|||||||
}
|
}
|
||||||
static inline GaugeLinkField
|
static inline GaugeLinkField
|
||||||
CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) {
|
CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) {
|
||||||
return Cshift(adj(Link), mu, -1);
|
return PeriodicBC::CovShiftIdentityBackward(Link, mu);
|
||||||
}
|
}
|
||||||
static inline GaugeLinkField
|
static inline GaugeLinkField
|
||||||
CovShiftIdentityForward(const GaugeLinkField &Link, int mu) {
|
CovShiftIdentityForward(const GaugeLinkField &Link, int mu) {
|
||||||
return Link;
|
return PeriodicBC::CovShiftIdentityForward(Link,mu);
|
||||||
}
|
}
|
||||||
static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) {
|
static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) {
|
||||||
return Cshift(Link, mu, 1);
|
return PeriodicBC::ShiftStaple(Link,mu);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline bool isPeriodicGaugeField(void) { return true; }
|
static inline bool isPeriodicGaugeField(void) { return true; }
|
||||||
@ -74,7 +74,13 @@ public:
|
|||||||
|
|
||||||
// Composition with smeared link, bc's etc.. probably need multiple inheritance
|
// Composition with smeared link, bc's etc.. probably need multiple inheritance
|
||||||
// Variable precision "S" and variable Nc
|
// Variable precision "S" and variable Nc
|
||||||
template <class GimplTypes> class ConjugateGaugeImpl : public GimplTypes {
|
class ConjugateGaugeImplBase {
|
||||||
|
protected:
|
||||||
|
static std::vector<int> _conjDirs;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <class GimplTypes> class ConjugateGaugeImpl : public GimplTypes, ConjugateGaugeImplBase {
|
||||||
|
private:
|
||||||
public:
|
public:
|
||||||
INHERIT_GIMPL_TYPES(GimplTypes);
|
INHERIT_GIMPL_TYPES(GimplTypes);
|
||||||
|
|
||||||
@ -84,47 +90,56 @@ public:
|
|||||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
template <class covariant>
|
template <class covariant>
|
||||||
static Lattice<covariant> CovShiftForward(const GaugeLinkField &Link, int mu,
|
static Lattice<covariant> CovShiftForward(const GaugeLinkField &Link, int mu,
|
||||||
const Lattice<covariant> &field) {
|
const Lattice<covariant> &field)
|
||||||
return ConjugateBC::CovShiftForward(Link, mu, field);
|
{
|
||||||
|
assert(_conjDirs.size() == Nd);
|
||||||
|
if(_conjDirs[mu])
|
||||||
|
return ConjugateBC::CovShiftForward(Link, mu, field);
|
||||||
|
else
|
||||||
|
return PeriodicBC::CovShiftForward(Link, mu, field);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class covariant>
|
template <class covariant>
|
||||||
static Lattice<covariant> CovShiftBackward(const GaugeLinkField &Link, int mu,
|
static Lattice<covariant> CovShiftBackward(const GaugeLinkField &Link, int mu,
|
||||||
const Lattice<covariant> &field) {
|
const Lattice<covariant> &field)
|
||||||
return ConjugateBC::CovShiftBackward(Link, mu, field);
|
{
|
||||||
|
assert(_conjDirs.size() == Nd);
|
||||||
|
if(_conjDirs[mu])
|
||||||
|
return ConjugateBC::CovShiftBackward(Link, mu, field);
|
||||||
|
else
|
||||||
|
return PeriodicBC::CovShiftBackward(Link, mu, field);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline GaugeLinkField
|
static inline GaugeLinkField
|
||||||
CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) {
|
CovShiftIdentityBackward(const GaugeLinkField &Link, int mu)
|
||||||
GridBase *grid = Link.Grid();
|
{
|
||||||
int Lmu = grid->GlobalDimensions()[mu] - 1;
|
assert(_conjDirs.size() == Nd);
|
||||||
|
if(_conjDirs[mu])
|
||||||
Lattice<iScalar<vInteger>> coor(grid);
|
return ConjugateBC::CovShiftIdentityBackward(Link, mu);
|
||||||
LatticeCoordinate(coor, mu);
|
else
|
||||||
|
return PeriodicBC::CovShiftIdentityBackward(Link, mu);
|
||||||
GaugeLinkField tmp(grid);
|
|
||||||
tmp = adj(Link);
|
|
||||||
tmp = where(coor == Lmu, conjugate(tmp), tmp);
|
|
||||||
return Cshift(tmp, mu, -1); // moves towards positive mu
|
|
||||||
}
|
}
|
||||||
static inline GaugeLinkField
|
static inline GaugeLinkField
|
||||||
CovShiftIdentityForward(const GaugeLinkField &Link, int mu) {
|
CovShiftIdentityForward(const GaugeLinkField &Link, int mu)
|
||||||
return Link;
|
{
|
||||||
|
assert(_conjDirs.size() == Nd);
|
||||||
|
if(_conjDirs[mu])
|
||||||
|
return ConjugateBC::CovShiftIdentityForward(Link,mu);
|
||||||
|
else
|
||||||
|
return PeriodicBC::CovShiftIdentityForward(Link,mu);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) {
|
static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu)
|
||||||
GridBase *grid = Link.Grid();
|
{
|
||||||
int Lmu = grid->GlobalDimensions()[mu] - 1;
|
assert(_conjDirs.size() == Nd);
|
||||||
|
if(_conjDirs[mu])
|
||||||
Lattice<iScalar<vInteger>> coor(grid);
|
return ConjugateBC::ShiftStaple(Link,mu);
|
||||||
LatticeCoordinate(coor, mu);
|
else
|
||||||
|
return PeriodicBC::ShiftStaple(Link,mu);
|
||||||
GaugeLinkField tmp(grid);
|
|
||||||
tmp = Cshift(Link, mu, 1);
|
|
||||||
tmp = where(coor == Lmu, conjugate(tmp), tmp);
|
|
||||||
return tmp;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline void setDirections(std::vector<int> &conjDirs) { _conjDirs=conjDirs; }
|
||||||
|
static inline std::vector<int> getDirections(void) { return _conjDirs; }
|
||||||
static inline bool isPeriodicGaugeField(void) { return false; }
|
static inline bool isPeriodicGaugeField(void) { return false; }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -49,7 +49,7 @@ public:
|
|||||||
|
|
||||||
virtual std::string action_name(){return "PlaqPlusRectangleAction";}
|
virtual std::string action_name(){return "PlaqPlusRectangleAction";}
|
||||||
|
|
||||||
virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {}; // noop as no pseudoferms
|
virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {}; // noop as no pseudoferms
|
||||||
|
|
||||||
virtual std::string LogParameters(){
|
virtual std::string LogParameters(){
|
||||||
std::stringstream sstream;
|
std::stringstream sstream;
|
||||||
|
@ -54,8 +54,7 @@ public:
|
|||||||
return sstream.str();
|
return sstream.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual void refresh(const GaugeField &U,
|
virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG){}; // noop as no pseudoferms
|
||||||
GridParallelRNG &pRNG){}; // noop as no pseudoferms
|
|
||||||
|
|
||||||
virtual RealD S(const GaugeField &U) {
|
virtual RealD S(const GaugeField &U) {
|
||||||
RealD plaq = WilsonLoops<Gimpl>::avgPlaquette(U);
|
RealD plaq = WilsonLoops<Gimpl>::avgPlaquette(U);
|
||||||
|
@ -124,7 +124,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
//
|
//
|
||||||
// As a check of rational require \Phi^dag M_{EOFA} \Phi == eta^dag M^-1/2^dag M M^-1/2 eta = eta^dag eta
|
// As a check of rational require \Phi^dag M_{EOFA} \Phi == eta^dag M^-1/2^dag M M^-1/2 eta = eta^dag eta
|
||||||
//
|
//
|
||||||
virtual void refresh(const GaugeField& U, GridParallelRNG& pRNG)
|
virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG)
|
||||||
{
|
{
|
||||||
Lop.ImportGauge(U);
|
Lop.ImportGauge(U);
|
||||||
Rop.ImportGauge(U);
|
Rop.ImportGauge(U);
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
|
|
||||||
/*************************************************************************************
|
/*************************************************************************************
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
@ -43,8 +42,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
//
|
//
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
class OneFlavourEvenOddRationalPseudoFermionAction
|
class OneFlavourEvenOddRationalPseudoFermionAction : public Action<typename Impl::GaugeField> {
|
||||||
: public Action<typename Impl::GaugeField> {
|
|
||||||
public:
|
public:
|
||||||
INHERIT_IMPL_TYPES(Impl);
|
INHERIT_IMPL_TYPES(Impl);
|
||||||
|
|
||||||
@ -103,7 +101,7 @@ public:
|
|||||||
return sstream.str();
|
return sstream.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual void refresh(const GaugeField &U, GridParallelRNG &pRNG) {
|
virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {
|
||||||
// P(phi) = e^{- phi^dag (MpcdagMpc)^-1/2 phi}
|
// P(phi) = e^{- phi^dag (MpcdagMpc)^-1/2 phi}
|
||||||
// = e^{- phi^dag (MpcdagMpc)^-1/4 (MpcdagMpc)^-1/4 phi}
|
// = e^{- phi^dag (MpcdagMpc)^-1/4 (MpcdagMpc)^-1/4 phi}
|
||||||
// Phi = MpcdagMpc^{1/4} eta
|
// Phi = MpcdagMpc^{1/4} eta
|
||||||
@ -156,7 +154,10 @@ public:
|
|||||||
|
|
||||||
msCG(Mpc, PhiOdd, Y);
|
msCG(Mpc, PhiOdd, Y);
|
||||||
|
|
||||||
if ( (rand()%param.BoundsCheckFreq)==0 ) {
|
auto grid = FermOp.FermionGrid();
|
||||||
|
auto r=rand();
|
||||||
|
grid->Broadcast(0,r);
|
||||||
|
if ( (r%param.BoundsCheckFreq)==0 ) {
|
||||||
FermionField gauss(FermOp.FermionRedBlackGrid());
|
FermionField gauss(FermOp.FermionRedBlackGrid());
|
||||||
gauss = PhiOdd;
|
gauss = PhiOdd;
|
||||||
HighBoundCheck(Mpc,gauss,param.hi);
|
HighBoundCheck(Mpc,gauss,param.hi);
|
||||||
|
@ -101,7 +101,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {
|
virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
|
||||||
|
|
||||||
// S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi
|
// S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi
|
||||||
//
|
//
|
||||||
@ -170,7 +170,10 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
msCG_M(MdagM,X,Y);
|
msCG_M(MdagM,X,Y);
|
||||||
|
|
||||||
// Randomly apply rational bounds checks.
|
// Randomly apply rational bounds checks.
|
||||||
if ( (rand()%param.BoundsCheckFreq)==0 ) {
|
auto grid = NumOp.FermionGrid();
|
||||||
|
auto r=rand();
|
||||||
|
grid->Broadcast(0,r);
|
||||||
|
if ( (r%param.BoundsCheckFreq)==0 ) {
|
||||||
FermionField gauss(NumOp.FermionRedBlackGrid());
|
FermionField gauss(NumOp.FermionRedBlackGrid());
|
||||||
gauss = PhiOdd;
|
gauss = PhiOdd;
|
||||||
HighBoundCheck(MdagM,gauss,param.hi);
|
HighBoundCheck(MdagM,gauss,param.hi);
|
||||||
|
@ -98,7 +98,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {
|
virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
|
||||||
|
|
||||||
|
|
||||||
// P(phi) = e^{- phi^dag (MdagM)^-1/2 phi}
|
// P(phi) = e^{- phi^dag (MdagM)^-1/2 phi}
|
||||||
@ -142,7 +142,10 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
msCG(MdagMOp,Phi,Y);
|
msCG(MdagMOp,Phi,Y);
|
||||||
|
|
||||||
if ( (rand()%param.BoundsCheckFreq)==0 ) {
|
auto grid = FermOp.FermionGrid();
|
||||||
|
auto r=rand();
|
||||||
|
grid->Broadcast(0,r);
|
||||||
|
if ( (r%param.BoundsCheckFreq)==0 ) {
|
||||||
FermionField gauss(FermOp.FermionGrid());
|
FermionField gauss(FermOp.FermionGrid());
|
||||||
gauss = Phi;
|
gauss = Phi;
|
||||||
HighBoundCheck(MdagMOp,gauss,param.hi);
|
HighBoundCheck(MdagMOp,gauss,param.hi);
|
||||||
|
@ -95,7 +95,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {
|
virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
|
||||||
|
|
||||||
// S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi
|
// S_f = chi^dag* P(V^dag*V)/Q(V^dag*V)* N(M^dag*M)/D(M^dag*M)* P(V^dag*V)/Q(V^dag*V)* chi
|
||||||
//
|
//
|
||||||
@ -156,7 +156,10 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
msCG_M(MdagM,X,Y);
|
msCG_M(MdagM,X,Y);
|
||||||
|
|
||||||
// Randomly apply rational bounds checks.
|
// Randomly apply rational bounds checks.
|
||||||
if ( (rand()%param.BoundsCheckFreq)==0 ) {
|
auto grid = NumOp.FermionGrid();
|
||||||
|
auto r=rand();
|
||||||
|
grid->Broadcast(0,r);
|
||||||
|
if ( (r%param.BoundsCheckFreq)==0 ) {
|
||||||
FermionField gauss(NumOp.FermionGrid());
|
FermionField gauss(NumOp.FermionGrid());
|
||||||
gauss = Phi;
|
gauss = Phi;
|
||||||
HighBoundCheck(MdagM,gauss,param.hi);
|
HighBoundCheck(MdagM,gauss,param.hi);
|
||||||
|
@ -73,7 +73,7 @@ public:
|
|||||||
//////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Push the gauge field in to the dops. Assume any BC's and smearing already applied
|
// Push the gauge field in to the dops. Assume any BC's and smearing already applied
|
||||||
//////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////
|
||||||
virtual void refresh(const GaugeField &U, GridParallelRNG &pRNG) {
|
virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {
|
||||||
// P(phi) = e^{- phi^dag (MdagM)^-1 phi}
|
// P(phi) = e^{- phi^dag (MdagM)^-1 phi}
|
||||||
// Phi = Mdag eta
|
// Phi = Mdag eta
|
||||||
// P(eta) = e^{- eta^dag eta}
|
// P(eta) = e^{- eta^dag eta}
|
||||||
|
@ -77,7 +77,7 @@ public:
|
|||||||
//////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Push the gauge field in to the dops. Assume any BC's and smearing already applied
|
// Push the gauge field in to the dops. Assume any BC's and smearing already applied
|
||||||
//////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////
|
||||||
virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {
|
virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
|
||||||
|
|
||||||
// P(phi) = e^{- phi^dag (MpcdagMpc)^-1 phi}
|
// P(phi) = e^{- phi^dag (MpcdagMpc)^-1 phi}
|
||||||
// Phi = McpDag eta
|
// Phi = McpDag eta
|
||||||
|
@ -84,7 +84,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {
|
virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
|
||||||
|
|
||||||
// P(phi) = e^{- phi^dag Vpc (MpcdagMpc)^-1 Vpcdag phi}
|
// P(phi) = e^{- phi^dag Vpc (MpcdagMpc)^-1 Vpcdag phi}
|
||||||
//
|
//
|
||||||
|
@ -64,7 +64,7 @@ public:
|
|||||||
return sstream.str();
|
return sstream.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {
|
virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) {
|
||||||
|
|
||||||
// P(phi) = e^{- phi^dag V (MdagM)^-1 Vdag phi}
|
// P(phi) = e^{- phi^dag V (MdagM)^-1 Vdag phi}
|
||||||
//
|
//
|
||||||
|
@ -55,7 +55,7 @@ public:
|
|||||||
}
|
}
|
||||||
virtual std::string action_name() {return "ScalarAction";}
|
virtual std::string action_name() {return "ScalarAction";}
|
||||||
|
|
||||||
virtual void refresh(const Field &U, GridParallelRNG &pRNG) {} // noop as no pseudoferms
|
virtual void refresh(const Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {} // noop as no pseudoferms
|
||||||
|
|
||||||
virtual RealD S(const Field &p) {
|
virtual RealD S(const Field &p) {
|
||||||
return (mass_square * 0.5 + Nd) * ScalarObs<Impl>::sumphisquared(p) +
|
return (mass_square * 0.5 + Nd) * ScalarObs<Impl>::sumphisquared(p) +
|
||||||
|
@ -27,7 +27,7 @@ public:
|
|||||||
typedef Field FermionField;
|
typedef Field FermionField;
|
||||||
typedef Field PropagatorField;
|
typedef Field PropagatorField;
|
||||||
|
|
||||||
static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){
|
static inline void generate_momenta(Field& P, GridSerialRNG &sRNG, GridParallelRNG& pRNG){
|
||||||
RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR); // CPS/UKQCD momentum rescaling
|
RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR); // CPS/UKQCD momentum rescaling
|
||||||
gaussian(pRNG, P);
|
gaussian(pRNG, P);
|
||||||
P *= scale;
|
P *= scale;
|
||||||
@ -55,6 +55,10 @@ public:
|
|||||||
U = 1.0;
|
U = 1.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline void Project(Field &U) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
static void MomentumSpacePropagator(Field &out, RealD m)
|
static void MomentumSpacePropagator(Field &out, RealD m)
|
||||||
{
|
{
|
||||||
GridBase *grid = out.Grid();
|
GridBase *grid = out.Grid();
|
||||||
@ -147,7 +151,7 @@ public:
|
|||||||
out = one / out;
|
out = one / out;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void generate_momenta(Field &P, GridParallelRNG &pRNG)
|
static inline void generate_momenta(Field &P, GridSerialRNG & sRNG, GridParallelRNG &pRNG)
|
||||||
{
|
{
|
||||||
RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR); // CPS/UKQCD momentum rescaling
|
RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR); // CPS/UKQCD momentum rescaling
|
||||||
#ifndef USE_FFT_ACCELERATION
|
#ifndef USE_FFT_ACCELERATION
|
||||||
@ -234,6 +238,10 @@ public:
|
|||||||
#endif //USE_FFT_ACCELERATION
|
#endif //USE_FFT_ACCELERATION
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline void Project(Field &U) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
|
static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
|
||||||
Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U);
|
Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U);
|
||||||
}
|
}
|
||||||
|
@ -77,7 +77,7 @@ public:
|
|||||||
|
|
||||||
virtual std::string action_name() { return "ScalarAction"; }
|
virtual std::string action_name() { return "ScalarAction"; }
|
||||||
|
|
||||||
virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}
|
virtual void refresh(const Field &U, GridSerialRNG & sRNG, GridParallelRNG &pRNG) {}
|
||||||
|
|
||||||
virtual RealD S(const Field &p)
|
virtual RealD S(const Field &p)
|
||||||
{
|
{
|
||||||
|
@ -159,6 +159,13 @@ private:
|
|||||||
Resources.GetCheckPointer()->CheckpointRestore(Parameters.StartTrajectory, U,
|
Resources.GetCheckPointer()->CheckpointRestore(Parameters.StartTrajectory, U,
|
||||||
Resources.GetSerialRNG(),
|
Resources.GetSerialRNG(),
|
||||||
Resources.GetParallelRNG());
|
Resources.GetParallelRNG());
|
||||||
|
} else {
|
||||||
|
// others
|
||||||
|
std::cout << GridLogError << "Unrecognized StartingType\n";
|
||||||
|
std::cout
|
||||||
|
<< GridLogError
|
||||||
|
<< "Valid [HotStart, ColdStart, TepidStart, CheckpointStart]\n";
|
||||||
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
Smearing.set_Field(U);
|
Smearing.set_Field(U);
|
||||||
|
@ -139,7 +139,7 @@ private:
|
|||||||
// Evolution
|
// Evolution
|
||||||
/////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////
|
||||||
RealD evolve_hmc_step(Field &U) {
|
RealD evolve_hmc_step(Field &U) {
|
||||||
TheIntegrator.refresh(U, pRNG); // set U and initialize P and phi's
|
TheIntegrator.refresh(U, sRNG, pRNG); // set U and initialize P and phi's
|
||||||
|
|
||||||
RealD H0 = TheIntegrator.S(U); // initial state action
|
RealD H0 = TheIntegrator.S(U); // initial state action
|
||||||
|
|
||||||
|
@ -74,7 +74,7 @@ public:
|
|||||||
conf_file = os.str();
|
conf_file = os.str();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
virtual ~BaseHmcCheckpointer(){};
|
||||||
void check_filename(const std::string &filename){
|
void check_filename(const std::string &filename){
|
||||||
std::ifstream f(filename.c_str());
|
std::ifstream f(filename.c_str());
|
||||||
if(!f.good()){
|
if(!f.good()){
|
||||||
@ -82,7 +82,6 @@ public:
|
|||||||
abort();
|
abort();
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual void initialize(const CheckpointerParameters &Params) = 0;
|
virtual void initialize(const CheckpointerParameters &Params) = 0;
|
||||||
|
|
||||||
virtual void CheckpointRestore(int traj, typename Impl::Field &U,
|
virtual void CheckpointRestore(int traj, typename Impl::Field &U,
|
||||||
|
@ -45,6 +45,7 @@ private:
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
INHERIT_GIMPL_TYPES(Implementation);
|
INHERIT_GIMPL_TYPES(Implementation);
|
||||||
|
typedef GaugeStatistics<Implementation> GaugeStats;
|
||||||
|
|
||||||
ILDGHmcCheckpointer(const CheckpointerParameters &Params_) { initialize(Params_); }
|
ILDGHmcCheckpointer(const CheckpointerParameters &Params_) { initialize(Params_); }
|
||||||
|
|
||||||
@ -78,7 +79,7 @@ public:
|
|||||||
BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
|
BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
|
||||||
IldgWriter _IldgWriter(grid->IsBoss());
|
IldgWriter _IldgWriter(grid->IsBoss());
|
||||||
_IldgWriter.open(config);
|
_IldgWriter.open(config);
|
||||||
_IldgWriter.writeConfiguration(U, traj, config, config);
|
_IldgWriter.writeConfiguration<GaugeStats>(U, traj, config, config);
|
||||||
_IldgWriter.close();
|
_IldgWriter.close();
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Written ILDG Configuration on " << config
|
std::cout << GridLogMessage << "Written ILDG Configuration on " << config
|
||||||
@ -105,7 +106,7 @@ public:
|
|||||||
FieldMetaData header;
|
FieldMetaData header;
|
||||||
IldgReader _IldgReader;
|
IldgReader _IldgReader;
|
||||||
_IldgReader.open(config);
|
_IldgReader.open(config);
|
||||||
_IldgReader.readConfiguration(U,header); // format from the header
|
_IldgReader.readConfiguration<GaugeStats>(U,header); // format from the header
|
||||||
_IldgReader.close();
|
_IldgReader.close();
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Read ILDG Configuration from " << config
|
std::cout << GridLogMessage << "Read ILDG Configuration from " << config
|
||||||
|
@ -43,6 +43,7 @@ private:
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
INHERIT_GIMPL_TYPES(Gimpl); // only for gauge configurations
|
INHERIT_GIMPL_TYPES(Gimpl); // only for gauge configurations
|
||||||
|
typedef GaugeStatistics<Gimpl> GaugeStats;
|
||||||
|
|
||||||
NerscHmcCheckpointer(const CheckpointerParameters &Params_) { initialize(Params_); }
|
NerscHmcCheckpointer(const CheckpointerParameters &Params_) { initialize(Params_); }
|
||||||
|
|
||||||
@ -60,7 +61,7 @@ public:
|
|||||||
int precision32 = 1;
|
int precision32 = 1;
|
||||||
int tworow = 0;
|
int tworow = 0;
|
||||||
NerscIO::writeRNGState(sRNG, pRNG, rng);
|
NerscIO::writeRNGState(sRNG, pRNG, rng);
|
||||||
NerscIO::writeConfiguration(U, config, tworow, precision32);
|
NerscIO::writeConfiguration<GaugeStats>(U, config, tworow, precision32);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -74,7 +75,7 @@ public:
|
|||||||
|
|
||||||
FieldMetaData header;
|
FieldMetaData header;
|
||||||
NerscIO::readRNGState(sRNG, pRNG, header, rng);
|
NerscIO::readRNGState(sRNG, pRNG, header, rng);
|
||||||
NerscIO::readConfiguration(U, header, config);
|
NerscIO::readConfiguration<GaugeStats>(U, header, config);
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -33,6 +33,7 @@ directory
|
|||||||
#define INTEGRATOR_INCLUDED
|
#define INTEGRATOR_INCLUDED
|
||||||
|
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
#include "MomentumFilter.h"
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
@ -78,8 +79,19 @@ protected:
|
|||||||
RepresentationPolicy Representations;
|
RepresentationPolicy Representations;
|
||||||
IntegratorParameters Params;
|
IntegratorParameters Params;
|
||||||
|
|
||||||
|
//Filters allow the user to manipulate the conjugate momentum, for example to freeze links in DDHMC
|
||||||
|
//It is applied whenever the momentum is updated / refreshed
|
||||||
|
//The default filter does nothing
|
||||||
|
MomentumFilterBase<MomentaField> const* MomFilter;
|
||||||
|
|
||||||
const ActionSet<Field, RepresentationPolicy> as;
|
const ActionSet<Field, RepresentationPolicy> as;
|
||||||
|
|
||||||
|
//Get a pointer to a shared static instance of the "do-nothing" momentum filter to serve as a default
|
||||||
|
static MomentumFilterBase<MomentaField> const* getDefaultMomFilter(){
|
||||||
|
static MomentumFilterNone<MomentaField> filter;
|
||||||
|
return &filter;
|
||||||
|
}
|
||||||
|
|
||||||
void update_P(Field& U, int level, double ep)
|
void update_P(Field& U, int level, double ep)
|
||||||
{
|
{
|
||||||
t_P[level] += ep;
|
t_P[level] += ep;
|
||||||
@ -135,6 +147,8 @@ protected:
|
|||||||
|
|
||||||
// Force from the other representations
|
// Force from the other representations
|
||||||
as[level].apply(update_P_hireps, Representations, Mom, U, ep);
|
as[level].apply(update_P_hireps, Representations, Mom, U, ep);
|
||||||
|
|
||||||
|
MomFilter->applyFilter(Mom);
|
||||||
}
|
}
|
||||||
|
|
||||||
void update_U(Field& U, double ep)
|
void update_U(Field& U, double ep)
|
||||||
@ -174,12 +188,24 @@ public:
|
|||||||
t_P.resize(levels, 0.0);
|
t_P.resize(levels, 0.0);
|
||||||
t_U = 0.0;
|
t_U = 0.0;
|
||||||
// initialization of smearer delegated outside of Integrator
|
// initialization of smearer delegated outside of Integrator
|
||||||
|
|
||||||
|
//Default the momentum filter to "do-nothing"
|
||||||
|
MomFilter = getDefaultMomFilter();
|
||||||
};
|
};
|
||||||
|
|
||||||
virtual ~Integrator() {}
|
virtual ~Integrator() {}
|
||||||
|
|
||||||
virtual std::string integrator_name() = 0;
|
virtual std::string integrator_name() = 0;
|
||||||
|
|
||||||
|
//Set the momentum filter allowing for manipulation of the conjugate momentum
|
||||||
|
void setMomentumFilter(const MomentumFilterBase<MomentaField> &filter){
|
||||||
|
MomFilter = &filter;
|
||||||
|
}
|
||||||
|
|
||||||
|
//Access the conjugate momentum
|
||||||
|
const MomentaField & getMomentum() const{ return P; }
|
||||||
|
|
||||||
|
|
||||||
void print_parameters()
|
void print_parameters()
|
||||||
{
|
{
|
||||||
std::cout << GridLogMessage << "[Integrator] Name : "<< integrator_name() << std::endl;
|
std::cout << GridLogMessage << "[Integrator] Name : "<< integrator_name() << std::endl;
|
||||||
@ -210,10 +236,9 @@ public:
|
|||||||
// over the representations
|
// over the representations
|
||||||
struct _refresh {
|
struct _refresh {
|
||||||
template <class FieldType, class Repr>
|
template <class FieldType, class Repr>
|
||||||
void operator()(std::vector<Action<FieldType>*> repr_set, Repr& Rep,
|
void operator()(std::vector<Action<FieldType>*> repr_set, Repr& Rep, GridSerialRNG & sRNG, GridParallelRNG& pRNG) {
|
||||||
GridParallelRNG& pRNG) {
|
|
||||||
for (int a = 0; a < repr_set.size(); ++a){
|
for (int a = 0; a < repr_set.size(); ++a){
|
||||||
repr_set.at(a)->refresh(Rep.U, pRNG);
|
repr_set.at(a)->refresh(Rep.U, sRNG, pRNG);
|
||||||
|
|
||||||
std::cout << GridLogDebug << "Hirep refreshing pseudofermions" << std::endl;
|
std::cout << GridLogDebug << "Hirep refreshing pseudofermions" << std::endl;
|
||||||
}
|
}
|
||||||
@ -221,12 +246,12 @@ public:
|
|||||||
} refresh_hireps{};
|
} refresh_hireps{};
|
||||||
|
|
||||||
// Initialization of momenta and actions
|
// Initialization of momenta and actions
|
||||||
void refresh(Field& U, GridParallelRNG& pRNG)
|
void refresh(Field& U, GridSerialRNG & sRNG, GridParallelRNG& pRNG)
|
||||||
{
|
{
|
||||||
assert(P.Grid() == U.Grid());
|
assert(P.Grid() == U.Grid());
|
||||||
std::cout << GridLogIntegrator << "Integrator refresh\n";
|
std::cout << GridLogIntegrator << "Integrator refresh\n";
|
||||||
|
|
||||||
FieldImplementation::generate_momenta(P, pRNG);
|
FieldImplementation::generate_momenta(P, sRNG, pRNG);
|
||||||
|
|
||||||
// Update the smeared fields, can be implemented as observer
|
// Update the smeared fields, can be implemented as observer
|
||||||
// necessary to keep the fields updated even after a reject
|
// necessary to keep the fields updated even after a reject
|
||||||
@ -243,12 +268,14 @@ public:
|
|||||||
// get gauge field from the SmearingPolicy and
|
// get gauge field from the SmearingPolicy and
|
||||||
// based on the boolean is_smeared in actionID
|
// based on the boolean is_smeared in actionID
|
||||||
Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
|
Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
|
||||||
as[level].actions.at(actionID)->refresh(Us, pRNG);
|
as[level].actions.at(actionID)->refresh(Us, sRNG, pRNG);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Refresh the higher representation actions
|
// Refresh the higher representation actions
|
||||||
as[level].apply(refresh_hireps, Representations, pRNG);
|
as[level].apply(refresh_hireps, Representations, sRNG, pRNG);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
MomFilter->applyFilter(P);
|
||||||
}
|
}
|
||||||
|
|
||||||
// to be used by the actionlevel class to iterate
|
// to be used by the actionlevel class to iterate
|
||||||
@ -313,6 +340,8 @@ public:
|
|||||||
std::cout << GridLogIntegrator << " times[" << level << "]= " << t_P[level] << " " << t_U << std::endl;
|
std::cout << GridLogIntegrator << " times[" << level << "]= " << t_P[level] << " " << t_U << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
FieldImplementation::Project(U);
|
||||||
|
|
||||||
// and that we indeed got to the end of the trajectory
|
// and that we indeed got to the end of the trajectory
|
||||||
assert(fabs(t_U - Params.trajL) < 1.0e-6);
|
assert(fabs(t_U - Params.trajL) < 1.0e-6);
|
||||||
|
|
||||||
|
94
Grid/qcd/hmc/integrators/MomentumFilter.h
Normal file
94
Grid/qcd/hmc/integrators/MomentumFilter.h
Normal file
@ -0,0 +1,94 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/qcd/hmc/integrators/MomentumFilter.h
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: Christopher Kelly <ckelly@bnl.gov>
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution
|
||||||
|
directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
#ifndef MOMENTUM_FILTER
|
||||||
|
#define MOMENTUM_FILTER
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
//These filter objects allow the user to manipulate the conjugate momentum as part of the update / refresh
|
||||||
|
|
||||||
|
template<typename MomentaField>
|
||||||
|
struct MomentumFilterBase{
|
||||||
|
virtual void applyFilter(MomentaField &P) const;
|
||||||
|
};
|
||||||
|
|
||||||
|
//Do nothing
|
||||||
|
template<typename MomentaField>
|
||||||
|
struct MomentumFilterNone: public MomentumFilterBase<MomentaField>{
|
||||||
|
void applyFilter(MomentaField &P) const override{}
|
||||||
|
};
|
||||||
|
|
||||||
|
//Multiply each site/direction by a Lorentz vector complex number field
|
||||||
|
//Can be used to implement a mask, zeroing out sites
|
||||||
|
template<typename MomentaField>
|
||||||
|
struct MomentumFilterApplyPhase: public MomentumFilterBase<MomentaField>{
|
||||||
|
typedef typename MomentaField::vector_type vector_type; //SIMD-vectorized complex type
|
||||||
|
typedef typename MomentaField::scalar_type scalar_type; //scalar complex type
|
||||||
|
typedef iVector<iScalar<iScalar<vector_type> >, Nd > LorentzScalarType; //complex phase for each site/direction
|
||||||
|
typedef Lattice<LorentzScalarType> LatticeLorentzScalarType;
|
||||||
|
|
||||||
|
LatticeLorentzScalarType phase;
|
||||||
|
|
||||||
|
MomentumFilterApplyPhase(const LatticeLorentzScalarType _phase): phase(_phase){}
|
||||||
|
|
||||||
|
//Default to uniform field of (1,0)
|
||||||
|
MomentumFilterApplyPhase(GridBase* _grid): phase(_grid){
|
||||||
|
LorentzScalarType one;
|
||||||
|
for(int mu=0;mu<Nd;mu++)
|
||||||
|
one(mu)()() = scalar_type(1.);
|
||||||
|
|
||||||
|
phase = one;
|
||||||
|
}
|
||||||
|
|
||||||
|
void applyFilter(MomentaField &P) const override{
|
||||||
|
conformable(P,phase);
|
||||||
|
autoView( P_v , P, AcceleratorWrite);
|
||||||
|
autoView( phase_v , phase, AcceleratorRead);
|
||||||
|
|
||||||
|
accelerator_for(ss,P_v.size(),MomentaField::vector_type::Nsimd(),{
|
||||||
|
auto site_mom = P_v(ss);
|
||||||
|
auto site_phase = phase_v(ss);
|
||||||
|
for(int mu=0;mu<Nd;mu++)
|
||||||
|
site_mom(mu) = site_mom(mu) * site_phase(mu);
|
||||||
|
coalescedWrite(P_v[ss], site_mom);
|
||||||
|
});
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
#endif
|
@ -99,7 +99,7 @@ public:
|
|||||||
virtual Prod* getPtr() = 0;
|
virtual Prod* getPtr() = 0;
|
||||||
|
|
||||||
// add a getReference?
|
// add a getReference?
|
||||||
|
virtual ~HMCModuleBase(){};
|
||||||
virtual void print_parameters(){}; // default to nothing
|
virtual void print_parameters(){}; // default to nothing
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -85,21 +85,18 @@ public:
|
|||||||
|
|
||||||
std::cout << GridLogDebug << "Stout smearing started\n";
|
std::cout << GridLogDebug << "Stout smearing started\n";
|
||||||
|
|
||||||
// Smear the configurations
|
// C contains the staples multiplied by some rho
|
||||||
|
u_smr = U ; // set the smeared field to the current gauge field
|
||||||
SmearBase->smear(C, U);
|
SmearBase->smear(C, U);
|
||||||
|
|
||||||
for (int mu = 0; mu < Nd; mu++) {
|
for (int mu = 0; mu < Nd; mu++) {
|
||||||
if( mu == OrthogDim )
|
if( mu == OrthogDim ) continue ;
|
||||||
tmp = 1.0; // Don't smear in the orthogonal direction
|
// u_smr = exp(iQ_mu)*U_mu apart from Orthogdim
|
||||||
else {
|
Umu = peekLorentz(U, mu);
|
||||||
tmp = peekLorentz(C, mu);
|
tmp = peekLorentz(C, mu);
|
||||||
Umu = peekLorentz(U, mu);
|
iq_mu = Ta( tmp * adj(Umu));
|
||||||
iq_mu = Ta(
|
exponentiate_iQ(tmp, iq_mu);
|
||||||
tmp *
|
pokeLorentz(u_smr, tmp * Umu, mu);
|
||||||
adj(Umu)); // iq_mu = Ta(Omega_mu) to match the signs with the paper
|
|
||||||
exponentiate_iQ(tmp, iq_mu);
|
|
||||||
}
|
|
||||||
pokeLorentz(u_smr, tmp * Umu, mu); // u_smr = exp(iQ_mu)*U_mu
|
|
||||||
}
|
}
|
||||||
std::cout << GridLogDebug << "Stout smearing completed\n";
|
std::cout << GridLogDebug << "Stout smearing completed\n";
|
||||||
};
|
};
|
||||||
|
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user