mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-16 14:57:05 +01:00
Compare commits
392 Commits
sycl
...
feature/co
Author | SHA1 | Date | |
---|---|---|---|
3c23a947cc | |||
56111bb823 | |||
99445673f6 | |||
97a59643f7 | |||
579595f547 | |||
281ac5fc12 | |||
d8fa903b02 | |||
eaff0f3aeb | |||
e8e20c01b2 | |||
a4afc3ea2a | |||
3fe75bc7cb | |||
45d49d8648 | |||
6013183361 | |||
4b882e8056 | |||
3f9ae6e7e7 | |||
909acd55cd | |||
4dd9e39e0d | |||
7adb253e25 | |||
873519e960 | |||
9aec4a3c26 | |||
70510d151b | |||
9e7bacb5a4 | |||
2ef1fa66a8 | |||
cf76741ec6 | |||
497e7c1c40 | |||
888eacd3b8 | |||
321f0f51b5 | |||
30ad9578a2 | |||
9dce101586 | |||
97e264d0ff | |||
683a5e5bf5 | |||
d4861a362c | |||
5ff3eae027 | |||
147dc15d26 | |||
c61ea72949 | |||
86e8b9fe38 | |||
612e468889 | |||
4ea8d128c2 | |||
e49b7f2f88 | |||
aace3d47b9 | |||
d5049949a4 | |||
f1c7480e3c | |||
5adae5d6ff | |||
a8412ace05 | |||
9fd1c2ad4b | |||
4cf3575353 | |||
804a810d68 | |||
8fcb392e24 | |||
dd8d70eeff | |||
aa8aba6543 | |||
13df14f96e | |||
3aab983760 | |||
9c4dcc5ea3 | |||
a1063ddbb9 | |||
18ef8056ec | |||
1c673977fa | |||
e9bc748828 | |||
f48156529b | |||
d05ce01809 | |||
cf23eff60e | |||
6e313575be | |||
b13d1f7238 | |||
b5e7945dd9 | |||
7535566f54 | |||
50b808ab33 | |||
f16c2665f5 | |||
41e28015ae | |||
a0ccbb3bd6 | |||
5eeabaa2bb | |||
00d0d6d008 | |||
537a9f7030 | |||
cc9c993f74 | |||
d10422ded8 | |||
f313565a3c | |||
b3881d2636 | |||
61d5860b46 | |||
52d17987dc | |||
19d8bba97d | |||
463d72d322 | |||
d060341168 | |||
c772bcd514 | |||
3362f8dfa0 | |||
bf3c9857e0 | |||
a88b3ceca5 | |||
aa135412f5 | |||
9945399e60 | |||
5eeffa49e8 | |||
3f06209720 | |||
12e239dd9f | |||
af2301afbb | |||
f98856a26f | |||
d55cc5b380 | |||
c2b688abc9 | |||
b0d61b9687 | |||
5f893bf9af | |||
0e17bd6597 | |||
22caa158cc | |||
b24a504d7c | |||
992ef6e9fc | |||
f32a320bc3 | |||
5f0fe029d2 | |||
6b1486e89b | |||
3f9c427a3a | |||
d201277652 | |||
fdda7cf9cf | |||
e22d30f715 | |||
1ba25a0d8c | |||
9ba3647bdf | |||
5ee832f738 | |||
467deee46f | |||
35a69a5133 | |||
e9c5a271a8 | |||
acac2d6938 | |||
97db2b8d20 | |||
80fd6ab407 | |||
5534921bee | |||
ace9cd64bb | |||
a3e2aeb603 | |||
049dd25785 | |||
d43d372294 | |||
b71a081cba | |||
c48909590b | |||
446ef40570 | |||
81441e98f4 | |||
ecd3f890f5 | |||
1c881ce23c | |||
dacbbdd051 | |||
2859955a03 | |||
cc220abd1d | |||
d1c0c0197e | |||
fd9424ef27 | |||
a5c35c4024 | |||
e03b64dc06 | |||
4677c40195 | |||
288c615782 | |||
48e81cf6f8 | |||
5cffa05c7e | |||
d50a2164d7 | |||
32ff766dbd | |||
01652d8cfe | |||
4d2dc7ba03 | |||
51d1beb1f3 | |||
65b724bb5f | |||
6dbd117aa5 | |||
198b29f618 | |||
a8309638d4 | |||
f98a4e880e | |||
8244caff25 | |||
bcd7895362 | |||
85b1c5df39 | |||
b4255140d6 | |||
0c3095e173 | |||
d3ce60713d | |||
eac1f08b7b | |||
1654c4f3c0 | |||
8807d998bc | |||
5791021dcd | |||
c273fb051c | |||
c545530170 | |||
d982a5b6d5 | |||
15ca8637f3 | |||
cbc995b74c | |||
8b74174d74 | |||
e21fef17df | |||
3d27708f07 | |||
b918744184 | |||
7d14a3c086 | |||
e14a84317d | |||
6c31b99f1f | |||
9522dcd611 | |||
ed469898dc | |||
1eee94a809 | |||
54523369a3 | |||
a98c91c2a5 | |||
a9b92867a8 | |||
65920faeba | |||
249e2db87d | |||
cf3535d16e | |||
d61ee817f4 | |||
3448b7387c | |||
47b89d2739 | |||
2a75516330 | |||
b2087f14c4 | |||
dd1ba266b2 | |||
1292d59563 | |||
9877ed9bf8 | |||
f0dc0f3621 | |||
1efe30d6cc | |||
0b787e9fe0 | |||
37ec4b241c | |||
63b0a19f37 | |||
90ea7dfa99 | |||
f866d7c33e | |||
542bdef198 | |||
06007db3d9 | |||
12e6059a70 | |||
dbaa24ebf6 | |||
3276aa67dc | |||
3b30b9f0c0 | |||
69db4816f7 | |||
3abe09025a | |||
e33878e0de | |||
27b4fbf3f0 | |||
968a90633a | |||
6365a89ba3 | |||
ddbb008694 | |||
7997e0a449 | |||
197612bc7a | |||
0e88bf4bff | |||
3e64d78469 | |||
2004611def | |||
a2868c96a4 | |||
7cf7f11e1a | |||
ea7f8fda5e | |||
906b78811b | |||
97703b181b | |||
d9474c6cb6 | |||
bbd145382b | |||
1b08cb7300 | |||
337d9dc043 | |||
8726e94ea7 | |||
67db4993c2 | |||
f1f655d92b | |||
43334e88c3 | |||
4f1e66b044 | |||
fd3c8b0e85 | |||
1635c263ee | |||
64fe5b21b4 | |||
ee9889821d | |||
eb470aa6dc | |||
77af9a3ddc | |||
102089798c | |||
39cea8b5a7 | |||
a65f66d2db | |||
936c5ecf69 | |||
22cfbdbbb3 | |||
093d1ee21b | |||
d6ba2581ce | |||
577c064184 | |||
2ff1fa6fad | |||
70be1bd8be | |||
4ef50ba31f | |||
3e97a26f90 | |||
599f28f6ef | |||
5b117865b2 | |||
05bbc49a99 | |||
81a8209749 | |||
a87e45ba25 | |||
465856331a | |||
cc958aa9ed | |||
a25e4b3d0c | |||
d1210ca12a | |||
36ea0e222a | |||
92281ec22d | |||
87266ce099 | |||
2a23f133e8 | |||
8dbf790f62 | |||
2402b4940e | |||
2111052fbe | |||
f0d17d2b49 | |||
244c003a1b | |||
433766ac62 | |||
93a37c8f68 | |||
9872c76825 | |||
5ee3ea2144 | |||
5050833b42 | |||
7bee4ebb54 | |||
71cf9851e7 | |||
b4735c9904 | |||
9b2699226c | |||
5f52804907 | |||
936071773e | |||
1732f9319e | |||
91c81cab30 | |||
38164f8480 | |||
f013979791 | |||
e947b563ea | |||
5cb3530c34 | |||
250008372f | |||
4fedd8d29f | |||
6ddcef1bca | |||
8c5a5fdfce | |||
046b1cbbc0 | |||
a65ce237c1 | |||
cd27f1005d | |||
f8c0a59221 | |||
832485699f | |||
81484a4760 | |||
9a86059761 | |||
b780b7b7a0 | |||
9e085bd04e | |||
6b6bf537d3 | |||
323a651c71 | |||
9f212679f1 | |||
032f7dde1a | |||
50b1db1e8b | |||
015d8bb38a | |||
10a34312dc | |||
db8c0e7584 | |||
d15ccad8a7 | |||
0009b5cee8 | |||
20d1941a45 | |||
b7c76ede29 | |||
05edf803bd | |||
78b8e40f83 | |||
fc2e9850d3 | |||
ffaaed679e | |||
b2fd8b993a | |||
291ee8c3d0 | |||
e1a5b3ea49 | |||
55a55660cb | |||
ceb8b374da | |||
4bc2ad2894 | |||
798af3e68f | |||
b0ef2367f3 | |||
71a7350a85 | |||
6f79369955 | |||
f9cb6b979f | |||
ed4d9d17f8 | |||
fbed02690d | |||
39f3ae5b1d | |||
e64bec8c8e | |||
0893b4e552 | |||
92f0f29670 | |||
48a340a9d1 | |||
f45621109b | |||
32d1a0bbea | |||
267cce66a1 | |||
3417147b11 | |||
b338719bc8 | |||
2b81cbe2c2 | |||
acff9d6ed2 | |||
a306a49788 | |||
7ef03c5368 | |||
5abec5b8a9 | |||
499edc0636 | |||
d990e61be3 | |||
3edb2dc2da | |||
345721220e | |||
6db68d6ecb | |||
09f0963d1f | |||
6f44e3c192 | |||
5893888f87 | |||
39b448affb | |||
e54a8f05a9 | |||
64b72fc17f | |||
6fdce60492 | |||
852db4626a | |||
6504a098cc | |||
79a385faca | |||
c12a67030a | |||
581392f2f2 | |||
113f277b6a | |||
974586bedc | |||
160f78c1e4 | |||
7e4e1bbbc2 | |||
e699b7e9f9 | |||
a28bc0de90 | |||
14d0fe4d6c | |||
0ad2e0815c | |||
1c8ca05e16 | |||
dc9c8340bb | |||
19eef97503 | |||
635246ce50 | |||
5cdbb7e71e | |||
8123590a1b | |||
86c9c4da8b | |||
cd1efee866 | |||
bd310932f7 | |||
304762e7ac | |||
d79ab03a6c | |||
d5708e0eb2 | |||
123f6b7a61 | |||
2b6457dd9a | |||
b367cbd422 | |||
e252c1aca3 | |||
b140c6a4f9 | |||
326de36467 | |||
9f224a1647 | |||
bb46ba9b5f | |||
dd5a22b36b | |||
1ea85b9972 | |||
8fb63f1c25 | |||
77fa586f6c | |||
15238e8d5e | |||
b27e31957a | |||
46927771e3 | |||
d8cea77707 | |||
5f8a76d490 | |||
28d49a3b60 | |||
b4c624ece6 | |||
2c22db841a |
@ -9,11 +9,6 @@ matrix:
|
|||||||
- os: osx
|
- os: osx
|
||||||
osx_image: xcode8.3
|
osx_image: xcode8.3
|
||||||
compiler: clang
|
compiler: clang
|
||||||
env: PREC=single
|
|
||||||
- os: osx
|
|
||||||
osx_image: xcode8.3
|
|
||||||
compiler: clang
|
|
||||||
env: PREC=double
|
|
||||||
|
|
||||||
before_install:
|
before_install:
|
||||||
- export GRIDDIR=`pwd`
|
- export GRIDDIR=`pwd`
|
||||||
@ -55,7 +50,7 @@ script:
|
|||||||
- make -j4
|
- make -j4
|
||||||
- make install
|
- make install
|
||||||
- cd $CWD/build
|
- cd $CWD/build
|
||||||
- ../configure --enable-precision=$PREC --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF}
|
- ../configure --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF}
|
||||||
- make -j4
|
- make -j4
|
||||||
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
|
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
|
||||||
- make check
|
- make check
|
||||||
|
@ -37,7 +37,9 @@ directory
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
//disables and intel compiler specific warning (in json.hpp)
|
//disables and intel compiler specific warning (in json.hpp)
|
||||||
|
#ifdef __ICC
|
||||||
#pragma warning disable 488
|
#pragma warning disable 488
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef __NVCC__
|
#ifdef __NVCC__
|
||||||
//disables nvcc specific warning in json.hpp
|
//disables nvcc specific warning in json.hpp
|
||||||
|
@ -28,4 +28,7 @@
|
|||||||
///////////////////
|
///////////////////
|
||||||
#include "Config.h"
|
#include "Config.h"
|
||||||
|
|
||||||
|
#ifdef TOFU
|
||||||
|
#undef GRID_COMMS_THREADS
|
||||||
|
#endif
|
||||||
#endif /* GRID_STD_H */
|
#endif /* GRID_STD_H */
|
||||||
|
@ -34,6 +34,12 @@
|
|||||||
#define __SYCL__REDEFINE__
|
#define __SYCL__REDEFINE__
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/* HIP save and restore compile environment*/
|
||||||
|
#ifdef GRID_HIP
|
||||||
|
#pragma push
|
||||||
|
#pragma push_macro("__HIP_DEVICE_COMPILE__")
|
||||||
|
#endif
|
||||||
|
#define EIGEN_NO_HIP
|
||||||
|
|
||||||
#include <Grid/Eigen/Dense>
|
#include <Grid/Eigen/Dense>
|
||||||
#include <Grid/Eigen/unsupported/CXX11/Tensor>
|
#include <Grid/Eigen/unsupported/CXX11/Tensor>
|
||||||
@ -42,7 +48,7 @@
|
|||||||
#ifdef __NVCC__REDEFINE__
|
#ifdef __NVCC__REDEFINE__
|
||||||
#pragma pop_macro("__CUDACC__")
|
#pragma pop_macro("__CUDACC__")
|
||||||
#pragma pop_macro("__NVCC__")
|
#pragma pop_macro("__NVCC__")
|
||||||
#pragma pop_macro("GRID_SIMT")
|
#pragma pop_macro("__CUDA_ARCH__")
|
||||||
#pragma pop
|
#pragma pop
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -52,6 +58,12 @@
|
|||||||
#pragma pop
|
#pragma pop
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/*HIP restore*/
|
||||||
|
#ifdef __HIP__REDEFINE__
|
||||||
|
#pragma pop_macro("__HIP_DEVICE_COMPILE__")
|
||||||
|
#pragma pop
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined __GNUC__
|
#if defined __GNUC__
|
||||||
#pragma GCC diagnostic pop
|
#pragma GCC diagnostic pop
|
||||||
#endif
|
#endif
|
||||||
|
@ -21,6 +21,7 @@ if BUILD_HDF5
|
|||||||
extra_headers+=serialisation/Hdf5Type.h
|
extra_headers+=serialisation/Hdf5Type.h
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
||||||
all: version-cache Version.h
|
all: version-cache Version.h
|
||||||
|
|
||||||
version-cache:
|
version-cache:
|
||||||
@ -53,6 +54,17 @@ Version.h: version-cache
|
|||||||
include Make.inc
|
include Make.inc
|
||||||
include Eigen.inc
|
include Eigen.inc
|
||||||
|
|
||||||
|
extra_sources+=$(ZWILS_FERMION_FILES)
|
||||||
|
extra_sources+=$(WILS_FERMION_FILES)
|
||||||
|
extra_sources+=$(STAG_FERMION_FILES)
|
||||||
|
if BUILD_GPARITY
|
||||||
|
extra_sources+=$(GP_FERMION_FILES)
|
||||||
|
endif
|
||||||
|
if BUILD_FERMION_REPS
|
||||||
|
extra_sources+=$(ADJ_FERMION_FILES)
|
||||||
|
extra_sources+=$(TWOIND_FERMION_FILES)
|
||||||
|
endif
|
||||||
|
|
||||||
lib_LIBRARIES = libGrid.a
|
lib_LIBRARIES = libGrid.a
|
||||||
|
|
||||||
CCFILES += $(extra_sources)
|
CCFILES += $(extra_sources)
|
||||||
|
@ -31,6 +31,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
#ifndef GRID_ALGORITHM_COARSENED_MATRIX_H
|
#ifndef GRID_ALGORITHM_COARSENED_MATRIX_H
|
||||||
#define GRID_ALGORITHM_COARSENED_MATRIX_H
|
#define GRID_ALGORITHM_COARSENED_MATRIX_H
|
||||||
|
|
||||||
|
#include <Grid/qcd/QCD.h> // needed for Dagger(Yes|No), Inverse(Yes|No)
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
@ -59,12 +60,14 @@ inline void blockMaskedInnerProduct(Lattice<CComplex> &CoarseInner,
|
|||||||
class Geometry {
|
class Geometry {
|
||||||
public:
|
public:
|
||||||
int npoint;
|
int npoint;
|
||||||
|
int base;
|
||||||
std::vector<int> directions ;
|
std::vector<int> directions ;
|
||||||
std::vector<int> displacements;
|
std::vector<int> displacements;
|
||||||
|
std::vector<int> points_dagger;
|
||||||
|
|
||||||
Geometry(int _d) {
|
Geometry(int _d) {
|
||||||
|
|
||||||
int base = (_d==5) ? 1:0;
|
base = (_d==5) ? 1:0;
|
||||||
|
|
||||||
// make coarse grid stencil for 4d , not 5d
|
// make coarse grid stencil for 4d , not 5d
|
||||||
if ( _d==5 ) _d=4;
|
if ( _d==5 ) _d=4;
|
||||||
@ -72,16 +75,51 @@ public:
|
|||||||
npoint = 2*_d+1;
|
npoint = 2*_d+1;
|
||||||
directions.resize(npoint);
|
directions.resize(npoint);
|
||||||
displacements.resize(npoint);
|
displacements.resize(npoint);
|
||||||
|
points_dagger.resize(npoint);
|
||||||
for(int d=0;d<_d;d++){
|
for(int d=0;d<_d;d++){
|
||||||
directions[d ] = d+base;
|
directions[d ] = d+base;
|
||||||
directions[d+_d] = d+base;
|
directions[d+_d] = d+base;
|
||||||
displacements[d ] = +1;
|
displacements[d ] = +1;
|
||||||
displacements[d+_d]= -1;
|
displacements[d+_d]= -1;
|
||||||
|
points_dagger[d ] = d+_d;
|
||||||
|
points_dagger[d+_d] = d;
|
||||||
}
|
}
|
||||||
directions [2*_d]=0;
|
directions [2*_d]=0;
|
||||||
displacements[2*_d]=0;
|
displacements[2*_d]=0;
|
||||||
|
points_dagger[2*_d]=2*_d;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int point(int dir, int disp) {
|
||||||
|
assert(disp == -1 || disp == 0 || disp == 1);
|
||||||
|
assert(base+0 <= dir && dir < base+4);
|
||||||
|
|
||||||
|
// directions faster index = new indexing
|
||||||
|
// 4d (base = 0):
|
||||||
|
// point 0 1 2 3 4 5 6 7 8
|
||||||
|
// dir 0 1 2 3 0 1 2 3 0
|
||||||
|
// disp +1 +1 +1 +1 -1 -1 -1 -1 0
|
||||||
|
// 5d (base = 1):
|
||||||
|
// point 0 1 2 3 4 5 6 7 8
|
||||||
|
// dir 1 2 3 4 1 2 3 4 0
|
||||||
|
// disp +1 +1 +1 +1 -1 -1 -1 -1 0
|
||||||
|
|
||||||
|
// displacements faster index = old indexing
|
||||||
|
// 4d (base = 0):
|
||||||
|
// point 0 1 2 3 4 5 6 7 8
|
||||||
|
// dir 0 0 1 1 2 2 3 3 0
|
||||||
|
// disp +1 -1 +1 -1 +1 -1 +1 -1 0
|
||||||
|
// 5d (base = 1):
|
||||||
|
// point 0 1 2 3 4 5 6 7 8
|
||||||
|
// dir 1 1 2 2 3 3 4 4 0
|
||||||
|
// disp +1 -1 +1 -1 +1 -1 +1 -1 0
|
||||||
|
|
||||||
|
if(dir == 0 and disp == 0)
|
||||||
|
return 8;
|
||||||
|
else // New indexing
|
||||||
|
return (1 - disp) / 2 * 4 + dir - base;
|
||||||
|
// else // Old indexing
|
||||||
|
// return (4 * (dir - base) + 1 - disp) / 2;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template<class Fobj,class CComplex,int nbasis>
|
template<class Fobj,class CComplex,int nbasis>
|
||||||
@ -258,7 +296,7 @@ public:
|
|||||||
// Fine Object == (per site) type of fine field
|
// Fine Object == (per site) type of fine field
|
||||||
// nbasis == number of deflation vectors
|
// nbasis == number of deflation vectors
|
||||||
template<class Fobj,class CComplex,int nbasis>
|
template<class Fobj,class CComplex,int nbasis>
|
||||||
class CoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > > {
|
class CoarsenedMatrix : public CheckerBoardedSparseMatrixBase<Lattice<iVector<CComplex,nbasis > > > {
|
||||||
public:
|
public:
|
||||||
|
|
||||||
typedef iVector<CComplex,nbasis > siteVector;
|
typedef iVector<CComplex,nbasis > siteVector;
|
||||||
@ -268,33 +306,59 @@ public:
|
|||||||
typedef iMatrix<CComplex,nbasis > Cobj;
|
typedef iMatrix<CComplex,nbasis > Cobj;
|
||||||
typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field
|
typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field
|
||||||
typedef Lattice<Fobj > FineField;
|
typedef Lattice<Fobj > FineField;
|
||||||
|
typedef CoarseVector FermionField;
|
||||||
|
|
||||||
|
// enrich interface, use default implementation as in FermionOperator ///////
|
||||||
|
void Dminus(CoarseVector const& in, CoarseVector& out) { out = in; }
|
||||||
|
void DminusDag(CoarseVector const& in, CoarseVector& out) { out = in; }
|
||||||
|
void ImportPhysicalFermionSource(CoarseVector const& input, CoarseVector& imported) { imported = input; }
|
||||||
|
void ImportUnphysicalFermion(CoarseVector const& input, CoarseVector& imported) { imported = input; }
|
||||||
|
void ExportPhysicalFermionSolution(CoarseVector const& solution, CoarseVector& exported) { exported = solution; };
|
||||||
|
void ExportPhysicalFermionSource(CoarseVector const& solution, CoarseVector& exported) { exported = solution; };
|
||||||
|
|
||||||
////////////////////
|
////////////////////
|
||||||
// Data members
|
// Data members
|
||||||
////////////////////
|
////////////////////
|
||||||
Geometry geom;
|
Geometry geom;
|
||||||
GridBase * _grid;
|
GridBase * _grid;
|
||||||
|
GridBase* _cbgrid;
|
||||||
int hermitian;
|
int hermitian;
|
||||||
|
|
||||||
CartesianStencil<siteVector,siteVector,int> Stencil;
|
CartesianStencil<siteVector,siteVector,int> Stencil;
|
||||||
|
CartesianStencil<siteVector,siteVector,int> StencilEven;
|
||||||
|
CartesianStencil<siteVector,siteVector,int> StencilOdd;
|
||||||
|
|
||||||
std::vector<CoarseMatrix> A;
|
std::vector<CoarseMatrix> A;
|
||||||
|
std::vector<CoarseMatrix> Aeven;
|
||||||
|
std::vector<CoarseMatrix> Aodd;
|
||||||
|
|
||||||
|
CoarseMatrix AselfInv;
|
||||||
|
CoarseMatrix AselfInvEven;
|
||||||
|
CoarseMatrix AselfInvOdd;
|
||||||
|
|
||||||
|
Vector<RealD> dag_factor;
|
||||||
|
|
||||||
///////////////////////
|
///////////////////////
|
||||||
// Interface
|
// Interface
|
||||||
///////////////////////
|
///////////////////////
|
||||||
GridBase * Grid(void) { return _grid; }; // this is all the linalg routines need to know
|
GridBase * Grid(void) { return _grid; }; // this is all the linalg routines need to know
|
||||||
|
GridBase * RedBlackGrid() { return _cbgrid; };
|
||||||
|
|
||||||
|
int ConstEE() { return 0; }
|
||||||
|
|
||||||
void M (const CoarseVector &in, CoarseVector &out)
|
void M (const CoarseVector &in, CoarseVector &out)
|
||||||
{
|
{
|
||||||
conformable(_grid,in.Grid());
|
conformable(_grid,in.Grid());
|
||||||
conformable(in.Grid(),out.Grid());
|
conformable(in.Grid(),out.Grid());
|
||||||
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
|
||||||
SimpleCompressor<siteVector> compressor;
|
SimpleCompressor<siteVector> compressor;
|
||||||
|
|
||||||
Stencil.HaloExchange(in,compressor);
|
Stencil.HaloExchange(in,compressor);
|
||||||
autoView( in_v , in, AcceleratorRead);
|
autoView( in_v , in, AcceleratorRead);
|
||||||
autoView( out_v , out, AcceleratorWrite);
|
autoView( out_v , out, AcceleratorWrite);
|
||||||
|
autoView( Stencil_v , Stencil, AcceleratorRead);
|
||||||
|
auto& geom_v = geom;
|
||||||
typedef LatticeView<Cobj> Aview;
|
typedef LatticeView<Cobj> Aview;
|
||||||
|
|
||||||
Vector<Aview> AcceleratorViewContainer;
|
Vector<Aview> AcceleratorViewContainer;
|
||||||
@ -316,14 +380,14 @@ public:
|
|||||||
int ptype;
|
int ptype;
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
|
|
||||||
for(int point=0;point<geom.npoint;point++){
|
for(int point=0;point<geom_v.npoint;point++){
|
||||||
|
|
||||||
SE=Stencil.GetEntry(ptype,point,ss);
|
SE=Stencil_v.GetEntry(ptype,point,ss);
|
||||||
|
|
||||||
if(SE->_is_local) {
|
if(SE->_is_local) {
|
||||||
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
|
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
|
||||||
} else {
|
} else {
|
||||||
nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
|
nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]);
|
||||||
}
|
}
|
||||||
acceleratorSynchronise();
|
acceleratorSynchronise();
|
||||||
|
|
||||||
@ -344,12 +408,72 @@ public:
|
|||||||
return M(in,out);
|
return M(in,out);
|
||||||
} else {
|
} else {
|
||||||
// corresponds to Galerkin coarsening
|
// corresponds to Galerkin coarsening
|
||||||
CoarseVector tmp(Grid());
|
return MdagNonHermitian(in, out);
|
||||||
G5C(tmp, in);
|
|
||||||
M(tmp, out);
|
|
||||||
G5C(out, out);
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
void MdagNonHermitian(const CoarseVector &in, CoarseVector &out)
|
||||||
|
{
|
||||||
|
conformable(_grid,in.Grid());
|
||||||
|
conformable(in.Grid(),out.Grid());
|
||||||
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
|
||||||
|
SimpleCompressor<siteVector> compressor;
|
||||||
|
|
||||||
|
Stencil.HaloExchange(in,compressor);
|
||||||
|
autoView( in_v , in, AcceleratorRead);
|
||||||
|
autoView( out_v , out, AcceleratorWrite);
|
||||||
|
autoView( Stencil_v , Stencil, AcceleratorRead);
|
||||||
|
auto& geom_v = geom;
|
||||||
|
typedef LatticeView<Cobj> Aview;
|
||||||
|
|
||||||
|
Vector<Aview> AcceleratorViewContainer;
|
||||||
|
|
||||||
|
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
|
||||||
|
Aview *Aview_p = & AcceleratorViewContainer[0];
|
||||||
|
|
||||||
|
const int Nsimd = CComplex::Nsimd();
|
||||||
|
typedef decltype(coalescedRead(in_v[0])) calcVector;
|
||||||
|
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
|
||||||
|
|
||||||
|
int osites=Grid()->oSites();
|
||||||
|
|
||||||
|
Vector<int> points(geom.npoint, 0);
|
||||||
|
for(int p=0; p<geom.npoint; p++)
|
||||||
|
points[p] = geom.points_dagger[p];
|
||||||
|
|
||||||
|
RealD* dag_factor_p = &dag_factor[0];
|
||||||
|
|
||||||
|
accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
|
||||||
|
int ss = sss/nbasis;
|
||||||
|
int b = sss%nbasis;
|
||||||
|
calcComplex res = Zero();
|
||||||
|
calcVector nbr;
|
||||||
|
int ptype;
|
||||||
|
StencilEntry *SE;
|
||||||
|
|
||||||
|
for(int p=0;p<geom_v.npoint;p++){
|
||||||
|
int point = points[p];
|
||||||
|
|
||||||
|
SE=Stencil_v.GetEntry(ptype,point,ss);
|
||||||
|
|
||||||
|
if(SE->_is_local) {
|
||||||
|
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
|
||||||
|
} else {
|
||||||
|
nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]);
|
||||||
|
}
|
||||||
|
acceleratorSynchronise();
|
||||||
|
|
||||||
|
for(int bb=0;bb<nbasis;bb++) {
|
||||||
|
res = res + dag_factor_p[b*nbasis+bb]*coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
coalescedWrite(out_v[ss](b),res);
|
||||||
|
});
|
||||||
|
|
||||||
|
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
|
||||||
|
}
|
||||||
|
|
||||||
void MdirComms(const CoarseVector &in)
|
void MdirComms(const CoarseVector &in)
|
||||||
{
|
{
|
||||||
SimpleCompressor<siteVector> compressor;
|
SimpleCompressor<siteVector> compressor;
|
||||||
@ -359,6 +483,7 @@ public:
|
|||||||
{
|
{
|
||||||
conformable(_grid,in.Grid());
|
conformable(_grid,in.Grid());
|
||||||
conformable(_grid,out.Grid());
|
conformable(_grid,out.Grid());
|
||||||
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
|
||||||
typedef LatticeView<Cobj> Aview;
|
typedef LatticeView<Cobj> Aview;
|
||||||
Vector<Aview> AcceleratorViewContainer;
|
Vector<Aview> AcceleratorViewContainer;
|
||||||
@ -367,6 +492,7 @@ public:
|
|||||||
|
|
||||||
autoView( out_v , out, AcceleratorWrite);
|
autoView( out_v , out, AcceleratorWrite);
|
||||||
autoView( in_v , in, AcceleratorRead);
|
autoView( in_v , in, AcceleratorRead);
|
||||||
|
autoView( Stencil_v , Stencil, AcceleratorRead);
|
||||||
|
|
||||||
const int Nsimd = CComplex::Nsimd();
|
const int Nsimd = CComplex::Nsimd();
|
||||||
typedef decltype(coalescedRead(in_v[0])) calcVector;
|
typedef decltype(coalescedRead(in_v[0])) calcVector;
|
||||||
@ -380,12 +506,12 @@ public:
|
|||||||
int ptype;
|
int ptype;
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
|
|
||||||
SE=Stencil.GetEntry(ptype,point,ss);
|
SE=Stencil_v.GetEntry(ptype,point,ss);
|
||||||
|
|
||||||
if(SE->_is_local) {
|
if(SE->_is_local) {
|
||||||
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
|
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
|
||||||
} else {
|
} else {
|
||||||
nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
|
nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]);
|
||||||
}
|
}
|
||||||
acceleratorSynchronise();
|
acceleratorSynchronise();
|
||||||
|
|
||||||
@ -413,34 +539,7 @@ public:
|
|||||||
|
|
||||||
this->MdirComms(in);
|
this->MdirComms(in);
|
||||||
|
|
||||||
int ndim = in.Grid()->Nd();
|
MdirCalc(in,out,geom.point(dir,disp));
|
||||||
|
|
||||||
//////////////
|
|
||||||
// 4D action like wilson
|
|
||||||
// 0+ => 0
|
|
||||||
// 0- => 1
|
|
||||||
// 1+ => 2
|
|
||||||
// 1- => 3
|
|
||||||
// etc..
|
|
||||||
//////////////
|
|
||||||
// 5D action like DWF
|
|
||||||
// 1+ => 0
|
|
||||||
// 1- => 1
|
|
||||||
// 2+ => 2
|
|
||||||
// 2- => 3
|
|
||||||
// etc..
|
|
||||||
auto point = [dir, disp, ndim](){
|
|
||||||
if(dir == 0 and disp == 0)
|
|
||||||
return 8;
|
|
||||||
else if ( ndim==4 ) {
|
|
||||||
return (4 * dir + 1 - disp) / 2;
|
|
||||||
} else {
|
|
||||||
return (4 * (dir-1) + 1 - disp) / 2;
|
|
||||||
}
|
|
||||||
}();
|
|
||||||
|
|
||||||
MdirCalc(in,out,point);
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
void Mdiag(const CoarseVector &in, CoarseVector &out)
|
void Mdiag(const CoarseVector &in, CoarseVector &out)
|
||||||
@ -449,23 +548,296 @@ public:
|
|||||||
MdirCalc(in, out, point); // No comms
|
MdirCalc(in, out, point); // No comms
|
||||||
};
|
};
|
||||||
|
|
||||||
|
void Mooee(const CoarseVector &in, CoarseVector &out) {
|
||||||
|
MooeeInternal(in, out, DaggerNo, InverseNo);
|
||||||
|
}
|
||||||
|
|
||||||
CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) :
|
void MooeeInv(const CoarseVector &in, CoarseVector &out) {
|
||||||
|
MooeeInternal(in, out, DaggerNo, InverseYes);
|
||||||
|
}
|
||||||
|
|
||||||
|
void MooeeDag(const CoarseVector &in, CoarseVector &out) {
|
||||||
|
MooeeInternal(in, out, DaggerYes, InverseNo);
|
||||||
|
}
|
||||||
|
|
||||||
|
void MooeeInvDag(const CoarseVector &in, CoarseVector &out) {
|
||||||
|
MooeeInternal(in, out, DaggerYes, InverseYes);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Meooe(const CoarseVector &in, CoarseVector &out) {
|
||||||
|
if(in.Checkerboard() == Odd) {
|
||||||
|
DhopEO(in, out, DaggerNo);
|
||||||
|
} else {
|
||||||
|
DhopOE(in, out, DaggerNo);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void MeooeDag(const CoarseVector &in, CoarseVector &out) {
|
||||||
|
if(in.Checkerboard() == Odd) {
|
||||||
|
DhopEO(in, out, DaggerYes);
|
||||||
|
} else {
|
||||||
|
DhopOE(in, out, DaggerYes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Dhop(const CoarseVector &in, CoarseVector &out, int dag) {
|
||||||
|
conformable(in.Grid(), _grid); // verifies full grid
|
||||||
|
conformable(in.Grid(), out.Grid());
|
||||||
|
|
||||||
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
|
||||||
|
DhopInternal(Stencil, A, in, out, dag);
|
||||||
|
}
|
||||||
|
|
||||||
|
void DhopOE(const CoarseVector &in, CoarseVector &out, int dag) {
|
||||||
|
conformable(in.Grid(), _cbgrid); // verifies half grid
|
||||||
|
conformable(in.Grid(), out.Grid()); // drops the cb check
|
||||||
|
|
||||||
|
assert(in.Checkerboard() == Even);
|
||||||
|
out.Checkerboard() = Odd;
|
||||||
|
|
||||||
|
DhopInternal(StencilEven, Aodd, in, out, dag);
|
||||||
|
}
|
||||||
|
|
||||||
|
void DhopEO(const CoarseVector &in, CoarseVector &out, int dag) {
|
||||||
|
conformable(in.Grid(), _cbgrid); // verifies half grid
|
||||||
|
conformable(in.Grid(), out.Grid()); // drops the cb check
|
||||||
|
|
||||||
|
assert(in.Checkerboard() == Odd);
|
||||||
|
out.Checkerboard() = Even;
|
||||||
|
|
||||||
|
DhopInternal(StencilOdd, Aeven, in, out, dag);
|
||||||
|
}
|
||||||
|
|
||||||
|
void MooeeInternal(const CoarseVector &in, CoarseVector &out, int dag, int inv) {
|
||||||
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
|
||||||
|
|
||||||
|
CoarseMatrix *Aself = nullptr;
|
||||||
|
if(in.Grid()->_isCheckerBoarded) {
|
||||||
|
if(in.Checkerboard() == Odd) {
|
||||||
|
Aself = (inv) ? &AselfInvOdd : &Aodd[geom.npoint-1];
|
||||||
|
DselfInternal(StencilOdd, *Aself, in, out, dag);
|
||||||
|
} else {
|
||||||
|
Aself = (inv) ? &AselfInvEven : &Aeven[geom.npoint-1];
|
||||||
|
DselfInternal(StencilEven, *Aself, in, out, dag);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
Aself = (inv) ? &AselfInv : &A[geom.npoint-1];
|
||||||
|
DselfInternal(Stencil, *Aself, in, out, dag);
|
||||||
|
}
|
||||||
|
assert(Aself != nullptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
void DselfInternal(CartesianStencil<siteVector,siteVector,int> &st, CoarseMatrix &a,
|
||||||
|
const CoarseVector &in, CoarseVector &out, int dag) {
|
||||||
|
int point = geom.npoint-1;
|
||||||
|
autoView( out_v, out, AcceleratorWrite);
|
||||||
|
autoView( in_v, in, AcceleratorRead);
|
||||||
|
autoView( st_v, st, AcceleratorRead);
|
||||||
|
autoView( a_v, a, AcceleratorRead);
|
||||||
|
|
||||||
|
const int Nsimd = CComplex::Nsimd();
|
||||||
|
typedef decltype(coalescedRead(in_v[0])) calcVector;
|
||||||
|
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
|
||||||
|
|
||||||
|
RealD* dag_factor_p = &dag_factor[0];
|
||||||
|
|
||||||
|
if(dag) {
|
||||||
|
accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, {
|
||||||
|
int ss = sss/nbasis;
|
||||||
|
int b = sss%nbasis;
|
||||||
|
calcComplex res = Zero();
|
||||||
|
calcVector nbr;
|
||||||
|
int ptype;
|
||||||
|
StencilEntry *SE;
|
||||||
|
|
||||||
|
SE=st_v.GetEntry(ptype,point,ss);
|
||||||
|
|
||||||
|
if(SE->_is_local) {
|
||||||
|
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
|
||||||
|
} else {
|
||||||
|
nbr = coalescedRead(st_v.CommBuf()[SE->_offset]);
|
||||||
|
}
|
||||||
|
acceleratorSynchronise();
|
||||||
|
|
||||||
|
for(int bb=0;bb<nbasis;bb++) {
|
||||||
|
res = res + dag_factor_p[b*nbasis+bb]*coalescedRead(a_v[ss](b,bb))*nbr(bb);
|
||||||
|
}
|
||||||
|
coalescedWrite(out_v[ss](b),res);
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, {
|
||||||
|
int ss = sss/nbasis;
|
||||||
|
int b = sss%nbasis;
|
||||||
|
calcComplex res = Zero();
|
||||||
|
calcVector nbr;
|
||||||
|
int ptype;
|
||||||
|
StencilEntry *SE;
|
||||||
|
|
||||||
|
SE=st_v.GetEntry(ptype,point,ss);
|
||||||
|
|
||||||
|
if(SE->_is_local) {
|
||||||
|
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
|
||||||
|
} else {
|
||||||
|
nbr = coalescedRead(st_v.CommBuf()[SE->_offset]);
|
||||||
|
}
|
||||||
|
acceleratorSynchronise();
|
||||||
|
|
||||||
|
for(int bb=0;bb<nbasis;bb++) {
|
||||||
|
res = res + coalescedRead(a_v[ss](b,bb))*nbr(bb);
|
||||||
|
}
|
||||||
|
coalescedWrite(out_v[ss](b),res);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void DhopInternal(CartesianStencil<siteVector,siteVector,int> &st, std::vector<CoarseMatrix> &a,
|
||||||
|
const CoarseVector &in, CoarseVector &out, int dag) {
|
||||||
|
SimpleCompressor<siteVector> compressor;
|
||||||
|
|
||||||
|
st.HaloExchange(in,compressor);
|
||||||
|
autoView( in_v, in, AcceleratorRead);
|
||||||
|
autoView( out_v, out, AcceleratorWrite);
|
||||||
|
autoView( st_v , st, AcceleratorRead);
|
||||||
|
typedef LatticeView<Cobj> Aview;
|
||||||
|
|
||||||
|
// determine in what order we need the points
|
||||||
|
int npoint = geom.npoint-1;
|
||||||
|
Vector<int> points(npoint, 0);
|
||||||
|
for(int p=0; p<npoint; p++)
|
||||||
|
points[p] = (dag && !hermitian) ? geom.points_dagger[p] : p;
|
||||||
|
|
||||||
|
Vector<Aview> AcceleratorViewContainer;
|
||||||
|
for(int p=0;p<npoint;p++) AcceleratorViewContainer.push_back(a[p].View(AcceleratorRead));
|
||||||
|
Aview *Aview_p = & AcceleratorViewContainer[0];
|
||||||
|
|
||||||
|
const int Nsimd = CComplex::Nsimd();
|
||||||
|
typedef decltype(coalescedRead(in_v[0])) calcVector;
|
||||||
|
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
|
||||||
|
|
||||||
|
RealD* dag_factor_p = &dag_factor[0];
|
||||||
|
|
||||||
|
if(dag) {
|
||||||
|
accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, {
|
||||||
|
int ss = sss/nbasis;
|
||||||
|
int b = sss%nbasis;
|
||||||
|
calcComplex res = Zero();
|
||||||
|
calcVector nbr;
|
||||||
|
int ptype;
|
||||||
|
StencilEntry *SE;
|
||||||
|
|
||||||
|
for(int p=0;p<npoint;p++){
|
||||||
|
int point = points[p];
|
||||||
|
SE=st_v.GetEntry(ptype,point,ss);
|
||||||
|
|
||||||
|
if(SE->_is_local) {
|
||||||
|
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
|
||||||
|
} else {
|
||||||
|
nbr = coalescedRead(st_v.CommBuf()[SE->_offset]);
|
||||||
|
}
|
||||||
|
acceleratorSynchronise();
|
||||||
|
|
||||||
|
for(int bb=0;bb<nbasis;bb++) {
|
||||||
|
res = res + dag_factor_p[b*nbasis+bb]*coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
coalescedWrite(out_v[ss](b),res);
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, {
|
||||||
|
int ss = sss/nbasis;
|
||||||
|
int b = sss%nbasis;
|
||||||
|
calcComplex res = Zero();
|
||||||
|
calcVector nbr;
|
||||||
|
int ptype;
|
||||||
|
StencilEntry *SE;
|
||||||
|
|
||||||
|
for(int p=0;p<npoint;p++){
|
||||||
|
int point = points[p];
|
||||||
|
SE=st_v.GetEntry(ptype,point,ss);
|
||||||
|
|
||||||
|
if(SE->_is_local) {
|
||||||
|
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
|
||||||
|
} else {
|
||||||
|
nbr = coalescedRead(st_v.CommBuf()[SE->_offset]);
|
||||||
|
}
|
||||||
|
acceleratorSynchronise();
|
||||||
|
|
||||||
|
for(int bb=0;bb<nbasis;bb++) {
|
||||||
|
res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
coalescedWrite(out_v[ss](b),res);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int p=0;p<npoint;p++) AcceleratorViewContainer[p].ViewClose();
|
||||||
|
}
|
||||||
|
|
||||||
|
CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) :
|
||||||
_grid(&CoarseGrid),
|
_grid(&CoarseGrid),
|
||||||
|
_cbgrid(new GridRedBlackCartesian(&CoarseGrid)),
|
||||||
geom(CoarseGrid._ndimension),
|
geom(CoarseGrid._ndimension),
|
||||||
hermitian(hermitian_),
|
hermitian(hermitian_),
|
||||||
Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
|
Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
|
||||||
A(geom.npoint,&CoarseGrid)
|
StencilEven(_cbgrid,geom.npoint,Even,geom.directions,geom.displacements,0),
|
||||||
|
StencilOdd(_cbgrid,geom.npoint,Odd,geom.directions,geom.displacements,0),
|
||||||
|
A(geom.npoint,&CoarseGrid),
|
||||||
|
Aeven(geom.npoint,_cbgrid),
|
||||||
|
Aodd(geom.npoint,_cbgrid),
|
||||||
|
AselfInv(&CoarseGrid),
|
||||||
|
AselfInvEven(_cbgrid),
|
||||||
|
AselfInvOdd(_cbgrid),
|
||||||
|
dag_factor(nbasis*nbasis)
|
||||||
{
|
{
|
||||||
|
fillFactor();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
CoarsenedMatrix(GridCartesian &CoarseGrid, GridRedBlackCartesian &CoarseRBGrid, int hermitian_=0) :
|
||||||
|
|
||||||
|
_grid(&CoarseGrid),
|
||||||
|
_cbgrid(&CoarseRBGrid),
|
||||||
|
geom(CoarseGrid._ndimension),
|
||||||
|
hermitian(hermitian_),
|
||||||
|
Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
|
||||||
|
StencilEven(&CoarseRBGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
|
||||||
|
StencilOdd(&CoarseRBGrid,geom.npoint,Odd,geom.directions,geom.displacements,0),
|
||||||
|
A(geom.npoint,&CoarseGrid),
|
||||||
|
Aeven(geom.npoint,&CoarseRBGrid),
|
||||||
|
Aodd(geom.npoint,&CoarseRBGrid),
|
||||||
|
AselfInv(&CoarseGrid),
|
||||||
|
AselfInvEven(&CoarseRBGrid),
|
||||||
|
AselfInvOdd(&CoarseRBGrid),
|
||||||
|
dag_factor(nbasis*nbasis)
|
||||||
|
{
|
||||||
|
fillFactor();
|
||||||
|
};
|
||||||
|
|
||||||
|
void fillFactor() {
|
||||||
|
Eigen::MatrixXd dag_factor_eigen = Eigen::MatrixXd::Ones(nbasis, nbasis);
|
||||||
|
if(!hermitian) {
|
||||||
|
const int nb = nbasis/2;
|
||||||
|
dag_factor_eigen.block(0,nb,nb,nb) *= -1.0;
|
||||||
|
dag_factor_eigen.block(nb,0,nb,nb) *= -1.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// GPU readable prefactor
|
||||||
|
thread_for(i, nbasis*nbasis, {
|
||||||
|
int j = i/nbasis;
|
||||||
|
int k = i%nbasis;
|
||||||
|
dag_factor[i] = dag_factor_eigen(j, k);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
|
void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
|
||||||
Aggregation<Fobj,CComplex,nbasis> & Subspace)
|
Aggregation<Fobj,CComplex,nbasis> & Subspace)
|
||||||
{
|
{
|
||||||
typedef Lattice<typename Fobj::tensor_reduced> FineComplexField;
|
typedef Lattice<typename Fobj::tensor_reduced> FineComplexField;
|
||||||
typedef typename Fobj::scalar_type scalar_type;
|
typedef typename Fobj::scalar_type scalar_type;
|
||||||
|
|
||||||
|
std::cout << GridLogMessage<< "CoarsenMatrix "<< std::endl;
|
||||||
|
|
||||||
FineComplexField one(FineGrid); one=scalar_type(1.0,0.0);
|
FineComplexField one(FineGrid); one=scalar_type(1.0,0.0);
|
||||||
FineComplexField zero(FineGrid); zero=scalar_type(0.0,0.0);
|
FineComplexField zero(FineGrid); zero=scalar_type(0.0,0.0);
|
||||||
|
|
||||||
@ -496,11 +868,13 @@ public:
|
|||||||
|
|
||||||
CoarseScalar InnerProd(Grid());
|
CoarseScalar InnerProd(Grid());
|
||||||
|
|
||||||
|
std::cout << GridLogMessage<< "CoarsenMatrix Orthog "<< std::endl;
|
||||||
// Orthogonalise the subblocks over the basis
|
// Orthogonalise the subblocks over the basis
|
||||||
blockOrthogonalise(InnerProd,Subspace.subspace);
|
blockOrthogonalise(InnerProd,Subspace.subspace);
|
||||||
|
|
||||||
// Compute the matrix elements of linop between this orthonormal
|
// Compute the matrix elements of linop between this orthonormal
|
||||||
// set of vectors.
|
// set of vectors.
|
||||||
|
std::cout << GridLogMessage<< "CoarsenMatrix masks "<< std::endl;
|
||||||
int self_stencil=-1;
|
int self_stencil=-1;
|
||||||
for(int p=0;p<geom.npoint;p++)
|
for(int p=0;p<geom.npoint;p++)
|
||||||
{
|
{
|
||||||
@ -539,7 +913,7 @@ public:
|
|||||||
|
|
||||||
phi=Subspace.subspace[i];
|
phi=Subspace.subspace[i];
|
||||||
|
|
||||||
// std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i << std::endl;
|
std::cout << GridLogMessage<< "CoarsenMatrix vector "<<i << std::endl;
|
||||||
linop.OpDirAll(phi,Mphi_p);
|
linop.OpDirAll(phi,Mphi_p);
|
||||||
linop.OpDiag (phi,Mphi_p[geom.npoint-1]);
|
linop.OpDiag (phi,Mphi_p[geom.npoint-1]);
|
||||||
|
|
||||||
@ -568,6 +942,18 @@ public:
|
|||||||
autoView( A_self , A[self_stencil], AcceleratorWrite);
|
autoView( A_self , A[self_stencil], AcceleratorWrite);
|
||||||
|
|
||||||
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
|
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
|
||||||
|
if ( hermitian && (disp==-1) ) {
|
||||||
|
for(int pp=0;pp<geom.npoint;pp++){// Find the opposite link and set <j|A|i> = <i|A|j>*
|
||||||
|
int dirp = geom.directions[pp];
|
||||||
|
int dispp = geom.displacements[pp];
|
||||||
|
if ( (dirp==dir) && (dispp==1) ){
|
||||||
|
auto sft = conjugate(Cshift(oZProj,dir,1));
|
||||||
|
autoView( sft_v , sft , AcceleratorWrite);
|
||||||
|
autoView( A_pp , A[pp], AcceleratorWrite);
|
||||||
|
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_pp[ss](i,j),sft_v(ss)); });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -606,28 +992,54 @@ public:
|
|||||||
}
|
}
|
||||||
if(hermitian) {
|
if(hermitian) {
|
||||||
std::cout << GridLogMessage << " ForceHermitian, new code "<<std::endl;
|
std::cout << GridLogMessage << " ForceHermitian, new code "<<std::endl;
|
||||||
ForceHermitian();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
InvertSelfStencilLink(); std::cout << GridLogMessage << "Coarse self link inverted" << std::endl;
|
||||||
|
FillHalfCbs(); std::cout << GridLogMessage << "Coarse half checkerboards filled" << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ForceHermitian(void) {
|
void InvertSelfStencilLink() {
|
||||||
CoarseMatrix Diff (Grid());
|
std::cout << GridLogDebug << "CoarsenedMatrix::InvertSelfStencilLink" << std::endl;
|
||||||
for(int p=0;p<geom.npoint;p++){
|
int localVolume = Grid()->lSites();
|
||||||
int dir = geom.directions[p];
|
|
||||||
int disp = geom.displacements[p];
|
typedef typename Cobj::scalar_object scalar_object;
|
||||||
if(disp==-1) {
|
|
||||||
// Find the opposite link
|
autoView(Aself_v, A[geom.npoint-1], CpuRead);
|
||||||
for(int pp=0;pp<geom.npoint;pp++){
|
autoView(AselfInv_v, AselfInv, CpuWrite);
|
||||||
int dirp = geom.directions[pp];
|
thread_for(site, localVolume, { // NOTE: Not able to bring this to GPU because of Eigen + peek/poke
|
||||||
int dispp = geom.displacements[pp];
|
Eigen::MatrixXcd selfLinkEigen = Eigen::MatrixXcd::Zero(nbasis, nbasis);
|
||||||
if ( (dirp==dir) && (dispp==1) ){
|
Eigen::MatrixXcd selfLinkInvEigen = Eigen::MatrixXcd::Zero(nbasis, nbasis);
|
||||||
// Diff = adj(Cshift(A[p],dir,1)) - A[pp];
|
|
||||||
// std::cout << GridLogMessage<<" Replacing stencil leg "<<pp<<" with leg "<<p<< " diff "<<norm2(Diff) <<std::endl;
|
scalar_object selfLink = Zero();
|
||||||
A[pp] = adj(Cshift(A[p],dir,1));
|
scalar_object selfLinkInv = Zero();
|
||||||
}
|
|
||||||
}
|
Coordinate lcoor;
|
||||||
}
|
|
||||||
|
Grid()->LocalIndexToLocalCoor(site, lcoor);
|
||||||
|
peekLocalSite(selfLink, Aself_v, lcoor);
|
||||||
|
|
||||||
|
for (int i = 0; i < nbasis; ++i)
|
||||||
|
for (int j = 0; j < nbasis; ++j)
|
||||||
|
selfLinkEigen(i, j) = static_cast<ComplexD>(TensorRemove(selfLink(i, j)));
|
||||||
|
|
||||||
|
selfLinkInvEigen = selfLinkEigen.inverse();
|
||||||
|
|
||||||
|
for(int i = 0; i < nbasis; ++i)
|
||||||
|
for(int j = 0; j < nbasis; ++j)
|
||||||
|
selfLinkInv(i, j) = selfLinkInvEigen(i, j);
|
||||||
|
|
||||||
|
pokeLocalSite(selfLinkInv, AselfInv_v, lcoor);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
void FillHalfCbs() {
|
||||||
|
std::cout << GridLogDebug << "CoarsenedMatrix::FillHalfCbs" << std::endl;
|
||||||
|
for(int p = 0; p < geom.npoint; ++p) {
|
||||||
|
pickCheckerboard(Even, Aeven[p], A[p]);
|
||||||
|
pickCheckerboard(Odd, Aodd[p], A[p]);
|
||||||
}
|
}
|
||||||
|
pickCheckerboard(Even, AselfInvEven, AselfInv);
|
||||||
|
pickCheckerboard(Odd, AselfInvOdd, AselfInv);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1,67 +0,0 @@
|
|||||||
#include <Grid/GridCore.h>
|
|
||||||
#include <fcntl.h>
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
|
|
||||||
MemoryStats *MemoryProfiler::stats = nullptr;
|
|
||||||
bool MemoryProfiler::debug = false;
|
|
||||||
|
|
||||||
void check_huge_pages(void *Buf,uint64_t BYTES)
|
|
||||||
{
|
|
||||||
#ifdef __linux__
|
|
||||||
int fd = open("/proc/self/pagemap", O_RDONLY);
|
|
||||||
assert(fd >= 0);
|
|
||||||
const int page_size = 4096;
|
|
||||||
uint64_t virt_pfn = (uint64_t)Buf / page_size;
|
|
||||||
off_t offset = sizeof(uint64_t) * virt_pfn;
|
|
||||||
uint64_t npages = (BYTES + page_size-1) / page_size;
|
|
||||||
uint64_t pagedata[npages];
|
|
||||||
uint64_t ret = lseek(fd, offset, SEEK_SET);
|
|
||||||
assert(ret == offset);
|
|
||||||
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
|
|
||||||
assert(ret == sizeof(uint64_t) * npages);
|
|
||||||
int nhugepages = npages / 512;
|
|
||||||
int n4ktotal, nnothuge;
|
|
||||||
n4ktotal = 0;
|
|
||||||
nnothuge = 0;
|
|
||||||
for (int i = 0; i < nhugepages; ++i) {
|
|
||||||
uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
|
|
||||||
for (int j = 0; j < 512; ++j) {
|
|
||||||
uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
|
|
||||||
++n4ktotal;
|
|
||||||
if (pageaddr != baseaddr + j * page_size)
|
|
||||||
++nnothuge;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
int rank = CartesianCommunicator::RankWorld();
|
|
||||||
printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string sizeString(const size_t bytes)
|
|
||||||
{
|
|
||||||
constexpr unsigned int bufSize = 256;
|
|
||||||
const char *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"};
|
|
||||||
char buf[256];
|
|
||||||
size_t s = 0;
|
|
||||||
double count = bytes;
|
|
||||||
|
|
||||||
while (count >= 1024 && s < 7)
|
|
||||||
{
|
|
||||||
s++;
|
|
||||||
count /= 1024;
|
|
||||||
}
|
|
||||||
if (count - floor(count) == 0.0)
|
|
||||||
{
|
|
||||||
snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
|
|
||||||
}
|
|
||||||
|
|
||||||
return std::string(buf);
|
|
||||||
}
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
|
||||||
|
|
@ -65,8 +65,7 @@ public:
|
|||||||
MemoryManager::CpuFree((void *)__p,bytes);
|
MemoryManager::CpuFree((void *)__p,bytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
// FIXME: hack for the copy constructor, eventually it must be avoided
|
// FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
|
||||||
//void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
|
|
||||||
void construct(pointer __p, const _Tp& __val) { assert(0);};
|
void construct(pointer __p, const _Tp& __val) { assert(0);};
|
||||||
void construct(pointer __p) { };
|
void construct(pointer __p) { };
|
||||||
void destroy(pointer __p) { };
|
void destroy(pointer __p) { };
|
||||||
@ -74,6 +73,9 @@ public:
|
|||||||
template<typename _Tp> inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
|
template<typename _Tp> inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
|
||||||
template<typename _Tp> inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
|
template<typename _Tp> inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Unified virtual memory
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////
|
||||||
template<typename _Tp>
|
template<typename _Tp>
|
||||||
class uvmAllocator {
|
class uvmAllocator {
|
||||||
public:
|
public:
|
||||||
@ -109,22 +111,72 @@ public:
|
|||||||
MemoryManager::SharedFree((void *)__p,bytes);
|
MemoryManager::SharedFree((void *)__p,bytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
// FIXME: hack for the copy constructor, eventually it must be avoided
|
|
||||||
void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
|
void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
|
||||||
//void construct(pointer __p, const _Tp& __val) { };
|
|
||||||
void construct(pointer __p) { };
|
void construct(pointer __p) { };
|
||||||
void destroy(pointer __p) { };
|
void destroy(pointer __p) { };
|
||||||
};
|
};
|
||||||
template<typename _Tp> inline bool operator==(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return true; }
|
template<typename _Tp> inline bool operator==(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return true; }
|
||||||
template<typename _Tp> inline bool operator!=(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return false; }
|
template<typename _Tp> inline bool operator!=(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return false; }
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Device memory
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
template<typename _Tp>
|
||||||
|
class devAllocator {
|
||||||
|
public:
|
||||||
|
typedef std::size_t size_type;
|
||||||
|
typedef std::ptrdiff_t difference_type;
|
||||||
|
typedef _Tp* pointer;
|
||||||
|
typedef const _Tp* const_pointer;
|
||||||
|
typedef _Tp& reference;
|
||||||
|
typedef const _Tp& const_reference;
|
||||||
|
typedef _Tp value_type;
|
||||||
|
|
||||||
|
template<typename _Tp1> struct rebind { typedef devAllocator<_Tp1> other; };
|
||||||
|
devAllocator() throw() { }
|
||||||
|
devAllocator(const devAllocator&) throw() { }
|
||||||
|
template<typename _Tp1> devAllocator(const devAllocator<_Tp1>&) throw() { }
|
||||||
|
~devAllocator() throw() { }
|
||||||
|
pointer address(reference __x) const { return &__x; }
|
||||||
|
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
|
||||||
|
|
||||||
|
pointer allocate(size_type __n, const void* _p= 0)
|
||||||
|
{
|
||||||
|
size_type bytes = __n*sizeof(_Tp);
|
||||||
|
profilerAllocate(bytes);
|
||||||
|
_Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes);
|
||||||
|
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
void deallocate(pointer __p, size_type __n)
|
||||||
|
{
|
||||||
|
size_type bytes = __n * sizeof(_Tp);
|
||||||
|
profilerFree(bytes);
|
||||||
|
MemoryManager::AcceleratorFree((void *)__p,bytes);
|
||||||
|
}
|
||||||
|
void construct(pointer __p, const _Tp& __val) { };
|
||||||
|
void construct(pointer __p) { };
|
||||||
|
void destroy(pointer __p) { };
|
||||||
|
};
|
||||||
|
template<typename _Tp> inline bool operator==(const devAllocator<_Tp>&, const devAllocator<_Tp>&){ return true; }
|
||||||
|
template<typename _Tp> inline bool operator!=(const devAllocator<_Tp>&, const devAllocator<_Tp>&){ return false; }
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Template typedefs
|
// Template typedefs
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
template<class T> using commAllocator = uvmAllocator<T>;
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
|
// Cshift on device
|
||||||
template<class T> using commVector = std::vector<T,uvmAllocator<T> >;
|
template<class T> using cshiftAllocator = devAllocator<T>;
|
||||||
//template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >;
|
#else
|
||||||
|
// Cshift on host
|
||||||
|
template<class T> using cshiftAllocator = std::allocator<T>;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
|
||||||
|
template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;
|
||||||
|
template<class T> using commVector = std::vector<T,devAllocator<T> >;
|
||||||
|
template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
@ -24,7 +24,7 @@ void MemoryManager::PrintBytes(void)
|
|||||||
//////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////
|
||||||
MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
|
MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
|
||||||
int MemoryManager::Victim[MemoryManager::NallocType];
|
int MemoryManager::Victim[MemoryManager::NallocType];
|
||||||
int MemoryManager::Ncache[MemoryManager::NallocType];
|
int MemoryManager::Ncache[MemoryManager::NallocType] = { 8, 32, 8, 32, 8, 32 };
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////
|
||||||
// Actual allocation and deallocation utils
|
// Actual allocation and deallocation utils
|
||||||
@ -35,8 +35,6 @@ void *MemoryManager::AcceleratorAllocate(size_t bytes)
|
|||||||
if ( ptr == (void *) NULL ) {
|
if ( ptr == (void *) NULL ) {
|
||||||
ptr = (void *) acceleratorAllocDevice(bytes);
|
ptr = (void *) acceleratorAllocDevice(bytes);
|
||||||
total_device+=bytes;
|
total_device+=bytes;
|
||||||
// std::cout <<"AcceleratorAllocate: allocated Accelerator pointer "<<std::hex<<ptr<<std::dec<<std::endl;
|
|
||||||
// PrintBytes();
|
|
||||||
}
|
}
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
@ -69,14 +67,32 @@ void MemoryManager::SharedFree (void *ptr,size_t bytes)
|
|||||||
// PrintBytes();
|
// PrintBytes();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#ifdef GRID_UVM
|
||||||
|
void *MemoryManager::CpuAllocate(size_t bytes)
|
||||||
|
{
|
||||||
|
void *ptr = (void *) Lookup(bytes,Cpu);
|
||||||
|
if ( ptr == (void *) NULL ) {
|
||||||
|
ptr = (void *) acceleratorAllocShared(bytes);
|
||||||
|
total_host+=bytes;
|
||||||
|
}
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
void MemoryManager::CpuFree (void *_ptr,size_t bytes)
|
||||||
|
{
|
||||||
|
NotifyDeletion(_ptr);
|
||||||
|
void *__freeme = Insert(_ptr,bytes,Cpu);
|
||||||
|
if ( __freeme ) {
|
||||||
|
acceleratorFreeShared(__freeme);
|
||||||
|
total_host-=bytes;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
void *MemoryManager::CpuAllocate(size_t bytes)
|
void *MemoryManager::CpuAllocate(size_t bytes)
|
||||||
{
|
{
|
||||||
void *ptr = (void *) Lookup(bytes,Cpu);
|
void *ptr = (void *) Lookup(bytes,Cpu);
|
||||||
if ( ptr == (void *) NULL ) {
|
if ( ptr == (void *) NULL ) {
|
||||||
ptr = (void *) acceleratorAllocCpu(bytes);
|
ptr = (void *) acceleratorAllocCpu(bytes);
|
||||||
total_host+=bytes;
|
total_host+=bytes;
|
||||||
// std::cout <<"CpuAllocate: allocated Cpu pointer "<<std::hex<<ptr<<std::dec<<std::endl;
|
|
||||||
// PrintBytes();
|
|
||||||
}
|
}
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
@ -87,20 +103,15 @@ void MemoryManager::CpuFree (void *_ptr,size_t bytes)
|
|||||||
if ( __freeme ) {
|
if ( __freeme ) {
|
||||||
acceleratorFreeCpu(__freeme);
|
acceleratorFreeCpu(__freeme);
|
||||||
total_host-=bytes;
|
total_host-=bytes;
|
||||||
// PrintBytes();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
//////////////////////////////////////////
|
//////////////////////////////////////////
|
||||||
// call only once
|
// call only once
|
||||||
//////////////////////////////////////////
|
//////////////////////////////////////////
|
||||||
void MemoryManager::Init(void)
|
void MemoryManager::Init(void)
|
||||||
{
|
{
|
||||||
Ncache[Cpu] = 8;
|
|
||||||
Ncache[Acc] = 8;
|
|
||||||
Ncache[Shared] = 8;
|
|
||||||
Ncache[CpuSmall] = 32;
|
|
||||||
Ncache[AccSmall] = 32;
|
|
||||||
Ncache[SharedSmall] = 32;
|
|
||||||
|
|
||||||
char * str;
|
char * str;
|
||||||
int Nc;
|
int Nc;
|
||||||
@ -125,6 +136,15 @@ void MemoryManager::Init(void)
|
|||||||
Ncache[SharedSmall]=Nc;
|
Ncache[SharedSmall]=Nc;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void MemoryManager::InitMessage(void) {
|
||||||
|
|
||||||
|
#ifndef GRID_UVM
|
||||||
|
std::cout << GridLogMessage << "MemoryManager Cache "<< MemoryManager::DeviceMaxBytes <<" bytes "<<std::endl;
|
||||||
|
#endif
|
||||||
|
|
||||||
std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
|
std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
|
||||||
#ifdef ALLOCATION_CACHE
|
#ifdef ALLOCATION_CACHE
|
||||||
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
|
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
|
||||||
@ -153,6 +173,7 @@ void MemoryManager::Init(void)
|
|||||||
std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_device"<<std::endl;
|
std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_device"<<std::endl;
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
|
void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
|
||||||
|
@ -34,8 +34,6 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
// Move control to configure.ac and Config.h?
|
// Move control to configure.ac and Config.h?
|
||||||
|
|
||||||
#define ALLOCATION_CACHE
|
|
||||||
#define GRID_ALLOC_ALIGN (2*1024*1024)
|
|
||||||
#define GRID_ALLOC_SMALL_LIMIT (4096)
|
#define GRID_ALLOC_SMALL_LIMIT (4096)
|
||||||
|
|
||||||
/*Pinning pages is costly*/
|
/*Pinning pages is costly*/
|
||||||
@ -93,11 +91,12 @@ private:
|
|||||||
static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
|
static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
|
||||||
static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
|
static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
|
||||||
|
|
||||||
static void *AcceleratorAllocate(size_t bytes);
|
|
||||||
static void AcceleratorFree (void *ptr,size_t bytes);
|
|
||||||
static void PrintBytes(void);
|
static void PrintBytes(void);
|
||||||
public:
|
public:
|
||||||
static void Init(void);
|
static void Init(void);
|
||||||
|
static void InitMessage(void);
|
||||||
|
static void *AcceleratorAllocate(size_t bytes);
|
||||||
|
static void AcceleratorFree (void *ptr,size_t bytes);
|
||||||
static void *SharedAllocate(size_t bytes);
|
static void *SharedAllocate(size_t bytes);
|
||||||
static void SharedFree (void *ptr,size_t bytes);
|
static void SharedFree (void *ptr,size_t bytes);
|
||||||
static void *CpuAllocate(size_t bytes);
|
static void *CpuAllocate(size_t bytes);
|
||||||
|
@ -1,11 +1,12 @@
|
|||||||
#include <Grid/GridCore.h>
|
#include <Grid/GridCore.h>
|
||||||
|
|
||||||
#ifndef GRID_UVM
|
#ifndef GRID_UVM
|
||||||
|
|
||||||
#warning "Using explicit device memory copies"
|
#warning "Using explicit device memory copies"
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
//define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
|
||||||
#define dprintf(...)
|
#define dprintf(...)
|
||||||
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
// For caching copies of data on device
|
// For caching copies of data on device
|
||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
@ -103,7 +104,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
|
|||||||
///////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////
|
||||||
assert(AccCache.state!=Empty);
|
assert(AccCache.state!=Empty);
|
||||||
|
|
||||||
// dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
|
dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
|
||||||
assert(AccCache.accLock==0);
|
assert(AccCache.accLock==0);
|
||||||
assert(AccCache.cpuLock==0);
|
assert(AccCache.cpuLock==0);
|
||||||
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
||||||
@ -111,7 +112,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
|
|||||||
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
|
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
|
||||||
DeviceBytes -=AccCache.bytes;
|
DeviceBytes -=AccCache.bytes;
|
||||||
LRUremove(AccCache);
|
LRUremove(AccCache);
|
||||||
// dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
|
dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
|
||||||
}
|
}
|
||||||
uint64_t CpuPtr = AccCache.CpuPtr;
|
uint64_t CpuPtr = AccCache.CpuPtr;
|
||||||
EntryErase(CpuPtr);
|
EntryErase(CpuPtr);
|
||||||
@ -125,7 +126,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
|
|||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
assert(AccCache.state!=Empty);
|
assert(AccCache.state!=Empty);
|
||||||
|
|
||||||
// dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
|
dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
|
||||||
assert(AccCache.accLock==0);
|
assert(AccCache.accLock==0);
|
||||||
assert(AccCache.cpuLock==0);
|
assert(AccCache.cpuLock==0);
|
||||||
if(AccCache.state==AccDirty) {
|
if(AccCache.state==AccDirty) {
|
||||||
@ -136,7 +137,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
|
|||||||
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
|
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
|
||||||
DeviceBytes -=AccCache.bytes;
|
DeviceBytes -=AccCache.bytes;
|
||||||
LRUremove(AccCache);
|
LRUremove(AccCache);
|
||||||
// dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
|
dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
|
||||||
}
|
}
|
||||||
uint64_t CpuPtr = AccCache.CpuPtr;
|
uint64_t CpuPtr = AccCache.CpuPtr;
|
||||||
EntryErase(CpuPtr);
|
EntryErase(CpuPtr);
|
||||||
@ -149,7 +150,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
|
|||||||
assert(AccCache.AccPtr!=(uint64_t)NULL);
|
assert(AccCache.AccPtr!=(uint64_t)NULL);
|
||||||
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
||||||
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
|
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
|
||||||
// dprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
dprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
||||||
DeviceToHostBytes+=AccCache.bytes;
|
DeviceToHostBytes+=AccCache.bytes;
|
||||||
DeviceToHostXfer++;
|
DeviceToHostXfer++;
|
||||||
AccCache.state=Consistent;
|
AccCache.state=Consistent;
|
||||||
@ -164,7 +165,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
|
|||||||
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
|
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
|
||||||
DeviceBytes+=AccCache.bytes;
|
DeviceBytes+=AccCache.bytes;
|
||||||
}
|
}
|
||||||
// dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
||||||
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
|
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
|
||||||
HostToDeviceBytes+=AccCache.bytes;
|
HostToDeviceBytes+=AccCache.bytes;
|
||||||
HostToDeviceXfer++;
|
HostToDeviceXfer++;
|
||||||
@ -227,18 +228,24 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
|
|||||||
// Find if present, otherwise get or force an empty
|
// Find if present, otherwise get or force an empty
|
||||||
////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////
|
||||||
if ( EntryPresent(CpuPtr)==0 ){
|
if ( EntryPresent(CpuPtr)==0 ){
|
||||||
EvictVictims(bytes);
|
|
||||||
EntryCreate(CpuPtr,bytes,mode,hint);
|
EntryCreate(CpuPtr,bytes,mode,hint);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto AccCacheIterator = EntryLookup(CpuPtr);
|
auto AccCacheIterator = EntryLookup(CpuPtr);
|
||||||
auto & AccCache = AccCacheIterator->second;
|
auto & AccCache = AccCacheIterator->second;
|
||||||
|
if (!AccCache.AccPtr) {
|
||||||
|
EvictVictims(bytes);
|
||||||
|
}
|
||||||
assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard));
|
assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard));
|
||||||
|
|
||||||
assert(AccCache.cpuLock==0); // Programming error
|
assert(AccCache.cpuLock==0); // Programming error
|
||||||
|
|
||||||
if(AccCache.state!=Empty) {
|
if(AccCache.state!=Empty) {
|
||||||
|
dprintf("ViewOpen found entry %llx %llx : %lld %lld\n",
|
||||||
|
(uint64_t)AccCache.CpuPtr,
|
||||||
|
(uint64_t)CpuPtr,
|
||||||
|
(uint64_t)AccCache.bytes,
|
||||||
|
(uint64_t)bytes);
|
||||||
assert(AccCache.CpuPtr == CpuPtr);
|
assert(AccCache.CpuPtr == CpuPtr);
|
||||||
assert(AccCache.bytes ==bytes);
|
assert(AccCache.bytes ==bytes);
|
||||||
}
|
}
|
||||||
@ -285,21 +292,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
|
|||||||
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
|
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
|
||||||
}
|
}
|
||||||
AccCache.accLock++;
|
AccCache.accLock++;
|
||||||
// printf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock);
|
dprintf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock);
|
||||||
} else if(AccCache.state==Consistent) {
|
} else if(AccCache.state==Consistent) {
|
||||||
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
|
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
|
||||||
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
|
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
|
||||||
else
|
else
|
||||||
AccCache.state = Consistent; // Consistent + AccRead => Consistent
|
AccCache.state = Consistent; // Consistent + AccRead => Consistent
|
||||||
AccCache.accLock++;
|
AccCache.accLock++;
|
||||||
// printf("Consistent entry into device accLock %d\n",AccCache.accLock);
|
dprintf("Consistent entry into device accLock %d\n",AccCache.accLock);
|
||||||
} else if(AccCache.state==AccDirty) {
|
} else if(AccCache.state==AccDirty) {
|
||||||
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
|
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
|
||||||
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
|
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
|
||||||
else
|
else
|
||||||
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
|
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
|
||||||
AccCache.accLock++;
|
AccCache.accLock++;
|
||||||
// printf("AccDirty entry into device accLock %d\n",AccCache.accLock);
|
dprintf("AccDirty entry into device accLock %d\n",AccCache.accLock);
|
||||||
} else {
|
} else {
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
@ -361,13 +368,16 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
|
|||||||
// Find if present, otherwise get or force an empty
|
// Find if present, otherwise get or force an empty
|
||||||
////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////
|
||||||
if ( EntryPresent(CpuPtr)==0 ){
|
if ( EntryPresent(CpuPtr)==0 ){
|
||||||
EvictVictims(bytes);
|
|
||||||
EntryCreate(CpuPtr,bytes,mode,transient);
|
EntryCreate(CpuPtr,bytes,mode,transient);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto AccCacheIterator = EntryLookup(CpuPtr);
|
auto AccCacheIterator = EntryLookup(CpuPtr);
|
||||||
auto & AccCache = AccCacheIterator->second;
|
auto & AccCache = AccCacheIterator->second;
|
||||||
|
|
||||||
|
if (!AccCache.AccPtr) {
|
||||||
|
EvictVictims(bytes);
|
||||||
|
}
|
||||||
|
|
||||||
assert((mode==CpuRead)||(mode==CpuWrite));
|
assert((mode==CpuRead)||(mode==CpuWrite));
|
||||||
assert(AccCache.accLock==0); // Programming error
|
assert(AccCache.accLock==0); // Programming error
|
||||||
|
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
#include <Grid/GridCore.h>
|
#include <Grid/GridCore.h>
|
||||||
#ifdef GRID_UVM
|
#ifdef GRID_UVM
|
||||||
|
|
||||||
#warning "Grid is assuming unified virtual memory address space"
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
/////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////
|
||||||
// View management is 1:1 address space mapping
|
// View management is 1:1 address space mapping
|
||||||
|
@ -138,21 +138,6 @@ public:
|
|||||||
int recv_from_rank,
|
int recv_from_rank,
|
||||||
int bytes);
|
int bytes);
|
||||||
|
|
||||||
void SendRecvPacket(void *xmit,
|
|
||||||
void *recv,
|
|
||||||
int xmit_to_rank,
|
|
||||||
int recv_from_rank,
|
|
||||||
int bytes);
|
|
||||||
|
|
||||||
void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
|
||||||
void *xmit,
|
|
||||||
int xmit_to_rank,
|
|
||||||
void *recv,
|
|
||||||
int recv_from_rank,
|
|
||||||
int bytes);
|
|
||||||
|
|
||||||
void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
|
|
||||||
|
|
||||||
double StencilSendToRecvFrom(void *xmit,
|
double StencilSendToRecvFrom(void *xmit,
|
||||||
int xmit_to_rank,
|
int xmit_to_rank,
|
||||||
void *recv,
|
void *recv,
|
||||||
|
@ -43,8 +43,16 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
|
|||||||
|
|
||||||
MPI_Initialized(&flag); // needed to coexist with other libs apparently
|
MPI_Initialized(&flag); // needed to coexist with other libs apparently
|
||||||
if ( !flag ) {
|
if ( !flag ) {
|
||||||
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
|
|
||||||
|
|
||||||
|
#ifndef GRID_COMMS_THREADS
|
||||||
|
nCommThreads=1;
|
||||||
|
// wrong results here too
|
||||||
|
// For now: comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs
|
||||||
|
// other comms schemes are ok
|
||||||
|
MPI_Init_thread(argc,argv,MPI_THREAD_SERIALIZED,&provided);
|
||||||
|
#else
|
||||||
|
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
|
||||||
|
#endif
|
||||||
//If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
|
//If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
|
||||||
if( (nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) {
|
if( (nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) {
|
||||||
assert(0);
|
assert(0);
|
||||||
@ -294,60 +302,28 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
|||||||
int bytes)
|
int bytes)
|
||||||
{
|
{
|
||||||
std::vector<CommsRequest_t> reqs(0);
|
std::vector<CommsRequest_t> reqs(0);
|
||||||
// unsigned long xcrc = crc32(0L, Z_NULL, 0);
|
unsigned long xcrc = crc32(0L, Z_NULL, 0);
|
||||||
// unsigned long rcrc = crc32(0L, Z_NULL, 0);
|
unsigned long rcrc = crc32(0L, Z_NULL, 0);
|
||||||
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
|
|
||||||
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
|
|
||||||
SendToRecvFromComplete(reqs);
|
|
||||||
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
|
|
||||||
// printf("proc %d SendToRecvFrom %d bytes %lx %lx\n",_processor,bytes,xcrc,rcrc);
|
|
||||||
}
|
|
||||||
void CartesianCommunicator::SendRecvPacket(void *xmit,
|
|
||||||
void *recv,
|
|
||||||
int sender,
|
|
||||||
int receiver,
|
|
||||||
int bytes)
|
|
||||||
{
|
|
||||||
MPI_Status stat;
|
|
||||||
assert(sender != receiver);
|
|
||||||
int tag = sender;
|
|
||||||
if ( _processor == sender ) {
|
|
||||||
MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
|
|
||||||
}
|
|
||||||
if ( _processor == receiver ) {
|
|
||||||
MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Basic Halo comms primitive
|
|
||||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
|
||||||
void *xmit,
|
|
||||||
int dest,
|
|
||||||
void *recv,
|
|
||||||
int from,
|
|
||||||
int bytes)
|
|
||||||
{
|
|
||||||
int myrank = _processor;
|
int myrank = _processor;
|
||||||
int ierr;
|
int ierr;
|
||||||
|
|
||||||
if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
|
// Enforce no UVM in comms, device or host OK
|
||||||
MPI_Request xrq;
|
assert(acceleratorIsCommunicable(xmit));
|
||||||
MPI_Request rrq;
|
assert(acceleratorIsCommunicable(recv));
|
||||||
|
|
||||||
ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
|
// Give the CPU to MPI immediately; can use threads to overlap optionally
|
||||||
ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
|
// printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes);
|
||||||
|
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
|
||||||
|
recv,bytes,MPI_CHAR,from, from,
|
||||||
|
communicator,MPI_STATUS_IGNORE);
|
||||||
|
assert(ierr==0);
|
||||||
|
|
||||||
assert(ierr==0);
|
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
|
||||||
list.push_back(xrq);
|
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
|
||||||
list.push_back(rrq);
|
// printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
|
||||||
} else {
|
|
||||||
// Give the CPU to MPI immediately; can use threads to overlap optionally
|
|
||||||
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
|
|
||||||
recv,bytes,MPI_CHAR,from, from,
|
|
||||||
communicator,MPI_STATUS_IGNORE);
|
|
||||||
assert(ierr==0);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
// Basic Halo comms primitive
|
||||||
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
||||||
int dest,
|
int dest,
|
||||||
void *recv,
|
void *recv,
|
||||||
@ -382,16 +358,19 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
assert(from != _processor);
|
assert(from != _processor);
|
||||||
assert(gme == ShmRank);
|
assert(gme == ShmRank);
|
||||||
double off_node_bytes=0.0;
|
double off_node_bytes=0.0;
|
||||||
|
int tag;
|
||||||
|
|
||||||
if ( gfrom ==MPI_UNDEFINED) {
|
if ( gfrom ==MPI_UNDEFINED) {
|
||||||
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[commdir],&rrq);
|
tag= dir+from*32;
|
||||||
|
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
list.push_back(rrq);
|
list.push_back(rrq);
|
||||||
off_node_bytes+=bytes;
|
off_node_bytes+=bytes;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( gdest == MPI_UNDEFINED ) {
|
if ( gdest == MPI_UNDEFINED ) {
|
||||||
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[commdir],&xrq);
|
tag= dir+_processor*32;
|
||||||
|
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
list.push_back(xrq);
|
list.push_back(xrq);
|
||||||
off_node_bytes+=bytes;
|
off_node_bytes+=bytes;
|
||||||
@ -403,15 +382,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
|
|
||||||
return off_node_bytes;
|
return off_node_bytes;
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
|
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
|
||||||
{
|
|
||||||
SendToRecvFromComplete(waitall);
|
|
||||||
}
|
|
||||||
void CartesianCommunicator::StencilBarrier(void)
|
|
||||||
{
|
|
||||||
MPI_Barrier (ShmComm);
|
|
||||||
}
|
|
||||||
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
|
||||||
{
|
{
|
||||||
int nreq=list.size();
|
int nreq=list.size();
|
||||||
|
|
||||||
@ -422,6 +393,13 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
|
|||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
list.resize(0);
|
list.resize(0);
|
||||||
}
|
}
|
||||||
|
void CartesianCommunicator::StencilBarrier(void)
|
||||||
|
{
|
||||||
|
MPI_Barrier (ShmComm);
|
||||||
|
}
|
||||||
|
//void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
||||||
|
//{
|
||||||
|
//}
|
||||||
void CartesianCommunicator::Barrier(void)
|
void CartesianCommunicator::Barrier(void)
|
||||||
{
|
{
|
||||||
int ierr = MPI_Barrier(communicator);
|
int ierr = MPI_Barrier(communicator);
|
||||||
@ -483,5 +461,3 @@ void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t
|
|||||||
}
|
}
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
|
||||||
|
@ -77,15 +77,6 @@ void CartesianCommunicator::GlobalSumVector(uint64_t *,int N){}
|
|||||||
void CartesianCommunicator::GlobalXOR(uint32_t &){}
|
void CartesianCommunicator::GlobalXOR(uint32_t &){}
|
||||||
void CartesianCommunicator::GlobalXOR(uint64_t &){}
|
void CartesianCommunicator::GlobalXOR(uint64_t &){}
|
||||||
|
|
||||||
void CartesianCommunicator::SendRecvPacket(void *xmit,
|
|
||||||
void *recv,
|
|
||||||
int xmit_to_rank,
|
|
||||||
int recv_from_rank,
|
|
||||||
int bytes)
|
|
||||||
{
|
|
||||||
assert(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// Basic Halo comms primitive -- should never call in single node
|
// Basic Halo comms primitive -- should never call in single node
|
||||||
void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||||
@ -96,20 +87,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
|||||||
{
|
{
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
|
||||||
void *xmit,
|
|
||||||
int dest,
|
|
||||||
void *recv,
|
|
||||||
int from,
|
|
||||||
int bytes)
|
|
||||||
{
|
|
||||||
assert(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
|
||||||
{
|
|
||||||
assert(0);
|
|
||||||
}
|
|
||||||
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
|
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
|
||||||
{
|
{
|
||||||
bcopy(in,out,bytes*words);
|
bcopy(in,out,bytes*words);
|
||||||
@ -137,10 +114,6 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
|||||||
int recv_from_rank,
|
int recv_from_rank,
|
||||||
int bytes, int dir)
|
int bytes, int dir)
|
||||||
{
|
{
|
||||||
std::vector<CommsRequest_t> list;
|
|
||||||
// Discard the "dir"
|
|
||||||
SendToRecvFromBegin (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
|
|
||||||
SendToRecvFromComplete(list);
|
|
||||||
return 2.0*bytes;
|
return 2.0*bytes;
|
||||||
}
|
}
|
||||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
@ -150,13 +123,10 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
int recv_from_rank,
|
int recv_from_rank,
|
||||||
int bytes, int dir)
|
int bytes, int dir)
|
||||||
{
|
{
|
||||||
// Discard the "dir"
|
|
||||||
SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
|
|
||||||
return 2.0*bytes;
|
return 2.0*bytes;
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
|
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
|
||||||
{
|
{
|
||||||
SendToRecvFromComplete(waitall);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void CartesianCommunicator::StencilBarrier(void){};
|
void CartesianCommunicator::StencilBarrier(void){};
|
||||||
|
@ -102,7 +102,7 @@ public:
|
|||||||
///////////////////////////////////////////////////
|
///////////////////////////////////////////////////
|
||||||
static void SharedMemoryAllocate(uint64_t bytes, int flags);
|
static void SharedMemoryAllocate(uint64_t bytes, int flags);
|
||||||
static void SharedMemoryFree(void);
|
static void SharedMemoryFree(void);
|
||||||
static void SharedMemoryCopy(void *dest,const void *src,size_t bytes);
|
static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
|
||||||
static void SharedMemoryZero(void *dest,size_t bytes);
|
static void SharedMemoryZero(void *dest,size_t bytes);
|
||||||
|
|
||||||
};
|
};
|
||||||
|
@ -32,6 +32,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#ifdef GRID_CUDA
|
#ifdef GRID_CUDA
|
||||||
#include <cuda_runtime_api.h>
|
#include <cuda_runtime_api.h>
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef GRID_HIP
|
||||||
|
#include <hip/hip_runtime_api.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
#define header "SharedMemoryMpi: "
|
#define header "SharedMemoryMpi: "
|
||||||
@ -47,7 +50,12 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
|
|||||||
/////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////
|
||||||
// Split into groups that can share memory
|
// Split into groups that can share memory
|
||||||
/////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////
|
||||||
|
#ifndef GRID_MPI3_SHM_NONE
|
||||||
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
|
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
|
||||||
|
#else
|
||||||
|
MPI_Comm_split(comm, WorldRank, 0, &WorldShmComm);
|
||||||
|
#endif
|
||||||
|
|
||||||
MPI_Comm_rank(WorldShmComm ,&WorldShmRank);
|
MPI_Comm_rank(WorldShmComm ,&WorldShmRank);
|
||||||
MPI_Comm_size(WorldShmComm ,&WorldShmSize);
|
MPI_Comm_size(WorldShmComm ,&WorldShmSize);
|
||||||
|
|
||||||
@ -420,7 +428,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Hugetlbfs mapping intended
|
// Hugetlbfs mapping intended
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
#ifdef GRID_CUDA
|
#if defined(GRID_CUDA) ||defined(GRID_HIP)
|
||||||
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||||
{
|
{
|
||||||
void * ShmCommBuf ;
|
void * ShmCommBuf ;
|
||||||
@ -443,17 +451,16 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Each MPI rank should allocate our own buffer
|
// Each MPI rank should allocate our own buffer
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
auto err = cudaMalloc(&ShmCommBuf, bytes);
|
ShmCommBuf = acceleratorAllocDevice(bytes);
|
||||||
if ( err != cudaSuccess) {
|
|
||||||
std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
|
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
if (ShmCommBuf == (void *)NULL ) {
|
if (ShmCommBuf == (void *)NULL ) {
|
||||||
std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed NULL pointer for " << bytes<<" bytes " << std::endl;
|
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
if ( WorldRank == 0 ){
|
// if ( WorldRank == 0 ){
|
||||||
std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
|
if ( 1 ){
|
||||||
|
std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
|
||||||
|
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
|
||||||
}
|
}
|
||||||
SharedMemoryZero(ShmCommBuf,bytes);
|
SharedMemoryZero(ShmCommBuf,bytes);
|
||||||
|
|
||||||
@ -462,18 +469,30 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
for(int r=0;r<WorldShmSize;r++){
|
for(int r=0;r<WorldShmSize;r++){
|
||||||
|
|
||||||
|
#ifndef GRID_MPI3_SHM_NONE
|
||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
// If it is me, pass around the IPC access key
|
// If it is me, pass around the IPC access key
|
||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
|
#ifdef GRID_CUDA
|
||||||
cudaIpcMemHandle_t handle;
|
cudaIpcMemHandle_t handle;
|
||||||
|
|
||||||
if ( r==WorldShmRank ) {
|
if ( r==WorldShmRank ) {
|
||||||
err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
|
auto err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
|
||||||
if ( err != cudaSuccess) {
|
if ( err != cudaSuccess) {
|
||||||
std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
|
std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_HIP
|
||||||
|
hipIpcMemHandle_t handle;
|
||||||
|
if ( r==WorldShmRank ) {
|
||||||
|
auto err = hipIpcGetMemHandle(&handle,ShmCommBuf);
|
||||||
|
if ( err != hipSuccess) {
|
||||||
|
std::cerr << " SharedMemoryMPI.cc hipIpcGetMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
// Share this IPC handle across the Shm Comm
|
// Share this IPC handle across the Shm Comm
|
||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
@ -490,17 +509,31 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
// If I am not the source, overwrite thisBuf with remote buffer
|
// If I am not the source, overwrite thisBuf with remote buffer
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
void * thisBuf = ShmCommBuf;
|
void * thisBuf = ShmCommBuf;
|
||||||
|
#ifdef GRID_CUDA
|
||||||
if ( r!=WorldShmRank ) {
|
if ( r!=WorldShmRank ) {
|
||||||
err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
|
auto err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
|
||||||
if ( err != cudaSuccess) {
|
if ( err != cudaSuccess) {
|
||||||
std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
|
std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_HIP
|
||||||
|
if ( r!=WorldShmRank ) {
|
||||||
|
auto err = hipIpcOpenMemHandle(&thisBuf,handle,hipIpcMemLazyEnablePeerAccess);
|
||||||
|
if ( err != hipSuccess) {
|
||||||
|
std::cerr << " SharedMemoryMPI.cc hipIpcOpenMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Save a copy of the device buffers
|
// Save a copy of the device buffers
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
WorldShmCommBufs[r] = thisBuf;
|
WorldShmCommBufs[r] = thisBuf;
|
||||||
|
#else
|
||||||
|
WorldShmCommBufs[r] = ShmCommBuf;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
_ShmAllocBytes=bytes;
|
_ShmAllocBytes=bytes;
|
||||||
@ -633,7 +666,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
#endif
|
#endif
|
||||||
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
|
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
|
||||||
|
|
||||||
// std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
|
|
||||||
if ( ptr == (void * )MAP_FAILED ) {
|
if ( ptr == (void * )MAP_FAILED ) {
|
||||||
perror("failed mmap");
|
perror("failed mmap");
|
||||||
assert(0);
|
assert(0);
|
||||||
@ -683,7 +715,7 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
|
|||||||
bzero(dest,bytes);
|
bzero(dest,bytes);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
void GlobalSharedMemory::SharedMemoryCopy(void *dest,const void *src,size_t bytes)
|
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
|
||||||
{
|
{
|
||||||
#ifdef GRID_CUDA
|
#ifdef GRID_CUDA
|
||||||
cudaMemcpy(dest,src,bytes,cudaMemcpyDefault);
|
cudaMemcpy(dest,src,bytes,cudaMemcpyDefault);
|
||||||
@ -705,7 +737,11 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
|
|||||||
/////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////
|
||||||
// Split into groups that can share memory
|
// Split into groups that can share memory
|
||||||
/////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////
|
||||||
|
#ifndef GRID_MPI3_SHM_NONE
|
||||||
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
|
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
|
||||||
|
#else
|
||||||
|
MPI_Comm_split(comm, rank, 0, &ShmComm);
|
||||||
|
#endif
|
||||||
MPI_Comm_rank(ShmComm ,&ShmRank);
|
MPI_Comm_rank(ShmComm ,&ShmRank);
|
||||||
MPI_Comm_size(ShmComm ,&ShmSize);
|
MPI_Comm_size(ShmComm ,&ShmSize);
|
||||||
ShmCommBufs.resize(ShmSize);
|
ShmCommBufs.resize(ShmSize);
|
||||||
@ -735,19 +771,12 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
|
|||||||
std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r;
|
std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r;
|
||||||
MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]);
|
MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]);
|
||||||
|
|
||||||
#ifdef GRID_IBM_SUMMIT
|
#ifdef GRID_SHM_FORCE_MPI
|
||||||
// Hide the shared memory path between sockets
|
// Hide the shared memory path between ranks
|
||||||
// if even number of nodes
|
{
|
||||||
if ( (ShmSize & 0x1)==0 ) {
|
|
||||||
int SocketSize = ShmSize/2;
|
|
||||||
int mySocket = ShmRank/SocketSize;
|
|
||||||
for(int r=0;r<size;r++){
|
for(int r=0;r<size;r++){
|
||||||
int hisRank=ShmRanks[r];
|
if ( r!=rank ) {
|
||||||
if ( hisRank!= MPI_UNDEFINED ) {
|
ShmRanks[r] = MPI_UNDEFINED;
|
||||||
int hisSocket=hisRank/SocketSize;
|
|
||||||
if ( hisSocket != mySocket ) {
|
|
||||||
ShmRanks[r] = MPI_UNDEFINED;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -29,6 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <Grid/GridCore.h>
|
#include <Grid/GridCore.h>
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
#define header "SharedMemoryNone: "
|
||||||
|
|
||||||
/*Construct from an MPI communicator*/
|
/*Construct from an MPI communicator*/
|
||||||
void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
|
void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
|
||||||
@ -55,6 +56,38 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M
|
|||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Hugetlbfs mapping intended, use anonymous mmap
|
// Hugetlbfs mapping intended, use anonymous mmap
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
#if 1
|
||||||
|
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||||
|
{
|
||||||
|
std::cout << header "SharedMemoryAllocate "<< bytes<< " GPU implementation "<<std::endl;
|
||||||
|
void * ShmCommBuf ;
|
||||||
|
assert(_ShmSetup==1);
|
||||||
|
assert(_ShmAlloc==0);
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Each MPI rank should allocate our own buffer
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
ShmCommBuf = acceleratorAllocDevice(bytes);
|
||||||
|
|
||||||
|
if (ShmCommBuf == (void *)NULL ) {
|
||||||
|
std::cerr << " SharedMemoryNone.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
if ( WorldRank == 0 ){
|
||||||
|
std::cout << WorldRank << header " SharedMemoryNone.cc acceleratorAllocDevice "<< bytes
|
||||||
|
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
|
||||||
|
}
|
||||||
|
SharedMemoryZero(ShmCommBuf,bytes);
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Loop over ranks/gpu's on our node
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
WorldShmCommBufs[0] = ShmCommBuf;
|
||||||
|
|
||||||
|
_ShmAllocBytes=bytes;
|
||||||
|
_ShmAlloc=1;
|
||||||
|
}
|
||||||
|
#else
|
||||||
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||||
{
|
{
|
||||||
void * ShmCommBuf ;
|
void * ShmCommBuf ;
|
||||||
@ -83,7 +116,15 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
_ShmAllocBytes=bytes;
|
_ShmAllocBytes=bytes;
|
||||||
_ShmAlloc=1;
|
_ShmAlloc=1;
|
||||||
};
|
};
|
||||||
|
#endif
|
||||||
|
void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
|
||||||
|
{
|
||||||
|
acceleratorMemSet(dest,0,bytes);
|
||||||
|
}
|
||||||
|
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
|
||||||
|
{
|
||||||
|
acceleratorCopyToDevice(src,dest,bytes);
|
||||||
|
}
|
||||||
////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////
|
||||||
// Global shared functionality finished
|
// Global shared functionality finished
|
||||||
// Now move to per communicator functionality
|
// Now move to per communicator functionality
|
||||||
|
@ -52,23 +52,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
template<typename Op, typename T1>
|
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
|
||||||
auto Cshift(const LatticeUnaryExpression<Op,T1> &expr,int dim,int shift)
|
auto Cshift(const Expression &expr,int dim,int shift) -> decltype(closure(expr))
|
||||||
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))>
|
|
||||||
{
|
|
||||||
return Cshift(closure(expr),dim,shift);
|
|
||||||
}
|
|
||||||
template <class Op, class T1, class T2>
|
|
||||||
auto Cshift(const LatticeBinaryExpression<Op,T1,T2> &expr,int dim,int shift)
|
|
||||||
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))>
|
|
||||||
{
|
|
||||||
return Cshift(closure(expr),dim,shift);
|
|
||||||
}
|
|
||||||
template <class Op, class T1, class T2, class T3>
|
|
||||||
auto Cshift(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr,int dim,int shift)
|
|
||||||
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1),
|
|
||||||
eval(0, expr.arg2),
|
|
||||||
eval(0, expr.arg3)))>
|
|
||||||
{
|
{
|
||||||
return Cshift(closure(expr),dim,shift);
|
return Cshift(closure(expr),dim,shift);
|
||||||
}
|
}
|
||||||
|
@ -35,7 +35,7 @@ extern Vector<std::pair<int,int> > Cshift_table;
|
|||||||
// Gather for when there is no need to SIMD split
|
// Gather for when there is no need to SIMD split
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
template<class vobj> void
|
template<class vobj> void
|
||||||
Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
|
Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
|
||||||
{
|
{
|
||||||
int rd = rhs.Grid()->_rdimensions[dimension];
|
int rd = rhs.Grid()->_rdimensions[dimension];
|
||||||
|
|
||||||
@ -73,12 +73,19 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
autoView(rhs_v , rhs, AcceleratorRead);
|
|
||||||
auto buffer_p = & buffer[0];
|
auto buffer_p = & buffer[0];
|
||||||
auto table = &Cshift_table[0];
|
auto table = &Cshift_table[0];
|
||||||
accelerator_for(i,ent,1,{
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
|
autoView(rhs_v , rhs, AcceleratorRead);
|
||||||
|
accelerator_for(i,ent,vobj::Nsimd(),{
|
||||||
|
coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
|
||||||
|
});
|
||||||
|
#else
|
||||||
|
autoView(rhs_v , rhs, CpuRead);
|
||||||
|
thread_for(i,ent,{
|
||||||
buffer_p[table[i].first]=rhs_v[table[i].second];
|
buffer_p[table[i].first]=rhs_v[table[i].second];
|
||||||
});
|
});
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -103,6 +110,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
|
|||||||
int n1=rhs.Grid()->_slice_stride[dimension];
|
int n1=rhs.Grid()->_slice_stride[dimension];
|
||||||
|
|
||||||
if ( cbmask ==0x3){
|
if ( cbmask ==0x3){
|
||||||
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
autoView(rhs_v , rhs, AcceleratorRead);
|
autoView(rhs_v , rhs, AcceleratorRead);
|
||||||
accelerator_for2d(n,e1,b,e2,1,{
|
accelerator_for2d(n,e1,b,e2,1,{
|
||||||
int o = n*n1;
|
int o = n*n1;
|
||||||
@ -111,12 +119,22 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
|
|||||||
vobj temp =rhs_v[so+o+b];
|
vobj temp =rhs_v[so+o+b];
|
||||||
extract<vobj>(temp,pointers,offset);
|
extract<vobj>(temp,pointers,offset);
|
||||||
});
|
});
|
||||||
} else {
|
#else
|
||||||
autoView(rhs_v , rhs, AcceleratorRead);
|
autoView(rhs_v , rhs, CpuRead);
|
||||||
|
thread_for2d(n,e1,b,e2,{
|
||||||
|
int o = n*n1;
|
||||||
|
int offset = b+n*e2;
|
||||||
|
|
||||||
|
vobj temp =rhs_v[so+o+b];
|
||||||
|
extract<vobj>(temp,pointers,offset);
|
||||||
|
});
|
||||||
|
#endif
|
||||||
|
} else {
|
||||||
Coordinate rdim=rhs.Grid()->_rdimensions;
|
Coordinate rdim=rhs.Grid()->_rdimensions;
|
||||||
Coordinate cdm =rhs.Grid()->_checker_dim_mask;
|
Coordinate cdm =rhs.Grid()->_checker_dim_mask;
|
||||||
std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
|
std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
|
||||||
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
|
autoView(rhs_v , rhs, AcceleratorRead);
|
||||||
accelerator_for2d(n,e1,b,e2,1,{
|
accelerator_for2d(n,e1,b,e2,1,{
|
||||||
|
|
||||||
Coordinate coor;
|
Coordinate coor;
|
||||||
@ -134,13 +152,33 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
|
|||||||
extract<vobj>(temp,pointers,offset);
|
extract<vobj>(temp,pointers,offset);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
#else
|
||||||
|
autoView(rhs_v , rhs, CpuRead);
|
||||||
|
thread_for2d(n,e1,b,e2,{
|
||||||
|
|
||||||
|
Coordinate coor;
|
||||||
|
|
||||||
|
int o=n*n1;
|
||||||
|
int oindex = o+b;
|
||||||
|
|
||||||
|
int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
|
||||||
|
|
||||||
|
int ocb=1<<cb;
|
||||||
|
int offset = b+n*e2;
|
||||||
|
|
||||||
|
if ( ocb & cbmask ) {
|
||||||
|
vobj temp =rhs_v[so+o+b];
|
||||||
|
extract<vobj>(temp,pointers,offset);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
// Scatter for when there is no need to SIMD split
|
// Scatter for when there is no need to SIMD split
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
|
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<vobj> &buffer, int dimension,int plane,int cbmask)
|
||||||
{
|
{
|
||||||
int rd = rhs.Grid()->_rdimensions[dimension];
|
int rd = rhs.Grid()->_rdimensions[dimension];
|
||||||
|
|
||||||
@ -182,12 +220,19 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
|
|||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
autoView( rhs_v, rhs, AcceleratorWrite);
|
|
||||||
auto buffer_p = & buffer[0];
|
auto buffer_p = & buffer[0];
|
||||||
auto table = &Cshift_table[0];
|
auto table = &Cshift_table[0];
|
||||||
accelerator_for(i,ent,1,{
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
rhs_v[table[i].first]=buffer_p[table[i].second];
|
autoView( rhs_v, rhs, AcceleratorWrite);
|
||||||
|
accelerator_for(i,ent,vobj::Nsimd(),{
|
||||||
|
coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
|
||||||
});
|
});
|
||||||
|
#else
|
||||||
|
autoView( rhs_v, rhs, CpuWrite);
|
||||||
|
thread_for(i,ent,{
|
||||||
|
rhs_v[table[i].first]=buffer_p[table[i].second];
|
||||||
|
});
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -208,18 +253,30 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
|
|||||||
int e2=rhs.Grid()->_slice_block[dimension];
|
int e2=rhs.Grid()->_slice_block[dimension];
|
||||||
|
|
||||||
if(cbmask ==0x3 ) {
|
if(cbmask ==0x3 ) {
|
||||||
|
int _slice_stride = rhs.Grid()->_slice_stride[dimension];
|
||||||
|
int _slice_block = rhs.Grid()->_slice_block[dimension];
|
||||||
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
autoView( rhs_v , rhs, AcceleratorWrite);
|
autoView( rhs_v , rhs, AcceleratorWrite);
|
||||||
accelerator_for2d(n,e1,b,e2,1,{
|
accelerator_for2d(n,e1,b,e2,1,{
|
||||||
int o = n*rhs.Grid()->_slice_stride[dimension];
|
int o = n*_slice_stride;
|
||||||
int offset = b+n*rhs.Grid()->_slice_block[dimension];
|
int offset = b+n*_slice_block;
|
||||||
merge(rhs_v[so+o+b],pointers,offset);
|
merge(rhs_v[so+o+b],pointers,offset);
|
||||||
});
|
});
|
||||||
|
#else
|
||||||
|
autoView( rhs_v , rhs, CpuWrite);
|
||||||
|
thread_for2d(n,e1,b,e2,{
|
||||||
|
int o = n*_slice_stride;
|
||||||
|
int offset = b+n*_slice_block;
|
||||||
|
merge(rhs_v[so+o+b],pointers,offset);
|
||||||
|
});
|
||||||
|
#endif
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
// Case of SIMD split AND checker dim cannot currently be hit, except in
|
// Case of SIMD split AND checker dim cannot currently be hit, except in
|
||||||
// Test_cshift_red_black code.
|
// Test_cshift_red_black code.
|
||||||
// std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
|
// std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
|
||||||
std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
|
std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
|
||||||
|
assert(0); // This will fail if hit on GPU
|
||||||
autoView( rhs_v, rhs, CpuWrite);
|
autoView( rhs_v, rhs, CpuWrite);
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
@ -277,12 +334,20 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
|
|||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
|
auto table = &Cshift_table[0];
|
||||||
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
autoView(rhs_v , rhs, AcceleratorRead);
|
autoView(rhs_v , rhs, AcceleratorRead);
|
||||||
autoView(lhs_v , lhs, AcceleratorWrite);
|
autoView(lhs_v , lhs, AcceleratorWrite);
|
||||||
auto table = &Cshift_table[0];
|
accelerator_for(i,ent,vobj::Nsimd(),{
|
||||||
accelerator_for(i,ent,1,{
|
coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
|
||||||
|
});
|
||||||
|
#else
|
||||||
|
autoView(rhs_v , rhs, CpuRead);
|
||||||
|
autoView(lhs_v , lhs, CpuWrite);
|
||||||
|
thread_for(i,ent,{
|
||||||
lhs_v[table[i].first]=rhs_v[table[i].second];
|
lhs_v[table[i].first]=rhs_v[table[i].second];
|
||||||
});
|
});
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -321,12 +386,20 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
|
|||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
|
auto table = &Cshift_table[0];
|
||||||
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
autoView( rhs_v, rhs, AcceleratorRead);
|
autoView( rhs_v, rhs, AcceleratorRead);
|
||||||
autoView( lhs_v, lhs, AcceleratorWrite);
|
autoView( lhs_v, lhs, AcceleratorWrite);
|
||||||
auto table = &Cshift_table[0];
|
|
||||||
accelerator_for(i,ent,1,{
|
accelerator_for(i,ent,1,{
|
||||||
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
|
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
|
||||||
});
|
});
|
||||||
|
#else
|
||||||
|
autoView( rhs_v, rhs, CpuRead);
|
||||||
|
autoView( lhs_v, lhs, CpuWrite);
|
||||||
|
thread_for(i,ent,{
|
||||||
|
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
|
||||||
|
});
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -101,7 +101,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
|
|||||||
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
|
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#define ACCELERATOR_CSHIFT_NO_COPY
|
||||||
|
#ifdef ACCELERATOR_CSHIFT_NO_COPY
|
||||||
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||||
{
|
{
|
||||||
typedef typename vobj::vector_type vector_type;
|
typedef typename vobj::vector_type vector_type;
|
||||||
@ -121,8 +122,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
|||||||
assert(shift<fd);
|
assert(shift<fd);
|
||||||
|
|
||||||
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
|
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
|
||||||
commVector<vobj> send_buf(buffer_size);
|
cshiftVector<vobj> send_buf(buffer_size);
|
||||||
commVector<vobj> recv_buf(buffer_size);
|
cshiftVector<vobj> recv_buf(buffer_size);
|
||||||
|
|
||||||
int cb= (cbmask==0x2)? Odd : Even;
|
int cb= (cbmask==0x2)? Odd : Even;
|
||||||
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
||||||
@ -138,7 +139,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
|||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
int words = send_buf.size();
|
int words = buffer_size;
|
||||||
if (cbmask != 0x3) words=words>>1;
|
if (cbmask != 0x3) words=words>>1;
|
||||||
|
|
||||||
int bytes = words * sizeof(vobj);
|
int bytes = words * sizeof(vobj);
|
||||||
@ -150,12 +151,14 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
|||||||
int xmit_to_rank;
|
int xmit_to_rank;
|
||||||
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
|
grid->Barrier();
|
||||||
|
|
||||||
grid->SendToRecvFrom((void *)&send_buf[0],
|
grid->SendToRecvFrom((void *)&send_buf[0],
|
||||||
xmit_to_rank,
|
xmit_to_rank,
|
||||||
(void *)&recv_buf[0],
|
(void *)&recv_buf[0],
|
||||||
recv_from_rank,
|
recv_from_rank,
|
||||||
bytes);
|
bytes);
|
||||||
|
|
||||||
grid->Barrier();
|
grid->Barrier();
|
||||||
|
|
||||||
Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
|
Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
|
||||||
@ -195,8 +198,15 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
|
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
|
||||||
// int words = sizeof(vobj)/sizeof(vector_type);
|
// int words = sizeof(vobj)/sizeof(vector_type);
|
||||||
|
|
||||||
std::vector<commVector<scalar_object> > send_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
|
std::vector<cshiftVector<scalar_object> > send_buf_extract(Nsimd);
|
||||||
std::vector<commVector<scalar_object> > recv_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
|
std::vector<cshiftVector<scalar_object> > recv_buf_extract(Nsimd);
|
||||||
|
scalar_object * recv_buf_extract_mpi;
|
||||||
|
scalar_object * send_buf_extract_mpi;
|
||||||
|
|
||||||
|
for(int s=0;s<Nsimd;s++){
|
||||||
|
send_buf_extract[s].resize(buffer_size);
|
||||||
|
recv_buf_extract[s].resize(buffer_size);
|
||||||
|
}
|
||||||
|
|
||||||
int bytes = buffer_size*sizeof(scalar_object);
|
int bytes = buffer_size*sizeof(scalar_object);
|
||||||
|
|
||||||
@ -242,11 +252,204 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
if(nbr_proc){
|
if(nbr_proc){
|
||||||
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0],
|
grid->Barrier();
|
||||||
|
|
||||||
|
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
|
||||||
|
recv_buf_extract_mpi = &recv_buf_extract[i][0];
|
||||||
|
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
|
||||||
xmit_to_rank,
|
xmit_to_rank,
|
||||||
(void *)&recv_buf_extract[i][0],
|
(void *)recv_buf_extract_mpi,
|
||||||
recv_from_rank,
|
recv_from_rank,
|
||||||
bytes);
|
bytes);
|
||||||
|
|
||||||
|
grid->Barrier();
|
||||||
|
|
||||||
|
rpointers[i] = &recv_buf_extract[i][0];
|
||||||
|
} else {
|
||||||
|
rpointers[i] = &send_buf_extract[nbr_lane][0];
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||||
|
{
|
||||||
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
|
||||||
|
GridBase *grid=rhs.Grid();
|
||||||
|
Lattice<vobj> temp(rhs.Grid());
|
||||||
|
|
||||||
|
int fd = rhs.Grid()->_fdimensions[dimension];
|
||||||
|
int rd = rhs.Grid()->_rdimensions[dimension];
|
||||||
|
int pd = rhs.Grid()->_processors[dimension];
|
||||||
|
int simd_layout = rhs.Grid()->_simd_layout[dimension];
|
||||||
|
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
|
||||||
|
assert(simd_layout==1);
|
||||||
|
assert(comm_dim==1);
|
||||||
|
assert(shift>=0);
|
||||||
|
assert(shift<fd);
|
||||||
|
|
||||||
|
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
|
||||||
|
cshiftVector<vobj> send_buf_v(buffer_size);
|
||||||
|
cshiftVector<vobj> recv_buf_v(buffer_size);
|
||||||
|
vobj *send_buf;
|
||||||
|
vobj *recv_buf;
|
||||||
|
{
|
||||||
|
grid->ShmBufferFreeAll();
|
||||||
|
size_t bytes = buffer_size*sizeof(vobj);
|
||||||
|
send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
|
||||||
|
recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
int cb= (cbmask==0x2)? Odd : Even;
|
||||||
|
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
||||||
|
|
||||||
|
for(int x=0;x<rd;x++){
|
||||||
|
|
||||||
|
int sx = (x+sshift)%rd;
|
||||||
|
int comm_proc = ((x+sshift)/rd)%pd;
|
||||||
|
|
||||||
|
if (comm_proc==0) {
|
||||||
|
|
||||||
|
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
int words = buffer_size;
|
||||||
|
if (cbmask != 0x3) words=words>>1;
|
||||||
|
|
||||||
|
int bytes = words * sizeof(vobj);
|
||||||
|
|
||||||
|
Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
|
||||||
|
|
||||||
|
// int rank = grid->_processor;
|
||||||
|
int recv_from_rank;
|
||||||
|
int xmit_to_rank;
|
||||||
|
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
|
|
||||||
|
grid->Barrier();
|
||||||
|
|
||||||
|
acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
|
||||||
|
grid->SendToRecvFrom((void *)&send_buf[0],
|
||||||
|
xmit_to_rank,
|
||||||
|
(void *)&recv_buf[0],
|
||||||
|
recv_from_rank,
|
||||||
|
bytes);
|
||||||
|
acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
|
||||||
|
|
||||||
|
grid->Barrier();
|
||||||
|
|
||||||
|
Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||||
|
{
|
||||||
|
GridBase *grid=rhs.Grid();
|
||||||
|
const int Nsimd = grid->Nsimd();
|
||||||
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
typedef typename vobj::scalar_object scalar_object;
|
||||||
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
|
||||||
|
int fd = grid->_fdimensions[dimension];
|
||||||
|
int rd = grid->_rdimensions[dimension];
|
||||||
|
int ld = grid->_ldimensions[dimension];
|
||||||
|
int pd = grid->_processors[dimension];
|
||||||
|
int simd_layout = grid->_simd_layout[dimension];
|
||||||
|
int comm_dim = grid->_processors[dimension] >1 ;
|
||||||
|
|
||||||
|
//std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
|
||||||
|
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
|
||||||
|
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
|
||||||
|
|
||||||
|
assert(comm_dim==1);
|
||||||
|
assert(simd_layout==2);
|
||||||
|
assert(shift>=0);
|
||||||
|
assert(shift<fd);
|
||||||
|
|
||||||
|
int permute_type=grid->PermuteType(dimension);
|
||||||
|
|
||||||
|
///////////////////////////////////////////////
|
||||||
|
// Simd direction uses an extract/merge pair
|
||||||
|
///////////////////////////////////////////////
|
||||||
|
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
|
||||||
|
// int words = sizeof(vobj)/sizeof(vector_type);
|
||||||
|
|
||||||
|
std::vector<cshiftVector<scalar_object> > send_buf_extract(Nsimd);
|
||||||
|
std::vector<cshiftVector<scalar_object> > recv_buf_extract(Nsimd);
|
||||||
|
scalar_object * recv_buf_extract_mpi;
|
||||||
|
scalar_object * send_buf_extract_mpi;
|
||||||
|
{
|
||||||
|
size_t bytes = sizeof(scalar_object)*buffer_size;
|
||||||
|
grid->ShmBufferFreeAll();
|
||||||
|
send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
|
||||||
|
recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
|
||||||
|
}
|
||||||
|
for(int s=0;s<Nsimd;s++){
|
||||||
|
send_buf_extract[s].resize(buffer_size);
|
||||||
|
recv_buf_extract[s].resize(buffer_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
int bytes = buffer_size*sizeof(scalar_object);
|
||||||
|
|
||||||
|
ExtractPointerArray<scalar_object> pointers(Nsimd); //
|
||||||
|
ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
|
||||||
|
|
||||||
|
///////////////////////////////////////////
|
||||||
|
// Work out what to send where
|
||||||
|
///////////////////////////////////////////
|
||||||
|
int cb = (cbmask==0x2)? Odd : Even;
|
||||||
|
int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
||||||
|
|
||||||
|
// loop over outer coord planes orthog to dim
|
||||||
|
for(int x=0;x<rd;x++){
|
||||||
|
|
||||||
|
// FIXME call local permute copy if none are offnode.
|
||||||
|
for(int i=0;i<Nsimd;i++){
|
||||||
|
pointers[i] = &send_buf_extract[i][0];
|
||||||
|
}
|
||||||
|
int sx = (x+sshift)%rd;
|
||||||
|
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
|
||||||
|
|
||||||
|
for(int i=0;i<Nsimd;i++){
|
||||||
|
|
||||||
|
int inner_bit = (Nsimd>>(permute_type+1));
|
||||||
|
int ic= (i&inner_bit)? 1:0;
|
||||||
|
|
||||||
|
int my_coor = rd*ic + x;
|
||||||
|
int nbr_coor = my_coor+sshift;
|
||||||
|
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
|
||||||
|
|
||||||
|
int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer
|
||||||
|
int nbr_ox = (nbr_coor%rd); // outer coord of peer
|
||||||
|
int nbr_lane = (i&(~inner_bit));
|
||||||
|
|
||||||
|
int recv_from_rank;
|
||||||
|
int xmit_to_rank;
|
||||||
|
|
||||||
|
if (nbr_ic) nbr_lane|=inner_bit;
|
||||||
|
|
||||||
|
assert (sx == nbr_ox);
|
||||||
|
|
||||||
|
if(nbr_proc){
|
||||||
|
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
|
grid->Barrier();
|
||||||
|
|
||||||
|
acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
|
||||||
|
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
|
||||||
|
xmit_to_rank,
|
||||||
|
(void *)recv_buf_extract_mpi,
|
||||||
|
recv_from_rank,
|
||||||
|
bytes);
|
||||||
|
acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
|
||||||
|
|
||||||
grid->Barrier();
|
grid->Barrier();
|
||||||
rpointers[i] = &recv_buf_extract[i][0];
|
rpointers[i] = &recv_buf_extract[i][0];
|
||||||
} else {
|
} else {
|
||||||
@ -258,7 +461,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -36,7 +36,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <Grid/lattice/Lattice_local.h>
|
#include <Grid/lattice/Lattice_local.h>
|
||||||
#include <Grid/lattice/Lattice_reduction.h>
|
#include <Grid/lattice/Lattice_reduction.h>
|
||||||
#include <Grid/lattice/Lattice_peekpoke.h>
|
#include <Grid/lattice/Lattice_peekpoke.h>
|
||||||
//#include <Grid/lattice/Lattice_reality.h>
|
#include <Grid/lattice/Lattice_reality.h>
|
||||||
|
#include <Grid/lattice/Lattice_real_imag.h>
|
||||||
#include <Grid/lattice/Lattice_comparison_utils.h>
|
#include <Grid/lattice/Lattice_comparison_utils.h>
|
||||||
#include <Grid/lattice/Lattice_comparison.h>
|
#include <Grid/lattice/Lattice_comparison.h>
|
||||||
#include <Grid/lattice/Lattice_coordinate.h>
|
#include <Grid/lattice/Lattice_coordinate.h>
|
||||||
|
@ -42,9 +42,24 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
////////////////////////////////////////////////////
|
////////////////////////////////////////////////////
|
||||||
// Predicated where support
|
// Predicated where support
|
||||||
////////////////////////////////////////////////////
|
////////////////////////////////////////////////////
|
||||||
|
#ifdef GRID_SIMT
|
||||||
|
// drop to scalar in SIMT; cleaner in fact
|
||||||
template <class iobj, class vobj, class robj>
|
template <class iobj, class vobj, class robj>
|
||||||
accelerator_inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue,
|
accelerator_inline vobj predicatedWhere(const iobj &predicate,
|
||||||
const robj &iffalse) {
|
const vobj &iftrue,
|
||||||
|
const robj &iffalse)
|
||||||
|
{
|
||||||
|
Integer mask = TensorRemove(predicate);
|
||||||
|
typename std::remove_const<vobj>::type ret= iffalse;
|
||||||
|
if (mask) ret=iftrue;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
template <class iobj, class vobj, class robj>
|
||||||
|
accelerator_inline vobj predicatedWhere(const iobj &predicate,
|
||||||
|
const vobj &iftrue,
|
||||||
|
const robj &iffalse)
|
||||||
|
{
|
||||||
typename std::remove_const<vobj>::type ret;
|
typename std::remove_const<vobj>::type ret;
|
||||||
|
|
||||||
typedef typename vobj::scalar_object scalar_object;
|
typedef typename vobj::scalar_object scalar_object;
|
||||||
@ -68,6 +83,7 @@ accelerator_inline vobj predicatedWhere(const iobj &predicate, const vobj &iftru
|
|||||||
merge(ret, falsevals);
|
merge(ret, falsevals);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
/////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////
|
||||||
//Specialization of getVectorType for lattices
|
//Specialization of getVectorType for lattices
|
||||||
@ -81,32 +97,62 @@ struct getVectorType<Lattice<T> >{
|
|||||||
//-- recursive evaluation of expressions; --
|
//-- recursive evaluation of expressions; --
|
||||||
// handle leaves of syntax tree
|
// handle leaves of syntax tree
|
||||||
///////////////////////////////////////////////////
|
///////////////////////////////////////////////////
|
||||||
template<class sobj> accelerator_inline
|
template<class sobj,
|
||||||
|
typename std::enable_if<!is_lattice<sobj>::value&&!is_lattice_expr<sobj>::value,sobj>::type * = nullptr>
|
||||||
|
accelerator_inline
|
||||||
sobj eval(const uint64_t ss, const sobj &arg)
|
sobj eval(const uint64_t ss, const sobj &arg)
|
||||||
{
|
{
|
||||||
return arg;
|
return arg;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class lobj> accelerator_inline
|
template <class lobj> accelerator_inline
|
||||||
const lobj & eval(const uint64_t ss, const LatticeView<lobj> &arg)
|
auto eval(const uint64_t ss, const LatticeView<lobj> &arg) -> decltype(arg(ss))
|
||||||
|
{
|
||||||
|
return arg(ss);
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////
|
||||||
|
//-- recursive evaluation of expressions; --
|
||||||
|
// whole vector return, used only for expression return type inference
|
||||||
|
///////////////////////////////////////////////////
|
||||||
|
template<class sobj> accelerator_inline
|
||||||
|
sobj vecEval(const uint64_t ss, const sobj &arg)
|
||||||
|
{
|
||||||
|
return arg;
|
||||||
|
}
|
||||||
|
template <class lobj> accelerator_inline
|
||||||
|
const lobj & vecEval(const uint64_t ss, const LatticeView<lobj> &arg)
|
||||||
{
|
{
|
||||||
return arg[ss];
|
return arg[ss];
|
||||||
}
|
}
|
||||||
|
|
||||||
// What needs this?
|
|
||||||
// Cannot be legal on accelerator
|
|
||||||
// Comparison must convert
|
|
||||||
#if 1
|
|
||||||
template <class lobj> accelerator_inline
|
|
||||||
const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg)
|
|
||||||
{
|
|
||||||
auto view = arg.View(AcceleratorRead);
|
|
||||||
return view[ss];
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////
|
///////////////////////////////////////////////////
|
||||||
// handle nodes in syntax tree- eval one operand
|
// handle nodes in syntax tree- eval one operand
|
||||||
|
// vecEval needed (but never called as all expressions offloaded) to infer the return type
|
||||||
|
// in SIMT contexts of closure.
|
||||||
|
///////////////////////////////////////////////////
|
||||||
|
template <typename Op, typename T1> accelerator_inline
|
||||||
|
auto vecEval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)
|
||||||
|
-> decltype(expr.op.func( vecEval(ss, expr.arg1)))
|
||||||
|
{
|
||||||
|
return expr.op.func( vecEval(ss, expr.arg1) );
|
||||||
|
}
|
||||||
|
// vecEval two operands
|
||||||
|
template <typename Op, typename T1, typename T2> accelerator_inline
|
||||||
|
auto vecEval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr)
|
||||||
|
-> decltype(expr.op.func( vecEval(ss,expr.arg1),vecEval(ss,expr.arg2)))
|
||||||
|
{
|
||||||
|
return expr.op.func( vecEval(ss,expr.arg1), vecEval(ss,expr.arg2) );
|
||||||
|
}
|
||||||
|
// vecEval three operands
|
||||||
|
template <typename Op, typename T1, typename T2, typename T3> accelerator_inline
|
||||||
|
auto vecEval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
|
||||||
|
-> decltype(expr.op.func(vecEval(ss, expr.arg1), vecEval(ss, expr.arg2), vecEval(ss, expr.arg3)))
|
||||||
|
{
|
||||||
|
return expr.op.func(vecEval(ss, expr.arg1), vecEval(ss, expr.arg2), vecEval(ss, expr.arg3));
|
||||||
|
}
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////
|
||||||
|
// handle nodes in syntax tree- eval one operand coalesced
|
||||||
///////////////////////////////////////////////////
|
///////////////////////////////////////////////////
|
||||||
template <typename Op, typename T1> accelerator_inline
|
template <typename Op, typename T1> accelerator_inline
|
||||||
auto eval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)
|
auto eval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)
|
||||||
@ -114,23 +160,41 @@ auto eval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)
|
|||||||
{
|
{
|
||||||
return expr.op.func( eval(ss, expr.arg1) );
|
return expr.op.func( eval(ss, expr.arg1) );
|
||||||
}
|
}
|
||||||
///////////////////////
|
|
||||||
// eval two operands
|
// eval two operands
|
||||||
///////////////////////
|
|
||||||
template <typename Op, typename T1, typename T2> accelerator_inline
|
template <typename Op, typename T1, typename T2> accelerator_inline
|
||||||
auto eval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr)
|
auto eval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr)
|
||||||
-> decltype(expr.op.func( eval(ss,expr.arg1),eval(ss,expr.arg2)))
|
-> decltype(expr.op.func( eval(ss,expr.arg1),eval(ss,expr.arg2)))
|
||||||
{
|
{
|
||||||
return expr.op.func( eval(ss,expr.arg1), eval(ss,expr.arg2) );
|
return expr.op.func( eval(ss,expr.arg1), eval(ss,expr.arg2) );
|
||||||
}
|
}
|
||||||
///////////////////////
|
|
||||||
// eval three operands
|
// eval three operands
|
||||||
///////////////////////
|
|
||||||
template <typename Op, typename T1, typename T2, typename T3> accelerator_inline
|
template <typename Op, typename T1, typename T2, typename T3> accelerator_inline
|
||||||
auto eval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
|
auto eval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
|
||||||
-> decltype(expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3)))
|
-> decltype(expr.op.func(eval(ss, expr.arg1),
|
||||||
|
eval(ss, expr.arg2),
|
||||||
|
eval(ss, expr.arg3)))
|
||||||
{
|
{
|
||||||
return expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3));
|
#ifdef GRID_SIMT
|
||||||
|
// Handles Nsimd (vInteger) != Nsimd(ComplexD)
|
||||||
|
typedef decltype(vecEval(ss, expr.arg2)) rvobj;
|
||||||
|
typedef typename std::remove_reference<rvobj>::type vobj;
|
||||||
|
|
||||||
|
const int Nsimd = vobj::vector_type::Nsimd();
|
||||||
|
|
||||||
|
auto vpred = vecEval(ss,expr.arg1);
|
||||||
|
|
||||||
|
ExtractBuffer<Integer> mask(Nsimd);
|
||||||
|
extract<vInteger, Integer>(TensorRemove(vpred), mask);
|
||||||
|
|
||||||
|
int s = acceleratorSIMTlane(Nsimd);
|
||||||
|
return expr.op.func(mask[s],
|
||||||
|
eval(ss, expr.arg2),
|
||||||
|
eval(ss, expr.arg3));
|
||||||
|
#else
|
||||||
|
return expr.op.func(eval(ss, expr.arg1),
|
||||||
|
eval(ss, expr.arg2),
|
||||||
|
eval(ss, expr.arg3));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
@ -228,7 +292,7 @@ template <typename Op, typename T1, typename T2> inline
|
|||||||
void ExpressionViewOpen(LatticeBinaryExpression<Op, T1, T2> &expr)
|
void ExpressionViewOpen(LatticeBinaryExpression<Op, T1, T2> &expr)
|
||||||
{
|
{
|
||||||
ExpressionViewOpen(expr.arg1); // recurse AST
|
ExpressionViewOpen(expr.arg1); // recurse AST
|
||||||
ExpressionViewOpen(expr.arg2); // recurse AST
|
ExpressionViewOpen(expr.arg2); // rrecurse AST
|
||||||
}
|
}
|
||||||
template <typename Op, typename T1, typename T2, typename T3>
|
template <typename Op, typename T1, typename T2, typename T3>
|
||||||
inline void ExpressionViewOpen(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
|
inline void ExpressionViewOpen(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
|
||||||
@ -272,28 +336,20 @@ inline void ExpressionViewClose(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
|
|||||||
// Unary operators and funcs
|
// Unary operators and funcs
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
#define GridUnopClass(name, ret) \
|
#define GridUnopClass(name, ret) \
|
||||||
template <class arg> \
|
|
||||||
struct name { \
|
struct name { \
|
||||||
static auto accelerator_inline func(const arg a) -> decltype(ret) { return ret; } \
|
template<class _arg> static auto accelerator_inline func(const _arg a) -> decltype(ret) { return ret; } \
|
||||||
};
|
};
|
||||||
|
|
||||||
GridUnopClass(UnarySub, -a);
|
GridUnopClass(UnarySub, -a);
|
||||||
GridUnopClass(UnaryNot, Not(a));
|
GridUnopClass(UnaryNot, Not(a));
|
||||||
GridUnopClass(UnaryAdj, adj(a));
|
|
||||||
GridUnopClass(UnaryConj, conjugate(a));
|
|
||||||
GridUnopClass(UnaryTrace, trace(a));
|
GridUnopClass(UnaryTrace, trace(a));
|
||||||
GridUnopClass(UnaryTranspose, transpose(a));
|
GridUnopClass(UnaryTranspose, transpose(a));
|
||||||
GridUnopClass(UnaryTa, Ta(a));
|
GridUnopClass(UnaryTa, Ta(a));
|
||||||
GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
|
GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
|
||||||
GridUnopClass(UnaryReal, real(a));
|
|
||||||
GridUnopClass(UnaryImag, imag(a));
|
|
||||||
GridUnopClass(UnaryToReal, toReal(a));
|
|
||||||
GridUnopClass(UnaryToComplex, toComplex(a));
|
|
||||||
GridUnopClass(UnaryTimesI, timesI(a));
|
GridUnopClass(UnaryTimesI, timesI(a));
|
||||||
GridUnopClass(UnaryTimesMinusI, timesMinusI(a));
|
GridUnopClass(UnaryTimesMinusI, timesMinusI(a));
|
||||||
GridUnopClass(UnaryAbs, abs(a));
|
GridUnopClass(UnaryAbs, abs(a));
|
||||||
GridUnopClass(UnarySqrt, sqrt(a));
|
GridUnopClass(UnarySqrt, sqrt(a));
|
||||||
GridUnopClass(UnaryRsqrt, rsqrt(a));
|
|
||||||
GridUnopClass(UnarySin, sin(a));
|
GridUnopClass(UnarySin, sin(a));
|
||||||
GridUnopClass(UnaryCos, cos(a));
|
GridUnopClass(UnaryCos, cos(a));
|
||||||
GridUnopClass(UnaryAsin, asin(a));
|
GridUnopClass(UnaryAsin, asin(a));
|
||||||
@ -305,10 +361,10 @@ GridUnopClass(UnaryExp, exp(a));
|
|||||||
// Binary operators
|
// Binary operators
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
#define GridBinOpClass(name, combination) \
|
#define GridBinOpClass(name, combination) \
|
||||||
template <class left, class right> \
|
|
||||||
struct name { \
|
struct name { \
|
||||||
|
template <class _left, class _right> \
|
||||||
static auto accelerator_inline \
|
static auto accelerator_inline \
|
||||||
func(const left &lhs, const right &rhs) \
|
func(const _left &lhs, const _right &rhs) \
|
||||||
-> decltype(combination) const \
|
-> decltype(combination) const \
|
||||||
{ \
|
{ \
|
||||||
return combination; \
|
return combination; \
|
||||||
@ -328,10 +384,10 @@ GridBinOpClass(BinaryOrOr, lhs || rhs);
|
|||||||
// Trinary conditional op
|
// Trinary conditional op
|
||||||
////////////////////////////////////////////////////
|
////////////////////////////////////////////////////
|
||||||
#define GridTrinOpClass(name, combination) \
|
#define GridTrinOpClass(name, combination) \
|
||||||
template <class predicate, class left, class right> \
|
|
||||||
struct name { \
|
struct name { \
|
||||||
|
template <class _predicate,class _left, class _right> \
|
||||||
static auto accelerator_inline \
|
static auto accelerator_inline \
|
||||||
func(const predicate &pred, const left &lhs, const right &rhs) \
|
func(const _predicate &pred, const _left &lhs, const _right &rhs) \
|
||||||
-> decltype(combination) const \
|
-> decltype(combination) const \
|
||||||
{ \
|
{ \
|
||||||
return combination; \
|
return combination; \
|
||||||
@ -339,17 +395,17 @@ GridBinOpClass(BinaryOrOr, lhs || rhs);
|
|||||||
};
|
};
|
||||||
|
|
||||||
GridTrinOpClass(TrinaryWhere,
|
GridTrinOpClass(TrinaryWhere,
|
||||||
(predicatedWhere<predicate,
|
(predicatedWhere<
|
||||||
typename std::remove_reference<left>::type,
|
typename std::remove_reference<_predicate>::type,
|
||||||
typename std::remove_reference<right>::type>(pred, lhs,rhs)));
|
typename std::remove_reference<_left>::type,
|
||||||
|
typename std::remove_reference<_right>::type>(pred, lhs,rhs)));
|
||||||
|
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
// Operator syntactical glue
|
// Operator syntactical glue
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
|
#define GRID_UNOP(name) name
|
||||||
#define GRID_UNOP(name) name<decltype(eval(0, arg))>
|
#define GRID_BINOP(name) name
|
||||||
#define GRID_BINOP(name) name<decltype(eval(0, lhs)), decltype(eval(0, rhs))>
|
#define GRID_TRINOP(name) name
|
||||||
#define GRID_TRINOP(name) name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>
|
|
||||||
|
|
||||||
#define GRID_DEF_UNOP(op, name) \
|
#define GRID_DEF_UNOP(op, name) \
|
||||||
template <typename T1, typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value,T1>::type * = nullptr> \
|
template <typename T1, typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value,T1>::type * = nullptr> \
|
||||||
@ -395,22 +451,17 @@ GridTrinOpClass(TrinaryWhere,
|
|||||||
GRID_DEF_UNOP(operator-, UnarySub);
|
GRID_DEF_UNOP(operator-, UnarySub);
|
||||||
GRID_DEF_UNOP(Not, UnaryNot);
|
GRID_DEF_UNOP(Not, UnaryNot);
|
||||||
GRID_DEF_UNOP(operator!, UnaryNot);
|
GRID_DEF_UNOP(operator!, UnaryNot);
|
||||||
GRID_DEF_UNOP(adj, UnaryAdj);
|
//GRID_DEF_UNOP(adj, UnaryAdj);
|
||||||
GRID_DEF_UNOP(conjugate, UnaryConj);
|
//GRID_DEF_UNOP(conjugate, UnaryConj);
|
||||||
GRID_DEF_UNOP(trace, UnaryTrace);
|
GRID_DEF_UNOP(trace, UnaryTrace);
|
||||||
GRID_DEF_UNOP(transpose, UnaryTranspose);
|
GRID_DEF_UNOP(transpose, UnaryTranspose);
|
||||||
GRID_DEF_UNOP(Ta, UnaryTa);
|
GRID_DEF_UNOP(Ta, UnaryTa);
|
||||||
GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
|
GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
|
||||||
GRID_DEF_UNOP(real, UnaryReal);
|
|
||||||
GRID_DEF_UNOP(imag, UnaryImag);
|
|
||||||
GRID_DEF_UNOP(toReal, UnaryToReal);
|
|
||||||
GRID_DEF_UNOP(toComplex, UnaryToComplex);
|
|
||||||
GRID_DEF_UNOP(timesI, UnaryTimesI);
|
GRID_DEF_UNOP(timesI, UnaryTimesI);
|
||||||
GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI);
|
GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI);
|
||||||
GRID_DEF_UNOP(abs, UnaryAbs); // abs overloaded in cmath C++98; DON'T do the
|
GRID_DEF_UNOP(abs, UnaryAbs); // abs overloaded in cmath C++98; DON'T do the
|
||||||
// abs-fabs-dabs-labs thing
|
// abs-fabs-dabs-labs thing
|
||||||
GRID_DEF_UNOP(sqrt, UnarySqrt);
|
GRID_DEF_UNOP(sqrt, UnarySqrt);
|
||||||
GRID_DEF_UNOP(rsqrt, UnaryRsqrt);
|
|
||||||
GRID_DEF_UNOP(sin, UnarySin);
|
GRID_DEF_UNOP(sin, UnarySin);
|
||||||
GRID_DEF_UNOP(cos, UnaryCos);
|
GRID_DEF_UNOP(cos, UnaryCos);
|
||||||
GRID_DEF_UNOP(asin, UnaryAsin);
|
GRID_DEF_UNOP(asin, UnaryAsin);
|
||||||
@ -435,29 +486,36 @@ GRID_DEF_TRINOP(where, TrinaryWhere);
|
|||||||
/////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////
|
||||||
template <class Op, class T1>
|
template <class Op, class T1>
|
||||||
auto closure(const LatticeUnaryExpression<Op, T1> &expr)
|
auto closure(const LatticeUnaryExpression<Op, T1> &expr)
|
||||||
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))>
|
-> Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1)))>::type >
|
||||||
{
|
{
|
||||||
Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> ret(expr);
|
Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1)))>::type > ret(expr);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
template <class Op, class T1, class T2>
|
template <class Op, class T1, class T2>
|
||||||
auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
|
auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
|
||||||
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))>
|
-> Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))>::type >
|
||||||
{
|
{
|
||||||
Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))> ret(expr);
|
Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))>::type > ret(expr);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
template <class Op, class T1, class T2, class T3>
|
template <class Op, class T1, class T2, class T3>
|
||||||
auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
|
auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
|
||||||
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1),
|
-> Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),
|
||||||
eval(0, expr.arg2),
|
vecEval(0, expr.arg2),
|
||||||
eval(0, expr.arg3)))>
|
vecEval(0, expr.arg3)))>::type >
|
||||||
{
|
{
|
||||||
Lattice<decltype(expr.op.func(eval(0, expr.arg1),
|
Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),
|
||||||
eval(0, expr.arg2),
|
vecEval(0, expr.arg2),
|
||||||
eval(0, expr.arg3)))> ret(expr);
|
vecEval(0, expr.arg3)))>::type > ret(expr);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
#define EXPRESSION_CLOSURE(function) \
|
||||||
|
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> \
|
||||||
|
auto function(Expression &expr) -> decltype(function(closure(expr))) \
|
||||||
|
{ \
|
||||||
|
return function(closure(expr)); \
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
#undef GRID_UNOP
|
#undef GRID_UNOP
|
||||||
#undef GRID_BINOP
|
#undef GRID_BINOP
|
||||||
|
@ -60,9 +60,9 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
|
|||||||
autoView( lhs_v , lhs, AcceleratorRead);
|
autoView( lhs_v , lhs, AcceleratorRead);
|
||||||
autoView( rhs_v , rhs, AcceleratorRead);
|
autoView( rhs_v , rhs, AcceleratorRead);
|
||||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
|
||||||
auto lhs_t=lhs_v(ss);
|
auto lhs_t=lhs_v(ss);
|
||||||
auto rhs_t=rhs_v(ss);
|
auto rhs_t=rhs_v(ss);
|
||||||
|
auto tmp =ret_v(ss);
|
||||||
mac(&tmp,&lhs_t,&rhs_t);
|
mac(&tmp,&lhs_t,&rhs_t);
|
||||||
coalescedWrite(ret_v[ss],tmp);
|
coalescedWrite(ret_v[ss],tmp);
|
||||||
});
|
});
|
||||||
@ -124,7 +124,7 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
|||||||
autoView( ret_v , ret, AcceleratorWrite);
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
autoView( lhs_v , lhs, AcceleratorRead);
|
autoView( lhs_v , lhs, AcceleratorRead);
|
||||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
auto tmp =ret_v(ss);
|
||||||
auto lhs_t=lhs_v(ss);
|
auto lhs_t=lhs_v(ss);
|
||||||
mac(&tmp,&lhs_t,&rhs);
|
mac(&tmp,&lhs_t,&rhs);
|
||||||
coalescedWrite(ret_v[ss],tmp);
|
coalescedWrite(ret_v[ss],tmp);
|
||||||
@ -182,7 +182,7 @@ void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
|||||||
autoView( ret_v , ret, AcceleratorWrite);
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
autoView( rhs_v , lhs, AcceleratorRead);
|
autoView( rhs_v , lhs, AcceleratorRead);
|
||||||
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
auto tmp =ret_v(ss);
|
||||||
auto rhs_t=rhs_v(ss);
|
auto rhs_t=rhs_v(ss);
|
||||||
mac(&tmp,&lhs,&rhs_t);
|
mac(&tmp,&lhs,&rhs_t);
|
||||||
coalescedWrite(ret_v[ss],tmp);
|
coalescedWrite(ret_v[ss],tmp);
|
||||||
|
@ -123,9 +123,9 @@ public:
|
|||||||
auto exprCopy = expr;
|
auto exprCopy = expr;
|
||||||
ExpressionViewOpen(exprCopy);
|
ExpressionViewOpen(exprCopy);
|
||||||
auto me = View(AcceleratorWriteDiscard);
|
auto me = View(AcceleratorWriteDiscard);
|
||||||
accelerator_for(ss,me.size(),1,{
|
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
||||||
auto tmp = eval(ss,exprCopy);
|
auto tmp = eval(ss,exprCopy);
|
||||||
vstream(me[ss],tmp);
|
coalescedWrite(me[ss],tmp);
|
||||||
});
|
});
|
||||||
me.ViewClose();
|
me.ViewClose();
|
||||||
ExpressionViewClose(exprCopy);
|
ExpressionViewClose(exprCopy);
|
||||||
@ -146,9 +146,9 @@ public:
|
|||||||
auto exprCopy = expr;
|
auto exprCopy = expr;
|
||||||
ExpressionViewOpen(exprCopy);
|
ExpressionViewOpen(exprCopy);
|
||||||
auto me = View(AcceleratorWriteDiscard);
|
auto me = View(AcceleratorWriteDiscard);
|
||||||
accelerator_for(ss,me.size(),1,{
|
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
||||||
auto tmp = eval(ss,exprCopy);
|
auto tmp = eval(ss,exprCopy);
|
||||||
vstream(me[ss],tmp);
|
coalescedWrite(me[ss],tmp);
|
||||||
});
|
});
|
||||||
me.ViewClose();
|
me.ViewClose();
|
||||||
ExpressionViewClose(exprCopy);
|
ExpressionViewClose(exprCopy);
|
||||||
@ -168,9 +168,9 @@ public:
|
|||||||
auto exprCopy = expr;
|
auto exprCopy = expr;
|
||||||
ExpressionViewOpen(exprCopy);
|
ExpressionViewOpen(exprCopy);
|
||||||
auto me = View(AcceleratorWriteDiscard);
|
auto me = View(AcceleratorWriteDiscard);
|
||||||
accelerator_for(ss,me.size(),1,{
|
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
||||||
auto tmp = eval(ss,exprCopy);
|
auto tmp = eval(ss,exprCopy);
|
||||||
vstream(me[ss],tmp);
|
coalescedWrite(me[ss],tmp);
|
||||||
});
|
});
|
||||||
me.ViewClose();
|
me.ViewClose();
|
||||||
ExpressionViewClose(exprCopy);
|
ExpressionViewClose(exprCopy);
|
||||||
|
@ -54,13 +54,34 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
|
|||||||
typedef decltype(basis[0].View(AcceleratorRead)) View;
|
typedef decltype(basis[0].View(AcceleratorRead)) View;
|
||||||
|
|
||||||
Vector<View> basis_v; basis_v.reserve(basis.size());
|
Vector<View> basis_v; basis_v.reserve(basis.size());
|
||||||
|
typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
|
||||||
|
typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t;
|
||||||
GridBase* grid = basis[0].Grid();
|
GridBase* grid = basis[0].Grid();
|
||||||
|
|
||||||
for(int k=0;k<basis.size();k++){
|
for(int k=0;k<basis.size();k++){
|
||||||
basis_v.push_back(basis[k].View(AcceleratorWrite));
|
basis_v.push_back(basis[k].View(AcceleratorWrite));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if ( (!defined(GRID_CUDA)) )
|
||||||
|
int max_threads = thread_max();
|
||||||
|
Vector < vobj > Bt(Nm * max_threads);
|
||||||
|
thread_region
|
||||||
|
{
|
||||||
|
vobj* B = &Bt[Nm * thread_num()];
|
||||||
|
thread_for_in_region(ss, grid->oSites(),{
|
||||||
|
for(int j=j0; j<j1; ++j) B[j]=0.;
|
||||||
|
|
||||||
|
for(int j=j0; j<j1; ++j){
|
||||||
|
for(int k=k0; k<k1; ++k){
|
||||||
|
B[j] +=Qt(j,k) * basis_v[k][ss];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(int j=j0; j<j1; ++j){
|
||||||
|
basis_v[j][ss] = B[j];
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
#else
|
||||||
View *basis_vp = &basis_v[0];
|
View *basis_vp = &basis_v[0];
|
||||||
|
|
||||||
int nrot = j1-j0;
|
int nrot = j1-j0;
|
||||||
@ -70,14 +91,12 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
|
|||||||
uint64_t oSites =grid->oSites();
|
uint64_t oSites =grid->oSites();
|
||||||
uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
|
uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
|
||||||
|
|
||||||
typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
|
|
||||||
|
|
||||||
Vector <vobj> Bt(siteBlock * nrot);
|
Vector <vobj> Bt(siteBlock * nrot);
|
||||||
auto Bp=&Bt[0];
|
auto Bp=&Bt[0];
|
||||||
|
|
||||||
// GPU readable copy of matrix
|
// GPU readable copy of matrix
|
||||||
Vector<double> Qt_jv(Nm*Nm);
|
Vector<Coeff_t> Qt_jv(Nm*Nm);
|
||||||
double *Qt_p = & Qt_jv[0];
|
Coeff_t *Qt_p = & Qt_jv[0];
|
||||||
thread_for(i,Nm*Nm,{
|
thread_for(i,Nm*Nm,{
|
||||||
int j = i/Nm;
|
int j = i/Nm;
|
||||||
int k = i%Nm;
|
int k = i%Nm;
|
||||||
@ -118,6 +137,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
|
|||||||
coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j]));
|
coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j]));
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
|
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
|
||||||
}
|
}
|
||||||
@ -141,11 +161,13 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
|
|||||||
double * Qt_j = & Qt_jv[0];
|
double * Qt_j = & Qt_jv[0];
|
||||||
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
|
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
|
||||||
|
|
||||||
|
auto basis_vp=& basis_v[0];
|
||||||
autoView(result_v,result,AcceleratorWrite);
|
autoView(result_v,result,AcceleratorWrite);
|
||||||
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
|
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
|
||||||
auto B=coalescedRead(zz);
|
vobj zzz=Zero();
|
||||||
|
auto B=coalescedRead(zzz);
|
||||||
for(int k=k0; k<k1; ++k){
|
for(int k=k0; k<k1; ++k){
|
||||||
B +=Qt_j[k] * coalescedRead(basis_v[k][ss]);
|
B +=Qt_j[k] * coalescedRead(basis_vp[k][ss]);
|
||||||
}
|
}
|
||||||
coalescedWrite(result_v[ss], B);
|
coalescedWrite(result_v[ss], B);
|
||||||
});
|
});
|
||||||
|
@ -42,34 +42,6 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
typedef iScalar<vInteger> vPredicate ;
|
typedef iScalar<vInteger> vPredicate ;
|
||||||
|
|
||||||
/*
|
|
||||||
template <class iobj, class vobj, class robj> accelerator_inline
|
|
||||||
vobj predicatedWhere(const iobj &predicate, const vobj &iftrue, const robj &iffalse)
|
|
||||||
{
|
|
||||||
typename std::remove_const<vobj>::type ret;
|
|
||||||
|
|
||||||
typedef typename vobj::scalar_object scalar_object;
|
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
|
||||||
typedef typename vobj::vector_type vector_type;
|
|
||||||
|
|
||||||
const int Nsimd = vobj::vector_type::Nsimd();
|
|
||||||
|
|
||||||
ExtractBuffer<Integer> mask(Nsimd);
|
|
||||||
ExtractBuffer<scalar_object> truevals(Nsimd);
|
|
||||||
ExtractBuffer<scalar_object> falsevals(Nsimd);
|
|
||||||
|
|
||||||
extract(iftrue, truevals);
|
|
||||||
extract(iffalse, falsevals);
|
|
||||||
extract<vInteger, Integer>(TensorRemove(predicate), mask);
|
|
||||||
|
|
||||||
for (int s = 0; s < Nsimd; s++) {
|
|
||||||
if (mask[s]) falsevals[s] = truevals[s];
|
|
||||||
}
|
|
||||||
|
|
||||||
merge(ret, falsevals);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
// compare lattice to lattice
|
// compare lattice to lattice
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
@ -182,6 +182,14 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
|
|||||||
|
|
||||||
return;
|
return;
|
||||||
};
|
};
|
||||||
|
template<class vobj,class sobj>
|
||||||
|
inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site)
|
||||||
|
{
|
||||||
|
autoView(lv,l,CpuRead);
|
||||||
|
peekLocalSite(s,lv,site);
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
|
||||||
// Must be CPU write view
|
// Must be CPU write view
|
||||||
template<class vobj,class sobj>
|
template<class vobj,class sobj>
|
||||||
inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
|
inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
|
||||||
@ -210,6 +218,14 @@ inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
|
|||||||
return;
|
return;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<class vobj,class sobj>
|
||||||
|
inline void pokeLocalSite(const sobj &s, Lattice<vobj> &l,Coordinate &site)
|
||||||
|
{
|
||||||
|
autoView(lv,l,CpuWrite);
|
||||||
|
pokeLocalSite(s,lv,site);
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
79
Grid/lattice/Lattice_real_imag.h
Normal file
79
Grid/lattice/Lattice_real_imag.h
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/lattice/Lattice_reality.h
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: neo <cossu@post.kek.jp>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#ifndef GRID_LATTICE_REAL_IMAG_H
|
||||||
|
#define GRID_LATTICE_REAL_IMAG_H
|
||||||
|
|
||||||
|
|
||||||
|
// FIXME .. this is the sector of the code
|
||||||
|
// I am most worried about the directions
|
||||||
|
// The choice of burying complex in the SIMD
|
||||||
|
// is making the use of "real" and "imag" very cumbersome
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
template<class vobj> inline Lattice<vobj> real(const Lattice<vobj> &lhs){
|
||||||
|
Lattice<vobj> ret(lhs.Grid());
|
||||||
|
|
||||||
|
autoView( lhs_v, lhs, AcceleratorRead);
|
||||||
|
autoView( ret_v, ret, AcceleratorWrite);
|
||||||
|
|
||||||
|
ret.Checkerboard()=lhs.Checkerboard();
|
||||||
|
accelerator_for( ss, lhs_v.size(), 1, {
|
||||||
|
ret_v[ss] =real(lhs_v[ss]);
|
||||||
|
});
|
||||||
|
return ret;
|
||||||
|
};
|
||||||
|
template<class vobj> inline Lattice<vobj> imag(const Lattice<vobj> &lhs){
|
||||||
|
Lattice<vobj> ret(lhs.Grid());
|
||||||
|
|
||||||
|
autoView( lhs_v, lhs, AcceleratorRead);
|
||||||
|
autoView( ret_v, ret, AcceleratorWrite);
|
||||||
|
|
||||||
|
ret.Checkerboard()=lhs.Checkerboard();
|
||||||
|
accelerator_for( ss, lhs_v.size(), 1, {
|
||||||
|
ret_v[ss] =imag(lhs_v[ss]);
|
||||||
|
});
|
||||||
|
return ret;
|
||||||
|
};
|
||||||
|
|
||||||
|
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
|
||||||
|
auto real(const Expression &expr) -> decltype(real(closure(expr)))
|
||||||
|
{
|
||||||
|
return real(closure(expr));
|
||||||
|
}
|
||||||
|
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
|
||||||
|
auto imag(const Expression &expr) -> decltype(imag(closure(expr)))
|
||||||
|
{
|
||||||
|
return imag(closure(expr));
|
||||||
|
}
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
#endif
|
@ -45,8 +45,8 @@ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
|
|||||||
autoView( ret_v, ret, AcceleratorWrite);
|
autoView( ret_v, ret, AcceleratorWrite);
|
||||||
|
|
||||||
ret.Checkerboard()=lhs.Checkerboard();
|
ret.Checkerboard()=lhs.Checkerboard();
|
||||||
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
|
accelerator_for( ss, lhs_v.size(), 1, {
|
||||||
coalescedWrite(ret_v[ss], adj(lhs_v(ss)));
|
ret_v[ss] = adj(lhs_v[ss]);
|
||||||
});
|
});
|
||||||
return ret;
|
return ret;
|
||||||
};
|
};
|
||||||
@ -64,6 +64,53 @@ template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
|
|||||||
return ret;
|
return ret;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<class vobj> inline Lattice<typename vobj::Complexified> toComplex(const Lattice<vobj> &lhs){
|
||||||
|
Lattice<typename vobj::Complexified> ret(lhs.Grid());
|
||||||
|
|
||||||
|
autoView( lhs_v, lhs, AcceleratorRead);
|
||||||
|
autoView( ret_v, ret, AcceleratorWrite);
|
||||||
|
|
||||||
|
ret.Checkerboard() = lhs.Checkerboard();
|
||||||
|
accelerator_for( ss, lhs_v.size(), 1, {
|
||||||
|
ret_v[ss] = toComplex(lhs_v[ss]);
|
||||||
|
});
|
||||||
|
return ret;
|
||||||
|
};
|
||||||
|
template<class vobj> inline Lattice<typename vobj::Realified> toReal(const Lattice<vobj> &lhs){
|
||||||
|
Lattice<typename vobj::Realified> ret(lhs.Grid());
|
||||||
|
|
||||||
|
autoView( lhs_v, lhs, AcceleratorRead);
|
||||||
|
autoView( ret_v, ret, AcceleratorWrite);
|
||||||
|
|
||||||
|
ret.Checkerboard() = lhs.Checkerboard();
|
||||||
|
accelerator_for( ss, lhs_v.size(), 1, {
|
||||||
|
ret_v[ss] = toReal(lhs_v[ss]);
|
||||||
|
});
|
||||||
|
return ret;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
|
||||||
|
auto toComplex(const Expression &expr) -> decltype(closure(expr))
|
||||||
|
{
|
||||||
|
return toComplex(closure(expr));
|
||||||
|
}
|
||||||
|
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
|
||||||
|
auto toReal(const Expression &expr) -> decltype(closure(expr))
|
||||||
|
{
|
||||||
|
return toReal(closure(expr));
|
||||||
|
}
|
||||||
|
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
|
||||||
|
auto adj(const Expression &expr) -> decltype(closure(expr))
|
||||||
|
{
|
||||||
|
return adj(closure(expr));
|
||||||
|
}
|
||||||
|
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
|
||||||
|
auto conjugate(const Expression &expr) -> decltype(closure(expr))
|
||||||
|
{
|
||||||
|
return conjugate(closure(expr));
|
||||||
|
}
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -62,7 +62,6 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
|
|||||||
for(int i=0;i<nthread;i++){
|
for(int i=0;i<nthread;i++){
|
||||||
ssum = ssum+sumarray[i];
|
ssum = ssum+sumarray[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
return ssum;
|
return ssum;
|
||||||
}
|
}
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
@ -93,7 +92,9 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
|
|||||||
ssum = ssum+sumarray[i];
|
ssum = ssum+sumarray[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
return ssum;
|
typedef typename vobj::scalar_object ssobj;
|
||||||
|
ssobj ret = ssum;
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -154,7 +155,7 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
|
|||||||
const uint64_t sites = grid->oSites();
|
const uint64_t sites = grid->oSites();
|
||||||
|
|
||||||
// Might make all code paths go this way.
|
// Might make all code paths go this way.
|
||||||
typedef decltype(innerProduct(vobj(),vobj())) inner_t;
|
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
|
||||||
Vector<inner_t> inner_tmp(sites);
|
Vector<inner_t> inner_tmp(sites);
|
||||||
auto inner_tmp_v = &inner_tmp[0];
|
auto inner_tmp_v = &inner_tmp[0];
|
||||||
|
|
||||||
@ -163,16 +164,16 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
|
|||||||
autoView( right_v,right, AcceleratorRead);
|
autoView( right_v,right, AcceleratorRead);
|
||||||
|
|
||||||
// GPU - SIMT lane compliance...
|
// GPU - SIMT lane compliance...
|
||||||
accelerator_for( ss, sites, nsimd,{
|
accelerator_for( ss, sites, 1,{
|
||||||
auto x_l = left_v(ss);
|
auto x_l = left_v[ss];
|
||||||
auto y_l = right_v(ss);
|
auto y_l = right_v[ss];
|
||||||
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
|
inner_tmp_v[ss]=innerProductD(x_l,y_l);
|
||||||
})
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// This is in single precision and fails some tests
|
// This is in single precision and fails some tests
|
||||||
// Need a sumD that sums in double
|
auto anrm = sum(inner_tmp_v,sites);
|
||||||
nrm = TensorRemove(sumD(inner_tmp_v,sites));
|
nrm = anrm;
|
||||||
return nrm;
|
return nrm;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -218,16 +219,16 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
|
|||||||
autoView( y_v, y, AcceleratorRead);
|
autoView( y_v, y, AcceleratorRead);
|
||||||
autoView( z_v, z, AcceleratorWrite);
|
autoView( z_v, z, AcceleratorWrite);
|
||||||
|
|
||||||
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
|
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
|
||||||
Vector<inner_t> inner_tmp(sites);
|
Vector<inner_t> inner_tmp(sites);
|
||||||
auto inner_tmp_v = &inner_tmp[0];
|
auto inner_tmp_v = &inner_tmp[0];
|
||||||
|
|
||||||
accelerator_for( ss, sites, nsimd,{
|
accelerator_for( ss, sites, 1,{
|
||||||
auto tmp = a*x_v(ss)+b*y_v(ss);
|
auto tmp = a*x_v[ss]+b*y_v[ss];
|
||||||
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
|
inner_tmp_v[ss]=innerProductD(tmp,tmp);
|
||||||
coalescedWrite(z_v[ss],tmp);
|
z_v[ss]=tmp;
|
||||||
});
|
});
|
||||||
nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
|
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
|
||||||
grid->GlobalSum(nrm);
|
grid->GlobalSum(nrm);
|
||||||
return nrm;
|
return nrm;
|
||||||
}
|
}
|
||||||
@ -243,29 +244,28 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
|
|||||||
|
|
||||||
GridBase *grid = left.Grid();
|
GridBase *grid = left.Grid();
|
||||||
|
|
||||||
|
|
||||||
const uint64_t nsimd = grid->Nsimd();
|
const uint64_t nsimd = grid->Nsimd();
|
||||||
const uint64_t sites = grid->oSites();
|
const uint64_t sites = grid->oSites();
|
||||||
|
|
||||||
// GPU
|
// GPU
|
||||||
typedef decltype(innerProduct(vobj(),vobj())) inner_t;
|
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
|
||||||
typedef decltype(innerProduct(vobj(),vobj())) norm_t;
|
typedef decltype(innerProductD(vobj(),vobj())) norm_t;
|
||||||
Vector<inner_t> inner_tmp(sites);
|
Vector<inner_t> inner_tmp(sites);
|
||||||
Vector<norm_t> norm_tmp(sites);
|
Vector<norm_t> norm_tmp(sites);
|
||||||
auto inner_tmp_v = &inner_tmp[0];
|
auto inner_tmp_v = &inner_tmp[0];
|
||||||
auto norm_tmp_v = &norm_tmp[0];
|
auto norm_tmp_v = &norm_tmp[0];
|
||||||
{
|
{
|
||||||
autoView(left_v,left, AcceleratorRead);
|
autoView(left_v,left, AcceleratorRead);
|
||||||
autoView(right_v,right,AcceleratorRead);
|
autoView(right_v,right,AcceleratorRead);
|
||||||
accelerator_for( ss, sites, nsimd,{
|
accelerator_for( ss, sites, 1,{
|
||||||
auto left_tmp = left_v(ss);
|
auto left_tmp = left_v[ss];
|
||||||
coalescedWrite(inner_tmp_v[ss],innerProduct(left_tmp,right_v(ss)));
|
inner_tmp_v[ss]=innerProductD(left_tmp,right_v[ss]);
|
||||||
coalescedWrite(norm_tmp_v[ss],innerProduct(left_tmp,left_tmp));
|
norm_tmp_v [ss]=innerProductD(left_tmp,left_tmp);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
tmp[0] = TensorRemove(sumD(inner_tmp_v,sites));
|
tmp[0] = TensorRemove(sum(inner_tmp_v,sites));
|
||||||
tmp[1] = TensorRemove(sumD(norm_tmp_v,sites));
|
tmp[1] = TensorRemove(sum(norm_tmp_v,sites));
|
||||||
|
|
||||||
grid->GlobalSumVector(&tmp[0],2); // keep norm Complex -> can use GlobalSumVector
|
grid->GlobalSumVector(&tmp[0],2); // keep norm Complex -> can use GlobalSumVector
|
||||||
ip = tmp[0];
|
ip = tmp[0];
|
||||||
|
@ -2,12 +2,13 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
#ifdef GRID_HIP
|
#ifdef GRID_HIP
|
||||||
extern hipDeviceProp_t *gpu_props;
|
extern hipDeviceProp_t *gpu_props;
|
||||||
|
#define WARP_SIZE 64
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_CUDA
|
#ifdef GRID_CUDA
|
||||||
extern cudaDeviceProp *gpu_props;
|
extern cudaDeviceProp *gpu_props;
|
||||||
|
#define WARP_SIZE 32
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define WARP_SIZE 32
|
|
||||||
__device__ unsigned int retirementCount = 0;
|
__device__ unsigned int retirementCount = 0;
|
||||||
|
|
||||||
template <class Iterator>
|
template <class Iterator>
|
||||||
@ -64,7 +65,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
|
|||||||
|
|
||||||
// cannot use overloaded operators for sobj as they are not volatile-qualified
|
// cannot use overloaded operators for sobj as they are not volatile-qualified
|
||||||
memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj));
|
memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj));
|
||||||
__syncwarp();
|
acceleratorSynchronise();
|
||||||
|
|
||||||
const Iterator VEC = WARP_SIZE;
|
const Iterator VEC = WARP_SIZE;
|
||||||
const Iterator vid = tid & (VEC-1);
|
const Iterator vid = tid & (VEC-1);
|
||||||
@ -78,9 +79,9 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
|
|||||||
beta += temp;
|
beta += temp;
|
||||||
memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj));
|
memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj));
|
||||||
}
|
}
|
||||||
__syncwarp();
|
acceleratorSynchronise();
|
||||||
}
|
}
|
||||||
__syncthreads();
|
acceleratorSynchroniseAll();
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
beta = Zero();
|
beta = Zero();
|
||||||
@ -90,7 +91,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
|
|||||||
}
|
}
|
||||||
memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj));
|
memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj));
|
||||||
}
|
}
|
||||||
__syncthreads();
|
acceleratorSynchroniseAll();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -127,6 +127,11 @@ accelerator_inline void convertType(T1 & out, const iScalar<T2> & in) {
|
|||||||
convertType(out,in._internal);
|
convertType(out,in._internal);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename T1, typename std::enable_if<!isGridScalar<T1>::value, T1>::type* = nullptr>
|
||||||
|
accelerator_inline void convertType(T1 & out, const iScalar<T1> & in) {
|
||||||
|
convertType(out,in._internal);
|
||||||
|
}
|
||||||
|
|
||||||
template<typename T1,typename T2>
|
template<typename T1,typename T2>
|
||||||
accelerator_inline void convertType(iScalar<T1> & out, const T2 & in) {
|
accelerator_inline void convertType(iScalar<T1> & out, const T2 & in) {
|
||||||
convertType(out._internal,in);
|
convertType(out._internal,in);
|
||||||
@ -240,6 +245,8 @@ template<class vobj,class vobj2,class CComplex>
|
|||||||
autoView( fineX_ , fineX, AcceleratorRead);
|
autoView( fineX_ , fineX, AcceleratorRead);
|
||||||
autoView( fineY_ , fineY, AcceleratorRead);
|
autoView( fineY_ , fineY, AcceleratorRead);
|
||||||
autoView( coarseA_, coarseA, AcceleratorRead);
|
autoView( coarseA_, coarseA, AcceleratorRead);
|
||||||
|
Coordinate fine_rdimensions = fine->_rdimensions;
|
||||||
|
Coordinate coarse_rdimensions = coarse->_rdimensions;
|
||||||
|
|
||||||
accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), {
|
accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), {
|
||||||
|
|
||||||
@ -247,9 +254,9 @@ template<class vobj,class vobj2,class CComplex>
|
|||||||
Coordinate coor_c(_ndimension);
|
Coordinate coor_c(_ndimension);
|
||||||
Coordinate coor_f(_ndimension);
|
Coordinate coor_f(_ndimension);
|
||||||
|
|
||||||
Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
|
Lexicographic::CoorFromIndex(coor_f,sf,fine_rdimensions);
|
||||||
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
|
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
|
||||||
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
|
Lexicographic::IndexFromCoor(coor_c,sc,coarse_rdimensions);
|
||||||
|
|
||||||
// z = A x + y
|
// z = A x + y
|
||||||
#ifdef GRID_SIMT
|
#ifdef GRID_SIMT
|
||||||
@ -353,11 +360,14 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
|
|||||||
autoView( coarseData_ , coarseData, AcceleratorWrite);
|
autoView( coarseData_ , coarseData, AcceleratorWrite);
|
||||||
autoView( fineData_ , fineData, AcceleratorRead);
|
autoView( fineData_ , fineData, AcceleratorRead);
|
||||||
|
|
||||||
|
Coordinate fine_rdimensions = fine->_rdimensions;
|
||||||
|
Coordinate coarse_rdimensions = coarse->_rdimensions;
|
||||||
|
|
||||||
accelerator_for(sc,coarse->oSites(),1,{
|
accelerator_for(sc,coarse->oSites(),1,{
|
||||||
|
|
||||||
// One thread per sub block
|
// One thread per sub block
|
||||||
Coordinate coor_c(_ndimension);
|
Coordinate coor_c(_ndimension);
|
||||||
Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate
|
Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions); // Block coordinate
|
||||||
coarseData_[sc]=Zero();
|
coarseData_[sc]=Zero();
|
||||||
|
|
||||||
for(int sb=0;sb<blockVol;sb++){
|
for(int sb=0;sb<blockVol;sb++){
|
||||||
@ -367,7 +377,7 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
|
|||||||
Coordinate coor_f(_ndimension);
|
Coordinate coor_f(_ndimension);
|
||||||
Lexicographic::CoorFromIndex(coor_b,sb,block_r); // Block sub coordinate
|
Lexicographic::CoorFromIndex(coor_b,sb,block_r); // Block sub coordinate
|
||||||
for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
|
for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
|
||||||
Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
|
Lexicographic::IndexFromCoor(coor_f,sf,fine_rdimensions);
|
||||||
|
|
||||||
coarseData_[sc]=coarseData_[sc]+fineData_[sf];
|
coarseData_[sc]=coarseData_[sc]+fineData_[sf];
|
||||||
}
|
}
|
||||||
|
@ -130,6 +130,8 @@ public:
|
|||||||
friend std::ostream& operator<< (std::ostream& stream, Logger& log){
|
friend std::ostream& operator<< (std::ostream& stream, Logger& log){
|
||||||
|
|
||||||
if ( log.active ) {
|
if ( log.active ) {
|
||||||
|
std::ios_base::fmtflags f(stream.flags());
|
||||||
|
|
||||||
stream << log.background()<< std::left;
|
stream << log.background()<< std::left;
|
||||||
if (log.topWidth > 0)
|
if (log.topWidth > 0)
|
||||||
{
|
{
|
||||||
@ -152,6 +154,8 @@ public:
|
|||||||
<< now << log.background() << " : " ;
|
<< now << log.background() << " : " ;
|
||||||
}
|
}
|
||||||
stream << log.colour();
|
stream << log.colour();
|
||||||
|
stream.flags(f);
|
||||||
|
|
||||||
return stream;
|
return stream;
|
||||||
} else {
|
} else {
|
||||||
return devnull;
|
return devnull;
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
#include <Grid/GridCore.h>
|
#include <Grid/GridCore.h>
|
||||||
|
|
||||||
int Grid::BinaryIO::latticeWriteMaxRetry = -1;
|
int Grid::BinaryIO::latticeWriteMaxRetry = -1;
|
||||||
|
Grid::BinaryIO::IoPerf Grid::BinaryIO::lastPerf;
|
||||||
|
@ -79,6 +79,13 @@ inline void removeWhitespace(std::string &key)
|
|||||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
class BinaryIO {
|
class BinaryIO {
|
||||||
public:
|
public:
|
||||||
|
struct IoPerf
|
||||||
|
{
|
||||||
|
uint64_t size{0},time{0};
|
||||||
|
double mbytesPerSecond{0.};
|
||||||
|
};
|
||||||
|
|
||||||
|
static IoPerf lastPerf;
|
||||||
static int latticeWriteMaxRetry;
|
static int latticeWriteMaxRetry;
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////
|
||||||
@ -502,12 +509,15 @@ class BinaryIO {
|
|||||||
timer.Stop();
|
timer.Stop();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
lastPerf.size = sizeof(fobj)*iodata.size()*nrank;
|
||||||
|
lastPerf.time = timer.useconds();
|
||||||
|
lastPerf.mbytesPerSecond = lastPerf.size/1024./1024./(lastPerf.time/1.0e6);
|
||||||
std::cout<<GridLogMessage<<"IOobject: ";
|
std::cout<<GridLogMessage<<"IOobject: ";
|
||||||
if ( control & BINARYIO_READ) std::cout << " read ";
|
if ( control & BINARYIO_READ) std::cout << " read ";
|
||||||
else std::cout << " write ";
|
else std::cout << " write ";
|
||||||
uint64_t bytes = sizeof(fobj)*iodata.size()*nrank;
|
uint64_t bytes = sizeof(fobj)*iodata.size()*nrank;
|
||||||
std::cout<< bytes <<" bytes in "<<timer.Elapsed() <<" "
|
std::cout<< lastPerf.size <<" bytes in "<< timer.Elapsed() <<" "
|
||||||
<< (double)bytes/ (double)timer.useconds() <<" MB/s "<<std::endl;
|
<< lastPerf.mbytesPerSecond <<" MB/s "<<std::endl;
|
||||||
|
|
||||||
std::cout<<GridLogMessage<<"IOobject: endian and checksum overhead "<<bstimer.Elapsed() <<std::endl;
|
std::cout<<GridLogMessage<<"IOobject: endian and checksum overhead "<<bstimer.Elapsed() <<std::endl;
|
||||||
|
|
||||||
@ -663,10 +673,15 @@ class BinaryIO {
|
|||||||
nersc_csum,scidac_csuma,scidac_csumb);
|
nersc_csum,scidac_csuma,scidac_csumb);
|
||||||
|
|
||||||
timer.Start();
|
timer.Start();
|
||||||
thread_for(lidx,lsites,{
|
thread_for(lidx,lsites,{ // FIX ME, suboptimal implementation
|
||||||
std::vector<RngStateType> tmp(RngStateCount);
|
std::vector<RngStateType> tmp(RngStateCount);
|
||||||
std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin());
|
std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin());
|
||||||
parallel_rng.SetState(tmp,lidx);
|
Coordinate lcoor;
|
||||||
|
grid->LocalIndexToLocalCoor(lidx, lcoor);
|
||||||
|
int o_idx=grid->oIndex(lcoor);
|
||||||
|
int i_idx=grid->iIndex(lcoor);
|
||||||
|
int gidx=parallel_rng.generator_idx(o_idx,i_idx);
|
||||||
|
parallel_rng.SetState(tmp,gidx);
|
||||||
});
|
});
|
||||||
timer.Stop();
|
timer.Stop();
|
||||||
|
|
||||||
@ -723,7 +738,12 @@ class BinaryIO {
|
|||||||
std::vector<RNGstate> iodata(lsites);
|
std::vector<RNGstate> iodata(lsites);
|
||||||
thread_for(lidx,lsites,{
|
thread_for(lidx,lsites,{
|
||||||
std::vector<RngStateType> tmp(RngStateCount);
|
std::vector<RngStateType> tmp(RngStateCount);
|
||||||
parallel_rng.GetState(tmp,lidx);
|
Coordinate lcoor;
|
||||||
|
grid->LocalIndexToLocalCoor(lidx, lcoor);
|
||||||
|
int o_idx=grid->oIndex(lcoor);
|
||||||
|
int i_idx=grid->iIndex(lcoor);
|
||||||
|
int gidx=parallel_rng.generator_idx(o_idx,i_idx);
|
||||||
|
parallel_rng.GetState(tmp,gidx);
|
||||||
std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin());
|
std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin());
|
||||||
});
|
});
|
||||||
timer.Stop();
|
timer.Stop();
|
||||||
|
@ -123,7 +123,7 @@ assert(GRID_FIELD_NORM_CALC(FieldNormMetaData_, n2ck) < 1.0e-5);
|
|||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
// Helper to fill out metadata
|
// Helper to fill out metadata
|
||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
template<class vobj> void ScidacMetaData(Lattice<vobj> & field,
|
template<class vobj> void ScidacMetaData(Lattice<vobj> & field,
|
||||||
FieldMetaData &header,
|
FieldMetaData &header,
|
||||||
scidacRecord & _scidacRecord,
|
scidacRecord & _scidacRecord,
|
||||||
scidacFile & _scidacFile)
|
scidacFile & _scidacFile)
|
||||||
@ -619,12 +619,12 @@ class IldgWriter : public ScidacWriter {
|
|||||||
// Don't require scidac records EXCEPT checksum
|
// Don't require scidac records EXCEPT checksum
|
||||||
// Use Grid MetaData object if present.
|
// Use Grid MetaData object if present.
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
template <class vsimd>
|
template <class stats = PeriodicGaugeStatistics>
|
||||||
void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,int sequence,std::string LFN,std::string description)
|
void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,int sequence,std::string LFN,std::string description)
|
||||||
{
|
{
|
||||||
GridBase * grid = Umu.Grid();
|
GridBase * grid = Umu.Grid();
|
||||||
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
|
typedef Lattice<vLorentzColourMatrixD> GaugeField;
|
||||||
typedef iLorentzColourMatrix<vsimd> vobj;
|
typedef vLorentzColourMatrixD vobj;
|
||||||
typedef typename vobj::scalar_object sobj;
|
typedef typename vobj::scalar_object sobj;
|
||||||
|
|
||||||
////////////////////////////////////////
|
////////////////////////////////////////
|
||||||
@ -636,6 +636,9 @@ class IldgWriter : public ScidacWriter {
|
|||||||
|
|
||||||
ScidacMetaData(Umu,header,_scidacRecord,_scidacFile);
|
ScidacMetaData(Umu,header,_scidacRecord,_scidacFile);
|
||||||
|
|
||||||
|
stats Stats;
|
||||||
|
Stats(Umu,header);
|
||||||
|
|
||||||
std::string format = header.floating_point;
|
std::string format = header.floating_point;
|
||||||
header.ensemble_id = description;
|
header.ensemble_id = description;
|
||||||
header.ensemble_label = description;
|
header.ensemble_label = description;
|
||||||
@ -705,10 +708,10 @@ class IldgReader : public GridLimeReader {
|
|||||||
// Else use ILDG MetaData object if present.
|
// Else use ILDG MetaData object if present.
|
||||||
// Else use SciDAC MetaData object if present.
|
// Else use SciDAC MetaData object if present.
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
template <class vsimd>
|
template <class stats = PeriodicGaugeStatistics>
|
||||||
void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu, FieldMetaData &FieldMetaData_) {
|
void readConfiguration(Lattice<vLorentzColourMatrixD> &Umu, FieldMetaData &FieldMetaData_) {
|
||||||
|
|
||||||
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
|
typedef Lattice<vLorentzColourMatrixD > GaugeField;
|
||||||
typedef typename GaugeField::vector_object vobj;
|
typedef typename GaugeField::vector_object vobj;
|
||||||
typedef typename vobj::scalar_object sobj;
|
typedef typename vobj::scalar_object sobj;
|
||||||
|
|
||||||
@ -921,7 +924,8 @@ class IldgReader : public GridLimeReader {
|
|||||||
|
|
||||||
if ( found_FieldMetaData || found_usqcdInfo ) {
|
if ( found_FieldMetaData || found_usqcdInfo ) {
|
||||||
FieldMetaData checker;
|
FieldMetaData checker;
|
||||||
GaugeStatistics(Umu,checker);
|
stats Stats;
|
||||||
|
Stats(Umu,checker);
|
||||||
assert(fabs(checker.plaquette - FieldMetaData_.plaquette )<1.0e-5);
|
assert(fabs(checker.plaquette - FieldMetaData_.plaquette )<1.0e-5);
|
||||||
assert(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5);
|
assert(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5);
|
||||||
std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl;
|
std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl;
|
||||||
|
@ -176,29 +176,18 @@ template<class vobj> inline void PrepareMetaData(Lattice<vobj> & field, FieldMet
|
|||||||
GridMetaData(grid,header);
|
GridMetaData(grid,header);
|
||||||
MachineCharacteristics(header);
|
MachineCharacteristics(header);
|
||||||
}
|
}
|
||||||
inline void GaugeStatistics(Lattice<vLorentzColourMatrixF> & data,FieldMetaData &header)
|
template<class Impl>
|
||||||
|
class GaugeStatistics
|
||||||
{
|
{
|
||||||
// How to convert data precision etc...
|
public:
|
||||||
header.link_trace=WilsonLoops<PeriodicGimplF>::linkTrace(data);
|
void operator()(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
|
||||||
header.plaquette =WilsonLoops<PeriodicGimplF>::avgPlaquette(data);
|
{
|
||||||
}
|
header.link_trace=WilsonLoops<Impl>::linkTrace(data);
|
||||||
inline void GaugeStatistics(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
|
header.plaquette =WilsonLoops<Impl>::avgPlaquette(data);
|
||||||
{
|
}
|
||||||
// How to convert data precision etc...
|
};
|
||||||
header.link_trace=WilsonLoops<PeriodicGimplD>::linkTrace(data);
|
typedef GaugeStatistics<PeriodicGimplD> PeriodicGaugeStatistics;
|
||||||
header.plaquette =WilsonLoops<PeriodicGimplD>::avgPlaquette(data);
|
typedef GaugeStatistics<ConjugateGimplD> ConjugateGaugeStatistics;
|
||||||
}
|
|
||||||
template<> inline void PrepareMetaData<vLorentzColourMatrixF>(Lattice<vLorentzColourMatrixF> & field, FieldMetaData &header)
|
|
||||||
{
|
|
||||||
|
|
||||||
GridBase *grid = field.Grid();
|
|
||||||
std::string format = getFormatString<vLorentzColourMatrixF>();
|
|
||||||
header.floating_point = format;
|
|
||||||
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
|
|
||||||
GridMetaData(grid,header);
|
|
||||||
GaugeStatistics(field,header);
|
|
||||||
MachineCharacteristics(header);
|
|
||||||
}
|
|
||||||
template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzColourMatrixD> & field, FieldMetaData &header)
|
template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzColourMatrixD> & field, FieldMetaData &header)
|
||||||
{
|
{
|
||||||
GridBase *grid = field.Grid();
|
GridBase *grid = field.Grid();
|
||||||
@ -206,7 +195,6 @@ template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzCo
|
|||||||
header.floating_point = format;
|
header.floating_point = format;
|
||||||
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
|
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
|
||||||
GridMetaData(grid,header);
|
GridMetaData(grid,header);
|
||||||
GaugeStatistics(field,header);
|
|
||||||
MachineCharacteristics(header);
|
MachineCharacteristics(header);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -40,6 +40,8 @@ using namespace Grid;
|
|||||||
class NerscIO : public BinaryIO {
|
class NerscIO : public BinaryIO {
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
typedef Lattice<vLorentzColourMatrixD> GaugeField;
|
||||||
|
|
||||||
static inline void truncate(std::string file){
|
static inline void truncate(std::string file){
|
||||||
std::ofstream fout(file,std::ios::out);
|
std::ofstream fout(file,std::ios::out);
|
||||||
}
|
}
|
||||||
@ -129,12 +131,12 @@ public:
|
|||||||
// Now the meat: the object readers
|
// Now the meat: the object readers
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
template<class vsimd>
|
template<class GaugeStats=PeriodicGaugeStatistics>
|
||||||
static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
|
static inline void readConfiguration(GaugeField &Umu,
|
||||||
FieldMetaData& header,
|
FieldMetaData& header,
|
||||||
std::string file)
|
std::string file,
|
||||||
|
GaugeStats GaugeStatisticsCalculator=GaugeStats())
|
||||||
{
|
{
|
||||||
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
|
|
||||||
|
|
||||||
GridBase *grid = Umu.Grid();
|
GridBase *grid = Umu.Grid();
|
||||||
uint64_t offset = readHeader(file,Umu.Grid(),header);
|
uint64_t offset = readHeader(file,Umu.Grid(),header);
|
||||||
@ -153,23 +155,23 @@ public:
|
|||||||
// munger is a function of <floating point, Real, data_type>
|
// munger is a function of <floating point, Real, data_type>
|
||||||
if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
|
if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
|
||||||
if ( ieee32 || ieee32big ) {
|
if ( ieee32 || ieee32big ) {
|
||||||
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3F>
|
BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3F>
|
||||||
(Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
|
(Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
|
||||||
nersc_csum,scidac_csuma,scidac_csumb);
|
nersc_csum,scidac_csuma,scidac_csumb);
|
||||||
}
|
}
|
||||||
if ( ieee64 || ieee64big ) {
|
if ( ieee64 || ieee64big ) {
|
||||||
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3D>
|
BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3D>
|
||||||
(Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
|
(Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
|
||||||
nersc_csum,scidac_csuma,scidac_csumb);
|
nersc_csum,scidac_csuma,scidac_csumb);
|
||||||
}
|
}
|
||||||
} else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
|
} else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
|
||||||
if ( ieee32 || ieee32big ) {
|
if ( ieee32 || ieee32big ) {
|
||||||
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
|
BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixF>
|
||||||
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
|
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
|
||||||
nersc_csum,scidac_csuma,scidac_csumb);
|
nersc_csum,scidac_csuma,scidac_csumb);
|
||||||
}
|
}
|
||||||
if ( ieee64 || ieee64big ) {
|
if ( ieee64 || ieee64big ) {
|
||||||
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
|
BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixD>
|
||||||
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format,
|
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format,
|
||||||
nersc_csum,scidac_csuma,scidac_csumb);
|
nersc_csum,scidac_csuma,scidac_csumb);
|
||||||
}
|
}
|
||||||
@ -177,7 +179,7 @@ public:
|
|||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
GaugeStatistics(Umu,clone);
|
GaugeStats Stats; Stats(Umu,clone);
|
||||||
|
|
||||||
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" checksum "<<std::hex<<nersc_csum<< std::dec
|
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" checksum "<<std::hex<<nersc_csum<< std::dec
|
||||||
<<" header "<<std::hex<<header.checksum<<std::dec <<std::endl;
|
<<" header "<<std::hex<<header.checksum<<std::dec <<std::endl;
|
||||||
@ -203,15 +205,13 @@ public:
|
|||||||
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
|
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vsimd>
|
template<class GaugeStats=PeriodicGaugeStatistics>
|
||||||
static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
|
static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
|
||||||
std::string file,
|
std::string file,
|
||||||
int two_row,
|
int two_row,
|
||||||
int bits32)
|
int bits32)
|
||||||
{
|
{
|
||||||
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
|
typedef vLorentzColourMatrixD vobj;
|
||||||
|
|
||||||
typedef iLorentzColourMatrix<vsimd> vobj;
|
|
||||||
typedef typename vobj::scalar_object sobj;
|
typedef typename vobj::scalar_object sobj;
|
||||||
|
|
||||||
FieldMetaData header;
|
FieldMetaData header;
|
||||||
@ -229,7 +229,7 @@ public:
|
|||||||
|
|
||||||
GridMetaData(grid,header);
|
GridMetaData(grid,header);
|
||||||
assert(header.nd==4);
|
assert(header.nd==4);
|
||||||
GaugeStatistics(Umu,header);
|
GaugeStats Stats; Stats(Umu,header);
|
||||||
MachineCharacteristics(header);
|
MachineCharacteristics(header);
|
||||||
|
|
||||||
uint64_t offset;
|
uint64_t offset;
|
||||||
@ -238,19 +238,19 @@ public:
|
|||||||
header.floating_point = std::string("IEEE64BIG");
|
header.floating_point = std::string("IEEE64BIG");
|
||||||
header.data_type = std::string("4D_SU3_GAUGE_3x3");
|
header.data_type = std::string("4D_SU3_GAUGE_3x3");
|
||||||
GaugeSimpleUnmunger<fobj3D,sobj> munge;
|
GaugeSimpleUnmunger<fobj3D,sobj> munge;
|
||||||
if ( grid->IsBoss() ) {
|
if ( grid->IsBoss() ) {
|
||||||
truncate(file);
|
truncate(file);
|
||||||
offset = writeHeader(header,file);
|
offset = writeHeader(header,file);
|
||||||
}
|
}
|
||||||
grid->Broadcast(0,(void *)&offset,sizeof(offset));
|
grid->Broadcast(0,(void *)&offset,sizeof(offset));
|
||||||
|
|
||||||
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
|
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
|
||||||
BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
|
BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
|
||||||
nersc_csum,scidac_csuma,scidac_csumb);
|
nersc_csum,scidac_csuma,scidac_csumb);
|
||||||
header.checksum = nersc_csum;
|
header.checksum = nersc_csum;
|
||||||
if ( grid->IsBoss() ) {
|
if ( grid->IsBoss() ) {
|
||||||
writeHeader(header,file);
|
writeHeader(header,file);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::cout<<GridLogMessage <<"Written NERSC Configuration on "<< file << " checksum "
|
std::cout<<GridLogMessage <<"Written NERSC Configuration on "<< file << " checksum "
|
||||||
<<std::hex<<header.checksum
|
<<std::hex<<header.checksum
|
||||||
|
@ -154,7 +154,7 @@ public:
|
|||||||
grid->Barrier(); timer.Stop();
|
grid->Barrier(); timer.Stop();
|
||||||
std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: redistribute overhead " << timer.Elapsed() << std::endl;
|
std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: redistribute overhead " << timer.Elapsed() << std::endl;
|
||||||
|
|
||||||
GaugeStatistics(Umu, clone);
|
PeriodicGaugeStatistics Stats; Stats(Umu, clone);
|
||||||
|
|
||||||
RealD plaq_diff = fabs(clone.plaquette - header.plaquette);
|
RealD plaq_diff = fabs(clone.plaquette - header.plaquette);
|
||||||
|
|
||||||
|
@ -208,7 +208,7 @@ public:
|
|||||||
|
|
||||||
FieldMetaData clone(header);
|
FieldMetaData clone(header);
|
||||||
|
|
||||||
GaugeStatistics(Umu, clone);
|
PeriodicGaugeStatistics Stats; Stats(Umu, clone);
|
||||||
|
|
||||||
RealD plaq_diff = fabs(clone.plaquette - header.plaquette);
|
RealD plaq_diff = fabs(clone.plaquette - header.plaquette);
|
||||||
|
|
||||||
|
@ -47,7 +47,7 @@ static constexpr int Ym = 5;
|
|||||||
static constexpr int Zm = 6;
|
static constexpr int Zm = 6;
|
||||||
static constexpr int Tm = 7;
|
static constexpr int Tm = 7;
|
||||||
|
|
||||||
static constexpr int Nc=3;
|
static constexpr int Nc=Config_Nc;
|
||||||
static constexpr int Ns=4;
|
static constexpr int Ns=4;
|
||||||
static constexpr int Nd=4;
|
static constexpr int Nd=4;
|
||||||
static constexpr int Nhs=2; // half spinor
|
static constexpr int Nhs=2; // half spinor
|
||||||
@ -80,6 +80,13 @@ template<typename T> struct isSpinor {
|
|||||||
template <typename T> using IfSpinor = Invoke<std::enable_if< isSpinor<T>::value,int> > ;
|
template <typename T> using IfSpinor = Invoke<std::enable_if< isSpinor<T>::value,int> > ;
|
||||||
template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ;
|
template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ;
|
||||||
|
|
||||||
|
const int CoarseIndex = 4;
|
||||||
|
template<typename T> struct isCoarsened {
|
||||||
|
static constexpr bool value = (CoarseIndex<=T::TensorLevel);
|
||||||
|
};
|
||||||
|
template <typename T> using IfCoarsened = Invoke<std::enable_if< isCoarsened<T>::value,int> > ;
|
||||||
|
template <typename T> using IfNotCoarsened = Invoke<std::enable_if<!isCoarsened<T>::value,int> > ;
|
||||||
|
|
||||||
// ChrisK very keen to add extra space for Gparity doubling.
|
// ChrisK very keen to add extra space for Gparity doubling.
|
||||||
//
|
//
|
||||||
// Also add domain wall index, in a way where Wilson operator
|
// Also add domain wall index, in a way where Wilson operator
|
||||||
|
@ -97,42 +97,30 @@ public:
|
|||||||
Coordinate icoor;
|
Coordinate icoor;
|
||||||
|
|
||||||
#ifdef GRID_SIMT
|
#ifdef GRID_SIMT
|
||||||
_Spinor tmp;
|
|
||||||
|
|
||||||
const int Nsimd =SiteDoubledGaugeField::Nsimd();
|
const int Nsimd =SiteDoubledGaugeField::Nsimd();
|
||||||
int s = acceleratorSIMTlane(Nsimd);
|
int s = acceleratorSIMTlane(Nsimd);
|
||||||
St.iCoorFromIindex(icoor,s);
|
St.iCoorFromIindex(icoor,s);
|
||||||
|
|
||||||
int mmu = mu % Nd;
|
int mmu = mu % Nd;
|
||||||
if ( SE->_around_the_world && St.parameters.twists[mmu] ) {
|
|
||||||
|
|
||||||
int permute_lane = (sl==1)
|
auto UU0=coalescedRead(U(0)(mu));
|
||||||
|| ((distance== 1)&&(icoor[direction]==1))
|
auto UU1=coalescedRead(U(1)(mu));
|
||||||
|| ((distance==-1)&&(icoor[direction]==0));
|
|
||||||
|
|
||||||
if ( permute_lane ) {
|
//Decide whether we do a G-parity flavor twist
|
||||||
tmp(0) = chi(1);
|
//Note: this assumes (but does not check) that sl==1 || sl==2 i.e. max 2 SIMD lanes in G-parity dir
|
||||||
tmp(1) = chi(0);
|
//It also assumes (but does not check) that abs(distance) == 1
|
||||||
} else {
|
int permute_lane = (sl==1)
|
||||||
tmp(0) = chi(0);
|
|| ((distance== 1)&&(icoor[direction]==1))
|
||||||
tmp(1) = chi(1);
|
|| ((distance==-1)&&(icoor[direction]==0));
|
||||||
}
|
|
||||||
|
|
||||||
auto UU0=coalescedRead(U(0)(mu));
|
permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu]; //only if we are going around the world
|
||||||
auto UU1=coalescedRead(U(1)(mu));
|
|
||||||
|
|
||||||
mult(&phi(0),&UU0,&tmp(0));
|
//Apply the links
|
||||||
mult(&phi(1),&UU1,&tmp(1));
|
int f_upper = permute_lane ? 1 : 0;
|
||||||
|
int f_lower = !f_upper;
|
||||||
|
|
||||||
} else {
|
mult(&phi(0),&UU0,&chi(f_upper));
|
||||||
|
mult(&phi(1),&UU1,&chi(f_lower));
|
||||||
auto UU0=coalescedRead(U(0)(mu));
|
|
||||||
auto UU1=coalescedRead(U(1)(mu));
|
|
||||||
|
|
||||||
mult(&phi(0),&UU0,&chi(0));
|
|
||||||
mult(&phi(1),&UU1,&chi(1));
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
#else
|
||||||
typedef _Spinor vobj;
|
typedef _Spinor vobj;
|
||||||
|
@ -208,7 +208,7 @@ public:
|
|||||||
LebesgueOrder LebesgueEvenOdd;
|
LebesgueOrder LebesgueEvenOdd;
|
||||||
|
|
||||||
// Comms buffer
|
// Comms buffer
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Conserved current utilities
|
// Conserved current utilities
|
||||||
|
@ -63,17 +63,20 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
|
|||||||
///////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Generic Nc kernels
|
// Generic Nc kernels
|
||||||
///////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
template<int Naik> accelerator_inline
|
template<int Naik>
|
||||||
|
static accelerator_inline
|
||||||
void DhopSiteGeneric(StencilView &st,
|
void DhopSiteGeneric(StencilView &st,
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||||
template<int Naik> accelerator_inline
|
|
||||||
|
template<int Naik> static accelerator_inline
|
||||||
void DhopSiteGenericInt(StencilView &st,
|
void DhopSiteGenericInt(StencilView &st,
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||||
template<int Naik> accelerator_inline
|
|
||||||
|
template<int Naik> static accelerator_inline
|
||||||
void DhopSiteGenericExt(StencilView &st,
|
void DhopSiteGenericExt(StencilView &st,
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
@ -82,17 +85,20 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
|
|||||||
///////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Nc=3 specific kernels
|
// Nc=3 specific kernels
|
||||||
///////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
template<int Naik> accelerator_inline
|
|
||||||
|
template<int Naik> static accelerator_inline
|
||||||
void DhopSiteHand(StencilView &st,
|
void DhopSiteHand(StencilView &st,
|
||||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||||
template<int Naik> accelerator_inline
|
|
||||||
|
template<int Naik> static accelerator_inline
|
||||||
void DhopSiteHandInt(StencilView &st,
|
void DhopSiteHandInt(StencilView &st,
|
||||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
const FermionFieldView &in, FermionFieldView &out,int dag);
|
const FermionFieldView &in, FermionFieldView &out,int dag);
|
||||||
template<int Naik> accelerator_inline
|
|
||||||
|
template<int Naik> static accelerator_inline
|
||||||
void DhopSiteHandExt(StencilView &st,
|
void DhopSiteHandExt(StencilView &st,
|
||||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
@ -101,6 +107,7 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
|
|||||||
///////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Asm Nc=3 specific kernels
|
// Asm Nc=3 specific kernels
|
||||||
///////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
void DhopSiteAsm(StencilView &st,
|
void DhopSiteAsm(StencilView &st,
|
||||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor * buf, int LLs, int sU,
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
|
@ -74,6 +74,20 @@ public:
|
|||||||
FermionField _tmp;
|
FermionField _tmp;
|
||||||
FermionField &tmp(void) { return _tmp; }
|
FermionField &tmp(void) { return _tmp; }
|
||||||
|
|
||||||
|
void Report(void);
|
||||||
|
void ZeroCounters(void);
|
||||||
|
double DhopCalls;
|
||||||
|
double DhopCommTime;
|
||||||
|
double DhopComputeTime;
|
||||||
|
double DhopComputeTime2;
|
||||||
|
double DhopFaceTime;
|
||||||
|
double DhopTotalTime;
|
||||||
|
|
||||||
|
double DerivCalls;
|
||||||
|
double DerivCommTime;
|
||||||
|
double DerivComputeTime;
|
||||||
|
double DerivDhopComputeTime;
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////
|
||||||
// override multiply; cut number routines if pass dagger argument
|
// override multiply; cut number routines if pass dagger argument
|
||||||
// and also make interface more uniformly consistent
|
// and also make interface more uniformly consistent
|
||||||
@ -196,5 +210,3 @@ typedef WilsonFermion<WilsonImplF> WilsonFermionF;
|
|||||||
typedef WilsonFermion<WilsonImplD> WilsonFermionD;
|
typedef WilsonFermion<WilsonImplD> WilsonFermionD;
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
|
||||||
|
@ -215,7 +215,7 @@ public:
|
|||||||
LebesgueOrder LebesgueEvenOdd;
|
LebesgueOrder LebesgueEvenOdd;
|
||||||
|
|
||||||
// Comms buffer
|
// Comms buffer
|
||||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
||||||
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
@ -642,7 +642,7 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
|
|||||||
Current curr_type,
|
Current curr_type,
|
||||||
unsigned int mu)
|
unsigned int mu)
|
||||||
{
|
{
|
||||||
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
|
#if (!defined(GRID_HIP))
|
||||||
Gamma::Algebra Gmu [] = {
|
Gamma::Algebra Gmu [] = {
|
||||||
Gamma::Algebra::GammaX,
|
Gamma::Algebra::GammaX,
|
||||||
Gamma::Algebra::GammaY,
|
Gamma::Algebra::GammaY,
|
||||||
@ -799,7 +799,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
|||||||
|
|
||||||
PropagatorField tmp(UGrid);
|
PropagatorField tmp(UGrid);
|
||||||
PropagatorField Utmp(UGrid);
|
PropagatorField Utmp(UGrid);
|
||||||
LatticeInteger zz (UGrid); zz=0.0;
|
PropagatorField zz (UGrid); zz=0.0;
|
||||||
LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
|
LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
|
||||||
for (int s=0;s<Ls;s++) {
|
for (int s=0;s<Ls;s++) {
|
||||||
|
|
||||||
@ -826,7 +826,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
|
#if (!defined(GRID_HIP))
|
||||||
int tshift = (mu == Nd-1) ? 1 : 0;
|
int tshift = (mu == Nd-1) ? 1 : 0;
|
||||||
////////////////////////////////////////////////
|
////////////////////////////////////////////////
|
||||||
// GENERAL CAYLEY CASE
|
// GENERAL CAYLEY CASE
|
||||||
@ -850,7 +850,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
|||||||
PropagatorField tmp(UGrid);
|
PropagatorField tmp(UGrid);
|
||||||
PropagatorField Utmp(UGrid);
|
PropagatorField Utmp(UGrid);
|
||||||
|
|
||||||
LatticeInteger zz (UGrid); zz=0.0;
|
PropagatorField zz (UGrid); zz=0.0;
|
||||||
LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
|
LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
|
||||||
|
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
|
@ -146,7 +146,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
template <int Naik>
|
template <int Naik> accelerator_inline
|
||||||
void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
|
void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
|
||||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int sF, int sU,
|
SiteSpinor *buf, int sF, int sU,
|
||||||
@ -221,7 +221,7 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
|
|||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
template <int Naik>
|
template <int Naik> accelerator_inline
|
||||||
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
|
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int sF, int sU,
|
SiteSpinor *buf, int sF, int sU,
|
||||||
@ -300,7 +300,7 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
|
|||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
template <int Naik>
|
template <int Naik> accelerator_inline
|
||||||
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
|
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int sF, int sU,
|
SiteSpinor *buf, int sF, int sU,
|
||||||
|
@ -78,7 +78,7 @@ StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
|
|||||||
// Int, Ext, Int+Ext cases for comms overlap
|
// Int, Ext, Int+Ext cases for comms overlap
|
||||||
////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
template <int Naik>
|
template <int Naik> accelerator_inline
|
||||||
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
|
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int sF, int sU,
|
SiteSpinor *buf, int sF, int sU,
|
||||||
@ -126,7 +126,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
|
|||||||
// Only contributions from interior of our node
|
// Only contributions from interior of our node
|
||||||
///////////////////////////////////////////////////
|
///////////////////////////////////////////////////
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
template <int Naik>
|
template <int Naik> accelerator_inline
|
||||||
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
|
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int sF, int sU,
|
SiteSpinor *buf, int sF, int sU,
|
||||||
@ -174,7 +174,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
|
|||||||
// Only contributions from exterior of our node
|
// Only contributions from exterior of our node
|
||||||
///////////////////////////////////////////////////
|
///////////////////////////////////////////////////
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
template <int Naik>
|
template <int Naik> accelerator_inline
|
||||||
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
|
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
|
||||||
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
|
||||||
SiteSpinor *buf, int sF, int sU,
|
SiteSpinor *buf, int sF, int sU,
|
||||||
|
@ -92,20 +92,16 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
|
|||||||
int lvol = _Umu.Grid()->lSites();
|
int lvol = _Umu.Grid()->lSites();
|
||||||
int DimRep = Impl::Dimension;
|
int DimRep = Impl::Dimension;
|
||||||
|
|
||||||
Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
|
|
||||||
Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
|
|
||||||
|
|
||||||
Coordinate lcoor;
|
|
||||||
typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
|
|
||||||
|
|
||||||
{
|
{
|
||||||
autoView(CTv,CloverTerm,CpuRead);
|
autoView(CTv,CloverTerm,CpuRead);
|
||||||
autoView(CTIv,CloverTermInv,CpuWrite);
|
autoView(CTIv,CloverTermInv,CpuWrite);
|
||||||
for (int site = 0; site < lvol; site++) {
|
thread_for(site, lvol, {
|
||||||
|
Coordinate lcoor;
|
||||||
grid->LocalIndexToLocalCoor(site, lcoor);
|
grid->LocalIndexToLocalCoor(site, lcoor);
|
||||||
EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
|
Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
|
||||||
|
Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
|
||||||
|
typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
|
||||||
peekLocalSite(Qx, CTv, lcoor);
|
peekLocalSite(Qx, CTv, lcoor);
|
||||||
Qxinv = Zero();
|
|
||||||
//if (csw!=0){
|
//if (csw!=0){
|
||||||
for (int j = 0; j < Ns; j++)
|
for (int j = 0; j < Ns; j++)
|
||||||
for (int k = 0; k < Ns; k++)
|
for (int k = 0; k < Ns; k++)
|
||||||
@ -126,21 +122,21 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
|
|||||||
// if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
|
// if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
|
||||||
// }
|
// }
|
||||||
pokeLocalSite(Qxinv, CTIv, lcoor);
|
pokeLocalSite(Qxinv, CTIv, lcoor);
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// Separate the even and odd parts
|
// Separate the even and odd parts
|
||||||
pickCheckerboard(Even, CloverTermEven, CloverTerm);
|
pickCheckerboard(Even, CloverTermEven, CloverTerm);
|
||||||
pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
|
pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
|
||||||
|
|
||||||
pickCheckerboard(Even, CloverTermDagEven, closure(adj(CloverTerm)));
|
pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm));
|
||||||
pickCheckerboard(Odd, CloverTermDagOdd, closure(adj(CloverTerm)));
|
pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm));
|
||||||
|
|
||||||
pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
|
pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
|
||||||
pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);
|
pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);
|
||||||
|
|
||||||
pickCheckerboard(Even, CloverTermInvDagEven, closure(adj(CloverTermInv)));
|
pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
|
||||||
pickCheckerboard(Odd, CloverTermInvDagOdd, closure(adj(CloverTermInv)));
|
pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
|
@ -75,6 +75,91 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
|
|||||||
StencilOdd.BuildSurfaceList(1,vol4);
|
StencilOdd.BuildSurfaceList(1,vol4);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<class Impl>
|
||||||
|
void WilsonFermion<Impl>::Report(void)
|
||||||
|
{
|
||||||
|
RealD NP = _grid->_Nprocessors;
|
||||||
|
RealD NN = _grid->NodeCount();
|
||||||
|
RealD volume = 1;
|
||||||
|
Coordinate latt = _grid->GlobalDimensions();
|
||||||
|
for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
|
||||||
|
|
||||||
|
if ( DhopCalls > 0 ) {
|
||||||
|
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
|
||||||
|
std::cout << GridLogMessage << "WilsonFermion Number of DhopEO Calls : " << DhopCalls << std::endl;
|
||||||
|
std::cout << GridLogMessage << "WilsonFermion TotalTime /Calls : " << DhopTotalTime / DhopCalls << " us" << std::endl;
|
||||||
|
std::cout << GridLogMessage << "WilsonFermion CommTime /Calls : " << DhopCommTime / DhopCalls << " us" << std::endl;
|
||||||
|
std::cout << GridLogMessage << "WilsonFermion FaceTime /Calls : " << DhopFaceTime / DhopCalls << " us" << std::endl;
|
||||||
|
std::cout << GridLogMessage << "WilsonFermion ComputeTime1/Calls : " << DhopComputeTime / DhopCalls << " us" << std::endl;
|
||||||
|
std::cout << GridLogMessage << "WilsonFermion ComputeTime2/Calls : " << DhopComputeTime2/ DhopCalls << " us" << std::endl;
|
||||||
|
|
||||||
|
// Average the compute time
|
||||||
|
_grid->GlobalSum(DhopComputeTime);
|
||||||
|
DhopComputeTime/=NP;
|
||||||
|
RealD mflops = 1320*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
|
||||||
|
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
||||||
|
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
||||||
|
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
|
||||||
|
|
||||||
|
RealD Fullmflops = 1320*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
|
||||||
|
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
|
||||||
|
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
|
||||||
|
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( DerivCalls > 0 ) {
|
||||||
|
std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl;
|
||||||
|
std::cout << GridLogMessage << "WilsonFermion Number of Deriv Calls : " <<DerivCalls <<std::endl;
|
||||||
|
std::cout << GridLogMessage << "WilsonFermion CommTime/Calls : " <<DerivCommTime/DerivCalls<<" us" <<std::endl;
|
||||||
|
std::cout << GridLogMessage << "WilsonFermion ComputeTime/Calls : " <<DerivComputeTime/DerivCalls<<" us" <<std::endl;
|
||||||
|
std::cout << GridLogMessage << "WilsonFermion Dhop ComputeTime/Calls : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
|
||||||
|
|
||||||
|
// how to count flops here?
|
||||||
|
RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
|
||||||
|
std::cout << GridLogMessage << "Average mflops/s per call ? : " << mflops << std::endl;
|
||||||
|
std::cout << GridLogMessage << "Average mflops/s per call per node ? : " << mflops/NP << std::endl;
|
||||||
|
|
||||||
|
// how to count flops here?
|
||||||
|
RealD Fullmflops = 144*volume*DerivCalls/(DerivDhopComputeTime+DerivCommTime)/2; // 2 for red black counting
|
||||||
|
std::cout << GridLogMessage << "Average mflops/s per call (full) ? : " << Fullmflops << std::endl;
|
||||||
|
std::cout << GridLogMessage << "Average mflops/s per call per node (full) ? : " << Fullmflops/NP << std::endl; }
|
||||||
|
|
||||||
|
if (DerivCalls > 0 || DhopCalls > 0){
|
||||||
|
std::cout << GridLogMessage << "WilsonFermion Stencil" <<std::endl; Stencil.Report();
|
||||||
|
std::cout << GridLogMessage << "WilsonFermion StencilEven"<<std::endl; StencilEven.Report();
|
||||||
|
std::cout << GridLogMessage << "WilsonFermion StencilOdd" <<std::endl; StencilOdd.Report();
|
||||||
|
}
|
||||||
|
if ( DhopCalls > 0){
|
||||||
|
std::cout << GridLogMessage << "WilsonFermion Stencil Reporti()" <<std::endl; Stencil.Reporti(DhopCalls);
|
||||||
|
std::cout << GridLogMessage << "WilsonFermion StencilEven Reporti()"<<std::endl; StencilEven.Reporti(DhopCalls);
|
||||||
|
std::cout << GridLogMessage << "WilsonFermion StencilOdd Reporti()" <<std::endl; StencilOdd.Reporti(DhopCalls);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl>
|
||||||
|
void WilsonFermion<Impl>::ZeroCounters(void) {
|
||||||
|
DhopCalls = 0; // ok
|
||||||
|
DhopCommTime = 0;
|
||||||
|
DhopComputeTime = 0;
|
||||||
|
DhopComputeTime2= 0;
|
||||||
|
DhopFaceTime = 0;
|
||||||
|
DhopTotalTime = 0;
|
||||||
|
|
||||||
|
DerivCalls = 0; // ok
|
||||||
|
DerivCommTime = 0;
|
||||||
|
DerivComputeTime = 0;
|
||||||
|
DerivDhopComputeTime = 0;
|
||||||
|
|
||||||
|
Stencil.ZeroCounters();
|
||||||
|
StencilEven.ZeroCounters();
|
||||||
|
StencilOdd.ZeroCounters();
|
||||||
|
Stencil.ZeroCountersi();
|
||||||
|
StencilEven.ZeroCountersi();
|
||||||
|
StencilOdd.ZeroCountersi();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu)
|
void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu)
|
||||||
{
|
{
|
||||||
@ -234,6 +319,7 @@ template <class Impl>
|
|||||||
void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
|
void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
|
||||||
GaugeField &mat, const FermionField &A,
|
GaugeField &mat, const FermionField &A,
|
||||||
const FermionField &B, int dag) {
|
const FermionField &B, int dag) {
|
||||||
|
DerivCalls++;
|
||||||
assert((dag == DaggerNo) || (dag == DaggerYes));
|
assert((dag == DaggerNo) || (dag == DaggerYes));
|
||||||
|
|
||||||
Compressor compressor(dag);
|
Compressor compressor(dag);
|
||||||
@ -242,8 +328,11 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
|
|||||||
FermionField Atilde(B.Grid());
|
FermionField Atilde(B.Grid());
|
||||||
Atilde = A;
|
Atilde = A;
|
||||||
|
|
||||||
|
DerivCommTime-=usecond();
|
||||||
st.HaloExchange(B, compressor);
|
st.HaloExchange(B, compressor);
|
||||||
|
DerivCommTime+=usecond();
|
||||||
|
|
||||||
|
DerivComputeTime-=usecond();
|
||||||
for (int mu = 0; mu < Nd; mu++) {
|
for (int mu = 0; mu < Nd; mu++) {
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
// Flip gamma (1+g)<->(1-g) if dag
|
// Flip gamma (1+g)<->(1-g) if dag
|
||||||
@ -251,6 +340,7 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
|
|||||||
int gamma = mu;
|
int gamma = mu;
|
||||||
if (!dag) gamma += Nd;
|
if (!dag) gamma += Nd;
|
||||||
|
|
||||||
|
DerivDhopComputeTime -= usecond();
|
||||||
int Ls=1;
|
int Ls=1;
|
||||||
Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, B.Grid()->oSites(), B, Btilde, mu, gamma);
|
Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, B.Grid()->oSites(), B, Btilde, mu, gamma);
|
||||||
|
|
||||||
@ -258,7 +348,9 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
|
|||||||
// spin trace outer product
|
// spin trace outer product
|
||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
Impl::InsertForce4D(mat, Btilde, Atilde, mu);
|
Impl::InsertForce4D(mat, Btilde, Atilde, mu);
|
||||||
|
DerivDhopComputeTime += usecond();
|
||||||
}
|
}
|
||||||
|
DerivComputeTime += usecond();
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -392,13 +484,14 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag)
|
FermionField &out, int dag)
|
||||||
{
|
{
|
||||||
|
DhopTotalTime-=usecond();
|
||||||
#ifdef GRID_OMP
|
#ifdef GRID_OMP
|
||||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
|
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
|
||||||
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
|
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
DhopInternalSerial(st,lo,U,in,out,dag);
|
DhopInternalSerial(st,lo,U,in,out,dag);
|
||||||
|
DhopTotalTime+=usecond();
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -417,38 +510,53 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO
|
|||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
std::vector<std::vector<CommsRequest_t> > requests;
|
std::vector<std::vector<CommsRequest_t> > requests;
|
||||||
st.Prepare();
|
st.Prepare();
|
||||||
|
DhopFaceTime-=usecond();
|
||||||
st.HaloGather(in,compressor);
|
st.HaloGather(in,compressor);
|
||||||
|
DhopFaceTime+=usecond();
|
||||||
|
|
||||||
|
DhopCommTime -=usecond();
|
||||||
st.CommunicateBegin(requests);
|
st.CommunicateBegin(requests);
|
||||||
|
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// Overlap with comms
|
// Overlap with comms
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
|
DhopFaceTime-=usecond();
|
||||||
st.CommsMergeSHM(compressor);
|
st.CommsMergeSHM(compressor);
|
||||||
|
DhopFaceTime+=usecond();
|
||||||
|
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// do the compute interior
|
// do the compute interior
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
int Opt = WilsonKernelsStatic::Opt;
|
int Opt = WilsonKernelsStatic::Opt;
|
||||||
|
DhopComputeTime-=usecond();
|
||||||
if (dag == DaggerYes) {
|
if (dag == DaggerYes) {
|
||||||
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
|
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
|
||||||
} else {
|
} else {
|
||||||
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
|
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
|
||||||
}
|
}
|
||||||
|
DhopComputeTime+=usecond();
|
||||||
|
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// Complete comms
|
// Complete comms
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
st.CommunicateComplete(requests);
|
st.CommunicateComplete(requests);
|
||||||
|
DhopCommTime +=usecond();
|
||||||
|
|
||||||
|
DhopFaceTime-=usecond();
|
||||||
st.CommsMerge(compressor);
|
st.CommsMerge(compressor);
|
||||||
|
DhopFaceTime+=usecond();
|
||||||
|
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// do the compute exterior
|
// do the compute exterior
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
|
|
||||||
|
DhopComputeTime2-=usecond();
|
||||||
if (dag == DaggerYes) {
|
if (dag == DaggerYes) {
|
||||||
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
|
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
|
||||||
} else {
|
} else {
|
||||||
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
|
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
|
||||||
}
|
}
|
||||||
|
DhopComputeTime2+=usecond();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -460,14 +568,18 @@ void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
{
|
{
|
||||||
assert((dag == DaggerNo) || (dag == DaggerYes));
|
assert((dag == DaggerNo) || (dag == DaggerYes));
|
||||||
Compressor compressor(dag);
|
Compressor compressor(dag);
|
||||||
|
DhopCommTime-=usecond();
|
||||||
st.HaloExchange(in, compressor);
|
st.HaloExchange(in, compressor);
|
||||||
|
DhopCommTime+=usecond();
|
||||||
|
|
||||||
|
DhopComputeTime-=usecond();
|
||||||
int Opt = WilsonKernelsStatic::Opt;
|
int Opt = WilsonKernelsStatic::Opt;
|
||||||
if (dag == DaggerYes) {
|
if (dag == DaggerYes) {
|
||||||
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
|
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
|
||||||
} else {
|
} else {
|
||||||
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
|
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
|
||||||
}
|
}
|
||||||
|
DhopComputeTime+=usecond();
|
||||||
};
|
};
|
||||||
/*Change ends */
|
/*Change ends */
|
||||||
|
|
||||||
|
450
Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h
Normal file
450
Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h
Normal file
@ -0,0 +1,450 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Source file: ./lib/qcd/action/fermion/WilsonKernelsAsmA64FX.h
|
||||||
|
|
||||||
|
Copyright (C) 2020
|
||||||
|
|
||||||
|
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
//#if defined(A64FXASM)
|
||||||
|
#if defined(A64FX)
|
||||||
|
|
||||||
|
// safety include
|
||||||
|
#include <arm_sve.h>
|
||||||
|
|
||||||
|
// undefine everything related to kernels
|
||||||
|
#include <simd/Fujitsu_A64FX_undef.h>
|
||||||
|
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////
|
||||||
|
// If we are A64FX specialise the single precision routine
|
||||||
|
///////////////////////////////////////////////////////////
|
||||||
|
#if defined(DSLASHINTRIN)
|
||||||
|
//#pragma message ("A64FX Dslash: intrin")
|
||||||
|
#include <simd/Fujitsu_A64FX_intrin_single.h>
|
||||||
|
#else
|
||||||
|
#pragma message ("A64FX Dslash: asm")
|
||||||
|
#include <simd/Fujitsu_A64FX_asm_single.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/// Switch off the 5d vectorised code optimisations
|
||||||
|
#undef DWFVEC5D
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////////
|
||||||
|
// XYZT vectorised, undag Kernel, single
|
||||||
|
/////////////////////////////////////////////////////////////////
|
||||||
|
#undef KERNEL_DAG
|
||||||
|
#define INTERIOR_AND_EXTERIOR
|
||||||
|
#undef INTERIOR
|
||||||
|
#undef EXTERIOR
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<WilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<ZWilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
|
||||||
|
#undef INTERIOR_AND_EXTERIOR
|
||||||
|
#define INTERIOR
|
||||||
|
#undef EXTERIOR
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<WilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<ZWilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
|
||||||
|
#undef INTERIOR_AND_EXTERIOR
|
||||||
|
#undef INTERIOR
|
||||||
|
#define EXTERIOR
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<WilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<ZWilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////////
|
||||||
|
// XYZT vectorised, dag Kernel, single
|
||||||
|
/////////////////////////////////////////////////////////////////
|
||||||
|
#define KERNEL_DAG
|
||||||
|
#define INTERIOR_AND_EXTERIOR
|
||||||
|
#undef INTERIOR
|
||||||
|
#undef EXTERIOR
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
|
||||||
|
#undef INTERIOR_AND_EXTERIOR
|
||||||
|
#define INTERIOR
|
||||||
|
#undef EXTERIOR
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<WilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
|
||||||
|
#undef INTERIOR_AND_EXTERIOR
|
||||||
|
#undef INTERIOR
|
||||||
|
#define EXTERIOR
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<WilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// undefine
|
||||||
|
#include <simd/Fujitsu_A64FX_undef.h>
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////
|
||||||
|
// If we are A64FX specialise the double precision routine
|
||||||
|
///////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
#if defined(DSLASHINTRIN)
|
||||||
|
#include <simd/Fujitsu_A64FX_intrin_double.h>
|
||||||
|
#else
|
||||||
|
#include <simd/Fujitsu_A64FX_asm_double.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// former KNL
|
||||||
|
//#define MAYBEPERM(A,perm) if (perm) { A ; }
|
||||||
|
//#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
|
||||||
|
//#define COMPLEX_SIGNS(isigns) vComplexD *isigns = &signsD[0];
|
||||||
|
|
||||||
|
|
||||||
|
#define INTERIOR_AND_EXTERIOR
|
||||||
|
#undef INTERIOR
|
||||||
|
#undef EXTERIOR
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////////
|
||||||
|
// XYZT vectorised, undag Kernel, double
|
||||||
|
/////////////////////////////////////////////////////////////////
|
||||||
|
#undef KERNEL_DAG
|
||||||
|
#define INTERIOR_AND_EXTERIOR
|
||||||
|
#undef INTERIOR
|
||||||
|
#undef EXTERIOR
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<WilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<ZWilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
|
||||||
|
#undef INTERIOR_AND_EXTERIOR
|
||||||
|
#define INTERIOR
|
||||||
|
#undef EXTERIOR
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<WilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<ZWilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
|
||||||
|
#undef INTERIOR_AND_EXTERIOR
|
||||||
|
#undef INTERIOR
|
||||||
|
#define EXTERIOR
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<WilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<ZWilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////////
|
||||||
|
// XYZT vectorised, dag Kernel, double
|
||||||
|
/////////////////////////////////////////////////////////////////
|
||||||
|
#define KERNEL_DAG
|
||||||
|
#define INTERIOR_AND_EXTERIOR
|
||||||
|
#undef INTERIOR
|
||||||
|
#undef EXTERIOR
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
|
||||||
|
#undef INTERIOR_AND_EXTERIOR
|
||||||
|
#define INTERIOR
|
||||||
|
#undef EXTERIOR
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<WilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
|
||||||
|
#undef INTERIOR_AND_EXTERIOR
|
||||||
|
#undef INTERIOR
|
||||||
|
#define EXTERIOR
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<WilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
|
||||||
|
template<> void
|
||||||
|
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
|
||||||
|
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// undefs
|
||||||
|
#include <simd/Fujitsu_A64FX_undef.h>
|
||||||
|
|
||||||
|
#endif //A64FXASM
|
@ -0,0 +1,395 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: WilsonKernelsAsmBodyA64FX.h
|
||||||
|
|
||||||
|
Copyright (C) 2020
|
||||||
|
|
||||||
|
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
|
||||||
|
// GCC 10 messes up SVE instruction scheduling using -O3, but
|
||||||
|
// -O3 -fno-schedule-insns -fno-schedule-insns2 does wonders
|
||||||
|
// performance now is better than armclang 20.2
|
||||||
|
|
||||||
|
#ifdef KERNEL_DAG
|
||||||
|
#define DIR0_PROJ XP_PROJ
|
||||||
|
#define DIR1_PROJ YP_PROJ
|
||||||
|
#define DIR2_PROJ ZP_PROJ
|
||||||
|
#define DIR3_PROJ TP_PROJ
|
||||||
|
#define DIR4_PROJ XM_PROJ
|
||||||
|
#define DIR5_PROJ YM_PROJ
|
||||||
|
#define DIR6_PROJ ZM_PROJ
|
||||||
|
#define DIR7_PROJ TM_PROJ
|
||||||
|
#define DIR0_RECON XP_RECON
|
||||||
|
#define DIR1_RECON YP_RECON_ACCUM
|
||||||
|
#define DIR2_RECON ZP_RECON_ACCUM
|
||||||
|
#define DIR3_RECON TP_RECON_ACCUM
|
||||||
|
#define DIR4_RECON XM_RECON_ACCUM
|
||||||
|
#define DIR5_RECON YM_RECON_ACCUM
|
||||||
|
#define DIR6_RECON ZM_RECON_ACCUM
|
||||||
|
#define DIR7_RECON TM_RECON_ACCUM
|
||||||
|
#else
|
||||||
|
#define DIR0_PROJ XM_PROJ
|
||||||
|
#define DIR1_PROJ YM_PROJ
|
||||||
|
#define DIR2_PROJ ZM_PROJ
|
||||||
|
#define DIR3_PROJ TM_PROJ
|
||||||
|
#define DIR4_PROJ XP_PROJ
|
||||||
|
#define DIR5_PROJ YP_PROJ
|
||||||
|
#define DIR6_PROJ ZP_PROJ
|
||||||
|
#define DIR7_PROJ TP_PROJ
|
||||||
|
#define DIR0_RECON XM_RECON
|
||||||
|
#define DIR1_RECON YM_RECON_ACCUM
|
||||||
|
#define DIR2_RECON ZM_RECON_ACCUM
|
||||||
|
#define DIR3_RECON TM_RECON_ACCUM
|
||||||
|
#define DIR4_RECON XP_RECON_ACCUM
|
||||||
|
#define DIR5_RECON YP_RECON_ACCUM
|
||||||
|
#define DIR6_RECON ZP_RECON_ACCUM
|
||||||
|
#define DIR7_RECON TP_RECON_ACCUM
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//using namespace std;
|
||||||
|
|
||||||
|
#undef SHOW
|
||||||
|
//#define SHOW
|
||||||
|
|
||||||
|
#undef WHERE
|
||||||
|
|
||||||
|
#ifdef INTERIOR_AND_EXTERIOR
|
||||||
|
#define WHERE "INT_AND_EXT"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef INTERIOR
|
||||||
|
#define WHERE "INT"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef EXTERIOR
|
||||||
|
#define WHERE "EXT"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//#pragma message("here")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Comms then compute kernel
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
#ifdef INTERIOR_AND_EXTERIOR
|
||||||
|
|
||||||
|
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||||
|
basep = st.GetPFInfo(nent,plocal); nent++; \
|
||||||
|
if ( local ) { \
|
||||||
|
LOAD_CHIMU(base); \
|
||||||
|
LOAD_TABLE(PERMUTE_DIR); \
|
||||||
|
PROJ; \
|
||||||
|
MAYBEPERM(PERMUTE_DIR,perm); \
|
||||||
|
} else { \
|
||||||
|
LOAD_CHI(base); \
|
||||||
|
} \
|
||||||
|
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
|
||||||
|
MULT_2SPIN_1(Dir); \
|
||||||
|
PREFETCH_CHIMU(base); \
|
||||||
|
PREFETCH_CHIMU_L2(basep); \
|
||||||
|
/* PREFETCH_GAUGE_L1(NxtDir); */ \
|
||||||
|
MULT_2SPIN_2; \
|
||||||
|
if (s == 0) { \
|
||||||
|
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
|
||||||
|
} \
|
||||||
|
RECON; \
|
||||||
|
|
||||||
|
/*
|
||||||
|
NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty
|
||||||
|
though I expected that it would improve on performance
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||||
|
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
||||||
|
PREFETCH1_CHIMU(base); \
|
||||||
|
ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
|
||||||
|
|
||||||
|
#define RESULT(base,basep) SAVE_RESULT(base,basep);
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Pre comms kernel -- prefetch like normal because it is mostly right
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
#ifdef INTERIOR
|
||||||
|
|
||||||
|
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||||
|
basep = st.GetPFInfo(nent,plocal); nent++; \
|
||||||
|
if ( local ) { \
|
||||||
|
LOAD_CHIMU(base); \
|
||||||
|
LOAD_TABLE(PERMUTE_DIR); \
|
||||||
|
PROJ; \
|
||||||
|
MAYBEPERM(PERMUTE_DIR,perm); \
|
||||||
|
}else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \
|
||||||
|
if ( local || st.same_node[Dir] ) { \
|
||||||
|
MULT_2SPIN_1(Dir); \
|
||||||
|
MULT_2SPIN_2; \
|
||||||
|
RECON; \
|
||||||
|
} \
|
||||||
|
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
|
||||||
|
PREFETCH_CHIMU(base); \
|
||||||
|
PREFETCH_CHIMU_L2(basep); \
|
||||||
|
|
||||||
|
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||||
|
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
||||||
|
PREFETCH1_CHIMU(base); \
|
||||||
|
{ ZERO_PSI; } \
|
||||||
|
ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
|
||||||
|
|
||||||
|
#define RESULT(base,basep) SAVE_RESULT(base,basep);
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Post comms kernel
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
#ifdef EXTERIOR
|
||||||
|
|
||||||
|
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||||
|
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
||||||
|
if((!local)&&(!st.same_node[Dir]) ) { \
|
||||||
|
LOAD_CHI(base); \
|
||||||
|
MULT_2SPIN_1(Dir); \
|
||||||
|
MULT_2SPIN_2; \
|
||||||
|
RECON; \
|
||||||
|
nmu++; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
|
||||||
|
nmu=0; \
|
||||||
|
{ ZERO_PSI;} \
|
||||||
|
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
|
||||||
|
if((!local)&&(!st.same_node[Dir]) ) { \
|
||||||
|
LOAD_CHI(base); \
|
||||||
|
MULT_2SPIN_1(Dir); \
|
||||||
|
MULT_2SPIN_2; \
|
||||||
|
RECON; \
|
||||||
|
nmu++; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
{
|
||||||
|
int nmu;
|
||||||
|
int local,perm, ptype;
|
||||||
|
uint64_t base;
|
||||||
|
uint64_t basep;
|
||||||
|
const uint64_t plocal =(uint64_t) & in[0];
|
||||||
|
|
||||||
|
MASK_REGS;
|
||||||
|
int nmax=U.oSites();
|
||||||
|
for(int site=0;site<Ns;site++) {
|
||||||
|
#ifndef EXTERIOR
|
||||||
|
// int sU =lo.Reorder(ssU);
|
||||||
|
int sU =ssU;
|
||||||
|
int ssn=ssU+1; if(ssn>=nmax) ssn=0;
|
||||||
|
// int sUn=lo.Reorder(ssn);
|
||||||
|
int sUn=ssn;
|
||||||
|
#else
|
||||||
|
int sU =ssU;
|
||||||
|
int ssn=ssU+1; if(ssn>=nmax) ssn=0;
|
||||||
|
int sUn=ssn;
|
||||||
|
#endif
|
||||||
|
for(int s=0;s<Ls;s++) {
|
||||||
|
ss =sU*Ls+s;
|
||||||
|
ssn=sUn*Ls+s;
|
||||||
|
int ent=ss*8;// 2*Ndim
|
||||||
|
int nent=ssn*8;
|
||||||
|
|
||||||
|
uint64_t delta_base, delta_base_p;
|
||||||
|
|
||||||
|
ASM_LEG_XP(Xp,Yp,PERMUTE_DIR3,DIR0_PROJ,DIR0_RECON);
|
||||||
|
|
||||||
|
#ifdef SHOW
|
||||||
|
float rescale = 64. * 12.;
|
||||||
|
std::cout << "=================================================================" << std::endl;
|
||||||
|
std::cout << "ss = " << ss << " ssn = " << ssn << std::endl;
|
||||||
|
std::cout << "sU = " << sU << " ssU = " << ssU << std::endl;
|
||||||
|
std::cout << " " << std::endl;
|
||||||
|
|
||||||
|
|
||||||
|
std::cout << "Dir = " << Xp << " " << WHERE<< std::endl;
|
||||||
|
|
||||||
|
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
|
||||||
|
std::cout << "st.same_node[Dir] = " << st.same_node[Xp] << std::endl;
|
||||||
|
std::cout << "base = " << (base - plocal)/rescale << std::endl;
|
||||||
|
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
|
||||||
|
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
|
||||||
|
std::cout << "----------------------------------------------------" << std::endl;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
ASM_LEG(Yp,Zp,PERMUTE_DIR2,DIR1_PROJ,DIR1_RECON);
|
||||||
|
|
||||||
|
#ifdef SHOW
|
||||||
|
std::cout << "Dir = " << Yp << " " << WHERE<< std::endl;
|
||||||
|
|
||||||
|
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
|
||||||
|
std::cout << "st.same_node[Dir] = " << st.same_node[Yp] << std::endl;
|
||||||
|
std::cout << "base = " << (base - plocal)/rescale << std::endl;
|
||||||
|
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
|
||||||
|
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
|
||||||
|
std::cout << "----------------------------------------------------" << std::endl;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
ASM_LEG(Zp,Tp,PERMUTE_DIR1,DIR2_PROJ,DIR2_RECON);
|
||||||
|
|
||||||
|
#ifdef SHOW
|
||||||
|
std::cout << "Dir = " << Zp << " " << WHERE<< std::endl;
|
||||||
|
|
||||||
|
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
|
||||||
|
std::cout << "st.same_node[Dir] = " << st.same_node[Zp] << std::endl;
|
||||||
|
std::cout << "base = " << (base - plocal)/rescale << std::endl;
|
||||||
|
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
|
||||||
|
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
|
||||||
|
std::cout << "----------------------------------------------------" << std::endl;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
ASM_LEG(Tp,Xm,PERMUTE_DIR0,DIR3_PROJ,DIR3_RECON);
|
||||||
|
|
||||||
|
#ifdef SHOW
|
||||||
|
std::cout << "Dir = " << Tp << " " << WHERE<< std::endl;
|
||||||
|
|
||||||
|
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
|
||||||
|
std::cout << "st.same_node[Dir] = " << st.same_node[Tp] << std::endl;
|
||||||
|
std::cout << "base = " << (base - plocal)/rescale << std::endl;
|
||||||
|
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
|
||||||
|
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
|
||||||
|
std::cout << "----------------------------------------------------" << std::endl;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
ASM_LEG(Xm,Ym,PERMUTE_DIR3,DIR4_PROJ,DIR4_RECON);
|
||||||
|
|
||||||
|
#ifdef SHOW
|
||||||
|
std::cout << "Dir = " << Xm << " " << WHERE<< std::endl;
|
||||||
|
|
||||||
|
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
|
||||||
|
std::cout << "st.same_node[Dir] = " << st.same_node[Xm] << std::endl;
|
||||||
|
std::cout << "base = " << (base - plocal)/rescale << std::endl;
|
||||||
|
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
|
||||||
|
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
|
||||||
|
std::cout << "----------------------------------------------------" << std::endl;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// DC ZVA test
|
||||||
|
// { uint64_t basestore = (uint64_t)&out[ss];
|
||||||
|
// PREFETCH_RESULT_L2_STORE(basestore); }
|
||||||
|
|
||||||
|
|
||||||
|
ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJ,DIR5_RECON);
|
||||||
|
|
||||||
|
#ifdef SHOW
|
||||||
|
std::cout << "Dir = " << Ym << " " << WHERE<< std::endl;
|
||||||
|
|
||||||
|
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
|
||||||
|
std::cout << "st.same_node[Dir] = " << st.same_node[Ym] << std::endl;
|
||||||
|
std::cout << "base = " << (base - plocal)/rescale << std::endl;
|
||||||
|
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
|
||||||
|
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
|
||||||
|
std::cout << "----------------------------------------------------" << std::endl;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// DC ZVA test
|
||||||
|
//{ uint64_t basestore = (uint64_t)&out[ss];
|
||||||
|
// PREFETCH_RESULT_L2_STORE(basestore); }
|
||||||
|
|
||||||
|
|
||||||
|
ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJ,DIR6_RECON);
|
||||||
|
|
||||||
|
#ifdef SHOW
|
||||||
|
std::cout << "Dir = " << Zm << " " << WHERE<< std::endl;
|
||||||
|
|
||||||
|
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
|
||||||
|
std::cout << "st.same_node[Dir] = " << st.same_node[Zm] << std::endl;
|
||||||
|
std::cout << "base = " << (base - plocal)/rescale << std::endl;
|
||||||
|
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
|
||||||
|
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
|
||||||
|
std::cout << "----------------------------------------------------" << std::endl;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// DC ZVA test
|
||||||
|
//{ uint64_t basestore = (uint64_t)&out[ss];
|
||||||
|
// PREFETCH_RESULT_L2_STORE(basestore); }
|
||||||
|
|
||||||
|
|
||||||
|
ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJ,DIR7_RECON);
|
||||||
|
|
||||||
|
#ifdef SHOW
|
||||||
|
std::cout << "Dir = " << Tm << " " << WHERE<< std::endl;
|
||||||
|
|
||||||
|
std::cout << "ent nent local perm = " << ent << " " << nent << " " << local << " " << perm << std::endl;
|
||||||
|
std::cout << "st.same_node[Dir] = " << st.same_node[Tm] << std::endl;
|
||||||
|
std::cout << "base = " << (base - plocal)/rescale << std::endl;
|
||||||
|
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
|
||||||
|
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
|
||||||
|
std::cout << "----------------------------------------------------" << std::endl;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef EXTERIOR
|
||||||
|
if (nmu==0) break;
|
||||||
|
// if (nmu!=0) std::cout << "EXT "<<sU<<std::endl;
|
||||||
|
#endif
|
||||||
|
base = (uint64_t) &out[ss];
|
||||||
|
basep= st.GetPFInfo(nent,plocal); ent++;
|
||||||
|
basep = (uint64_t) &out[ssn];
|
||||||
|
//PREFETCH_RESULT_L1_STORE(base);
|
||||||
|
RESULT(base,basep);
|
||||||
|
|
||||||
|
#ifdef SHOW
|
||||||
|
std::cout << "Dir = FINAL " << WHERE<< std::endl;;
|
||||||
|
|
||||||
|
base_ss = base;
|
||||||
|
std::cout << "base = " << (base - (uint64_t) &out[0])/rescale << std::endl;
|
||||||
|
std::cout << "Basep = " << (basep - plocal)/rescale << std::endl;
|
||||||
|
//printf("U = %llu\n", (uint64_t)&[sU](Dir));
|
||||||
|
std::cout << "----------------------------------------------------" << std::endl;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
}
|
||||||
|
ssU++;
|
||||||
|
UNLOCK_GAUGE(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#undef DIR0_PROJ
|
||||||
|
#undef DIR1_PROJ
|
||||||
|
#undef DIR2_PROJ
|
||||||
|
#undef DIR3_PROJ
|
||||||
|
#undef DIR4_PROJ
|
||||||
|
#undef DIR5_PROJ
|
||||||
|
#undef DIR6_PROJ
|
||||||
|
#undef DIR7_PROJ
|
||||||
|
#undef DIR0_RECON
|
||||||
|
#undef DIR1_RECON
|
||||||
|
#undef DIR2_RECON
|
||||||
|
#undef DIR3_RECON
|
||||||
|
#undef DIR4_RECON
|
||||||
|
#undef DIR5_RECON
|
||||||
|
#undef DIR6_RECON
|
||||||
|
#undef DIR7_RECON
|
||||||
|
#undef ASM_LEG
|
||||||
|
#undef ASM_LEG_XP
|
||||||
|
#undef RESULT
|
@ -646,7 +646,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
HAND_RESULT_EXT(ss,F)
|
HAND_RESULT_EXT(ss,F)
|
||||||
|
|
||||||
#define HAND_SPECIALISE_GPARITY(IMPL) \
|
#define HAND_SPECIALISE_GPARITY(IMPL) \
|
||||||
template<> void \
|
template<> accelerator_inline void \
|
||||||
WilsonKernels<IMPL>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
WilsonKernels<IMPL>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||||
{ \
|
{ \
|
||||||
@ -662,7 +662,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
template<> void \
|
template<> accelerator_inline void \
|
||||||
WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||||
{ \
|
{ \
|
||||||
@ -678,7 +678,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
template<> void \
|
template<> accelerator_inline void \
|
||||||
WilsonKernels<IMPL>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
WilsonKernels<IMPL>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||||
{ \
|
{ \
|
||||||
@ -694,7 +694,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
template<> void \
|
template<> accelerator_inline void \
|
||||||
WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||||
{ \
|
{ \
|
||||||
@ -710,7 +710,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
template<> void \
|
template<> accelerator_inline void \
|
||||||
WilsonKernels<IMPL>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
WilsonKernels<IMPL>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||||
{ \
|
{ \
|
||||||
@ -727,7 +727,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
nmu = 0; \
|
nmu = 0; \
|
||||||
HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||||
} \
|
} \
|
||||||
template<> void \
|
template<> accelerator_inline void \
|
||||||
WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
|
||||||
{ \
|
{ \
|
||||||
|
@ -495,7 +495,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
template<class Impl> void
|
template<class Impl> accelerator_inline void
|
||||||
WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
@ -519,7 +519,7 @@ WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site
|
|||||||
HAND_RESULT(ss);
|
HAND_RESULT(ss);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl> accelerator_inline
|
||||||
void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
@ -542,7 +542,7 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView
|
|||||||
HAND_RESULT(ss);
|
HAND_RESULT(ss);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl> void
|
template<class Impl> accelerator_inline void
|
||||||
WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
@ -566,7 +566,7 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,Si
|
|||||||
HAND_RESULT(ss);
|
HAND_RESULT(ss);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl> accelerator_inline
|
||||||
void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
@ -589,7 +589,7 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi
|
|||||||
HAND_RESULT(ss);
|
HAND_RESULT(ss);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl> void
|
template<class Impl> accelerator_inline void
|
||||||
WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
@ -614,7 +614,7 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,Si
|
|||||||
HAND_RESULT_EXT(ss);
|
HAND_RESULT_EXT(ss);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl> accelerator_inline
|
||||||
void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||||
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||||
{
|
{
|
||||||
|
@ -0,0 +1,943 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/qcd/action/fermion/WilsonKernelsHand.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||||
|
|
||||||
|
|
||||||
|
#undef LOAD_CHIMU
|
||||||
|
#undef LOAD_CHI
|
||||||
|
#undef MULT_2SPIN
|
||||||
|
#undef PERMUTE_DIR
|
||||||
|
#undef XP_PROJ
|
||||||
|
#undef YP_PROJ
|
||||||
|
#undef ZP_PROJ
|
||||||
|
#undef TP_PROJ
|
||||||
|
#undef XM_PROJ
|
||||||
|
#undef YM_PROJ
|
||||||
|
#undef ZM_PROJ
|
||||||
|
#undef TM_PROJ
|
||||||
|
#undef XP_RECON
|
||||||
|
#undef XP_RECON_ACCUM
|
||||||
|
#undef XM_RECON
|
||||||
|
#undef XM_RECON_ACCUM
|
||||||
|
#undef YP_RECON_ACCUM
|
||||||
|
#undef YM_RECON_ACCUM
|
||||||
|
#undef ZP_RECON_ACCUM
|
||||||
|
#undef ZM_RECON_ACCUM
|
||||||
|
#undef TP_RECON_ACCUM
|
||||||
|
#undef TM_RECON_ACCUM
|
||||||
|
#undef ZERO_RESULT
|
||||||
|
#undef Chimu_00
|
||||||
|
#undef Chimu_01
|
||||||
|
#undef Chimu_02
|
||||||
|
#undef Chimu_10
|
||||||
|
#undef Chimu_11
|
||||||
|
#undef Chimu_12
|
||||||
|
#undef Chimu_20
|
||||||
|
#undef Chimu_21
|
||||||
|
#undef Chimu_22
|
||||||
|
#undef Chimu_30
|
||||||
|
#undef Chimu_31
|
||||||
|
#undef Chimu_32
|
||||||
|
#undef HAND_STENCIL_LEG
|
||||||
|
#undef HAND_STENCIL_LEG_INT
|
||||||
|
#undef HAND_STENCIL_LEG_EXT
|
||||||
|
#undef HAND_RESULT
|
||||||
|
#undef HAND_RESULT_INT
|
||||||
|
#undef HAND_RESULT_EXT
|
||||||
|
|
||||||
|
#define REGISTER
|
||||||
|
|
||||||
|
#define LOAD_CHIMU \
|
||||||
|
{const SiteSpinor & ref (in[offset]); \
|
||||||
|
Chimu_00=ref()(0)(0);\
|
||||||
|
Chimu_01=ref()(0)(1);\
|
||||||
|
Chimu_02=ref()(0)(2);\
|
||||||
|
Chimu_10=ref()(1)(0);\
|
||||||
|
Chimu_11=ref()(1)(1);\
|
||||||
|
Chimu_12=ref()(1)(2);\
|
||||||
|
Chimu_20=ref()(2)(0);\
|
||||||
|
Chimu_21=ref()(2)(1);\
|
||||||
|
Chimu_22=ref()(2)(2);\
|
||||||
|
Chimu_30=ref()(3)(0);\
|
||||||
|
Chimu_31=ref()(3)(1);\
|
||||||
|
Chimu_32=ref()(3)(2);\
|
||||||
|
std::cout << std::endl << "DEBUG -- LOAD_CHIMU" << std::endl; \
|
||||||
|
std::cout << "Chimu_00 -- " << Chimu_00 << std::endl; \
|
||||||
|
std::cout << "Chimu_01 -- " << Chimu_01 << std::endl; \
|
||||||
|
std::cout << "Chimu_02 -- " << Chimu_02 << std::endl; \
|
||||||
|
std::cout << "Chimu_10 -- " << Chimu_10 << std::endl; \
|
||||||
|
std::cout << "Chimu_11 -- " << Chimu_11 << std::endl; \
|
||||||
|
std::cout << "Chimu_12 -- " << Chimu_12 << std::endl; \
|
||||||
|
std::cout << "Chimu_20 -- " << Chimu_20 << std::endl; \
|
||||||
|
std::cout << "Chimu_21 -- " << Chimu_21 << std::endl; \
|
||||||
|
std::cout << "Chimu_22 -- " << Chimu_22 << std::endl; \
|
||||||
|
std::cout << "Chimu_30 -- " << Chimu_30 << std::endl; \
|
||||||
|
std::cout << "Chimu_31 -- " << Chimu_31 << std::endl; \
|
||||||
|
std::cout << "Chimu_32 -- " << Chimu_32 << std::endl; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define LOAD_CHI\
|
||||||
|
{const SiteHalfSpinor &ref(buf[offset]); \
|
||||||
|
Chi_00 = ref()(0)(0);\
|
||||||
|
Chi_01 = ref()(0)(1);\
|
||||||
|
Chi_02 = ref()(0)(2);\
|
||||||
|
Chi_10 = ref()(1)(0);\
|
||||||
|
Chi_11 = ref()(1)(1);\
|
||||||
|
Chi_12 = ref()(1)(2);\
|
||||||
|
std::cout << std::endl << "DEBUG -- LOAD_CHI" << std::endl; \
|
||||||
|
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
|
||||||
|
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
|
||||||
|
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
|
||||||
|
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
|
||||||
|
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
|
||||||
|
std::cout << "Chi_12 -- " << Chi_12 << std::endl; \
|
||||||
|
}
|
||||||
|
|
||||||
|
// To splat or not to splat depends on the implementation
|
||||||
|
#define MULT_2SPIN(A)\
|
||||||
|
{auto & ref(U[sU](A)); \
|
||||||
|
Impl::loadLinkElement(U_00,ref()(0,0)); \
|
||||||
|
Impl::loadLinkElement(U_10,ref()(1,0)); \
|
||||||
|
Impl::loadLinkElement(U_20,ref()(2,0)); \
|
||||||
|
Impl::loadLinkElement(U_01,ref()(0,1)); \
|
||||||
|
Impl::loadLinkElement(U_11,ref()(1,1)); \
|
||||||
|
Impl::loadLinkElement(U_21,ref()(2,1)); \
|
||||||
|
UChi_00 = U_00*Chi_00;\
|
||||||
|
UChi_10 = U_00*Chi_10;\
|
||||||
|
UChi_01 = U_10*Chi_00;\
|
||||||
|
UChi_11 = U_10*Chi_10;\
|
||||||
|
UChi_02 = U_20*Chi_00;\
|
||||||
|
UChi_12 = U_20*Chi_10;\
|
||||||
|
UChi_00+= U_01*Chi_01;\
|
||||||
|
UChi_10+= U_01*Chi_11;\
|
||||||
|
UChi_01+= U_11*Chi_01;\
|
||||||
|
UChi_11+= U_11*Chi_11;\
|
||||||
|
UChi_02+= U_21*Chi_01;\
|
||||||
|
UChi_12+= U_21*Chi_11;\
|
||||||
|
Impl::loadLinkElement(U_00,ref()(0,2)); \
|
||||||
|
Impl::loadLinkElement(U_10,ref()(1,2)); \
|
||||||
|
Impl::loadLinkElement(U_20,ref()(2,2)); \
|
||||||
|
UChi_00+= U_00*Chi_02;\
|
||||||
|
UChi_10+= U_00*Chi_12;\
|
||||||
|
UChi_01+= U_10*Chi_02;\
|
||||||
|
UChi_11+= U_10*Chi_12;\
|
||||||
|
UChi_02+= U_20*Chi_02;\
|
||||||
|
UChi_12+= U_20*Chi_12;\
|
||||||
|
std::cout << std::endl << "DEBUG -- MULT_2SPIN" << std::endl; \
|
||||||
|
std::cout << "UChi_00 -- " << UChi_00 << std::endl; \
|
||||||
|
std::cout << "UChi_01 -- " << UChi_01 << std::endl; \
|
||||||
|
std::cout << "UChi_02 -- " << UChi_02 << std::endl; \
|
||||||
|
std::cout << "UChi_10 -- " << UChi_10 << std::endl; \
|
||||||
|
std::cout << "UChi_11 -- " << UChi_11 << std::endl; \
|
||||||
|
std::cout << "UChi_12 -- " << UChi_12 << std::endl; \
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#define PERMUTE_DIR(dir) \
|
||||||
|
std::cout << std::endl << "DEBUG -- PERM PRE" << std::endl; \
|
||||||
|
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
|
||||||
|
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
|
||||||
|
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
|
||||||
|
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
|
||||||
|
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
|
||||||
|
std::cout << "Chi_12 -- " << Chi_12 << std::endl; \
|
||||||
|
permute##dir(Chi_00,Chi_00);\
|
||||||
|
permute##dir(Chi_01,Chi_01);\
|
||||||
|
permute##dir(Chi_02,Chi_02);\
|
||||||
|
permute##dir(Chi_10,Chi_10);\
|
||||||
|
permute##dir(Chi_11,Chi_11);\
|
||||||
|
permute##dir(Chi_12,Chi_12);\
|
||||||
|
std::cout << std::endl << "DEBUG -- PERM POST" << std::endl; \
|
||||||
|
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
|
||||||
|
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
|
||||||
|
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
|
||||||
|
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
|
||||||
|
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
|
||||||
|
std::cout << "Chi_12 -- " << Chi_12 << std::endl;
|
||||||
|
|
||||||
|
// hspin(0)=fspin(0)+timesI(fspin(3));
|
||||||
|
// hspin(1)=fspin(1)+timesI(fspin(2));
|
||||||
|
#define XP_PROJ \
|
||||||
|
Chi_00 = Chimu_00+timesI(Chimu_30);\
|
||||||
|
Chi_01 = Chimu_01+timesI(Chimu_31);\
|
||||||
|
Chi_02 = Chimu_02+timesI(Chimu_32);\
|
||||||
|
Chi_10 = Chimu_10+timesI(Chimu_20);\
|
||||||
|
Chi_11 = Chimu_11+timesI(Chimu_21);\
|
||||||
|
Chi_12 = Chimu_12+timesI(Chimu_22);\
|
||||||
|
std::cout << std::endl << "DEBUG -- XP_PROJ" << std::endl; \
|
||||||
|
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
|
||||||
|
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
|
||||||
|
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
|
||||||
|
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
|
||||||
|
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
|
||||||
|
std::cout << "Chi_12 -- " << Chi_12 << std::endl;
|
||||||
|
|
||||||
|
#define YP_PROJ \
|
||||||
|
Chi_00 = Chimu_00-Chimu_30;\
|
||||||
|
Chi_01 = Chimu_01-Chimu_31;\
|
||||||
|
Chi_02 = Chimu_02-Chimu_32;\
|
||||||
|
Chi_10 = Chimu_10+Chimu_20;\
|
||||||
|
Chi_11 = Chimu_11+Chimu_21;\
|
||||||
|
Chi_12 = Chimu_12+Chimu_22;\
|
||||||
|
std::cout << std::endl << "DEBUG -- YP_PROJ" << std::endl; \
|
||||||
|
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
|
||||||
|
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
|
||||||
|
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
|
||||||
|
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
|
||||||
|
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
|
||||||
|
std::cout << "Chi_12 -- " << Chi_12 << std::endl;
|
||||||
|
|
||||||
|
#define ZP_PROJ \
|
||||||
|
Chi_00 = Chimu_00+timesI(Chimu_20); \
|
||||||
|
Chi_01 = Chimu_01+timesI(Chimu_21); \
|
||||||
|
Chi_02 = Chimu_02+timesI(Chimu_22); \
|
||||||
|
Chi_10 = Chimu_10-timesI(Chimu_30); \
|
||||||
|
Chi_11 = Chimu_11-timesI(Chimu_31); \
|
||||||
|
Chi_12 = Chimu_12-timesI(Chimu_32);\
|
||||||
|
std::cout << std::endl << "DEBUG -- ZP_PROJ" << std::endl; \
|
||||||
|
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
|
||||||
|
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
|
||||||
|
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
|
||||||
|
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
|
||||||
|
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
|
||||||
|
std::cout << "Chi_12 -- " << Chi_12 << std::endl;
|
||||||
|
|
||||||
|
#define TP_PROJ \
|
||||||
|
Chi_00 = Chimu_00+Chimu_20; \
|
||||||
|
Chi_01 = Chimu_01+Chimu_21; \
|
||||||
|
Chi_02 = Chimu_02+Chimu_22; \
|
||||||
|
Chi_10 = Chimu_10+Chimu_30; \
|
||||||
|
Chi_11 = Chimu_11+Chimu_31; \
|
||||||
|
Chi_12 = Chimu_12+Chimu_32;\
|
||||||
|
std::cout << std::endl << "DEBUG -- TP_PROJ" << std::endl; \
|
||||||
|
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
|
||||||
|
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
|
||||||
|
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
|
||||||
|
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
|
||||||
|
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
|
||||||
|
std::cout << "Chi_12 -- " << Chi_12 << std::endl;
|
||||||
|
|
||||||
|
|
||||||
|
// hspin(0)=fspin(0)-timesI(fspin(3));
|
||||||
|
// hspin(1)=fspin(1)-timesI(fspin(2));
|
||||||
|
#define XM_PROJ \
|
||||||
|
Chi_00 = Chimu_00-timesI(Chimu_30);\
|
||||||
|
Chi_01 = Chimu_01-timesI(Chimu_31);\
|
||||||
|
Chi_02 = Chimu_02-timesI(Chimu_32);\
|
||||||
|
Chi_10 = Chimu_10-timesI(Chimu_20);\
|
||||||
|
Chi_11 = Chimu_11-timesI(Chimu_21);\
|
||||||
|
Chi_12 = Chimu_12-timesI(Chimu_22);\
|
||||||
|
std::cout << std::endl << "DEBUG -- XM_PROJ" << std::endl; \
|
||||||
|
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
|
||||||
|
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
|
||||||
|
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
|
||||||
|
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
|
||||||
|
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
|
||||||
|
std::cout << "Chi_12 -- " << Chi_12 << std::endl;
|
||||||
|
|
||||||
|
#define YM_PROJ \
|
||||||
|
Chi_00 = Chimu_00+Chimu_30;\
|
||||||
|
Chi_01 = Chimu_01+Chimu_31;\
|
||||||
|
Chi_02 = Chimu_02+Chimu_32;\
|
||||||
|
Chi_10 = Chimu_10-Chimu_20;\
|
||||||
|
Chi_11 = Chimu_11-Chimu_21;\
|
||||||
|
Chi_12 = Chimu_12-Chimu_22;\
|
||||||
|
std::cout << std::endl << "DEBUG -- YM_PROJ" << std::endl; \
|
||||||
|
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
|
||||||
|
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
|
||||||
|
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
|
||||||
|
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
|
||||||
|
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
|
||||||
|
std::cout << "Chi_12 -- " << Chi_12 << std::endl;
|
||||||
|
|
||||||
|
#define ZM_PROJ \
|
||||||
|
Chi_00 = Chimu_00-timesI(Chimu_20); \
|
||||||
|
Chi_01 = Chimu_01-timesI(Chimu_21); \
|
||||||
|
Chi_02 = Chimu_02-timesI(Chimu_22); \
|
||||||
|
Chi_10 = Chimu_10+timesI(Chimu_30); \
|
||||||
|
Chi_11 = Chimu_11+timesI(Chimu_31); \
|
||||||
|
Chi_12 = Chimu_12+timesI(Chimu_32);\
|
||||||
|
std::cout << std::endl << "DEBUG -- ZM_PROJ" << std::endl; \
|
||||||
|
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
|
||||||
|
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
|
||||||
|
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
|
||||||
|
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
|
||||||
|
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
|
||||||
|
std::cout << "Chi_12 -- " << Chi_12 << std::endl;
|
||||||
|
|
||||||
|
#define TM_PROJ \
|
||||||
|
Chi_00 = Chimu_00-Chimu_20; \
|
||||||
|
Chi_01 = Chimu_01-Chimu_21; \
|
||||||
|
Chi_02 = Chimu_02-Chimu_22; \
|
||||||
|
Chi_10 = Chimu_10-Chimu_30; \
|
||||||
|
Chi_11 = Chimu_11-Chimu_31; \
|
||||||
|
Chi_12 = Chimu_12-Chimu_32;\
|
||||||
|
std::cout << std::endl << "DEBUG -- TM_PROJ" << std::endl; \
|
||||||
|
std::cout << "Chi_00 -- " << Chi_00 << std::endl; \
|
||||||
|
std::cout << "Chi_01 -- " << Chi_01 << std::endl; \
|
||||||
|
std::cout << "Chi_02 -- " << Chi_02 << std::endl; \
|
||||||
|
std::cout << "Chi_10 -- " << Chi_10 << std::endl; \
|
||||||
|
std::cout << "Chi_11 -- " << Chi_11 << std::endl; \
|
||||||
|
std::cout << "Chi_12 -- " << Chi_12 << std::endl;
|
||||||
|
|
||||||
|
// fspin(0)=hspin(0);
|
||||||
|
// fspin(1)=hspin(1);
|
||||||
|
// fspin(2)=timesMinusI(hspin(1));
|
||||||
|
// fspin(3)=timesMinusI(hspin(0));
|
||||||
|
#define XP_RECON\
|
||||||
|
result_00 = UChi_00;\
|
||||||
|
result_01 = UChi_01;\
|
||||||
|
result_02 = UChi_02;\
|
||||||
|
result_10 = UChi_10;\
|
||||||
|
result_11 = UChi_11;\
|
||||||
|
result_12 = UChi_12;\
|
||||||
|
result_20 = timesMinusI(UChi_10);\
|
||||||
|
result_21 = timesMinusI(UChi_11);\
|
||||||
|
result_22 = timesMinusI(UChi_12);\
|
||||||
|
result_30 = timesMinusI(UChi_00);\
|
||||||
|
result_31 = timesMinusI(UChi_01);\
|
||||||
|
result_32 = timesMinusI(UChi_02);\
|
||||||
|
std::cout << std::endl << "DEBUG -- XP_RECON" << std::endl; \
|
||||||
|
std::cout << "result_00 -- " << result_00 << std::endl; \
|
||||||
|
std::cout << "result_01 -- " << result_01 << std::endl; \
|
||||||
|
std::cout << "result_02 -- " << result_02 << std::endl; \
|
||||||
|
std::cout << "result_10 -- " << result_10 << std::endl; \
|
||||||
|
std::cout << "result_11 -- " << result_11 << std::endl; \
|
||||||
|
std::cout << "result_12 -- " << result_12 << std::endl; \
|
||||||
|
std::cout << "result_20 -- " << result_20 << std::endl; \
|
||||||
|
std::cout << "result_21 -- " << result_21 << std::endl; \
|
||||||
|
std::cout << "result_22 -- " << result_22 << std::endl; \
|
||||||
|
std::cout << "result_30 -- " << result_30 << std::endl; \
|
||||||
|
std::cout << "result_31 -- " << result_31 << std::endl; \
|
||||||
|
std::cout << "result_32 -- " << result_32 << std::endl;
|
||||||
|
|
||||||
|
#define XP_RECON_ACCUM\
|
||||||
|
result_00+=UChi_00;\
|
||||||
|
result_01+=UChi_01;\
|
||||||
|
result_02+=UChi_02;\
|
||||||
|
result_10+=UChi_10;\
|
||||||
|
result_11+=UChi_11;\
|
||||||
|
result_12+=UChi_12;\
|
||||||
|
result_20-=timesI(UChi_10);\
|
||||||
|
result_21-=timesI(UChi_11);\
|
||||||
|
result_22-=timesI(UChi_12);\
|
||||||
|
result_30-=timesI(UChi_00);\
|
||||||
|
result_31-=timesI(UChi_01);\
|
||||||
|
result_32-=timesI(UChi_02);\
|
||||||
|
std::cout << std::endl << "DEBUG -- XP_RECON_ACCUM" << std::endl; \
|
||||||
|
std::cout << "result_00 -- " << result_00 << std::endl; \
|
||||||
|
std::cout << "result_01 -- " << result_01 << std::endl; \
|
||||||
|
std::cout << "result_02 -- " << result_02 << std::endl; \
|
||||||
|
std::cout << "result_10 -- " << result_10 << std::endl; \
|
||||||
|
std::cout << "result_11 -- " << result_11 << std::endl; \
|
||||||
|
std::cout << "result_12 -- " << result_12 << std::endl; \
|
||||||
|
std::cout << "result_20 -- " << result_20 << std::endl; \
|
||||||
|
std::cout << "result_21 -- " << result_21 << std::endl; \
|
||||||
|
std::cout << "result_22 -- " << result_22 << std::endl; \
|
||||||
|
std::cout << "result_30 -- " << result_30 << std::endl; \
|
||||||
|
std::cout << "result_31 -- " << result_31 << std::endl; \
|
||||||
|
std::cout << "result_32 -- " << result_32 << std::endl;
|
||||||
|
|
||||||
|
#define XM_RECON\
|
||||||
|
result_00 = UChi_00;\
|
||||||
|
result_01 = UChi_01;\
|
||||||
|
result_02 = UChi_02;\
|
||||||
|
result_10 = UChi_10;\
|
||||||
|
result_11 = UChi_11;\
|
||||||
|
result_12 = UChi_12;\
|
||||||
|
result_20 = timesI(UChi_10);\
|
||||||
|
result_21 = timesI(UChi_11);\
|
||||||
|
result_22 = timesI(UChi_12);\
|
||||||
|
result_30 = timesI(UChi_00);\
|
||||||
|
result_31 = timesI(UChi_01);\
|
||||||
|
result_32 = timesI(UChi_02);\
|
||||||
|
std::cout << std::endl << "DEBUG -- XM_RECON" << std::endl; \
|
||||||
|
std::cout << "result_00 -- " << result_00 << std::endl; \
|
||||||
|
std::cout << "result_01 -- " << result_01 << std::endl; \
|
||||||
|
std::cout << "result_02 -- " << result_02 << std::endl; \
|
||||||
|
std::cout << "result_10 -- " << result_10 << std::endl; \
|
||||||
|
std::cout << "result_11 -- " << result_11 << std::endl; \
|
||||||
|
std::cout << "result_12 -- " << result_12 << std::endl; \
|
||||||
|
std::cout << "result_20 -- " << result_20 << std::endl; \
|
||||||
|
std::cout << "result_21 -- " << result_21 << std::endl; \
|
||||||
|
std::cout << "result_22 -- " << result_22 << std::endl; \
|
||||||
|
std::cout << "result_30 -- " << result_30 << std::endl; \
|
||||||
|
std::cout << "result_31 -- " << result_31 << std::endl; \
|
||||||
|
std::cout << "result_32 -- " << result_32 << std::endl;
|
||||||
|
|
||||||
|
#define XM_RECON_ACCUM\
|
||||||
|
result_00+= UChi_00;\
|
||||||
|
result_01+= UChi_01;\
|
||||||
|
result_02+= UChi_02;\
|
||||||
|
result_10+= UChi_10;\
|
||||||
|
result_11+= UChi_11;\
|
||||||
|
result_12+= UChi_12;\
|
||||||
|
result_20+= timesI(UChi_10);\
|
||||||
|
result_21+= timesI(UChi_11);\
|
||||||
|
result_22+= timesI(UChi_12);\
|
||||||
|
result_30+= timesI(UChi_00);\
|
||||||
|
result_31+= timesI(UChi_01);\
|
||||||
|
result_32+= timesI(UChi_02);\
|
||||||
|
std::cout << std::endl << "DEBUG -- XM_RECON_ACCUM" << std::endl; \
|
||||||
|
std::cout << "result_00 -- " << result_00 << std::endl; \
|
||||||
|
std::cout << "result_01 -- " << result_01 << std::endl; \
|
||||||
|
std::cout << "result_02 -- " << result_02 << std::endl; \
|
||||||
|
std::cout << "result_10 -- " << result_10 << std::endl; \
|
||||||
|
std::cout << "result_11 -- " << result_11 << std::endl; \
|
||||||
|
std::cout << "result_12 -- " << result_12 << std::endl; \
|
||||||
|
std::cout << "result_20 -- " << result_20 << std::endl; \
|
||||||
|
std::cout << "result_21 -- " << result_21 << std::endl; \
|
||||||
|
std::cout << "result_22 -- " << result_22 << std::endl; \
|
||||||
|
std::cout << "result_30 -- " << result_30 << std::endl; \
|
||||||
|
std::cout << "result_31 -- " << result_31 << std::endl; \
|
||||||
|
std::cout << "result_32 -- " << result_32 << std::endl;
|
||||||
|
|
||||||
|
#define YP_RECON_ACCUM\
|
||||||
|
result_00+= UChi_00;\
|
||||||
|
result_01+= UChi_01;\
|
||||||
|
result_02+= UChi_02;\
|
||||||
|
result_10+= UChi_10;\
|
||||||
|
result_11+= UChi_11;\
|
||||||
|
result_12+= UChi_12;\
|
||||||
|
result_20+= UChi_10;\
|
||||||
|
result_21+= UChi_11;\
|
||||||
|
result_22+= UChi_12;\
|
||||||
|
result_30-= UChi_00;\
|
||||||
|
result_31-= UChi_01;\
|
||||||
|
result_32-= UChi_02;\
|
||||||
|
std::cout << std::endl << "DEBUG -- YP_RECON_ACCUM" << std::endl; \
|
||||||
|
std::cout << "result_00 -- " << result_00 << std::endl; \
|
||||||
|
std::cout << "result_01 -- " << result_01 << std::endl; \
|
||||||
|
std::cout << "result_02 -- " << result_02 << std::endl; \
|
||||||
|
std::cout << "result_10 -- " << result_10 << std::endl; \
|
||||||
|
std::cout << "result_11 -- " << result_11 << std::endl; \
|
||||||
|
std::cout << "result_12 -- " << result_12 << std::endl; \
|
||||||
|
std::cout << "result_20 -- " << result_20 << std::endl; \
|
||||||
|
std::cout << "result_21 -- " << result_21 << std::endl; \
|
||||||
|
std::cout << "result_22 -- " << result_22 << std::endl; \
|
||||||
|
std::cout << "result_30 -- " << result_30 << std::endl; \
|
||||||
|
std::cout << "result_31 -- " << result_31 << std::endl; \
|
||||||
|
std::cout << "result_32 -- " << result_32 << std::endl;
|
||||||
|
|
||||||
|
#define YM_RECON_ACCUM\
|
||||||
|
result_00+= UChi_00;\
|
||||||
|
result_01+= UChi_01;\
|
||||||
|
result_02+= UChi_02;\
|
||||||
|
result_10+= UChi_10;\
|
||||||
|
result_11+= UChi_11;\
|
||||||
|
result_12+= UChi_12;\
|
||||||
|
result_20-= UChi_10;\
|
||||||
|
result_21-= UChi_11;\
|
||||||
|
result_22-= UChi_12;\
|
||||||
|
result_30+= UChi_00;\
|
||||||
|
result_31+= UChi_01;\
|
||||||
|
result_32+= UChi_02;\
|
||||||
|
std::cout << std::endl << "DEBUG -- YM_RECON_ACCUM" << std::endl; \
|
||||||
|
std::cout << "result_00 -- " << result_00 << std::endl; \
|
||||||
|
std::cout << "result_01 -- " << result_01 << std::endl; \
|
||||||
|
std::cout << "result_02 -- " << result_02 << std::endl; \
|
||||||
|
std::cout << "result_10 -- " << result_10 << std::endl; \
|
||||||
|
std::cout << "result_11 -- " << result_11 << std::endl; \
|
||||||
|
std::cout << "result_12 -- " << result_12 << std::endl; \
|
||||||
|
std::cout << "result_20 -- " << result_20 << std::endl; \
|
||||||
|
std::cout << "result_21 -- " << result_21 << std::endl; \
|
||||||
|
std::cout << "result_22 -- " << result_22 << std::endl; \
|
||||||
|
std::cout << "result_30 -- " << result_30 << std::endl; \
|
||||||
|
std::cout << "result_31 -- " << result_31 << std::endl; \
|
||||||
|
std::cout << "result_32 -- " << result_32 << std::endl;
|
||||||
|
|
||||||
|
#define ZP_RECON_ACCUM\
|
||||||
|
result_00+= UChi_00;\
|
||||||
|
result_01+= UChi_01;\
|
||||||
|
result_02+= UChi_02;\
|
||||||
|
result_10+= UChi_10;\
|
||||||
|
result_11+= UChi_11;\
|
||||||
|
result_12+= UChi_12;\
|
||||||
|
result_20-= timesI(UChi_00); \
|
||||||
|
result_21-= timesI(UChi_01); \
|
||||||
|
result_22-= timesI(UChi_02); \
|
||||||
|
result_30+= timesI(UChi_10); \
|
||||||
|
result_31+= timesI(UChi_11); \
|
||||||
|
result_32+= timesI(UChi_12);\
|
||||||
|
std::cout << std::endl << "DEBUG -- ZP_RECON_ACCUM" << std::endl; \
|
||||||
|
std::cout << "result_00 -- " << result_00 << std::endl; \
|
||||||
|
std::cout << "result_01 -- " << result_01 << std::endl; \
|
||||||
|
std::cout << "result_02 -- " << result_02 << std::endl; \
|
||||||
|
std::cout << "result_10 -- " << result_10 << std::endl; \
|
||||||
|
std::cout << "result_11 -- " << result_11 << std::endl; \
|
||||||
|
std::cout << "result_12 -- " << result_12 << std::endl; \
|
||||||
|
std::cout << "result_20 -- " << result_20 << std::endl; \
|
||||||
|
std::cout << "result_21 -- " << result_21 << std::endl; \
|
||||||
|
std::cout << "result_22 -- " << result_22 << std::endl; \
|
||||||
|
std::cout << "result_30 -- " << result_30 << std::endl; \
|
||||||
|
std::cout << "result_31 -- " << result_31 << std::endl; \
|
||||||
|
std::cout << "result_32 -- " << result_32 << std::endl;
|
||||||
|
|
||||||
|
#define ZM_RECON_ACCUM\
|
||||||
|
result_00+= UChi_00;\
|
||||||
|
result_01+= UChi_01;\
|
||||||
|
result_02+= UChi_02;\
|
||||||
|
result_10+= UChi_10;\
|
||||||
|
result_11+= UChi_11;\
|
||||||
|
result_12+= UChi_12;\
|
||||||
|
result_20+= timesI(UChi_00); \
|
||||||
|
result_21+= timesI(UChi_01); \
|
||||||
|
result_22+= timesI(UChi_02); \
|
||||||
|
result_30-= timesI(UChi_10); \
|
||||||
|
result_31-= timesI(UChi_11); \
|
||||||
|
result_32-= timesI(UChi_12);\
|
||||||
|
std::cout << std::endl << "DEBUG -- ZM_RECON_ACCUM" << std::endl; \
|
||||||
|
std::cout << "result_00 -- " << result_00 << std::endl; \
|
||||||
|
std::cout << "result_01 -- " << result_01 << std::endl; \
|
||||||
|
std::cout << "result_02 -- " << result_02 << std::endl; \
|
||||||
|
std::cout << "result_10 -- " << result_10 << std::endl; \
|
||||||
|
std::cout << "result_11 -- " << result_11 << std::endl; \
|
||||||
|
std::cout << "result_12 -- " << result_12 << std::endl; \
|
||||||
|
std::cout << "result_20 -- " << result_20 << std::endl; \
|
||||||
|
std::cout << "result_21 -- " << result_21 << std::endl; \
|
||||||
|
std::cout << "result_22 -- " << result_22 << std::endl; \
|
||||||
|
std::cout << "result_30 -- " << result_30 << std::endl; \
|
||||||
|
std::cout << "result_31 -- " << result_31 << std::endl; \
|
||||||
|
std::cout << "result_32 -- " << result_32 << std::endl;
|
||||||
|
|
||||||
|
#define TP_RECON_ACCUM\
|
||||||
|
result_00+= UChi_00;\
|
||||||
|
result_01+= UChi_01;\
|
||||||
|
result_02+= UChi_02;\
|
||||||
|
result_10+= UChi_10;\
|
||||||
|
result_11+= UChi_11;\
|
||||||
|
result_12+= UChi_12;\
|
||||||
|
result_20+= UChi_00; \
|
||||||
|
result_21+= UChi_01; \
|
||||||
|
result_22+= UChi_02; \
|
||||||
|
result_30+= UChi_10; \
|
||||||
|
result_31+= UChi_11; \
|
||||||
|
result_32+= UChi_12;\
|
||||||
|
std::cout << std::endl << "DEBUG -- TP_RECON_ACCUM" << std::endl; \
|
||||||
|
std::cout << "result_00 -- " << result_00 << std::endl; \
|
||||||
|
std::cout << "result_01 -- " << result_01 << std::endl; \
|
||||||
|
std::cout << "result_02 -- " << result_02 << std::endl; \
|
||||||
|
std::cout << "result_10 -- " << result_10 << std::endl; \
|
||||||
|
std::cout << "result_11 -- " << result_11 << std::endl; \
|
||||||
|
std::cout << "result_12 -- " << result_12 << std::endl; \
|
||||||
|
std::cout << "result_20 -- " << result_20 << std::endl; \
|
||||||
|
std::cout << "result_21 -- " << result_21 << std::endl; \
|
||||||
|
std::cout << "result_22 -- " << result_22 << std::endl; \
|
||||||
|
std::cout << "result_30 -- " << result_30 << std::endl; \
|
||||||
|
std::cout << "result_31 -- " << result_31 << std::endl; \
|
||||||
|
std::cout << "result_32 -- " << result_32 << std::endl;
|
||||||
|
|
||||||
|
#define TM_RECON_ACCUM\
|
||||||
|
result_00+= UChi_00;\
|
||||||
|
result_01+= UChi_01;\
|
||||||
|
result_02+= UChi_02;\
|
||||||
|
result_10+= UChi_10;\
|
||||||
|
result_11+= UChi_11;\
|
||||||
|
result_12+= UChi_12;\
|
||||||
|
result_20-= UChi_00; \
|
||||||
|
result_21-= UChi_01; \
|
||||||
|
result_22-= UChi_02; \
|
||||||
|
result_30-= UChi_10; \
|
||||||
|
result_31-= UChi_11; \
|
||||||
|
result_32-= UChi_12;\
|
||||||
|
std::cout << std::endl << "DEBUG -- TM_RECON_ACCUM" << std::endl; \
|
||||||
|
std::cout << "result_00 -- " << result_00 << std::endl; \
|
||||||
|
std::cout << "result_01 -- " << result_01 << std::endl; \
|
||||||
|
std::cout << "result_02 -- " << result_02 << std::endl; \
|
||||||
|
std::cout << "result_10 -- " << result_10 << std::endl; \
|
||||||
|
std::cout << "result_11 -- " << result_11 << std::endl; \
|
||||||
|
std::cout << "result_12 -- " << result_12 << std::endl; \
|
||||||
|
std::cout << "result_20 -- " << result_20 << std::endl; \
|
||||||
|
std::cout << "result_21 -- " << result_21 << std::endl; \
|
||||||
|
std::cout << "result_22 -- " << result_22 << std::endl; \
|
||||||
|
std::cout << "result_30 -- " << result_30 << std::endl; \
|
||||||
|
std::cout << "result_31 -- " << result_31 << std::endl; \
|
||||||
|
std::cout << "result_32 -- " << result_32 << std::endl;
|
||||||
|
|
||||||
|
#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \
|
||||||
|
SE=st.GetEntry(ptype,DIR,ss); \
|
||||||
|
offset = SE->_offset; \
|
||||||
|
local = SE->_is_local; \
|
||||||
|
perm = SE->_permute; \
|
||||||
|
if ( local ) { \
|
||||||
|
LOAD_CHIMU; \
|
||||||
|
PROJ; \
|
||||||
|
if ( perm) { \
|
||||||
|
PERMUTE_DIR(PERM); \
|
||||||
|
} \
|
||||||
|
} else { \
|
||||||
|
LOAD_CHI; \
|
||||||
|
} \
|
||||||
|
MULT_2SPIN(DIR); \
|
||||||
|
RECON;
|
||||||
|
|
||||||
|
#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON) \
|
||||||
|
SE=st.GetEntry(ptype,DIR,ss); \
|
||||||
|
offset = SE->_offset; \
|
||||||
|
local = SE->_is_local; \
|
||||||
|
perm = SE->_permute; \
|
||||||
|
if ( local ) { \
|
||||||
|
LOAD_CHIMU; \
|
||||||
|
PROJ; \
|
||||||
|
if ( perm) { \
|
||||||
|
PERMUTE_DIR(PERM); \
|
||||||
|
} \
|
||||||
|
} else if ( st.same_node[DIR] ) { \
|
||||||
|
LOAD_CHI; \
|
||||||
|
} \
|
||||||
|
if (local || st.same_node[DIR] ) { \
|
||||||
|
MULT_2SPIN(DIR); \
|
||||||
|
RECON; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON) \
|
||||||
|
SE=st.GetEntry(ptype,DIR,ss); \
|
||||||
|
offset = SE->_offset; \
|
||||||
|
if((!SE->_is_local)&&(!st.same_node[DIR]) ) { \
|
||||||
|
LOAD_CHI; \
|
||||||
|
MULT_2SPIN(DIR); \
|
||||||
|
RECON; \
|
||||||
|
nmu++; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define HAND_RESULT(ss) \
|
||||||
|
{ \
|
||||||
|
SiteSpinor & ref (out[ss]); \
|
||||||
|
vstream(ref()(0)(0),result_00); \
|
||||||
|
vstream(ref()(0)(1),result_01); \
|
||||||
|
vstream(ref()(0)(2),result_02); \
|
||||||
|
vstream(ref()(1)(0),result_10); \
|
||||||
|
vstream(ref()(1)(1),result_11); \
|
||||||
|
vstream(ref()(1)(2),result_12); \
|
||||||
|
vstream(ref()(2)(0),result_20); \
|
||||||
|
vstream(ref()(2)(1),result_21); \
|
||||||
|
vstream(ref()(2)(2),result_22); \
|
||||||
|
vstream(ref()(3)(0),result_30); \
|
||||||
|
vstream(ref()(3)(1),result_31); \
|
||||||
|
vstream(ref()(3)(2),result_32); \
|
||||||
|
std::cout << std::endl << "DEBUG -- RESULT" << std::endl; \
|
||||||
|
std::cout << "result_00 -- " << result_00 << std::endl; \
|
||||||
|
std::cout << "result_01 -- " << result_01 << std::endl; \
|
||||||
|
std::cout << "result_02 -- " << result_02 << std::endl; \
|
||||||
|
std::cout << "result_10 -- " << result_10 << std::endl; \
|
||||||
|
std::cout << "result_11 -- " << result_11 << std::endl; \
|
||||||
|
std::cout << "result_12 -- " << result_12 << std::endl; \
|
||||||
|
std::cout << "result_20 -- " << result_20 << std::endl; \
|
||||||
|
std::cout << "result_21 -- " << result_21 << std::endl; \
|
||||||
|
std::cout << "result_22 -- " << result_22 << std::endl; \
|
||||||
|
std::cout << "result_30 -- " << result_30 << std::endl; \
|
||||||
|
std::cout << "result_31 -- " << result_31 << std::endl; \
|
||||||
|
std::cout << "result_32 -- " << result_32 << std::endl;\
|
||||||
|
}
|
||||||
|
|
||||||
|
#define HAND_RESULT_EXT(ss) \
|
||||||
|
if (nmu){ \
|
||||||
|
SiteSpinor & ref (out[ss]); \
|
||||||
|
ref()(0)(0)+=result_00; \
|
||||||
|
ref()(0)(1)+=result_01; \
|
||||||
|
ref()(0)(2)+=result_02; \
|
||||||
|
ref()(1)(0)+=result_10; \
|
||||||
|
ref()(1)(1)+=result_11; \
|
||||||
|
ref()(1)(2)+=result_12; \
|
||||||
|
ref()(2)(0)+=result_20; \
|
||||||
|
ref()(2)(1)+=result_21; \
|
||||||
|
ref()(2)(2)+=result_22; \
|
||||||
|
ref()(3)(0)+=result_30; \
|
||||||
|
ref()(3)(1)+=result_31; \
|
||||||
|
ref()(3)(2)+=result_32; \
|
||||||
|
std::cout << std::endl << "DEBUG -- RESULT EXT" << std::endl; \
|
||||||
|
std::cout << "result_00 -- " << result_00 << std::endl; \
|
||||||
|
std::cout << "result_01 -- " << result_01 << std::endl; \
|
||||||
|
std::cout << "result_02 -- " << result_02 << std::endl; \
|
||||||
|
std::cout << "result_10 -- " << result_10 << std::endl; \
|
||||||
|
std::cout << "result_11 -- " << result_11 << std::endl; \
|
||||||
|
std::cout << "result_12 -- " << result_12 << std::endl; \
|
||||||
|
std::cout << "result_20 -- " << result_20 << std::endl; \
|
||||||
|
std::cout << "result_21 -- " << result_21 << std::endl; \
|
||||||
|
std::cout << "result_22 -- " << result_22 << std::endl; \
|
||||||
|
std::cout << "result_30 -- " << result_30 << std::endl; \
|
||||||
|
std::cout << "result_31 -- " << result_31 << std::endl; \
|
||||||
|
std::cout << "result_32 -- " << result_32 << std::endl;\
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#define HAND_DECLARATIONS(a) \
|
||||||
|
Simd result_00; \
|
||||||
|
Simd result_01; \
|
||||||
|
Simd result_02; \
|
||||||
|
Simd result_10; \
|
||||||
|
Simd result_11; \
|
||||||
|
Simd result_12; \
|
||||||
|
Simd result_20; \
|
||||||
|
Simd result_21; \
|
||||||
|
Simd result_22; \
|
||||||
|
Simd result_30; \
|
||||||
|
Simd result_31; \
|
||||||
|
Simd result_32; \
|
||||||
|
Simd Chi_00; \
|
||||||
|
Simd Chi_01; \
|
||||||
|
Simd Chi_02; \
|
||||||
|
Simd Chi_10; \
|
||||||
|
Simd Chi_11; \
|
||||||
|
Simd Chi_12; \
|
||||||
|
Simd UChi_00; \
|
||||||
|
Simd UChi_01; \
|
||||||
|
Simd UChi_02; \
|
||||||
|
Simd UChi_10; \
|
||||||
|
Simd UChi_11; \
|
||||||
|
Simd UChi_12; \
|
||||||
|
Simd U_00; \
|
||||||
|
Simd U_10; \
|
||||||
|
Simd U_20; \
|
||||||
|
Simd U_01; \
|
||||||
|
Simd U_11; \
|
||||||
|
Simd U_21;\
|
||||||
|
Simd debugreg;\
|
||||||
|
svbool_t pg1; \
|
||||||
|
pg1 = svptrue_b64(); \
|
||||||
|
|
||||||
|
#define ZERO_RESULT \
|
||||||
|
result_00=Zero(); \
|
||||||
|
result_01=Zero(); \
|
||||||
|
result_02=Zero(); \
|
||||||
|
result_10=Zero(); \
|
||||||
|
result_11=Zero(); \
|
||||||
|
result_12=Zero(); \
|
||||||
|
result_20=Zero(); \
|
||||||
|
result_21=Zero(); \
|
||||||
|
result_22=Zero(); \
|
||||||
|
result_30=Zero(); \
|
||||||
|
result_31=Zero(); \
|
||||||
|
result_32=Zero();
|
||||||
|
|
||||||
|
#define Chimu_00 Chi_00
|
||||||
|
#define Chimu_01 Chi_01
|
||||||
|
#define Chimu_02 Chi_02
|
||||||
|
#define Chimu_10 Chi_10
|
||||||
|
#define Chimu_11 Chi_11
|
||||||
|
#define Chimu_12 Chi_12
|
||||||
|
#define Chimu_20 UChi_00
|
||||||
|
#define Chimu_21 UChi_01
|
||||||
|
#define Chimu_22 UChi_02
|
||||||
|
#define Chimu_30 UChi_10
|
||||||
|
#define Chimu_31 UChi_11
|
||||||
|
#define Chimu_32 UChi_12
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
template<class Impl> void
|
||||||
|
WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||||
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
{
|
||||||
|
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
|
typedef typename Simd::scalar_type S;
|
||||||
|
typedef typename Simd::vector_type V;
|
||||||
|
|
||||||
|
HAND_DECLARATIONS(ignore);
|
||||||
|
|
||||||
|
int offset,local,perm, ptype;
|
||||||
|
StencilEntry *SE;
|
||||||
|
|
||||||
|
HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
|
||||||
|
HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM);
|
||||||
|
HAND_RESULT(ss);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl>
|
||||||
|
void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||||
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
{
|
||||||
|
typedef typename Simd::scalar_type S;
|
||||||
|
typedef typename Simd::vector_type V;
|
||||||
|
|
||||||
|
HAND_DECLARATIONS(ignore);
|
||||||
|
|
||||||
|
StencilEntry *SE;
|
||||||
|
int offset,local,perm, ptype;
|
||||||
|
|
||||||
|
HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
|
||||||
|
HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM);
|
||||||
|
HAND_RESULT(ss);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl> void
|
||||||
|
WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||||
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
{
|
||||||
|
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
|
typedef typename Simd::scalar_type S;
|
||||||
|
typedef typename Simd::vector_type V;
|
||||||
|
|
||||||
|
HAND_DECLARATIONS(ignore);
|
||||||
|
|
||||||
|
int offset,local,perm, ptype;
|
||||||
|
StencilEntry *SE;
|
||||||
|
ZERO_RESULT;
|
||||||
|
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
|
||||||
|
HAND_RESULT(ss);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl>
|
||||||
|
void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||||
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
{
|
||||||
|
typedef typename Simd::scalar_type S;
|
||||||
|
typedef typename Simd::vector_type V;
|
||||||
|
|
||||||
|
HAND_DECLARATIONS(ignore);
|
||||||
|
|
||||||
|
StencilEntry *SE;
|
||||||
|
int offset,local,perm, ptype;
|
||||||
|
ZERO_RESULT;
|
||||||
|
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
|
||||||
|
HAND_RESULT(ss);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl> void
|
||||||
|
WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||||
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
{
|
||||||
|
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
|
typedef typename Simd::scalar_type S;
|
||||||
|
typedef typename Simd::vector_type V;
|
||||||
|
|
||||||
|
HAND_DECLARATIONS(ignore);
|
||||||
|
|
||||||
|
int offset, ptype;
|
||||||
|
StencilEntry *SE;
|
||||||
|
int nmu=0;
|
||||||
|
ZERO_RESULT;
|
||||||
|
HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
|
||||||
|
HAND_RESULT_EXT(ss);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl>
|
||||||
|
void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
|
||||||
|
int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
|
||||||
|
{
|
||||||
|
typedef typename Simd::scalar_type S;
|
||||||
|
typedef typename Simd::vector_type V;
|
||||||
|
|
||||||
|
HAND_DECLARATIONS(ignore);
|
||||||
|
|
||||||
|
StencilEntry *SE;
|
||||||
|
int offset, ptype;
|
||||||
|
int nmu=0;
|
||||||
|
ZERO_RESULT;
|
||||||
|
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
|
||||||
|
HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
|
||||||
|
HAND_RESULT_EXT(ss);
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////// Wilson ; uses this implementation /////////////////////
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
||||||
|
#undef LOAD_CHIMU
|
||||||
|
#undef LOAD_CHI
|
||||||
|
#undef MULT_2SPIN
|
||||||
|
#undef PERMUTE_DIR
|
||||||
|
#undef XP_PROJ
|
||||||
|
#undef YP_PROJ
|
||||||
|
#undef ZP_PROJ
|
||||||
|
#undef TP_PROJ
|
||||||
|
#undef XM_PROJ
|
||||||
|
#undef YM_PROJ
|
||||||
|
#undef ZM_PROJ
|
||||||
|
#undef TM_PROJ
|
||||||
|
#undef XP_RECON
|
||||||
|
#undef XP_RECON_ACCUM
|
||||||
|
#undef XM_RECON
|
||||||
|
#undef XM_RECON_ACCUM
|
||||||
|
#undef YP_RECON_ACCUM
|
||||||
|
#undef YM_RECON_ACCUM
|
||||||
|
#undef ZP_RECON_ACCUM
|
||||||
|
#undef ZM_RECON_ACCUM
|
||||||
|
#undef TP_RECON_ACCUM
|
||||||
|
#undef TM_RECON_ACCUM
|
||||||
|
#undef ZERO_RESULT
|
||||||
|
#undef Chimu_00
|
||||||
|
#undef Chimu_01
|
||||||
|
#undef Chimu_02
|
||||||
|
#undef Chimu_10
|
||||||
|
#undef Chimu_11
|
||||||
|
#undef Chimu_12
|
||||||
|
#undef Chimu_20
|
||||||
|
#undef Chimu_21
|
||||||
|
#undef Chimu_22
|
||||||
|
#undef Chimu_30
|
||||||
|
#undef Chimu_31
|
||||||
|
#undef Chimu_32
|
||||||
|
#undef HAND_STENCIL_LEG
|
||||||
|
#undef HAND_STENCIL_LEG_INT
|
||||||
|
#undef HAND_STENCIL_LEG_EXT
|
||||||
|
#undef HAND_RESULT
|
||||||
|
#undef HAND_RESULT_INT
|
||||||
|
#undef HAND_RESULT_EXT
|
@ -114,7 +114,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
|
|||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
// All legs kernels ; comms then compute
|
// All legs kernels ; comms then compute
|
||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
template <class Impl>
|
template <class Impl> accelerator_inline
|
||||||
void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
|
void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
|
||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
@ -140,7 +140,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldV
|
|||||||
coalescedWrite(out[sF],result,lane);
|
coalescedWrite(out[sF],result,lane);
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl> accelerator_inline
|
||||||
void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U,
|
void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U,
|
||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
@ -169,7 +169,7 @@ void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView
|
|||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
// Interior kernels
|
// Interior kernels
|
||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
template <class Impl>
|
template <class Impl> accelerator_inline
|
||||||
void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,
|
void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,
|
||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
@ -197,7 +197,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFi
|
|||||||
coalescedWrite(out[sF], result,lane);
|
coalescedWrite(out[sF], result,lane);
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl> accelerator_inline
|
||||||
void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,
|
void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,
|
||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
@ -227,7 +227,7 @@ void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, DoubledGaugeField
|
|||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
// Exterior kernels
|
// Exterior kernels
|
||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
template <class Impl>
|
template <class Impl> accelerator_inline
|
||||||
void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,
|
void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,
|
||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
@ -258,7 +258,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFi
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl> accelerator_inline
|
||||||
void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,
|
void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,
|
||||||
SiteHalfSpinor *buf, int sF,
|
SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out)
|
int sU, const FermionFieldView &in, FermionFieldView &out)
|
||||||
@ -290,7 +290,7 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeField
|
|||||||
};
|
};
|
||||||
|
|
||||||
#define DhopDirMacro(Dir,spProj,spRecon) \
|
#define DhopDirMacro(Dir,spProj,spRecon) \
|
||||||
template <class Impl> \
|
template <class Impl> accelerator_inline \
|
||||||
void WilsonKernels<Impl>::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \
|
void WilsonKernels<Impl>::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \
|
int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \
|
||||||
{ \
|
{ \
|
||||||
@ -318,7 +318,7 @@ DhopDirMacro(Ym,spProjYm,spReconYm);
|
|||||||
DhopDirMacro(Zm,spProjZm,spReconZm);
|
DhopDirMacro(Zm,spProjZm,spReconZm);
|
||||||
DhopDirMacro(Tm,spProjTm,spReconTm);
|
DhopDirMacro(Tm,spProjTm,spReconTm);
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl> accelerator_inline
|
||||||
void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
|
void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
|
||||||
int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma)
|
int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma)
|
||||||
{
|
{
|
||||||
@ -501,4 +501,3 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
|||||||
#undef ASM_CALL
|
#undef ASM_CALL
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
@ -1 +0,0 @@
|
|||||||
../WilsonKernelsInstantiation.cc.master
|
|
@ -0,0 +1,51 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015, 2020
|
||||||
|
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution
|
||||||
|
directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
|
||||||
|
|
||||||
|
#ifndef AVX512
|
||||||
|
#ifndef QPX
|
||||||
|
#ifndef A64FX
|
||||||
|
#ifndef A64FXFIXEDSIZE
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
#include "impl.h"
|
||||||
|
template class WilsonKernels<IMPLEMENTATION>;
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
@ -1 +0,0 @@
|
|||||||
../WilsonKernelsInstantiation.cc.master
|
|
@ -0,0 +1,51 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015, 2020
|
||||||
|
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution
|
||||||
|
directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
|
||||||
|
|
||||||
|
#ifndef AVX512
|
||||||
|
#ifndef QPX
|
||||||
|
#ifndef A64FX
|
||||||
|
#ifndef A64FXFIXEDSIZE
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
#include "impl.h"
|
||||||
|
template class WilsonKernels<IMPLEMENTATION>;
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
@ -1 +0,0 @@
|
|||||||
../WilsonKernelsInstantiation.cc.master
|
|
@ -0,0 +1,51 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015, 2020
|
||||||
|
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution
|
||||||
|
directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
|
||||||
|
|
||||||
|
#ifndef AVX512
|
||||||
|
#ifndef QPX
|
||||||
|
#ifndef A64FX
|
||||||
|
#ifndef A64FXFIXEDSIZE
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
#include "impl.h"
|
||||||
|
template class WilsonKernels<IMPLEMENTATION>;
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
@ -1 +0,0 @@
|
|||||||
../WilsonKernelsInstantiation.cc.master
|
|
@ -0,0 +1,51 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015, 2020
|
||||||
|
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution
|
||||||
|
directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
|
||||||
|
|
||||||
|
#ifndef AVX512
|
||||||
|
#ifndef QPX
|
||||||
|
#ifndef A64FX
|
||||||
|
#ifndef A64FXFIXEDSIZE
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
#include "impl.h"
|
||||||
|
template class WilsonKernels<IMPLEMENTATION>;
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
@ -1 +0,0 @@
|
|||||||
../WilsonKernelsInstantiation.cc.master
|
|
@ -0,0 +1,51 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015, 2020
|
||||||
|
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution
|
||||||
|
directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
|
||||||
|
|
||||||
|
#ifndef AVX512
|
||||||
|
#ifndef QPX
|
||||||
|
#ifndef A64FX
|
||||||
|
#ifndef A64FXFIXEDSIZE
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
#include "impl.h"
|
||||||
|
template class WilsonKernels<IMPLEMENTATION>;
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
@ -1 +0,0 @@
|
|||||||
../WilsonKernelsInstantiation.cc.master
|
|
@ -0,0 +1,51 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015, 2020
|
||||||
|
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution
|
||||||
|
directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
|
||||||
|
|
||||||
|
#ifndef AVX512
|
||||||
|
#ifndef QPX
|
||||||
|
#ifndef A64FX
|
||||||
|
#ifndef A64FXFIXEDSIZE
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
#include "impl.h"
|
||||||
|
template class WilsonKernels<IMPLEMENTATION>;
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
@ -4,11 +4,12 @@ Grid physics library, www.github.com/paboyle/Grid
|
|||||||
|
|
||||||
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
||||||
|
|
||||||
Copyright (C) 2015
|
Copyright (C) 2015, 2020
|
||||||
|
|
||||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
This program is free software; you can redistribute it and/or modify
|
||||||
it under the terms of the GNU General Public License as published by
|
it under the terms of the GNU General Public License as published by
|
||||||
@ -34,9 +35,13 @@ directory
|
|||||||
|
|
||||||
#ifndef AVX512
|
#ifndef AVX512
|
||||||
#ifndef QPX
|
#ifndef QPX
|
||||||
|
#ifndef A64FX
|
||||||
|
#ifndef A64FXFIXEDSIZE
|
||||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
@ -44,4 +49,3 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
template class WilsonKernels<IMPLEMENTATION>;
|
template class WilsonKernels<IMPLEMENTATION>;
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
@ -37,6 +37,7 @@ directory
|
|||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmAvx512.h>
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmAvx512.h>
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmA64FX.h>
|
||||||
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmQPX.h>
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmQPX.h>
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
@ -1 +0,0 @@
|
|||||||
../WilsonKernelsInstantiation.cc.master
|
|
@ -0,0 +1,51 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015, 2020
|
||||||
|
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution
|
||||||
|
directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
|
||||||
|
|
||||||
|
#ifndef AVX512
|
||||||
|
#ifndef QPX
|
||||||
|
#ifndef A64FX
|
||||||
|
#ifndef A64FXFIXEDSIZE
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
#include "impl.h"
|
||||||
|
template class WilsonKernels<IMPLEMENTATION>;
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
@ -1 +0,0 @@
|
|||||||
../WilsonKernelsInstantiation.cc.master
|
|
@ -0,0 +1,51 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015, 2020
|
||||||
|
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution
|
||||||
|
directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
|
||||||
|
|
||||||
|
#ifndef AVX512
|
||||||
|
#ifndef QPX
|
||||||
|
#ifndef A64FX
|
||||||
|
#ifndef A64FXFIXEDSIZE
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
#include "impl.h"
|
||||||
|
template class WilsonKernels<IMPLEMENTATION>;
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
@ -1 +0,0 @@
|
|||||||
../WilsonKernelsInstantiation.cc.master
|
|
@ -0,0 +1,51 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015, 2020
|
||||||
|
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution
|
||||||
|
directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
|
||||||
|
|
||||||
|
#ifndef AVX512
|
||||||
|
#ifndef QPX
|
||||||
|
#ifndef A64FX
|
||||||
|
#ifndef A64FXFIXEDSIZE
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
#include "impl.h"
|
||||||
|
template class WilsonKernels<IMPLEMENTATION>;
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
@ -1 +0,0 @@
|
|||||||
../WilsonKernelsInstantiation.cc.master
|
|
@ -0,0 +1,51 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015, 2020
|
||||||
|
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution
|
||||||
|
directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
|
||||||
|
|
||||||
|
#ifndef AVX512
|
||||||
|
#ifndef QPX
|
||||||
|
#ifndef A64FX
|
||||||
|
#ifndef A64FXFIXEDSIZE
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
#include "impl.h"
|
||||||
|
template class WilsonKernels<IMPLEMENTATION>;
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
@ -1 +0,0 @@
|
|||||||
../WilsonKernelsInstantiation.cc.master
|
|
@ -0,0 +1,51 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015, 2020
|
||||||
|
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution
|
||||||
|
directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
|
||||||
|
|
||||||
|
#ifndef AVX512
|
||||||
|
#ifndef QPX
|
||||||
|
#ifndef A64FX
|
||||||
|
#ifndef A64FXFIXEDSIZE
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
#include "impl.h"
|
||||||
|
template class WilsonKernels<IMPLEMENTATION>;
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
@ -1 +0,0 @@
|
|||||||
../WilsonKernelsInstantiation.cc.master
|
|
@ -0,0 +1,51 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015, 2020
|
||||||
|
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution
|
||||||
|
directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
|
||||||
|
|
||||||
|
#ifndef AVX512
|
||||||
|
#ifndef QPX
|
||||||
|
#ifndef A64FX
|
||||||
|
#ifndef A64FXFIXEDSIZE
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
#include "impl.h"
|
||||||
|
template class WilsonKernels<IMPLEMENTATION>;
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
@ -1 +0,0 @@
|
|||||||
../WilsonKernelsInstantiation.cc.master
|
|
@ -0,0 +1,51 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015, 2020
|
||||||
|
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution
|
||||||
|
directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
|
||||||
|
|
||||||
|
#ifndef AVX512
|
||||||
|
#ifndef QPX
|
||||||
|
#ifndef A64FX
|
||||||
|
#ifndef A64FXFIXEDSIZE
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
#include "impl.h"
|
||||||
|
template class WilsonKernels<IMPLEMENTATION>;
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
@ -1 +0,0 @@
|
|||||||
../WilsonKernelsInstantiation.cc.master
|
|
@ -0,0 +1,51 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015, 2020
|
||||||
|
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution
|
||||||
|
directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h>
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsHandImplementation.h>
|
||||||
|
|
||||||
|
#ifndef AVX512
|
||||||
|
#ifndef QPX
|
||||||
|
#ifndef A64FX
|
||||||
|
#ifndef A64FXFIXEDSIZE
|
||||||
|
#include <Grid/qcd/action/fermion/implementation/WilsonKernelsAsmImplementation.h>
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
#include "impl.h"
|
||||||
|
template class WilsonKernels<IMPLEMENTATION>;
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
38
Grid/qcd/action/gauge/Gauge.cc
Normal file
38
Grid/qcd/action/gauge/Gauge.cc
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/qcd/action/gauge/Gauge.cc
|
||||||
|
|
||||||
|
Copyright (C) 2020
|
||||||
|
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution
|
||||||
|
directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
std::vector<int> ConjugateGaugeImplBase::_conjDirs;
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
||||||
|
|
@ -154,6 +154,10 @@ public:
|
|||||||
return Hsum.real();
|
return Hsum.real();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline void Project(Field &U) {
|
||||||
|
ProjectSUn(U);
|
||||||
|
}
|
||||||
|
|
||||||
static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
|
static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
|
||||||
SU<Nc>::HotConfiguration(pRNG, U);
|
SU<Nc>::HotConfiguration(pRNG, U);
|
||||||
}
|
}
|
||||||
|
@ -59,14 +59,14 @@ public:
|
|||||||
}
|
}
|
||||||
static inline GaugeLinkField
|
static inline GaugeLinkField
|
||||||
CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) {
|
CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) {
|
||||||
return Cshift(closure(adj(Link)), mu, -1);
|
return PeriodicBC::CovShiftIdentityBackward(Link, mu);
|
||||||
}
|
}
|
||||||
static inline GaugeLinkField
|
static inline GaugeLinkField
|
||||||
CovShiftIdentityForward(const GaugeLinkField &Link, int mu) {
|
CovShiftIdentityForward(const GaugeLinkField &Link, int mu) {
|
||||||
return Link;
|
return PeriodicBC::CovShiftIdentityForward(Link,mu);
|
||||||
}
|
}
|
||||||
static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) {
|
static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) {
|
||||||
return Cshift(Link, mu, 1);
|
return PeriodicBC::ShiftStaple(Link,mu);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline bool isPeriodicGaugeField(void) { return true; }
|
static inline bool isPeriodicGaugeField(void) { return true; }
|
||||||
@ -74,7 +74,13 @@ public:
|
|||||||
|
|
||||||
// Composition with smeared link, bc's etc.. probably need multiple inheritance
|
// Composition with smeared link, bc's etc.. probably need multiple inheritance
|
||||||
// Variable precision "S" and variable Nc
|
// Variable precision "S" and variable Nc
|
||||||
template <class GimplTypes> class ConjugateGaugeImpl : public GimplTypes {
|
class ConjugateGaugeImplBase {
|
||||||
|
protected:
|
||||||
|
static std::vector<int> _conjDirs;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <class GimplTypes> class ConjugateGaugeImpl : public GimplTypes, ConjugateGaugeImplBase {
|
||||||
|
private:
|
||||||
public:
|
public:
|
||||||
INHERIT_GIMPL_TYPES(GimplTypes);
|
INHERIT_GIMPL_TYPES(GimplTypes);
|
||||||
|
|
||||||
@ -84,47 +90,56 @@ public:
|
|||||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
template <class covariant>
|
template <class covariant>
|
||||||
static Lattice<covariant> CovShiftForward(const GaugeLinkField &Link, int mu,
|
static Lattice<covariant> CovShiftForward(const GaugeLinkField &Link, int mu,
|
||||||
const Lattice<covariant> &field) {
|
const Lattice<covariant> &field)
|
||||||
return ConjugateBC::CovShiftForward(Link, mu, field);
|
{
|
||||||
|
assert(_conjDirs.size() == Nd);
|
||||||
|
if(_conjDirs[mu])
|
||||||
|
return ConjugateBC::CovShiftForward(Link, mu, field);
|
||||||
|
else
|
||||||
|
return PeriodicBC::CovShiftForward(Link, mu, field);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class covariant>
|
template <class covariant>
|
||||||
static Lattice<covariant> CovShiftBackward(const GaugeLinkField &Link, int mu,
|
static Lattice<covariant> CovShiftBackward(const GaugeLinkField &Link, int mu,
|
||||||
const Lattice<covariant> &field) {
|
const Lattice<covariant> &field)
|
||||||
return ConjugateBC::CovShiftBackward(Link, mu, field);
|
{
|
||||||
|
assert(_conjDirs.size() == Nd);
|
||||||
|
if(_conjDirs[mu])
|
||||||
|
return ConjugateBC::CovShiftBackward(Link, mu, field);
|
||||||
|
else
|
||||||
|
return PeriodicBC::CovShiftBackward(Link, mu, field);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline GaugeLinkField
|
static inline GaugeLinkField
|
||||||
CovShiftIdentityBackward(const GaugeLinkField &Link, int mu) {
|
CovShiftIdentityBackward(const GaugeLinkField &Link, int mu)
|
||||||
GridBase *grid = Link.Grid();
|
{
|
||||||
int Lmu = grid->GlobalDimensions()[mu] - 1;
|
assert(_conjDirs.size() == Nd);
|
||||||
|
if(_conjDirs[mu])
|
||||||
Lattice<iScalar<vInteger>> coor(grid);
|
return ConjugateBC::CovShiftIdentityBackward(Link, mu);
|
||||||
LatticeCoordinate(coor, mu);
|
else
|
||||||
|
return PeriodicBC::CovShiftIdentityBackward(Link, mu);
|
||||||
GaugeLinkField tmp(grid);
|
|
||||||
tmp = adj(Link);
|
|
||||||
tmp = where(coor == Lmu, conjugate(tmp), tmp);
|
|
||||||
return Cshift(tmp, mu, -1); // moves towards positive mu
|
|
||||||
}
|
}
|
||||||
static inline GaugeLinkField
|
static inline GaugeLinkField
|
||||||
CovShiftIdentityForward(const GaugeLinkField &Link, int mu) {
|
CovShiftIdentityForward(const GaugeLinkField &Link, int mu)
|
||||||
return Link;
|
{
|
||||||
|
assert(_conjDirs.size() == Nd);
|
||||||
|
if(_conjDirs[mu])
|
||||||
|
return ConjugateBC::CovShiftIdentityForward(Link,mu);
|
||||||
|
else
|
||||||
|
return PeriodicBC::CovShiftIdentityForward(Link,mu);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu) {
|
static inline GaugeLinkField ShiftStaple(const GaugeLinkField &Link, int mu)
|
||||||
GridBase *grid = Link.Grid();
|
{
|
||||||
int Lmu = grid->GlobalDimensions()[mu] - 1;
|
assert(_conjDirs.size() == Nd);
|
||||||
|
if(_conjDirs[mu])
|
||||||
Lattice<iScalar<vInteger>> coor(grid);
|
return ConjugateBC::ShiftStaple(Link,mu);
|
||||||
LatticeCoordinate(coor, mu);
|
else
|
||||||
|
return PeriodicBC::ShiftStaple(Link,mu);
|
||||||
GaugeLinkField tmp(grid);
|
|
||||||
tmp = Cshift(Link, mu, 1);
|
|
||||||
tmp = where(coor == Lmu, conjugate(tmp), tmp);
|
|
||||||
return tmp;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline void setDirections(std::vector<int> &conjDirs) { _conjDirs=conjDirs; }
|
||||||
|
static inline std::vector<int> getDirections(void) { return _conjDirs; }
|
||||||
static inline bool isPeriodicGaugeField(void) { return false; }
|
static inline bool isPeriodicGaugeField(void) { return false; }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1,5 +1,13 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#define CPS_MD_TIME
|
||||||
|
|
||||||
|
#ifdef CPS_MD_TIME
|
||||||
|
#define HMC_MOMENTUM_DENOMINATOR (2.0)
|
||||||
|
#else
|
||||||
|
#define HMC_MOMENTUM_DENOMINATOR (1.0)
|
||||||
|
#endif
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
template <class S>
|
template <class S>
|
||||||
@ -20,7 +28,9 @@ public:
|
|||||||
typedef Field PropagatorField;
|
typedef Field PropagatorField;
|
||||||
|
|
||||||
static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){
|
static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){
|
||||||
|
RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR); // CPS/UKQCD momentum rescaling
|
||||||
gaussian(pRNG, P);
|
gaussian(pRNG, P);
|
||||||
|
P *= scale;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline Field projectForce(Field& P){return P;}
|
static inline Field projectForce(Field& P){return P;}
|
||||||
@ -45,6 +55,10 @@ public:
|
|||||||
U = 1.0;
|
U = 1.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline void Project(Field &U) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
static void MomentumSpacePropagator(Field &out, RealD m)
|
static void MomentumSpacePropagator(Field &out, RealD m)
|
||||||
{
|
{
|
||||||
GridBase *grid = out.Grid();
|
GridBase *grid = out.Grid();
|
||||||
@ -66,7 +80,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void FreePropagator(const Field &in, Field &out,
|
static void FreePropagator(const Field &in, Field &out,
|
||||||
const Field &momKernel)
|
const Field &momKernel)
|
||||||
{
|
{
|
||||||
FFT fft((GridCartesian *)in.Grid());
|
FFT fft((GridCartesian *)in.Grid());
|
||||||
Field inFT(in.Grid());
|
Field inFT(in.Grid());
|
||||||
@ -139,14 +153,17 @@ public:
|
|||||||
|
|
||||||
static inline void generate_momenta(Field &P, GridParallelRNG &pRNG)
|
static inline void generate_momenta(Field &P, GridParallelRNG &pRNG)
|
||||||
{
|
{
|
||||||
|
RealD scale = ::sqrt(HMC_MOMENTUM_DENOMINATOR); // CPS/UKQCD momentum rescaling
|
||||||
#ifndef USE_FFT_ACCELERATION
|
#ifndef USE_FFT_ACCELERATION
|
||||||
Group::GaussianFundamentalLieAlgebraMatrix(pRNG, P);
|
Group::GaussianFundamentalLieAlgebraMatrix(pRNG, P);
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
Field Pgaussian(P.Grid()), Pp(P.Grid());
|
Field Pgaussian(P.Grid()), Pp(P.Grid());
|
||||||
ComplexField p2(P.Grid()); p2 = zero;
|
ComplexField p2(P.Grid()); p2 = zero;
|
||||||
RealD M = FFT_MASS;
|
RealD M = FFT_MASS;
|
||||||
|
|
||||||
|
|
||||||
Group::GaussianFundamentalLieAlgebraMatrix(pRNG, Pgaussian);
|
Group::GaussianFundamentalLieAlgebraMatrix(pRNG, Pgaussian);
|
||||||
|
|
||||||
FFT theFFT((GridCartesian*)P.Grid());
|
FFT theFFT((GridCartesian*)P.Grid());
|
||||||
@ -156,17 +173,17 @@ public:
|
|||||||
p2 = sqrt(p2);
|
p2 = sqrt(p2);
|
||||||
Pp *= p2;
|
Pp *= p2;
|
||||||
theFFT.FFT_all_dim(P, Pp, FFT::backward);
|
theFFT.FFT_all_dim(P, Pp, FFT::backward);
|
||||||
|
|
||||||
#endif //USE_FFT_ACCELERATION
|
#endif //USE_FFT_ACCELERATION
|
||||||
|
P *= scale;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline Field projectForce(Field& P) {return P;}
|
static inline Field projectForce(Field& P) {return Ta(P);}
|
||||||
|
|
||||||
static inline void update_field(Field &P, Field &U, double ep)
|
static inline void update_field(Field &P, Field &U, double ep)
|
||||||
{
|
{
|
||||||
#ifndef USE_FFT_ACCELERATION
|
#ifndef USE_FFT_ACCELERATION
|
||||||
double t0=usecond();
|
double t0=usecond();
|
||||||
U += P*ep;
|
U += P*ep;
|
||||||
double t1=usecond();
|
double t1=usecond();
|
||||||
double total_time = (t1-t0)/1e6;
|
double total_time = (t1-t0)/1e6;
|
||||||
std::cout << GridLogIntegrator << "Total time for updating field (s) : " << total_time << std::endl;
|
std::cout << GridLogIntegrator << "Total time for updating field (s) : " << total_time << std::endl;
|
||||||
@ -221,6 +238,10 @@ public:
|
|||||||
#endif //USE_FFT_ACCELERATION
|
#endif //USE_FFT_ACCELERATION
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline void Project(Field &U) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
|
static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
|
||||||
Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U);
|
Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U);
|
||||||
}
|
}
|
||||||
|
@ -159,6 +159,13 @@ private:
|
|||||||
Resources.GetCheckPointer()->CheckpointRestore(Parameters.StartTrajectory, U,
|
Resources.GetCheckPointer()->CheckpointRestore(Parameters.StartTrajectory, U,
|
||||||
Resources.GetSerialRNG(),
|
Resources.GetSerialRNG(),
|
||||||
Resources.GetParallelRNG());
|
Resources.GetParallelRNG());
|
||||||
|
} else {
|
||||||
|
// others
|
||||||
|
std::cout << GridLogError << "Unrecognized StartingType\n";
|
||||||
|
std::cout
|
||||||
|
<< GridLogError
|
||||||
|
<< "Valid [HotStart, ColdStart, TepidStart, CheckpointStart]\n";
|
||||||
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
Smearing.set_Field(U);
|
Smearing.set_Field(U);
|
||||||
|
@ -74,7 +74,7 @@ public:
|
|||||||
conf_file = os.str();
|
conf_file = os.str();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
virtual ~BaseHmcCheckpointer(){};
|
||||||
void check_filename(const std::string &filename){
|
void check_filename(const std::string &filename){
|
||||||
std::ifstream f(filename.c_str());
|
std::ifstream f(filename.c_str());
|
||||||
if(!f.good()){
|
if(!f.good()){
|
||||||
@ -82,7 +82,6 @@ public:
|
|||||||
abort();
|
abort();
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual void initialize(const CheckpointerParameters &Params) = 0;
|
virtual void initialize(const CheckpointerParameters &Params) = 0;
|
||||||
|
|
||||||
virtual void CheckpointRestore(int traj, typename Impl::Field &U,
|
virtual void CheckpointRestore(int traj, typename Impl::Field &U,
|
||||||
|
@ -45,6 +45,7 @@ private:
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
INHERIT_GIMPL_TYPES(Implementation);
|
INHERIT_GIMPL_TYPES(Implementation);
|
||||||
|
typedef GaugeStatistics<Implementation> GaugeStats;
|
||||||
|
|
||||||
ILDGHmcCheckpointer(const CheckpointerParameters &Params_) { initialize(Params_); }
|
ILDGHmcCheckpointer(const CheckpointerParameters &Params_) { initialize(Params_); }
|
||||||
|
|
||||||
@ -78,7 +79,7 @@ public:
|
|||||||
BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
|
BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
|
||||||
IldgWriter _IldgWriter(grid->IsBoss());
|
IldgWriter _IldgWriter(grid->IsBoss());
|
||||||
_IldgWriter.open(config);
|
_IldgWriter.open(config);
|
||||||
_IldgWriter.writeConfiguration(U, traj, config, config);
|
_IldgWriter.writeConfiguration<GaugeStats>(U, traj, config, config);
|
||||||
_IldgWriter.close();
|
_IldgWriter.close();
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Written ILDG Configuration on " << config
|
std::cout << GridLogMessage << "Written ILDG Configuration on " << config
|
||||||
@ -105,7 +106,7 @@ public:
|
|||||||
FieldMetaData header;
|
FieldMetaData header;
|
||||||
IldgReader _IldgReader;
|
IldgReader _IldgReader;
|
||||||
_IldgReader.open(config);
|
_IldgReader.open(config);
|
||||||
_IldgReader.readConfiguration(U,header); // format from the header
|
_IldgReader.readConfiguration<GaugeStats>(U,header); // format from the header
|
||||||
_IldgReader.close();
|
_IldgReader.close();
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Read ILDG Configuration from " << config
|
std::cout << GridLogMessage << "Read ILDG Configuration from " << config
|
||||||
|
@ -43,6 +43,7 @@ private:
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
INHERIT_GIMPL_TYPES(Gimpl); // only for gauge configurations
|
INHERIT_GIMPL_TYPES(Gimpl); // only for gauge configurations
|
||||||
|
typedef GaugeStatistics<Gimpl> GaugeStats;
|
||||||
|
|
||||||
NerscHmcCheckpointer(const CheckpointerParameters &Params_) { initialize(Params_); }
|
NerscHmcCheckpointer(const CheckpointerParameters &Params_) { initialize(Params_); }
|
||||||
|
|
||||||
@ -60,7 +61,7 @@ public:
|
|||||||
int precision32 = 1;
|
int precision32 = 1;
|
||||||
int tworow = 0;
|
int tworow = 0;
|
||||||
NerscIO::writeRNGState(sRNG, pRNG, rng);
|
NerscIO::writeRNGState(sRNG, pRNG, rng);
|
||||||
NerscIO::writeConfiguration(U, config, tworow, precision32);
|
NerscIO::writeConfiguration<GaugeStats>(U, config, tworow, precision32);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -74,7 +75,7 @@ public:
|
|||||||
|
|
||||||
FieldMetaData header;
|
FieldMetaData header;
|
||||||
NerscIO::readRNGState(sRNG, pRNG, header, rng);
|
NerscIO::readRNGState(sRNG, pRNG, header, rng);
|
||||||
NerscIO::readConfiguration(U, header, config);
|
NerscIO::readConfiguration<GaugeStats>(U, header, config);
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -301,9 +301,9 @@ public:
|
|||||||
t_P[level] = 0;
|
t_P[level] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int step = 0; step < Params.MDsteps; ++step) { // MD step
|
for (int stp = 0; stp < Params.MDsteps; ++stp) { // MD step
|
||||||
int first_step = (step == 0);
|
int first_step = (stp == 0);
|
||||||
int last_step = (step == Params.MDsteps - 1);
|
int last_step = (stp == Params.MDsteps - 1);
|
||||||
this->step(U, 0, first_step, last_step);
|
this->step(U, 0, first_step, last_step);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -313,6 +313,8 @@ public:
|
|||||||
std::cout << GridLogIntegrator << " times[" << level << "]= " << t_P[level] << " " << t_U << std::endl;
|
std::cout << GridLogIntegrator << " times[" << level << "]= " << t_P[level] << " " << t_U << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
FieldImplementation::Project(U);
|
||||||
|
|
||||||
// and that we indeed got to the end of the trajectory
|
// and that we indeed got to the end of the trajectory
|
||||||
assert(fabs(t_U - Params.trajL) < 1.0e-6);
|
assert(fabs(t_U - Params.trajL) < 1.0e-6);
|
||||||
|
|
||||||
|
@ -99,7 +99,7 @@ public:
|
|||||||
virtual Prod* getPtr() = 0;
|
virtual Prod* getPtr() = 0;
|
||||||
|
|
||||||
// add a getReference?
|
// add a getReference?
|
||||||
|
virtual ~HMCModuleBase(){};
|
||||||
virtual void print_parameters(){}; // default to nothing
|
virtual void print_parameters(){}; // default to nothing
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -128,7 +128,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
|
|||||||
}
|
}
|
||||||
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProjTm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
|
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProjTm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
|
|
||||||
hspin(0)=fspin(0)-fspin(2);
|
hspin(0)=fspin(0)-fspin(2);
|
||||||
hspin(1)=fspin(1)-fspin(3);
|
hspin(1)=fspin(1)-fspin(3);
|
||||||
}
|
}
|
||||||
@ -138,40 +137,50 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
|
|||||||
* 0 0 -1 0
|
* 0 0 -1 0
|
||||||
* 0 0 0 -1
|
* 0 0 0 -1
|
||||||
*/
|
*/
|
||||||
|
|
||||||
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5p (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
|
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5p (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
|
|
||||||
hspin(0)=fspin(0);
|
hspin(0)=fspin(0);
|
||||||
hspin(1)=fspin(1);
|
hspin(1)=fspin(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5m (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
|
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5m (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
|
|
||||||
hspin(0)=fspin(2);
|
hspin(0)=fspin(2);
|
||||||
hspin(1)=fspin(3);
|
hspin(1)=fspin(3);
|
||||||
}
|
}
|
||||||
|
|
||||||
// template<class vtype> accelerator_inline void fspProj5p (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin)
|
|
||||||
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5p (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin)
|
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5p (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
|
|
||||||
rfspin(0)=fspin(0);
|
rfspin(0)=fspin(0);
|
||||||
rfspin(1)=fspin(1);
|
rfspin(1)=fspin(1);
|
||||||
rfspin(2)=Zero();
|
rfspin(2)=Zero();
|
||||||
rfspin(3)=Zero();
|
rfspin(3)=Zero();
|
||||||
}
|
}
|
||||||
// template<class vtype> accelerator_inline void fspProj5m (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin)
|
|
||||||
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5m (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin)
|
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProj5m (iVector<vtype,Ns> &rfspin,const iVector<vtype,Ns> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
|
|
||||||
rfspin(0)=Zero();
|
rfspin(0)=Zero();
|
||||||
rfspin(1)=Zero();
|
rfspin(1)=Zero();
|
||||||
rfspin(2)=fspin(2);
|
rfspin(2)=fspin(2);
|
||||||
rfspin(3)=fspin(3);
|
rfspin(3)=fspin(3);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<class vtype,int N,IfCoarsened<iVector<vtype,N> > = 0> accelerator_inline void spProj5p (iVector<vtype,N> &rfspin,const iVector<vtype,N> &fspin)
|
||||||
|
{
|
||||||
|
const int hN = N>>1;
|
||||||
|
for(int s=0;s<hN;s++){
|
||||||
|
rfspin(s)=fspin(s);
|
||||||
|
rfspin(s+hN)=Zero();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
template<class vtype,int N,IfCoarsened<iVector<vtype,N> > = 0> accelerator_inline void spProj5m (iVector<vtype,N> &rfspin,const iVector<vtype,N> &fspin)
|
||||||
|
{
|
||||||
|
const int hN = N>>1;
|
||||||
|
for(int s=0;s<hN;s++){
|
||||||
|
rfspin(s)=Zero();
|
||||||
|
rfspin(s+hN)=fspin(s+hN);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Reconstruction routines to move back again to four spin
|
// Reconstruction routines to move back again to four spin
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
@ -183,7 +192,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
|
|||||||
*/
|
*/
|
||||||
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconXp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconXp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
|
|
||||||
fspin(0)=hspin(0);
|
fspin(0)=hspin(0);
|
||||||
fspin(1)=hspin(1);
|
fspin(1)=hspin(1);
|
||||||
fspin(2)=timesMinusI(hspin(1));
|
fspin(2)=timesMinusI(hspin(1));
|
||||||
@ -191,7 +199,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
|
|||||||
}
|
}
|
||||||
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconXm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconXm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
|
|
||||||
fspin(0)=hspin(0);
|
fspin(0)=hspin(0);
|
||||||
fspin(1)=hspin(1);
|
fspin(1)=hspin(1);
|
||||||
fspin(2)=timesI(hspin(1));
|
fspin(2)=timesI(hspin(1));
|
||||||
@ -199,7 +206,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
|
|||||||
}
|
}
|
||||||
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconXp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconXp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
|
|
||||||
fspin(0)+=hspin(0);
|
fspin(0)+=hspin(0);
|
||||||
fspin(1)+=hspin(1);
|
fspin(1)+=hspin(1);
|
||||||
fspin(2)-=timesI(hspin(1));
|
fspin(2)-=timesI(hspin(1));
|
||||||
@ -207,7 +213,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
|
|||||||
}
|
}
|
||||||
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconXm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconXm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
|
|
||||||
fspin(0)+=hspin(0);
|
fspin(0)+=hspin(0);
|
||||||
fspin(1)+=hspin(1);
|
fspin(1)+=hspin(1);
|
||||||
fspin(2)+=timesI(hspin(1));
|
fspin(2)+=timesI(hspin(1));
|
||||||
@ -221,7 +226,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
|
|||||||
|
|
||||||
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconYp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconYp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
|
|
||||||
fspin(0)=hspin(0);
|
fspin(0)=hspin(0);
|
||||||
fspin(1)=hspin(1);
|
fspin(1)=hspin(1);
|
||||||
fspin(2)= hspin(1);
|
fspin(2)= hspin(1);
|
||||||
@ -229,7 +233,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
|
|||||||
}
|
}
|
||||||
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconYm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconYm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
|
|
||||||
fspin(0)=hspin(0);
|
fspin(0)=hspin(0);
|
||||||
fspin(1)=hspin(1);
|
fspin(1)=hspin(1);
|
||||||
fspin(2)=-hspin(1);
|
fspin(2)=-hspin(1);
|
||||||
@ -237,7 +240,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
|
|||||||
}
|
}
|
||||||
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconYp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconYp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
|
|
||||||
fspin(0)+=hspin(0);
|
fspin(0)+=hspin(0);
|
||||||
fspin(1)+=hspin(1);
|
fspin(1)+=hspin(1);
|
||||||
fspin(2)+=hspin(1);
|
fspin(2)+=hspin(1);
|
||||||
@ -245,7 +247,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
|
|||||||
}
|
}
|
||||||
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconYm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconYm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
|
|
||||||
fspin(0)+=hspin(0);
|
fspin(0)+=hspin(0);
|
||||||
fspin(1)+=hspin(1);
|
fspin(1)+=hspin(1);
|
||||||
fspin(2)-=hspin(1);
|
fspin(2)-=hspin(1);
|
||||||
@ -260,7 +261,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
|
|||||||
*/
|
*/
|
||||||
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconZp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconZp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
|
|
||||||
fspin(0)=hspin(0);
|
fspin(0)=hspin(0);
|
||||||
fspin(1)=hspin(1);
|
fspin(1)=hspin(1);
|
||||||
fspin(2)=timesMinusI(hspin(0));
|
fspin(2)=timesMinusI(hspin(0));
|
||||||
@ -268,7 +268,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
|
|||||||
}
|
}
|
||||||
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconZm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconZm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
|
|
||||||
fspin(0)=hspin(0);
|
fspin(0)=hspin(0);
|
||||||
fspin(1)=hspin(1);
|
fspin(1)=hspin(1);
|
||||||
fspin(2)= timesI(hspin(0));
|
fspin(2)= timesI(hspin(0));
|
||||||
@ -276,7 +275,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
|
|||||||
}
|
}
|
||||||
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconZp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconZp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
|
|
||||||
fspin(0)+=hspin(0);
|
fspin(0)+=hspin(0);
|
||||||
fspin(1)+=hspin(1);
|
fspin(1)+=hspin(1);
|
||||||
fspin(2)-=timesI(hspin(0));
|
fspin(2)-=timesI(hspin(0));
|
||||||
@ -284,7 +282,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
|
|||||||
}
|
}
|
||||||
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconZm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconZm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
|
|
||||||
fspin(0)+=hspin(0);
|
fspin(0)+=hspin(0);
|
||||||
fspin(1)+=hspin(1);
|
fspin(1)+=hspin(1);
|
||||||
fspin(2)+=timesI(hspin(0));
|
fspin(2)+=timesI(hspin(0));
|
||||||
@ -298,7 +295,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
|
|||||||
*/
|
*/
|
||||||
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconTp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconTp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
|
|
||||||
fspin(0)=hspin(0);
|
fspin(0)=hspin(0);
|
||||||
fspin(1)=hspin(1);
|
fspin(1)=hspin(1);
|
||||||
fspin(2)=hspin(0);
|
fspin(2)=hspin(0);
|
||||||
@ -306,7 +302,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
|
|||||||
}
|
}
|
||||||
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconTm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spReconTm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
|
|
||||||
fspin(0)=hspin(0);
|
fspin(0)=hspin(0);
|
||||||
fspin(1)=hspin(1);
|
fspin(1)=hspin(1);
|
||||||
fspin(2)=-hspin(0);
|
fspin(2)=-hspin(0);
|
||||||
@ -314,7 +309,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
|
|||||||
}
|
}
|
||||||
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconTp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconTp (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
|
|
||||||
fspin(0)+=hspin(0);
|
fspin(0)+=hspin(0);
|
||||||
fspin(1)+=hspin(1);
|
fspin(1)+=hspin(1);
|
||||||
fspin(2)+=hspin(0);
|
fspin(2)+=hspin(0);
|
||||||
@ -322,7 +316,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
|
|||||||
}
|
}
|
||||||
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconTm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconTm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
|
|
||||||
fspin(0)+=hspin(0);
|
fspin(0)+=hspin(0);
|
||||||
fspin(1)+=hspin(1);
|
fspin(1)+=hspin(1);
|
||||||
fspin(2)-=hspin(0);
|
fspin(2)-=hspin(0);
|
||||||
@ -336,7 +329,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
|
|||||||
*/
|
*/
|
||||||
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spRecon5p (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spRecon5p (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
|
|
||||||
fspin(0)=hspin(0)+hspin(0); // add is lower latency than mul
|
fspin(0)=hspin(0)+hspin(0); // add is lower latency than mul
|
||||||
fspin(1)=hspin(1)+hspin(1); // probably no measurable diffence though
|
fspin(1)=hspin(1)+hspin(1); // probably no measurable diffence though
|
||||||
fspin(2)=Zero();
|
fspin(2)=Zero();
|
||||||
@ -344,7 +336,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
|
|||||||
}
|
}
|
||||||
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spRecon5m (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spRecon5m (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
|
|
||||||
fspin(0)=Zero();
|
fspin(0)=Zero();
|
||||||
fspin(1)=Zero();
|
fspin(1)=Zero();
|
||||||
fspin(2)=hspin(0)+hspin(0);
|
fspin(2)=hspin(0)+hspin(0);
|
||||||
@ -352,7 +343,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
|
|||||||
}
|
}
|
||||||
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumRecon5p (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumRecon5p (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
|
|
||||||
fspin(0)+=hspin(0)+hspin(0);
|
fspin(0)+=hspin(0)+hspin(0);
|
||||||
fspin(1)+=hspin(1)+hspin(1);
|
fspin(1)+=hspin(1)+hspin(1);
|
||||||
}
|
}
|
||||||
@ -372,7 +362,6 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
|
|||||||
//////////
|
//////////
|
||||||
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjXp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjXp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++) {
|
for(int i=0;i<N;i++) {
|
||||||
spProjXp(hspin._internal[i],fspin._internal[i]);
|
spProjXp(hspin._internal[i],fspin._internal[i]);
|
||||||
}
|
}
|
||||||
@ -426,26 +415,21 @@ template<class rtype,class vtype,int N> accelerator_inline void accumReconXp (iM
|
|||||||
}}
|
}}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
////////
|
////////
|
||||||
// Xm
|
// Xm
|
||||||
////////
|
////////
|
||||||
template<class rtype,class vtype> accelerator_inline void spProjXm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
template<class rtype,class vtype> accelerator_inline void spProjXm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
|
|
||||||
spProjXm(hspin._internal,fspin._internal);
|
spProjXm(hspin._internal,fspin._internal);
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjXm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjXm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++) {
|
for(int i=0;i<N;i++) {
|
||||||
spProjXm(hspin._internal[i],fspin._internal[i]);
|
spProjXm(hspin._internal[i],fspin._internal[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N> accelerator_inline void spProjXm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
template<class rtype,class vtype,int N> accelerator_inline void spProjXm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++){
|
for(int i=0;i<N;i++){
|
||||||
for(int j=0;j<N;j++){
|
for(int j=0;j<N;j++){
|
||||||
spProjXm(hspin._internal[i][j],fspin._internal[i][j]);
|
spProjXm(hspin._internal[i][j],fspin._internal[i][j]);
|
||||||
@ -455,19 +439,16 @@ template<class rtype,class vtype,int N> accelerator_inline void spProjXm (iMatri
|
|||||||
|
|
||||||
template<class rtype,class vtype> accelerator_inline void spReconXm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
template<class rtype,class vtype> accelerator_inline void spReconXm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
|
|
||||||
spReconXm(hspin._internal,fspin._internal);
|
spReconXm(hspin._internal,fspin._internal);
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconXm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconXm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++) {
|
for(int i=0;i<N;i++) {
|
||||||
spReconXm(hspin._internal[i],fspin._internal[i]);
|
spReconXm(hspin._internal[i],fspin._internal[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N> accelerator_inline void spReconXm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
template<class rtype,class vtype,int N> accelerator_inline void spReconXm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++){
|
for(int i=0;i<N;i++){
|
||||||
for(int j=0;j<N;j++){
|
for(int j=0;j<N;j++){
|
||||||
spReconXm(hspin._internal[i][j],fspin._internal[i][j]);
|
spReconXm(hspin._internal[i][j],fspin._internal[i][j]);
|
||||||
@ -476,45 +457,37 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconXm (iMatr
|
|||||||
|
|
||||||
template<class rtype,class vtype> accelerator_inline void accumReconXm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
template<class rtype,class vtype> accelerator_inline void accumReconXm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
|
|
||||||
accumReconXm(hspin._internal,fspin._internal);
|
accumReconXm(hspin._internal,fspin._internal);
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconXm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconXm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++) {
|
for(int i=0;i<N;i++) {
|
||||||
accumReconXm(hspin._internal[i],fspin._internal[i]);
|
accumReconXm(hspin._internal[i],fspin._internal[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N> accelerator_inline void accumReconXm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
template<class rtype,class vtype,int N> accelerator_inline void accumReconXm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++){
|
for(int i=0;i<N;i++){
|
||||||
for(int j=0;j<N;j++){
|
for(int j=0;j<N;j++){
|
||||||
accumReconXm(hspin._internal[i][j],fspin._internal[i][j]);
|
accumReconXm(hspin._internal[i][j],fspin._internal[i][j]);
|
||||||
}}
|
}}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
////////
|
////////
|
||||||
// Yp
|
// Yp
|
||||||
////////
|
////////
|
||||||
template<class rtype,class vtype> accelerator_inline void spProjYp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
template<class rtype,class vtype> accelerator_inline void spProjYp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
|
|
||||||
spProjYp(hspin._internal,fspin._internal);
|
spProjYp(hspin._internal,fspin._internal);
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjYp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjYp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++) {
|
for(int i=0;i<N;i++) {
|
||||||
spProjYp(hspin._internal[i],fspin._internal[i]);
|
spProjYp(hspin._internal[i],fspin._internal[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N> accelerator_inline void spProjYp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
template<class rtype,class vtype,int N> accelerator_inline void spProjYp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++){
|
for(int i=0;i<N;i++){
|
||||||
for(int j=0;j<N;j++){
|
for(int j=0;j<N;j++){
|
||||||
spProjYp(hspin._internal[i][j],fspin._internal[i][j]);
|
spProjYp(hspin._internal[i][j],fspin._internal[i][j]);
|
||||||
@ -524,19 +497,16 @@ template<class rtype,class vtype,int N> accelerator_inline void spProjYp (iMatri
|
|||||||
|
|
||||||
template<class rtype,class vtype> accelerator_inline void spReconYp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
template<class rtype,class vtype> accelerator_inline void spReconYp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
|
|
||||||
spReconYp(hspin._internal,fspin._internal);
|
spReconYp(hspin._internal,fspin._internal);
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconYp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconYp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++) {
|
for(int i=0;i<N;i++) {
|
||||||
spReconYp(hspin._internal[i],fspin._internal[i]);
|
spReconYp(hspin._internal[i],fspin._internal[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N> accelerator_inline void spReconYp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
template<class rtype,class vtype,int N> accelerator_inline void spReconYp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++){
|
for(int i=0;i<N;i++){
|
||||||
for(int j=0;j<N;j++){
|
for(int j=0;j<N;j++){
|
||||||
spReconYp(hspin._internal[i][j],fspin._internal[i][j]);
|
spReconYp(hspin._internal[i][j],fspin._internal[i][j]);
|
||||||
@ -545,66 +515,55 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconYp (iMatr
|
|||||||
|
|
||||||
template<class rtype,class vtype> accelerator_inline void accumReconYp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
template<class rtype,class vtype> accelerator_inline void accumReconYp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
|
|
||||||
accumReconYp(hspin._internal,fspin._internal);
|
accumReconYp(hspin._internal,fspin._internal);
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconYp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconYp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++) {
|
for(int i=0;i<N;i++) {
|
||||||
accumReconYp(hspin._internal[i],fspin._internal[i]);
|
accumReconYp(hspin._internal[i],fspin._internal[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N> accelerator_inline void accumReconYp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
template<class rtype,class vtype,int N> accelerator_inline void accumReconYp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++){
|
for(int i=0;i<N;i++){
|
||||||
for(int j=0;j<N;j++){
|
for(int j=0;j<N;j++){
|
||||||
accumReconYp(hspin._internal[i][j],fspin._internal[i][j]);
|
accumReconYp(hspin._internal[i][j],fspin._internal[i][j]);
|
||||||
}}
|
}}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
////////
|
////////
|
||||||
// Ym
|
// Ym
|
||||||
////////
|
////////
|
||||||
template<class rtype,class vtype> accelerator_inline void spProjYm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
template<class rtype,class vtype> accelerator_inline void spProjYm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
|
|
||||||
spProjYm(hspin._internal,fspin._internal);
|
spProjYm(hspin._internal,fspin._internal);
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjYm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjYm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++) {
|
for(int i=0;i<N;i++) {
|
||||||
spProjYm(hspin._internal[i],fspin._internal[i]);
|
spProjYm(hspin._internal[i],fspin._internal[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N> accelerator_inline void spProjYm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
template<class rtype,class vtype,int N> accelerator_inline void spProjYm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++){
|
for(int i=0;i<N;i++){
|
||||||
for(int j=0;j<N;j++){
|
for(int j=0;j<N;j++){
|
||||||
spProjYm(hspin._internal[i][j],fspin._internal[i][j]);
|
spProjYm(hspin._internal[i][j],fspin._internal[i][j]);
|
||||||
}}
|
}}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class rtype,class vtype> accelerator_inline void spReconYm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
template<class rtype,class vtype> accelerator_inline void spReconYm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
|
|
||||||
spReconYm(hspin._internal,fspin._internal);
|
spReconYm(hspin._internal,fspin._internal);
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconYm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconYm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,const iVector<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++) {
|
for(int i=0;i<N;i++) {
|
||||||
spReconYm(hspin._internal[i],fspin._internal[i]);
|
spReconYm(hspin._internal[i],fspin._internal[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N> accelerator_inline void spReconYm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
template<class rtype,class vtype,int N> accelerator_inline void spReconYm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++){
|
for(int i=0;i<N;i++){
|
||||||
for(int j=0;j<N;j++){
|
for(int j=0;j<N;j++){
|
||||||
spReconYm(hspin._internal[i][j],fspin._internal[i][j]);
|
spReconYm(hspin._internal[i][j],fspin._internal[i][j]);
|
||||||
@ -613,19 +572,16 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconYm (iMatr
|
|||||||
|
|
||||||
template<class rtype,class vtype> accelerator_inline void accumReconYm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
template<class rtype,class vtype> accelerator_inline void accumReconYm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
|
|
||||||
accumReconYm(hspin._internal,fspin._internal);
|
accumReconYm(hspin._internal,fspin._internal);
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconYm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconYm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++) {
|
for(int i=0;i<N;i++) {
|
||||||
accumReconYm(hspin._internal[i],fspin._internal[i]);
|
accumReconYm(hspin._internal[i],fspin._internal[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N> accelerator_inline void accumReconYm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
template<class rtype,class vtype,int N> accelerator_inline void accumReconYm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++){
|
for(int i=0;i<N;i++){
|
||||||
for(int j=0;j<N;j++){
|
for(int j=0;j<N;j++){
|
||||||
accumReconYm(hspin._internal[i][j],fspin._internal[i][j]);
|
accumReconYm(hspin._internal[i][j],fspin._internal[i][j]);
|
||||||
@ -638,66 +594,57 @@ template<class rtype,class vtype,int N> accelerator_inline void accumReconYm (iM
|
|||||||
////////
|
////////
|
||||||
template<class rtype,class vtype> accelerator_inline void spProjZp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
template<class rtype,class vtype> accelerator_inline void spProjZp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
|
|
||||||
spProjZp(hspin._internal,fspin._internal);
|
spProjZp(hspin._internal,fspin._internal);
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjZp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjZp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++) {
|
for(int i=0;i<N;i++) {
|
||||||
spProjZp(hspin._internal[i],fspin._internal[i]);
|
spProjZp(hspin._internal[i],fspin._internal[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N> accelerator_inline void spProjZp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
template<class rtype,class vtype,int N> accelerator_inline void spProjZp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++){
|
for(int i=0;i<N;i++){
|
||||||
for(int j=0;j<N;j++){
|
for(int j=0;j<N;j++){
|
||||||
spProjZp(hspin._internal[i][j],fspin._internal[i][j]);
|
spProjZp(hspin._internal[i][j],fspin._internal[i][j]);
|
||||||
}}
|
}}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class rtype,class vtype> accelerator_inline void spReconZp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
template<class rtype,class vtype> accelerator_inline void spReconZp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
|
|
||||||
spReconZp(hspin._internal,fspin._internal);
|
spReconZp(hspin._internal,fspin._internal);
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconZp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconZp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++) {
|
for(int i=0;i<N;i++) {
|
||||||
spReconZp(hspin._internal[i],fspin._internal[i]);
|
spReconZp(hspin._internal[i],fspin._internal[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N> accelerator_inline void spReconZp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
template<class rtype,class vtype,int N> accelerator_inline void spReconZp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++){
|
for(int i=0;i<N;i++){
|
||||||
for(int j=0;j<N;j++){
|
for(int j=0;j<N;j++){
|
||||||
spReconZp(hspin._internal[i][j],fspin._internal[i][j]);
|
spReconZp(hspin._internal[i][j],fspin._internal[i][j]);
|
||||||
}}
|
}}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class rtype,class vtype> accelerator_inline void accumReconZp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
template<class rtype,class vtype> accelerator_inline void accumReconZp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
|
|
||||||
accumReconZp(hspin._internal,fspin._internal);
|
accumReconZp(hspin._internal,fspin._internal);
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconZp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconZp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++) {
|
for(int i=0;i<N;i++) {
|
||||||
accumReconZp(hspin._internal[i],fspin._internal[i]);
|
accumReconZp(hspin._internal[i],fspin._internal[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N> accelerator_inline void accumReconZp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
template<class rtype,class vtype,int N> accelerator_inline void accumReconZp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++){
|
for(int i=0;i<N;i++){
|
||||||
for(int j=0;j<N;j++){
|
for(int j=0;j<N;j++){
|
||||||
accumReconZp(hspin._internal[i][j],fspin._internal[i][j]);
|
accumReconZp(hspin._internal[i][j],fspin._internal[i][j]);
|
||||||
}}
|
}}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -706,62 +653,53 @@ template<class rtype,class vtype,int N> accelerator_inline void accumReconZp (iM
|
|||||||
////////
|
////////
|
||||||
template<class rtype,class vtype> accelerator_inline void spProjZm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
template<class rtype,class vtype> accelerator_inline void spProjZm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
|
|
||||||
spProjZm(hspin._internal,fspin._internal);
|
spProjZm(hspin._internal,fspin._internal);
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjZm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjZm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++) {
|
for(int i=0;i<N;i++) {
|
||||||
spProjZm(hspin._internal[i],fspin._internal[i]);
|
spProjZm(hspin._internal[i],fspin._internal[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N> accelerator_inline void spProjZm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
template<class rtype,class vtype,int N> accelerator_inline void spProjZm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++){
|
for(int i=0;i<N;i++){
|
||||||
for(int j=0;j<N;j++){
|
for(int j=0;j<N;j++){
|
||||||
spProjZm(hspin._internal[i][j],fspin._internal[i][j]);
|
spProjZm(hspin._internal[i][j],fspin._internal[i][j]);
|
||||||
}}
|
}}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class rtype,class vtype> accelerator_inline void spReconZm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
template<class rtype,class vtype> accelerator_inline void spReconZm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
|
|
||||||
spReconZm(hspin._internal,fspin._internal);
|
spReconZm(hspin._internal,fspin._internal);
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconZm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconZm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++) {
|
for(int i=0;i<N;i++) {
|
||||||
spReconZm(hspin._internal[i],fspin._internal[i]);
|
spReconZm(hspin._internal[i],fspin._internal[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N> accelerator_inline void spReconZm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
template<class rtype,class vtype,int N> accelerator_inline void spReconZm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++){
|
for(int i=0;i<N;i++){
|
||||||
for(int j=0;j<N;j++){
|
for(int j=0;j<N;j++){
|
||||||
spReconZm(hspin._internal[i][j],fspin._internal[i][j]);
|
spReconZm(hspin._internal[i][j],fspin._internal[i][j]);
|
||||||
}}
|
}}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class rtype,class vtype> accelerator_inline void accumReconZm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
template<class rtype,class vtype> accelerator_inline void accumReconZm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
|
|
||||||
accumReconZm(hspin._internal,fspin._internal);
|
accumReconZm(hspin._internal,fspin._internal);
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconZm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconZm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++) {
|
for(int i=0;i<N;i++) {
|
||||||
accumReconZm(hspin._internal[i],fspin._internal[i]);
|
accumReconZm(hspin._internal[i],fspin._internal[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N> accelerator_inline void accumReconZm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
template<class rtype,class vtype,int N> accelerator_inline void accumReconZm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++){
|
for(int i=0;i<N;i++){
|
||||||
for(int j=0;j<N;j++){
|
for(int j=0;j<N;j++){
|
||||||
accumReconZm(hspin._internal[i][j],fspin._internal[i][j]);
|
accumReconZm(hspin._internal[i][j],fspin._internal[i][j]);
|
||||||
@ -774,41 +712,35 @@ template<class rtype,class vtype,int N> accelerator_inline void accumReconZm (iM
|
|||||||
////////
|
////////
|
||||||
template<class rtype,class vtype> accelerator_inline void spProjTp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
template<class rtype,class vtype> accelerator_inline void spProjTp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
|
|
||||||
spProjTp(hspin._internal,fspin._internal);
|
spProjTp(hspin._internal,fspin._internal);
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjTp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjTp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++) {
|
for(int i=0;i<N;i++) {
|
||||||
spProjTp(hspin._internal[i],fspin._internal[i]);
|
spProjTp(hspin._internal[i],fspin._internal[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N> accelerator_inline void spProjTp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
template<class rtype,class vtype,int N> accelerator_inline void spProjTp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++){
|
for(int i=0;i<N;i++){
|
||||||
for(int j=0;j<N;j++){
|
for(int j=0;j<N;j++){
|
||||||
spProjTp(hspin._internal[i][j],fspin._internal[i][j]);
|
spProjTp(hspin._internal[i][j],fspin._internal[i][j]);
|
||||||
}}
|
}}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class rtype,class vtype> accelerator_inline void spReconTp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
template<class rtype,class vtype> accelerator_inline void spReconTp (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
|
|
||||||
spReconTp(hspin._internal,fspin._internal);
|
spReconTp(hspin._internal,fspin._internal);
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconTp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconTp (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++) {
|
for(int i=0;i<N;i++) {
|
||||||
spReconTp(hspin._internal[i],fspin._internal[i]);
|
spReconTp(hspin._internal[i],fspin._internal[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N> accelerator_inline void spReconTp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
template<class rtype,class vtype,int N> accelerator_inline void spReconTp (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++){
|
for(int i=0;i<N;i++){
|
||||||
for(int j=0;j<N;j++){
|
for(int j=0;j<N;j++){
|
||||||
spReconTp(hspin._internal[i][j],fspin._internal[i][j]);
|
spReconTp(hspin._internal[i][j],fspin._internal[i][j]);
|
||||||
@ -817,44 +749,37 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconTp (iMatr
|
|||||||
|
|
||||||
template<class rtype,class vtype> accelerator_inline void accumReconTp (iScalar<rtype> &hspin, iScalar<vtype> &fspin)
|
template<class rtype,class vtype> accelerator_inline void accumReconTp (iScalar<rtype> &hspin, iScalar<vtype> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
|
|
||||||
accumReconTp(hspin._internal,fspin._internal);
|
accumReconTp(hspin._internal,fspin._internal);
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconTp (iVector<rtype,N> &hspin, const iVector<vtype,N> &fspin)
|
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconTp (iVector<rtype,N> &hspin, const iVector<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++) {
|
for(int i=0;i<N;i++) {
|
||||||
accumReconTp(hspin._internal[i],fspin._internal[i]);
|
accumReconTp(hspin._internal[i],fspin._internal[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N> accelerator_inline void accumReconTp (iMatrix<rtype,N> &hspin, const iMatrix<vtype,N> &fspin)
|
template<class rtype,class vtype,int N> accelerator_inline void accumReconTp (iMatrix<rtype,N> &hspin, const iMatrix<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++){
|
for(int i=0;i<N;i++){
|
||||||
for(int j=0;j<N;j++){
|
for(int j=0;j<N;j++){
|
||||||
accumReconTp(hspin._internal[i][j],fspin._internal[i][j]);
|
accumReconTp(hspin._internal[i][j],fspin._internal[i][j]);
|
||||||
}}
|
}}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
////////
|
////////
|
||||||
// Tm
|
// Tm
|
||||||
////////
|
////////
|
||||||
template<class rtype,class vtype> accelerator_inline void spProjTm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
template<class rtype,class vtype> accelerator_inline void spProjTm (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
|
|
||||||
spProjTm(hspin._internal,fspin._internal);
|
spProjTm(hspin._internal,fspin._internal);
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjTm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProjTm (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++) {
|
for(int i=0;i<N;i++) {
|
||||||
spProjTm(hspin._internal[i],fspin._internal[i]);
|
spProjTm(hspin._internal[i],fspin._internal[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N> accelerator_inline void spProjTm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
template<class rtype,class vtype,int N> accelerator_inline void spProjTm (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++){
|
for(int i=0;i<N;i++){
|
||||||
for(int j=0;j<N;j++){
|
for(int j=0;j<N;j++){
|
||||||
spProjTm(hspin._internal[i][j],fspin._internal[i][j]);
|
spProjTm(hspin._internal[i][j],fspin._internal[i][j]);
|
||||||
@ -864,19 +789,16 @@ template<class rtype,class vtype,int N> accelerator_inline void spProjTm (iMatri
|
|||||||
|
|
||||||
template<class rtype,class vtype> accelerator_inline void spReconTm (iScalar<rtype> &hspin, const iScalar<vtype> &fspin)
|
template<class rtype,class vtype> accelerator_inline void spReconTm (iScalar<rtype> &hspin, const iScalar<vtype> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
|
|
||||||
spReconTm(hspin._internal,fspin._internal);
|
spReconTm(hspin._internal,fspin._internal);
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconTm (iVector<rtype,N> &hspin, const iVector<vtype,N> &fspin)
|
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spReconTm (iVector<rtype,N> &hspin, const iVector<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++) {
|
for(int i=0;i<N;i++) {
|
||||||
spReconTm(hspin._internal[i],fspin._internal[i]);
|
spReconTm(hspin._internal[i],fspin._internal[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N> accelerator_inline void spReconTm (iMatrix<rtype,N> &hspin, const iMatrix<vtype,N> &fspin)
|
template<class rtype,class vtype,int N> accelerator_inline void spReconTm (iMatrix<rtype,N> &hspin, const iMatrix<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++){
|
for(int i=0;i<N;i++){
|
||||||
for(int j=0;j<N;j++){
|
for(int j=0;j<N;j++){
|
||||||
spReconTm(hspin._internal[i][j],fspin._internal[i][j]);
|
spReconTm(hspin._internal[i][j],fspin._internal[i][j]);
|
||||||
@ -885,44 +807,37 @@ template<class rtype,class vtype,int N> accelerator_inline void spReconTm (iMatr
|
|||||||
|
|
||||||
template<class rtype,class vtype> accelerator_inline void accumReconTm (iScalar<rtype> &hspin, const iScalar<vtype> &fspin)
|
template<class rtype,class vtype> accelerator_inline void accumReconTm (iScalar<rtype> &hspin, const iScalar<vtype> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
|
|
||||||
accumReconTm(hspin._internal,fspin._internal);
|
accumReconTm(hspin._internal,fspin._internal);
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconTm (iVector<rtype,N> &hspin, const iVector<vtype,N> &fspin)
|
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumReconTm (iVector<rtype,N> &hspin, const iVector<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++) {
|
for(int i=0;i<N;i++) {
|
||||||
accumReconTm(hspin._internal[i],fspin._internal[i]);
|
accumReconTm(hspin._internal[i],fspin._internal[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N> accelerator_inline void accumReconTm (iMatrix<rtype,N> &hspin, const iMatrix<vtype,N> &fspin)
|
template<class rtype,class vtype,int N> accelerator_inline void accumReconTm (iMatrix<rtype,N> &hspin, const iMatrix<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++){
|
for(int i=0;i<N;i++){
|
||||||
for(int j=0;j<N;j++){
|
for(int j=0;j<N;j++){
|
||||||
accumReconTm(hspin._internal[i][j],fspin._internal[i][j]);
|
accumReconTm(hspin._internal[i][j],fspin._internal[i][j]);
|
||||||
}}
|
}}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
////////
|
////////
|
||||||
// 5p
|
// 5p
|
||||||
////////
|
////////
|
||||||
template<class rtype,class vtype> accelerator_inline void spProj5p (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
template<class rtype,class vtype,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
|
|
||||||
spProj5p(hspin._internal,fspin._internal);
|
spProj5p(hspin._internal,fspin._internal);
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProj5p (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProj5p (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++) {
|
for(int i=0;i<N;i++) {
|
||||||
spProj5p(hspin._internal[i],fspin._internal[i]);
|
spProj5p(hspin._internal[i],fspin._internal[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N> accelerator_inline void spProj5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
template<class rtype,class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++){
|
for(int i=0;i<N;i++){
|
||||||
for(int j=0;j<N;j++){
|
for(int j=0;j<N;j++){
|
||||||
spProj5p(hspin._internal[i][j],fspin._internal[i][j]);
|
spProj5p(hspin._internal[i][j],fspin._internal[i][j]);
|
||||||
@ -931,19 +846,16 @@ template<class rtype,class vtype,int N> accelerator_inline void spProj5p (iMatri
|
|||||||
|
|
||||||
template<class rtype,class vtype> accelerator_inline void spRecon5p (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
template<class rtype,class vtype> accelerator_inline void spRecon5p (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
|
|
||||||
spRecon5p(hspin._internal,fspin._internal);
|
spRecon5p(hspin._internal,fspin._internal);
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spRecon5p (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spRecon5p (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++) {
|
for(int i=0;i<N;i++) {
|
||||||
spRecon5p(hspin._internal[i],fspin._internal[i]);
|
spRecon5p(hspin._internal[i],fspin._internal[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N> accelerator_inline void spRecon5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
template<class rtype,class vtype,int N> accelerator_inline void spRecon5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++){
|
for(int i=0;i<N;i++){
|
||||||
for(int j=0;j<N;j++){
|
for(int j=0;j<N;j++){
|
||||||
spRecon5p(hspin._internal[i][j],fspin._internal[i][j]);
|
spRecon5p(hspin._internal[i][j],fspin._internal[i][j]);
|
||||||
@ -952,19 +864,16 @@ template<class rtype,class vtype,int N> accelerator_inline void spRecon5p (iMatr
|
|||||||
|
|
||||||
template<class rtype,class vtype> accelerator_inline void accumRecon5p (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
template<class rtype,class vtype> accelerator_inline void accumRecon5p (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
|
|
||||||
accumRecon5p(hspin._internal,fspin._internal);
|
accumRecon5p(hspin._internal,fspin._internal);
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumRecon5p (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumRecon5p (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++) {
|
for(int i=0;i<N;i++) {
|
||||||
accumRecon5p(hspin._internal[i],fspin._internal[i]);
|
accumRecon5p(hspin._internal[i],fspin._internal[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N> accelerator_inline void accumRecon5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
template<class rtype,class vtype,int N> accelerator_inline void accumRecon5p (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++){
|
for(int i=0;i<N;i++){
|
||||||
for(int j=0;j<N;j++){
|
for(int j=0;j<N;j++){
|
||||||
accumRecon5p(hspin._internal[i][j],fspin._internal[i][j]);
|
accumRecon5p(hspin._internal[i][j],fspin._internal[i][j]);
|
||||||
@ -972,24 +881,18 @@ template<class rtype,class vtype,int N> accelerator_inline void accumRecon5p (iM
|
|||||||
}
|
}
|
||||||
|
|
||||||
// four spinor projectors for chiral proj
|
// four spinor projectors for chiral proj
|
||||||
// template<class vtype> accelerator_inline void fspProj5p (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
|
template<class vtype,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
|
||||||
template<class vtype> accelerator_inline void spProj5p (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
|
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
|
|
||||||
spProj5p(hspin._internal,fspin._internal);
|
spProj5p(hspin._internal,fspin._internal);
|
||||||
}
|
}
|
||||||
// template<class vtype,int N> accelerator_inline void fspProj5p (iVector<vtype,N> &hspin,iVector<vtype,N> &fspin)
|
template<class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iVector<vtype,N> &hspin,const iVector<vtype,N> &fspin)
|
||||||
template<class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProj5p (iVector<vtype,N> &hspin,const iVector<vtype,N> &fspin)
|
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++) {
|
for(int i=0;i<N;i++) {
|
||||||
spProj5p(hspin._internal[i],fspin._internal[i]);
|
spProj5p(hspin._internal[i],fspin._internal[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// template<class vtype,int N> accelerator_inline void fspProj5p (iMatrix<vtype,N> &hspin,iMatrix<vtype,N> &fspin)
|
template<class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5p (iMatrix<vtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
||||||
template<class vtype,int N> accelerator_inline void spProj5p (iMatrix<vtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++){
|
for(int i=0;i<N;i++){
|
||||||
for(int j=0;j<N;j++){
|
for(int j=0;j<N;j++){
|
||||||
spProj5p(hspin._internal[i][j],fspin._internal[i][j]);
|
spProj5p(hspin._internal[i][j],fspin._internal[i][j]);
|
||||||
@ -1001,17 +904,17 @@ template<class vtype,int N> accelerator_inline void spProj5p (iMatrix<vtype,N> &
|
|||||||
// 5m
|
// 5m
|
||||||
////////
|
////////
|
||||||
|
|
||||||
template<class rtype,class vtype> accelerator_inline void spProj5m (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
template<class rtype,class vtype,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
||||||
{
|
{
|
||||||
spProj5m(hspin._internal,fspin._internal);
|
spProj5m(hspin._internal,fspin._internal);
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N,IfNotSpinor<iVector<rtype,N> > = 0> accelerator_inline void spProj5m (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
template<class rtype,class vtype,int N,IfNotSpinor<iVector<rtype,N> > = 0,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
for(int i=0;i<N;i++) {
|
for(int i=0;i<N;i++) {
|
||||||
spProj5m(hspin._internal[i],fspin._internal[i]);
|
spProj5m(hspin._internal[i],fspin._internal[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N> accelerator_inline void spProj5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
template<class rtype,class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
for(int i=0;i<N;i++){
|
for(int i=0;i<N;i++){
|
||||||
for(int j=0;j<N;j++){
|
for(int j=0;j<N;j++){
|
||||||
@ -1021,40 +924,34 @@ template<class rtype,class vtype,int N> accelerator_inline void spProj5m (iMatri
|
|||||||
|
|
||||||
template<class rtype,class vtype> accelerator_inline void spRecon5m (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
template<class rtype,class vtype> accelerator_inline void spRecon5m (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
|
|
||||||
spRecon5m(hspin._internal,fspin._internal);
|
spRecon5m(hspin._internal,fspin._internal);
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spRecon5m (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spRecon5m (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++) {
|
for(int i=0;i<N;i++) {
|
||||||
spRecon5m(hspin._internal[i],fspin._internal[i]);
|
spRecon5m(hspin._internal[i],fspin._internal[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N> accelerator_inline void spRecon5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
template<class rtype,class vtype,int N> accelerator_inline void spRecon5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++){
|
for(int i=0;i<N;i++){
|
||||||
for(int j=0;j<N;j++){
|
for(int j=0;j<N;j++){
|
||||||
spRecon5m(hspin._internal[i][j],fspin._internal[i][j]);
|
spRecon5m(hspin._internal[i][j],fspin._internal[i][j]);
|
||||||
}}
|
}}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class rtype,class vtype> accelerator_inline void accumRecon5m (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
template<class rtype,class vtype> accelerator_inline void accumRecon5m (iScalar<rtype> &hspin,const iScalar<vtype> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
|
|
||||||
accumRecon5m(hspin._internal,fspin._internal);
|
accumRecon5m(hspin._internal,fspin._internal);
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumRecon5m (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
template<class rtype,class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void accumRecon5m (iVector<rtype,N> &hspin,const iVector<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++) {
|
for(int i=0;i<N;i++) {
|
||||||
accumRecon5m(hspin._internal[i],fspin._internal[i]);
|
accumRecon5m(hspin._internal[i],fspin._internal[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
template<class rtype,class vtype,int N> accelerator_inline void accumRecon5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
template<class rtype,class vtype,int N> accelerator_inline void accumRecon5m (iMatrix<rtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++){
|
for(int i=0;i<N;i++){
|
||||||
for(int j=0;j<N;j++){
|
for(int j=0;j<N;j++){
|
||||||
accumRecon5m(hspin._internal[i][j],fspin._internal[i][j]);
|
accumRecon5m(hspin._internal[i][j],fspin._internal[i][j]);
|
||||||
@ -1063,24 +960,18 @@ template<class rtype,class vtype,int N> accelerator_inline void accumRecon5m (iM
|
|||||||
|
|
||||||
|
|
||||||
// four spinor projectors for chiral proj
|
// four spinor projectors for chiral proj
|
||||||
// template<class vtype> accelerator_inline void fspProj5m (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
|
template<class vtype,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
|
||||||
template<class vtype> accelerator_inline void spProj5m (iScalar<vtype> &hspin,const iScalar<vtype> &fspin)
|
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,SpinorIndex>::notvalue,iScalar<vtype> >::type *temp;
|
|
||||||
spProj5m(hspin._internal,fspin._internal);
|
spProj5m(hspin._internal,fspin._internal);
|
||||||
}
|
}
|
||||||
// template<class vtype,int N> accelerator_inline void fspProj5m (iVector<vtype,N> &hspin,iVector<vtype,N> &fspin)
|
template<class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iVector<vtype,N> &hspin,const iVector<vtype,N> &fspin)
|
||||||
template<class vtype,int N,IfNotSpinor<iVector<vtype,N> > = 0> accelerator_inline void spProj5m (iVector<vtype,N> &hspin,const iVector<vtype,N> &fspin)
|
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,SpinorIndex>::notvalue,iVector<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++) {
|
for(int i=0;i<N;i++) {
|
||||||
spProj5m(hspin._internal[i],fspin._internal[i]);
|
spProj5m(hspin._internal[i],fspin._internal[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// template<class vtype,int N> accelerator_inline void fspProj5m (iMatrix<vtype,N> &hspin,iMatrix<vtype,N> &fspin)
|
template<class vtype,int N,IfNotCoarsened<iScalar<vtype> > = 0> accelerator_inline void spProj5m (iMatrix<vtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
||||||
template<class vtype,int N> accelerator_inline void spProj5m (iMatrix<vtype,N> &hspin,const iMatrix<vtype,N> &fspin)
|
|
||||||
{
|
{
|
||||||
//typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,SpinorIndex>::notvalue,iMatrix<vtype,N> >::type *temp;
|
|
||||||
for(int i=0;i<N;i++){
|
for(int i=0;i<N;i++){
|
||||||
for(int j=0;j<N;j++){
|
for(int j=0;j<N;j++){
|
||||||
spProj5m(hspin._internal[i][j],fspin._internal[i][j]);
|
spProj5m(hspin._internal[i][j],fspin._internal[i][j]);
|
||||||
|
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user