1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-14 05:07:05 +01:00

Compare commits

...

164 Commits

Author SHA1 Message Date
5a112feac3 Merge branch 'release/ISC-freeze-1' 2018-06-04 18:49:40 +01:00
c2e8d0aa88 Solve g++ problem on the lanczos test 2018-06-04 18:34:15 +01:00
bf96a4bdbf Merge branch 'master' into develop 2018-06-04 14:03:11 +01:00
84685c9bc3 Overflow fix 2018-06-04 13:42:07 +01:00
013ea4e8d1 Merge branch 'feature/staggered-comms-compute' into develop 2018-05-21 13:11:56 +01:00
7fbbb31a50 Merge branch 'develop' into feature/staggered-comms-compute
Conflicts:
	lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
2018-05-21 13:07:29 +01:00
0e127b1fc7 New file single prec test 2018-05-21 12:57:13 +01:00
68c028b0a6 Comment 2018-05-21 12:54:25 +01:00
a61e0df54b Travis fix for Lime 2018-05-14 19:56:12 +01:00
f871fb0c6d check file is opened correctly in the Lime reader 2018-05-11 18:06:28 +01:00
25d1cadd3b Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-05-07 18:55:09 +01:00
c24d53bbd1 Further debug of RNG I/O 2018-05-07 18:55:05 +01:00
3c7a4106ed Trap for deadly empty comm thread option 2018-05-07 17:26:39 +01:00
6eed167f0c Merge branch 'release/0.8.1' 2018-05-04 17:34:11 +01:00
4ad0df6fde Bump volume for Gerardo 2018-05-04 17:33:23 +01:00
68a5079f33 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-05-04 14:13:54 +01:00
8634e19f1b Update 2018-05-04 14:13:35 +01:00
9ada378e38 Add timing 2018-05-04 10:58:01 +01:00
bfbf2f1fa0 no threaded stencil benchmark if OpenMP is not supported 2018-05-03 16:20:01 +01:00
587bfcc0f4 Add Timing 2018-05-03 12:10:31 +01:00
8c658de179 Compressor speed up (a little); streaming stores 2018-05-02 17:52:16 +01:00
ba37d51ee9 Debugging the RNG IO 2018-05-02 15:32:06 +01:00
4f4181c54a Merge branch 'feature/staggered-comms-compute' of https://github.com/paboyle/Grid into feature/staggered-comms-compute 2018-05-02 14:59:13 +01:00
4d4ac2517b Adding Scalar field theory example for Scidac format 2018-05-02 14:36:32 +01:00
e568c24d1d Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-05-02 14:29:25 +01:00
b458326744 Checkpointer module update 2018-05-02 14:29:22 +01:00
6e7d5e2243 HMC: added Scidac checkpointer and support for metadata 2018-05-02 14:28:59 +01:00
b35169f1dd MultiShift for Staggered 2018-05-02 14:22:37 +01:00
441ad7498d add Iterative counter 2018-05-02 14:21:30 +01:00
6f6c5c549a Split off gparity 2018-05-02 14:11:23 +01:00
1584e17b54 Revert to fast versoin 2018-05-02 14:10:55 +01:00
12982a4455 Hypercube optimisation 2018-05-02 14:10:21 +01:00
172f412102 shmget reintroduce 2018-05-02 14:07:41 +01:00
a64497265d TIming 2018-05-02 14:07:28 +01:00
c45f24a1b5 Improvements for tesseract 2018-04-30 21:50:00 +01:00
aaf37ee4d7 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-04-27 11:45:13 +01:00
1dddd17e3c Benchmark improvements from tesseract 2018-04-27 11:44:46 +01:00
661f1d3e8e Merge branch 'release/0.8.0' into develop 2018-04-27 11:22:33 +01:00
edcf9b9293 Merge branch 'release/0.8.0' 2018-04-27 11:13:19 +01:00
fe6860b4dd Update with LIME library guard 2018-04-27 08:57:34 +01:00
d6406b13e1 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-04-27 07:52:56 +01:00
e369d7306d Rename 2018-04-27 07:51:44 +01:00
9f8d63e104 Roll over version 2018-04-27 07:51:12 +01:00
9b0240d101 Hot start test 2018-04-27 07:50:51 +01:00
b27f0e5a53 Control over IO 2018-04-27 07:50:15 +01:00
75e4483407 Stronger convergence test 2018-04-27 07:49:57 +01:00
0734e9ddd4 Debugging Scatter_plane_simple 2018-04-27 14:39:01 +09:00
809b1cdd58 Bug fix for MPI running ; introduced last night 2018-04-27 05:19:10 +01:00
1be8089604 Clean compile 2018-04-26 23:42:45 +01:00
3e0eff6468 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-04-26 23:00:46 +01:00
7ecc47ac89 Quenched test compile 2018-04-26 23:00:28 +01:00
e9f1ac09de static 2018-04-26 23:00:08 +01:00
fa0d8feff4 Performance of CovariantCshift now non-embarrassing. 2018-04-26 17:56:27 +01:00
05b44aef6b Merge branch 'develop' of https://github.com/paboyle/Grid into develop
Conflicts:
	benchmarks/Benchmark_su3.cc
2018-04-26 15:38:49 +01:00
03e9832efa Use macros for bare openmp 2018-04-26 14:50:02 +01:00
28a375d35d Force static 2018-04-26 14:49:42 +01:00
3b06381745 Guard bare openmp statemetn with ifdef 2018-04-26 14:48:57 +01:00
91a0a3f820 Improvement 2018-04-26 14:48:35 +01:00
8f44c799a6 Saving the benchmarking tests for Cshift 2018-04-26 14:48:03 +01:00
96272f3841 Merge staggered fix linear operator and reduction 2018-04-26 10:33:19 +01:00
5c936d88a0 Merge branch 'feature/staggered-comms-compute' of https://github.com/paboyle/Grid into feature/staggered-comms-compute 2018-04-26 10:18:37 +01:00
1c64ee926e Faster staggered operator with m^2 term trivial used 2018-04-26 10:17:49 +01:00
2cbb72a81c Provide info if EE term is trivial (m^2 factor)
Better timing in staggered 4d case
2018-04-26 10:10:07 +01:00
31d83ee046 Enable special treatment of constEE cases 2018-04-26 10:08:46 +01:00
a9e8758a01 Improvements to staggered tests timings 2018-04-26 10:08:05 +01:00
3e125c5b61 Faster linalg on CG optimised against staggered
Sum overhead is bigger for staggered
2018-04-26 10:07:19 +01:00
eac6ec4b5e Faster reductions, important on single node staggered 2018-04-26 10:03:57 +01:00
213f8db6a2 Microsecond resultion 2018-04-26 10:01:39 +01:00
6358f35b7e Debug of previous commit 2018-04-26 14:18:11 +09:00
43f5a0df50 More timers in the integrator 2018-04-26 12:01:56 +09:00
c897878776 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-04-26 11:31:57 +09:00
2baf193031 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-04-25 00:14:03 +01:00
362ba0443a Cshift updates 2018-04-25 00:12:11 +01:00
276a2353df Move constructor 2018-04-25 00:11:07 +01:00
c5b9147b53 Correction of a minor bug in the su3 benchmark 2018-04-24 08:03:57 -07:00
64ac815fd9 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-04-24 17:27:38 +09:00
a1be533329 Corrected Flop count in Benchmark su3 and expanded the Wilson flow output 2018-04-24 01:19:53 -07:00
141da3ae71 function to get tensor dimensions 2018-04-20 17:13:34 +01:00
94edf9cf8b HDF5: direct access to group for custom operations 2018-04-20 17:13:21 +01:00
c11a3ca0a7 vectorise/unvectorise in reverse order 2018-04-20 17:13:04 +01:00
870b1a85ae Think I have the physical prop interface to CF and PF overlap right, but need a strong check/regression.
Only support Hw overlap, not Ht for now. Ht needs a new Dminus implemented.
2018-04-18 14:17:49 +01:00
b5510427f9 physical fermion interface, cshift benchmark in SU3. 2018-04-18 01:43:29 +01:00
26ed65c8f8 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-04-17 12:03:32 +01:00
f7f043d8cf Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-04-17 10:57:18 +01:00
ddcaa6ad29 Master does header on Nersc 2018-04-17 10:48:33 +01:00
c8d4d184ee XML push fragment fix 2018-04-06 22:53:01 +01:00
1569a374a9 XML interface polish, XML fragments can be pushed into a writer 2018-04-06 18:32:14 +01:00
eddf023b8a pugixml 1.9 update 2018-04-06 16:17:22 +01:00
f089bf5629 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-03-30 16:17:26 +01:00
276f113f28 IO uses master boss node for metadata. 2018-03-30 16:17:05 +01:00
a13c109111 deterministic initialisation of field metadata 2018-03-30 16:03:01 +01:00
ab6afd18ac Still compile if no LIME 2018-03-30 13:39:20 +01:00
5bde64d48b Barrier required in parallel when we use ftell 2018-03-30 12:41:30 +01:00
2f5add4d5f Creation of file 2018-03-30 12:30:58 +01:00
c5a885dcd6 I/O benchmark 2018-03-29 19:57:41 +01:00
c9c073eee4 Changes in messages in test dwf mixedprec 2018-03-23 11:27:56 +00:00
f290b2e908 Fix to pass CI tests 2018-03-23 11:14:23 +00:00
5f8225461b Fencing mixedcg test propagator write. LIME is still optional in Grid 2018-03-23 10:37:58 +00:00
20e186a1e0 Merge pull request #158 from goracle/dev-pull
Make compilation faster by moving print of git hash.
2018-03-22 10:45:17 +00:00
6ef4af989b Merge pull request #159 from goracle/dev-precsafe
Add dimension check to precisionChange.
2018-03-22 10:41:53 +00:00
ccde8b817f Add dimension check to precisionChange. 2018-03-21 20:58:04 -04:00
68168bf72d Revert "Add dimension match check to precisionChange."
This reverts commit 8f601d9b39.
2018-03-21 20:51:38 -04:00
e93d0feaa7 Merge branch 'dev-pull' of github.com:goracle/Grid into dev-pull 2018-03-21 20:39:30 -04:00
8f601d9b39 Add dimension match check to precisionChange. 2018-03-21 20:38:19 -04:00
5436308e4a Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-03-21 14:26:29 +00:00
07fe7d0cbe Save file in current dir; print checksums 2018-03-21 14:26:04 +00:00
60b57706c4 Small bug fix in the shm file names 2018-03-21 13:57:30 +00:00
954e38bebe Put a username in the path 2018-03-20 18:16:15 +00:00
b1a38bde7a Extra test for Gparity with plaquette action 2018-03-20 18:01:32 +00:00
2581875edc Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-03-19 18:00:08 +00:00
6c6d43eb4e Drop RB on coarse space ; that was a mistake 2018-03-17 09:35:01 +00:00
e1dcfd3553 typo fix 2018-03-16 23:10:47 +00:00
888838473a 4GB clean the offsets in parallel IO for multifile records 2018-03-16 21:54:56 +00:00
01568b0e62 Add a new SHM option 2018-03-16 21:54:28 +00:00
d5ce66f6ab Extra SHM option 2018-03-16 21:37:03 +00:00
d86936a3de Eliminating deprecated lex_sites 2018-03-16 12:26:39 +00:00
0fb84fa34b Make compilation faster by moving print of git hash. 2018-03-12 17:03:48 -04:00
0880747edb Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-03-09 20:44:42 +00:00
b801e1fcd6 fclose should be called through a call to close() 2018-03-09 20:44:10 +00:00
360cface33 Grid tensor serialisation fully implemented and tested 2018-03-08 19:12:03 +00:00
80302e95a8 MILC Interface 2018-03-08 15:34:03 +00:00
caf2f6b274 Merge branch 'develop' of github.com:paboyle/Grid into develop 2018-03-08 09:52:25 +00:00
c49be8988b Grid tensor serialisation 2018-03-08 09:51:22 +00:00
971c2379bd std::vector to tensor conversion + test units 2018-03-08 09:50:39 +00:00
94b0d66e4c Merge pull request #157 from goracle/dev-pull
Add print of the current git hash on Grid init.
2018-03-08 16:09:28 +09:00
5e8af396fd Add print of the current git hash on Grid init. 2018-03-07 13:11:51 -05:00
a7d19dbb64 Merge branch 'develop' of github.com:paboyle/Grid into develop
# Conflicts:
#	lib/serialisation/BaseIO.h
2018-03-07 15:13:54 +00:00
90dbe03e17 Conversion of Grid tensors to std::vector made more elegant, also pair syntax changed to (x y) to avoid issues with JSON/XML 2018-03-07 15:12:32 +00:00
8b14096990 Conversion of Grid tensors to std::vector made more elegant, also pair syntax changed to (x y) to avoid issues with JSON/XML 2018-03-07 15:12:18 +00:00
b938202081 Overlapped Comm for Wilson DhopInternal 2018-03-07 14:08:43 +00:00
485c5db0fe conversion of Grid tensors to nested std::vector in preparation for tensor serialisation 2018-03-06 19:22:03 +00:00
c399c2b44d Guido broke the charge conjugate plaquette action with premature optimisation.
This sector of the code does not matter for anything other than Guido's quenched HMC
studies, and any plaq specific optimisations should be retained in a private branch
instead of destroying the code simplicity.
2018-03-05 12:55:41 +00:00
af7de7a294 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-03-05 12:22:41 +00:00
1dc86efd26 Finalize protection 2018-03-05 12:22:18 +00:00
30391cb2eb Merge pull request #155 from fionnoh/develop
Some changes needed for deflation interface
2018-03-03 13:43:59 +00:00
2e88408f5c Some changes needed for deflation interface 2018-03-02 22:27:41 +00:00
0f468e2179 OverlappedComm for Staggered 5D and 4D. 2018-02-22 12:50:09 +00:00
4790e99817 Extra communicator free that I had missed.
Hard to audit them all as this is complex
2018-02-20 15:12:31 +00:00
2dd63aa7a4 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-02-20 14:29:26 +00:00
559a501140 Deflation interface for solvers 2018-02-20 14:29:08 +00:00
945684c470 updates for deflation in the RB solver 2018-02-20 14:28:38 +00:00
e30a80a234 Relaxed constraints on MPI thread mode when not using multiple comms threads 2018-02-15 17:13:36 +00:00
c96483e3bd Whitespace only change 2018-02-13 11:39:07 +00:00
ae31a6a760 Move deflate to right class 2018-02-13 02:11:37 +00:00
dd8f2a64fe INterface to suit hadrons on Lanczos 2018-02-13 02:08:49 +00:00
7b8b2731e7 Conj error for complex coeffs 2018-02-12 16:06:31 +00:00
237a8ec918 Communicator leak fixed (I think) 2018-02-12 13:27:20 +00:00
896f3a8002 Fix to MPI for Hokusai system 2018-02-01 18:51:51 +00:00
f0fcdf75b5 Update README.md 2018-01-30 12:44:20 +01:00
53bffb83d4 Updating README with new SKL target 2018-01-30 12:42:36 +01:00
cd44e851f1 Fixing compilation error in FundtoHirep 2018-01-30 06:04:30 +01:00
fb24e3a7d2 Adding utilities for perf profiling 2018-01-29 11:11:45 +01:00
655a69259a Added support for GCC compilation for Skylake AVX512 2018-01-28 17:02:46 +01:00
507c4e9efc Correcting an missing semicolumn in avx512 2018-01-27 10:59:55 +01:00
f8a5194c70 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-01-25 13:46:37 +01:00
cff3bae155 Adding support for general Nc in the benchmark outputs 2018-01-25 13:46:31 +01:00
6e3ce7423e Hadrons: don't display module list at startup (too long) 2018-01-22 20:04:05 +00:00
15f15a7cfd Merge branch 'develop' into feature/hadrons
# Conflicts:
#	extras/Hadrons/Modules.hpp
#	extras/Hadrons/modules.inc
2018-01-22 20:03:36 +00:00
0e5f626226 Hadrons: module for scalar operator divergence 2018-01-22 19:38:19 +00:00
97b9c6f03d No option for interior/exterior split of asm kernels since different directions get interleaved 2018-01-22 11:04:19 +00:00
63982819c6 No option to overlap comms and compute for asm implementation since different directions are interleaved
in the kernels, introducing if else structure would be too painful
2018-01-22 11:03:39 +00:00
b00d2d2c39 Correction of Representations compilation and small compilation error for Intel 17 2018-01-17 13:46:12 +00:00
f1b3e21830 Merge branch 'feature/clover' into develop 2018-01-17 10:07:42 +00:00
24162c9ead Staggered overlap comms comput 2018-01-09 13:02:52 +00:00
134 changed files with 8299 additions and 3665 deletions

1
.gitignore vendored
View File

@ -123,6 +123,7 @@ make-bin-BUCK.sh
#####################
lib/qcd/spin/gamma-gen/*.h
lib/qcd/spin/gamma-gen/*.cc
lib/version.h
# vs code editor files #
########################

View File

@ -19,6 +19,8 @@ before_install:
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi
install:
- export CWD=`pwd`
- echo $CWD
- export CC=$CC$VERSION
- export CXX=$CXX$VERSION
- echo $PATH
@ -36,11 +38,22 @@ script:
- ./bootstrap.sh
- mkdir build
- cd build
- ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=none
- mkdir lime
- cd lime
- mkdir build
- cd build
- wget http://usqcd-software.github.io/downloads/c-lime/lime-1.3.2.tar.gz
- tar xf lime-1.3.2.tar.gz
- cd lime-1.3.2
- ./configure --prefix=$CWD/build/lime/install
- make -j4
- make install
- cd $CWD/build
- ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install
- make -j4
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
- echo make clean
- ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none
- ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install
- make -j4
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
- make check

View File

@ -5,6 +5,10 @@ include $(top_srcdir)/doxygen.inc
bin_SCRIPTS=grid-config
BUILT_SOURCES = version.h
version.h:
echo "`git log -n 1 --format=format:"#define GITHASH \\"%H:%d\\"%n" HEAD`" > $(srcdir)/lib/version.h
.PHONY: bench check tests doxygen-run doxygen-doc $(DX_PS_GOAL) $(DX_PDF_GOAL)

View File

@ -187,10 +187,11 @@ Alternatively, some CPU codenames can be directly used:
| `<code>` | Description |
| ----------- | -------------------------------------- |
| `KNL` | [Intel Xeon Phi codename Knights Landing](http://ark.intel.com/products/codename/48999/Knights-Landing) |
| `SKL` | [Intel Skylake with AVX512 extensions](https://ark.intel.com/products/codename/37572/Skylake#@server) |
| `BGQ` | Blue Gene/Q |
#### Notes:
- We currently support AVX512 only for the Intel compiler. Support for GCC and clang will appear in future versions of Grid when the AVX512 support within GCC and clang will be more advanced.
- We currently support AVX512 for the Intel compiler and GCC (KNL and SKL target). Support for clang will appear in future versions of Grid when the AVX512 support in the compiler will be more advanced.
- For BG/Q only [bgclang](http://trac.alcf.anl.gov/projects/llvm-bgq) is supported. We do not presently plan to support more compilers for this platform.
- BG/Q performances are currently rather poor. This is being investigated for future versions.
- The vector size for the `GEN` target can be specified with the `configure` script option `--enable-gen-simd-width`.

View File

@ -1,4 +1,4 @@
Version : 0.7.0
Version : 0.8.0
- Clang 3.5 and above, ICPC v16 and above, GCC 6.3 and above recommended
- MPI and MPI3 comms optimisations for KNL and OPA finished

108
benchmarks/Benchmark_IO.cc Normal file
View File

@ -0,0 +1,108 @@
#include <Grid/Grid.h>
#ifdef HAVE_LIME
using namespace std;
using namespace Grid;
using namespace Grid::QCD;
#define MSG cout << GridLogMessage
#define SEP \
"============================================================================="
#ifndef BENCH_IO_LMAX
#define BENCH_IO_LMAX 40
#endif
typedef function<void(const string, LatticeFermion &)> WriterFn;
typedef function<void(LatticeFermion &, const string)> ReaderFn;
string filestem(const int l)
{
return "iobench_l" + to_string(l);
}
void limeWrite(const string filestem, LatticeFermion &vec)
{
emptyUserRecord record;
ScidacWriter binWriter(vec._grid->IsBoss());
binWriter.open(filestem + ".bin");
binWriter.writeScidacFieldRecord(vec, record);
binWriter.close();
}
void limeRead(LatticeFermion &vec, const string filestem)
{
emptyUserRecord record;
ScidacReader binReader;
binReader.open(filestem + ".bin");
binReader.readScidacFieldRecord(vec, record);
binReader.close();
}
void writeBenchmark(const int l, const WriterFn &write)
{
auto mpi = GridDefaultMpi();
auto simd = GridDefaultSimd(Nd, vComplex::Nsimd());
vector<int> latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
unique_ptr<GridCartesian> gPt(SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi));
GridCartesian *g = gPt.get();
GridParallelRNG rng(g);
LatticeFermion vec(g);
emptyUserRecord record;
ScidacWriter binWriter(g->IsBoss());
cout << "-- Local volume " << l << "^4" << endl;
random(rng, vec);
write(filestem(l), vec);
}
void readBenchmark(const int l, const ReaderFn &read)
{
auto mpi = GridDefaultMpi();
auto simd = GridDefaultSimd(Nd, vComplex::Nsimd());
vector<int> latt = {l*mpi[0], l*mpi[1], l*mpi[2], l*mpi[3]};
unique_ptr<GridCartesian> gPt(SpaceTimeGrid::makeFourDimGrid(latt, simd, mpi));
GridCartesian *g = gPt.get();
LatticeFermion vec(g);
emptyUserRecord record;
ScidacReader binReader;
cout << "-- Local volume " << l << "^4" << endl;
read(vec, filestem(l));
}
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
auto simd = GridDefaultSimd(Nd,vComplex::Nsimd());
auto mpi = GridDefaultMpi();
int64_t threads = GridThread::GetThreads();
MSG << "Grid is setup to use " << threads << " threads" << endl;
MSG << SEP << endl;
MSG << "Benchmark Lime write" << endl;
MSG << SEP << endl;
for (int l = 4; l <= BENCH_IO_LMAX; l += 2)
{
writeBenchmark(l, limeWrite);
}
MSG << "Benchmark Lime read" << endl;
MSG << SEP << endl;
for (int l = 4; l <= BENCH_IO_LMAX; l += 2)
{
readBenchmark(l, limeRead);
}
Grid_finalize();
return EXIT_SUCCESS;
}
#else
int main (int argc, char ** argv)
{
return EXIT_SUCCESS;
}
#endif

View File

@ -158,8 +158,10 @@ public:
dbytes=0;
ncomm=0;
parallel_for(int dir=0;dir<8;dir++){
#ifdef GRID_OMP
#pragma omp parallel for num_threads(Grid::CartesianCommunicator::nCommThreads)
#endif
for(int dir=0;dir<8;dir++){
double tbytes;
int mu =dir % 4;
@ -175,9 +177,14 @@ public:
int comm_proc = mpi_layout[mu]-1;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
}
#ifdef GRID_OMP
int tid = omp_get_thread_num();
#else
int tid = dir;
#endif
tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
(void *)&rbuf[dir][0], recv_from_rank,
bytes,dir);
bytes,tid);
#ifdef GRID_OMP
#pragma omp atomic

View File

@ -169,7 +169,11 @@ int main (int argc, char ** argv)
for(int lat=4;lat<=maxlat;lat+=4){
for(int Ls=8;Ls<=8;Ls*=2){
std::vector<int> latt_size ({lat,lat,lat,lat});
std::vector<int> latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1],
lat*mpi_layout[2],
lat*mpi_layout[3]});
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
RealD Nrank = Grid._Nprocessors;
@ -446,7 +450,7 @@ int main (int argc, char ** argv)
}
#ifdef GRID_OMP
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
@ -485,7 +489,8 @@ int main (int argc, char ** argv)
dbytes=0;
ncomm=0;
parallel_for(int dir=0;dir<8;dir++){
#pragma omp parallel for num_threads(Grid::CartesianCommunicator::nCommThreads)
for(int dir=0;dir<8;dir++){
double tbytes;
int mu =dir % 4;
@ -502,9 +507,9 @@ int main (int argc, char ** argv)
int comm_proc = mpi_layout[mu]-1;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
}
int tid = omp_get_thread_num();
tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
(void *)&rbuf[dir][0], recv_from_rank, bytes,dir);
(void *)&rbuf[dir][0], recv_from_rank, bytes,tid);
#pragma omp atomic
dbytes+=tbytes;
@ -532,7 +537,7 @@ int main (int argc, char ** argv)
}
}
#endif
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= All done; Bye Bye"<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;

View File

@ -48,7 +48,6 @@ int main (int argc, char ** argv)
int threads = GridThread::GetThreads();
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
std::vector<int> latt4 = GridDefaultLatt();
int Ls=16;
@ -57,6 +56,10 @@ int main (int argc, char ** argv)
std::stringstream ss(argv[i+1]); ss >> Ls;
}
GridLogLayout();
long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc);
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
@ -187,7 +190,7 @@ int main (int argc, char ** argv)
FGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=1344*volume*ncall;
double flops=single_site_flops*volume*ncall;
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
// std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
@ -226,7 +229,7 @@ int main (int argc, char ** argv)
FGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=1344*volume*ncall;
double flops=single_site_flops*volume*ncall;
std::cout<<GridLogMessage << "Called half prec comms Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
@ -277,7 +280,7 @@ int main (int argc, char ** argv)
double t1=usecond();
FGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=1344*volume*ncall;
double flops=single_site_flops*volume*ncall;
std::cout<<GridLogMessage << "Called Dw s_inner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
@ -355,7 +358,7 @@ int main (int argc, char ** argv)
// sDw.stat.print();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=(1344.0*volume*ncall)/2;
double flops=(single_site_flops*volume*ncall)/2.0;
std::cout<<GridLogMessage << "sDeo mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "sDeo mflop/s per rank "<< flops/(t1-t0)/NP<<std::endl;
@ -478,7 +481,7 @@ int main (int argc, char ** argv)
FGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=(1344.0*volume*ncall)/2;
double flops=(single_site_flops*volume*ncall)/2.0;
std::cout<<GridLogMessage << "Deo mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "Deo mflop/s per rank "<< flops/(t1-t0)/NP<<std::endl;

View File

@ -51,6 +51,7 @@ int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
@ -107,6 +108,7 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
@ -196,7 +198,7 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
if ( ! report ) {
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=1344*volume*ncall;
double flops=single_site_flops*volume*ncall;
std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0)<< "\t";
}
@ -228,7 +230,7 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
if(!report){
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=(1344.0*volume*ncall)/2;
double flops=(single_site_flops*volume*ncall)/2.0;
std::cout<< flops/(t1-t0);
}
}
@ -237,6 +239,7 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
#define CHECK_SDW
void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
{
long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc);
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
@ -321,7 +324,7 @@ void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
Counter.Report();
} else {
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=1344*volume*ncall;
double flops=single_site_flops*volume*ncall;
std::cout<<"\t"<< flops/(t1-t0);
}
@ -358,7 +361,7 @@ void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
CounterSdw.Report();
} else {
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=(1344.0*volume*ncall)/2;
double flops=(single_site_flops*volume*ncall)/2.0;
std::cout<<"\t"<< flops/(t1-t0);
}
}

View File

@ -107,7 +107,7 @@ int main (int argc, char ** argv)
FGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=2*1344*volume*ncall;
double flops=2*1320*volume*ncall;
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
// std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
@ -134,7 +134,7 @@ int main (int argc, char ** argv)
FGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=2*1344*volume*ncall;
double flops=2*1320*volume*ncall;
std::cout<<GridLogMessage << "Called half prec comms Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
@ -174,7 +174,7 @@ int main (int argc, char ** argv)
FGrid_d->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=2*1344*volume*ncall;
double flops=2*1320*volume*ncall;
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
// std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;

View File

@ -55,7 +55,7 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
uint64_t lmax=96;
uint64_t lmax=64;
#define NLOOP (10*lmax*lmax*lmax*lmax/vol)
for(int lat=8;lat<=lmax;lat+=8){

View File

@ -35,9 +35,11 @@ using namespace Grid::QCD;
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
#define LMAX (64)
#define LMAX (32)
#define LMIN (16)
#define LINC (4)
int64_t Nloop=20;
int64_t Nloop=2000;
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
std::vector<int> mpi_layout = GridDefaultMpi();
@ -51,7 +53,7 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
for(int lat=2;lat<=LMAX;lat+=2){
for(int lat=LMIN;lat<=LMAX;lat+=LINC){
std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
@ -83,7 +85,7 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
for(int lat=2;lat<=LMAX;lat+=2){
for(int lat=LMIN;lat<=LMAX;lat+=LINC){
std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
@ -114,7 +116,7 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
for(int lat=2;lat<=LMAX;lat+=2){
for(int lat=LMIN;lat<=LMAX;lat+=LINC){
std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
@ -145,7 +147,38 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
for(int lat=2;lat<=LMAX;lat+=2){
for(int lat=LMIN;lat<=LMAX;lat+=LINC){
std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
LatticeColourMatrix z(&Grid); random(pRNG,z);
LatticeColourMatrix x(&Grid); random(pRNG,x);
LatticeColourMatrix y(&Grid); random(pRNG,y);
double start=usecond();
for(int64_t i=0;i<Nloop;i++){
mac(z,x,y);
}
double stop=usecond();
double time = (stop-start)/Nloop*1000.0;
double bytes=3*vol*Nc*Nc*sizeof(Complex);
double flops=Nc*Nc*(6+8+8)*vol;
std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
}
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking SU3xSU3 CovShiftForward(z,x,y)"<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
for(int lat=LMIN;lat<=LMAX;lat+=LINC){
std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
@ -157,18 +190,64 @@ int main (int argc, char ** argv)
LatticeColourMatrix x(&Grid); random(pRNG,x);
LatticeColourMatrix y(&Grid); random(pRNG,y);
double start=usecond();
for(int64_t i=0;i<Nloop;i++){
mac(z,x,y);
for(int mu=0;mu<4;mu++){
double start=usecond();
for(int64_t i=0;i<Nloop;i++){
z = PeriodicBC::CovShiftForward(x,mu,y);
}
double stop=usecond();
double time = (stop-start)/Nloop*1000.0;
double bytes=3*vol*Nc*Nc*sizeof(Complex);
double flops=Nc*Nc*(6+8+8)*vol;
std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
}
double stop=usecond();
double time = (stop-start)/Nloop*1000.0;
double bytes=3*vol*Nc*Nc*sizeof(Complex);
double flops=Nc*Nc*(8+8+8)*vol;
std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
}
#if 1
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking SU3xSU3 z= x * Cshift(y)"<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s\t\t GFlop/s"<<std::endl;
std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
for(int lat=LMIN;lat<=LMAX;lat+=LINC){
std::vector<int> latt_size ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
LatticeColourMatrix z(&Grid); random(pRNG,z);
LatticeColourMatrix x(&Grid); random(pRNG,x);
LatticeColourMatrix y(&Grid); random(pRNG,y);
LatticeColourMatrix tmp(&Grid);
for(int mu=0;mu<4;mu++){
double tshift=0;
double tmult =0;
double start=usecond();
for(int64_t i=0;i<Nloop;i++){
tshift-=usecond();
tmp = Cshift(y,mu,-1);
tshift+=usecond();
tmult-=usecond();
z = x*tmp;
tmult+=usecond();
}
double stop=usecond();
double time = (stop-start)/Nloop;
tshift = tshift/Nloop;
tmult = tmult /Nloop;
double bytes=3*vol*Nc*Nc*sizeof(Complex);
double flops=Nc*Nc*(6+8+8)*vol;
std::cout<<GridLogMessage<<std::setprecision(3) << "total us "<<time<<" shift "<<tshift <<" mult "<<tmult<<std::endl;
time = time * 1000; // convert to NS for GB/s
std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<" \t\t"<<bytes/time<<"\t\t" << flops/time<<std::endl;
}
}
#endif
Grid_finalize();
}

View File

@ -4,7 +4,7 @@
Source file: ./benchmarks/Benchmark_wilson.cc
Copyright (C) 2015
Copyright (C) 2018
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
@ -32,6 +32,9 @@ using namespace std;
using namespace Grid;
using namespace Grid::QCD;
#include "Grid/util/Profiling.h"
template<class d>
struct scal {
d internal;
@ -45,6 +48,7 @@ struct scal {
};
bool overlapComms = false;
bool perfProfiling = false;
int main (int argc, char ** argv)
{
@ -53,6 +57,12 @@ int main (int argc, char ** argv)
if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){
overlapComms = true;
}
if( GridCmdOptionExists(argv,argv+argc,"--perf") ){
perfProfiling = true;
}
long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc);
std::vector<int> latt_size = GridDefaultLatt();
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
@ -61,10 +71,15 @@ int main (int argc, char ** argv)
GridRedBlackCartesian RBGrid(&Grid);
int threads = GridThread::GetThreads();
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
GridLogLayout();
std::cout<<GridLogMessage << "Grid floating point word size is REALF"<< sizeof(RealF)<<std::endl;
std::cout<<GridLogMessage << "Grid floating point word size is REALD"<< sizeof(RealD)<<std::endl;
std::cout<<GridLogMessage << "Grid floating point word size is REAL"<< sizeof(Real)<<std::endl;
std::cout<<GridLogMessage << "Grid number of colours : "<< QCD::Nc <<std::endl;
std::cout<<GridLogMessage << "Benchmarking Wilson operator in the fundamental representation" << std::endl;
std::vector<int> seeds({1,2,3,4});
GridParallelRNG pRNG(&Grid);
@ -134,9 +149,25 @@ int main (int argc, char ** argv)
Dw.Dhop(src,result,0);
}
double t1=usecond();
double flops=1344*volume*ncall;
double flops=single_site_flops*volume*ncall;
if (perfProfiling){
std::cout<<GridLogMessage << "Profiling Dw with perf"<<std::endl;
System::profile("kernel", [&]() {
for(int i=0;i<ncall;i++){
Dw.Dhop(src,result,0);
}
});
std::cout<<GridLogMessage << "Generated kernel.data"<<std::endl;
std::cout<<GridLogMessage << "Use with: perf report -i kernel.data"<<std::endl;
}
std::cout<<GridLogMessage << "Called Dw"<<std::endl;
std::cout<<GridLogMessage << "flops per site " << single_site_flops << std::endl;
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;

View File

@ -62,6 +62,7 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Number of colours "<< QCD::Nc <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking WilsonFermionR::Dhop "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
@ -69,13 +70,15 @@ int main (int argc, char ** argv)
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage << "* OpenMP threads : "<< GridThread::GetThreads() <<std::endl;
std::cout << GridLogMessage << "* MPI tasks : "<< GridCmdVectorIntToString(mpi_layout) << std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout<<GridLogMessage << "============================================================================="<< std::endl;
std::cout<<GridLogMessage << "= Benchmarking Wilson" << std::endl;
std::cout<<GridLogMessage << "============================================================================="<< std::endl;
std::cout<<GridLogMessage << "Volume\t\t\tWilson/MFLOPs\tWilsonDag/MFLOPs" << std::endl;
std::cout<<GridLogMessage << "============================================================================="<< std::endl;
std::cout<<GridLogMessage << "================================================================================================="<< std::endl;
std::cout<<GridLogMessage << "= Benchmarking Wilson operator in the fundamental representation" << std::endl;
std::cout<<GridLogMessage << "================================================================================================="<< std::endl;
std::cout<<GridLogMessage << "Volume\t\t\tWilson/MFLOPs\tWilsonDag/MFLOPs\tWilsonEO/MFLOPs\tWilsonDagEO/MFLOPs" << std::endl;
std::cout<<GridLogMessage << "================================================================================================="<< std::endl;
int Lmax = 32;
int dmin = 0;
@ -97,13 +100,20 @@ int main (int argc, char ** argv)
GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds);
LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
LatticeFermion src(&Grid); random(pRNG,src);
LatticeFermion result(&Grid); result=zero;
LatticeFermion src(&Grid); random(pRNG,src);
LatticeFermion src_o(&RBGrid); pickCheckerboard(Odd,src_o,src);
LatticeFermion result(&Grid); result=zero;
LatticeFermion result_e(&RBGrid); result_e=zero;
double volume = std::accumulate(latt_size.begin(),latt_size.end(),1,std::multiplies<int>());
WilsonFermionR Dw(Umu,Grid,RBGrid,mass,params);
// Full operator
bench_wilson(src,result,Dw,volume,DaggerNo);
bench_wilson(src,result,Dw,volume,DaggerYes);
std::cout << "\t";
// EO
bench_wilson(src,result,Dw,volume,DaggerNo);
bench_wilson(src,result,Dw,volume,DaggerYes);
std::cout << std::endl;
@ -122,9 +132,26 @@ void bench_wilson (
int const dag )
{
int ncall = 1000;
long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc);
double t0 = usecond();
for(int i=0; i<ncall; i++) { Dw.Dhop(src,result,dag); }
double t1 = usecond();
double flops = 1344 * volume * ncall;
double flops = single_site_flops * volume * ncall;
std::cout << flops/(t1-t0) << "\t\t";
}
void bench_wilson_eo (
LatticeFermion & src,
LatticeFermion & result,
WilsonFermionR & Dw,
double const volume,
int const dag )
{
int ncall = 1000;
long unsigned int single_site_flops = 8*QCD::Nc*(7+16*QCD::Nc);
double t0 = usecond();
for(int i=0; i<ncall; i++) { Dw.DhopEO(src,result,dag); }
double t1 = usecond();
double flops = (single_site_flops * volume * ncall)/2.0;
std::cout << flops/(t1-t0) << "\t\t";
}

View File

@ -249,6 +249,9 @@ case ${ax_cv_cxx_compiler_vendor} in
AVX512)
AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
SIMD_FLAGS='-mavx512f -mavx512pf -mavx512er -mavx512cd';;
SKL)
AC_DEFINE([AVX512],[1],[AVX512 intrinsics for SkyLake Xeon])
SIMD_FLAGS='-march=skylake-avx512';;
KNC)
AC_DEFINE([IMCI],[1],[IMCI intrinsics for Knights Corner])
SIMD_FLAGS='';;
@ -337,7 +340,7 @@ case ${ac_PRECISION} in
esac
###################### Shared memory allocation technique under MPI3
AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmopen|hugetlbfs],
AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmopen|shmget|hugetlbfs|shmnone],
[Select SHM allocation technique])],[ac_SHM=${enable_shm}],[ac_SHM=shmopen])
case ${ac_SHM} in
@ -346,6 +349,14 @@ case ${ac_SHM} in
AC_DEFINE([GRID_MPI3_SHMOPEN],[1],[GRID_MPI3_SHMOPEN] )
;;
shmget)
AC_DEFINE([GRID_MPI3_SHMGET],[1],[GRID_MPI3_SHMGET] )
;;
shmnone)
AC_DEFINE([GRID_MPI3_SHM_NONE],[1],[GRID_MPI3_SHM_NONE] )
;;
hugetlbfs)
AC_DEFINE([GRID_MPI3_SHMMMAP],[1],[GRID_MPI3_SHMMMAP] )
;;
@ -359,7 +370,7 @@ esac
AC_ARG_ENABLE([shmpath],[AC_HELP_STRING([--enable-shmpath=path],
[Select SHM mmap base path for hugetlbfs])],
[ac_SHMPATH=${enable_shmpath}],
[ac_SHMPATH=/var/lib/hugetlbfs/pagesize-2MB/])
[ac_SHMPATH=/var/lib/hugetlbfs/global/pagesize-2MB/])
AC_DEFINE_UNQUOTED([GRID_SHM_PATH],["$ac_SHMPATH"],[Path to a hugetlbfs filesystem for MMAPing])
############### communication type selection

View File

@ -43,12 +43,6 @@ using namespace Hadrons;
Application::Application(void)
{
initLogger();
LOG(Message) << "Modules available:" << std::endl;
auto list = ModuleFactory::getInstance().getBuilderList();
for (auto &m: list)
{
LOG(Message) << " " << m << std::endl;
}
auto dim = GridDefaultLatt(), mpi = GridDefaultMpi(), loc(dim);
locVol_ = 1;
for (unsigned int d = 0; d < dim.size(); ++d)

View File

@ -57,6 +57,7 @@ See the full license in the file "LICENSE" in the top level distribution directo
#include <Grid/Hadrons/Modules/MAction/DWF.hpp>
#include <Grid/Hadrons/Modules/MAction/Wilson.hpp>
#include <Grid/Hadrons/Modules/MAction/WilsonClover.hpp>
#include <Grid/Hadrons/Modules/MScalarSUN/Div.hpp>
#include <Grid/Hadrons/Modules/MScalarSUN/TrMag.hpp>
#include <Grid/Hadrons/Modules/MScalarSUN/TwoPoint.hpp>
#include <Grid/Hadrons/Modules/MScalarSUN/TrPhi.hpp>

View File

@ -171,7 +171,7 @@ void TMeson<FImpl1, FImpl2>::execute(void)
LOG(Message) << "Computing meson contractions '" << getName() << "' using"
<< " quarks '" << par().q1 << "' and '" << par().q2 << "'"
<< std::endl;
ResultWriter writer(RESULT_FILE_NAME(par().output));
std::vector<TComplex> buf;
std::vector<Result> result;

View File

@ -57,7 +57,7 @@ std::vector<std::string> TFundtoHirep<Rep>::getOutput(void)
template <typename Rep>
void TFundtoHirep<Rep>::setup(void)
{
env().template registerLattice<typename Rep::LatticeField>(getName());
envCreateLat(typename Rep::LatticeField, getName());
}
// execution ///////////////////////////////////////////////////////////////////
@ -70,6 +70,6 @@ void TFundtoHirep<Rep>::execute(void)
Rep TargetRepresentation(U._grid);
TargetRepresentation.update_representation(U);
typename Rep::LatticeField &URep = *env().template createLattice<typename Rep::LatticeField>(getName());
auto &URep = envGet(typename Rep::LatticeField, getName());
URep = TargetRepresentation.U;
}

View File

@ -0,0 +1,166 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: extras/Hadrons/Modules/MScalarSUN/Div.hpp
Copyright (C) 2015-2018
Author: Antonin Portelli <antonin.portelli@me.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef Hadrons_MScalarSUN_Div_hpp_
#define Hadrons_MScalarSUN_Div_hpp_
#include <Grid/Hadrons/Global.hpp>
#include <Grid/Hadrons/Module.hpp>
#include <Grid/Hadrons/ModuleFactory.hpp>
BEGIN_HADRONS_NAMESPACE
/******************************************************************************
* Div *
******************************************************************************/
BEGIN_MODULE_NAMESPACE(MScalarSUN)
class DivPar: Serializable
{
public:
GRID_SERIALIZABLE_ENUM(DiffType, undef, forward, 1, backward, 2, central, 3);
GRID_SERIALIZABLE_CLASS_MEMBERS(DivPar,
std::vector<std::string>, op,
DiffType, type,
std::string, output);
};
template <typename SImpl>
class TDiv: public Module<DivPar>
{
public:
typedef typename SImpl::Field Field;
typedef typename SImpl::ComplexField ComplexField;
class Result: Serializable
{
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
DivPar::DiffType, type,
Complex, value);
};
public:
// constructor
TDiv(const std::string name);
// destructor
virtual ~TDiv(void) = default;
// dependency relation
virtual std::vector<std::string> getInput(void);
virtual std::vector<std::string> getOutput(void);
// setup
virtual void setup(void);
// execution
virtual void execute(void);
};
MODULE_REGISTER_NS(DivSU2, TDiv<ScalarNxNAdjImplR<2>>, MScalarSUN);
MODULE_REGISTER_NS(DivSU3, TDiv<ScalarNxNAdjImplR<3>>, MScalarSUN);
MODULE_REGISTER_NS(DivSU4, TDiv<ScalarNxNAdjImplR<4>>, MScalarSUN);
MODULE_REGISTER_NS(DivSU5, TDiv<ScalarNxNAdjImplR<5>>, MScalarSUN);
MODULE_REGISTER_NS(DivSU6, TDiv<ScalarNxNAdjImplR<6>>, MScalarSUN);
/******************************************************************************
* TDiv implementation *
******************************************************************************/
// constructor /////////////////////////////////////////////////////////////////
template <typename SImpl>
TDiv<SImpl>::TDiv(const std::string name)
: Module<DivPar>(name)
{}
// dependencies/products ///////////////////////////////////////////////////////
template <typename SImpl>
std::vector<std::string> TDiv<SImpl>::getInput(void)
{
return par().op;
}
template <typename SImpl>
std::vector<std::string> TDiv<SImpl>::getOutput(void)
{
std::vector<std::string> out = {getName()};
return out;
}
// setup ///////////////////////////////////////////////////////////////////////
template <typename SImpl>
void TDiv<SImpl>::setup(void)
{
if (par().op.size() != env().getNd())
{
HADRON_ERROR(Size, "the number of components differs from number of dimensions");
}
envCreateLat(ComplexField, getName());
}
// execution ///////////////////////////////////////////////////////////////////
template <typename SImpl>
void TDiv<SImpl>::execute(void)
{
const auto nd = env().getNd();
LOG(Message) << "Computing the " << par().type << " divergence of [";
for (unsigned int mu = 0; mu < nd; ++mu)
{
std::cout << par().op[mu] << ((mu == nd - 1) ? "]" : ", ");
}
std::cout << std::endl;
auto &div = envGet(ComplexField, getName());
div = zero;
for (unsigned int mu = 0; mu < nd; ++mu)
{
auto &op = envGet(ComplexField, par().op[mu]);
switch(par().type)
{
case DivPar::DiffType::backward:
div += op - Cshift(op, mu, -1);
break;
case DivPar::DiffType::forward:
div += Cshift(op, mu, 1) - op;
break;
case DivPar::DiffType::central:
div += 0.5*(Cshift(op, mu, 1) - Cshift(op, mu, -1));
break;
}
}
if (!par().output.empty())
{
Result r;
ResultWriter writer(RESULT_FILE_NAME(par().output));
r.type = par().type;
r.value = TensorRemove(sum(div));
write(writer, "div", r);
}
}
END_MODULE_NAMESPACE
END_HADRONS_NAMESPACE
#endif // Hadrons_MScalarSUN_Div_hpp_

View File

@ -44,6 +44,7 @@ modules_hpp =\
Modules/MAction/DWF.hpp \
Modules/MAction/Wilson.hpp \
Modules/MAction/WilsonClover.hpp \
Modules/MScalarSUN/Div.hpp \
Modules/MScalarSUN/TrMag.hpp \
Modules/MScalarSUN/TwoPoint.hpp \
Modules/MScalarSUN/TrPhi.hpp \

View File

@ -39,6 +39,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/algorithms/approx/MultiShiftFunction.h>
#include <Grid/algorithms/approx/Forecast.h>
#include <Grid/algorithms/iterative/Deflation.h>
#include <Grid/algorithms/iterative/ConjugateGradient.h>
#include <Grid/algorithms/iterative/ConjugateResidual.h>
#include <Grid/algorithms/iterative/NormalEquations.h>

View File

@ -51,7 +51,7 @@ namespace Grid {
virtual void Op (const Field &in, Field &out) = 0; // Abstract base
virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2) = 0;
virtual void HermOp(const Field &in, Field &out)=0;
};
@ -309,36 +309,59 @@ namespace Grid {
class SchurStaggeredOperator : public SchurOperatorBase<Field> {
protected:
Matrix &_Mat;
Field tmp;
RealD mass;
double tMpc;
double tIP;
double tMeo;
double taxpby_norm;
uint64_t ncall;
public:
SchurStaggeredOperator (Matrix &Mat): _Mat(Mat){};
void Report(void)
{
std::cout << GridLogMessage << " HermOpAndNorm.Mpc "<< tMpc/ncall<<" usec "<<std::endl;
std::cout << GridLogMessage << " HermOpAndNorm.IP "<< tIP /ncall<<" usec "<<std::endl;
std::cout << GridLogMessage << " Mpc.MeoMoe "<< tMeo/ncall<<" usec "<<std::endl;
std::cout << GridLogMessage << " Mpc.axpby_norm "<< taxpby_norm/ncall<<" usec "<<std::endl;
}
SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid())
{
assert( _Mat.isTrivialEE() );
mass = _Mat.Mass();
tMpc=0;
tIP =0;
tMeo=0;
taxpby_norm=0;
ncall=0;
}
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
GridLogIterative.TimingMode(1);
std::cout << GridLogIterative << " HermOpAndNorm "<<std::endl;
ncall++;
tMpc-=usecond();
n2 = Mpc(in,out);
std::cout << GridLogIterative << " HermOpAndNorm.Mpc "<<std::endl;
tMpc+=usecond();
tIP-=usecond();
ComplexD dot= innerProduct(in,out);
std::cout << GridLogIterative << " HermOpAndNorm.innerProduct "<<std::endl;
tIP+=usecond();
n1 = real(dot);
}
virtual void HermOp(const Field &in, Field &out){
std::cout << GridLogIterative << " HermOp "<<std::endl;
Mpc(in,out);
ncall++;
tMpc-=usecond();
_Mat.Meooe(in,out);
_Mat.Meooe(out,tmp);
tMpc+=usecond();
taxpby_norm-=usecond();
axpby(out,-1.0,mass*mass,tmp,in);
taxpby_norm+=usecond();
}
virtual RealD Mpc (const Field &in, Field &out) {
Field tmp(in._grid);
Field tmp2(in._grid);
std::cout << GridLogIterative << " HermOp.Mpc "<<std::endl;
_Mat.Mooee(in,out);
_Mat.Mooee(out,tmp);
std::cout << GridLogIterative << " HermOp.MooeeMooee "<<std::endl;
tMeo-=usecond();
_Mat.Meooe(in,out);
_Mat.Meooe(out,tmp2);
std::cout << GridLogIterative << " HermOp.MeooeMeooe "<<std::endl;
RealD nn=axpy_norm(out,-1.0,tmp2,tmp);
std::cout << GridLogIterative << " HermOp.axpy_norm "<<std::endl;
_Mat.Meooe(out,tmp);
tMeo+=usecond();
taxpby_norm-=usecond();
RealD nn=axpby_norm(out,-1.0,mass*mass,tmp,in);
taxpby_norm+=usecond();
return nn;
}
virtual RealD MpcDag (const Field &in, Field &out){

View File

@ -54,6 +54,7 @@ class ConjugateGradient : public OperatorFunction<Field> {
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
psi.checkerboard = src.checkerboard;
conformable(psi, src);
@ -69,7 +70,6 @@ class ConjugateGradient : public OperatorFunction<Field> {
Linop.HermOpAndNorm(psi, mmp, d, b);
r = src - mmp;
p = r;
@ -96,38 +96,44 @@ class ConjugateGradient : public OperatorFunction<Field> {
<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
GridStopWatch LinalgTimer;
GridStopWatch InnerTimer;
GridStopWatch AxpyNormTimer;
GridStopWatch LinearCombTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
SolverTimer.Start();
int k;
for (k = 1; k <= MaxIterations; k++) {
for (k = 1; k <= MaxIterations*1000; k++) {
c = cp;
MatrixTimer.Start();
Linop.HermOpAndNorm(p, mmp, d, qq);
Linop.HermOp(p, mmp);
MatrixTimer.Stop();
LinalgTimer.Start();
// RealD qqck = norm2(mmp);
// ComplexD dck = innerProduct(p,mmp);
InnerTimer.Start();
ComplexD dc = innerProduct(p,mmp);
InnerTimer.Stop();
d = dc.real();
a = c / d;
b_pred = a * (a * qq - d) / c;
AxpyNormTimer.Start();
cp = axpy_norm(r, -a, mmp, r);
AxpyNormTimer.Stop();
b = cp / c;
// Fuse these loops ; should be really easy
psi = a * p + psi;
p = p * b + r;
LinearCombTimer.Start();
parallel_for(int ss=0;ss<src._grid->oSites();ss++){
vstream(psi[ss], a * p[ss] + psi[ss]);
vstream(p [ss], b * p[ss] + r[ss]);
}
LinearCombTimer.Stop();
LinalgTimer.Stop();
std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
<< " residual " << cp << " target " << rsq << std::endl;
std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << " b = "<< b << std::endl;
std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << " c = "<< c << std::endl;
// Stopping condition
if (cp <= rsq) {
@ -148,6 +154,9 @@ class ConjugateGradient : public OperatorFunction<Field> {
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tInner " << InnerTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);

View File

@ -43,6 +43,7 @@ namespace Grid {
public:
RealD Tolerance;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
int verbose;
MultiShiftFunction shifts;
@ -163,7 +164,16 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
for(int s=0;s<nshift;s++) {
axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
}
///////////////////////////////////////
// Timers
///////////////////////////////////////
GridStopWatch AXPYTimer;
GridStopWatch ShiftTimer;
GridStopWatch QRTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
SolverTimer.Start();
// Iteration loop
int k;
@ -171,7 +181,9 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
for (k=1;k<=MaxIterations;k++){
a = c /cp;
AXPYTimer.Start();
axpy(p,a,p,r);
AXPYTimer.Stop();
// Note to self - direction ps is iterated seperately
// for each shift. Does not appear to have any scope
@ -180,6 +192,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
// However SAME r is used. Could load "r" and update
// ALL ps[s]. 2/3 Bandwidth saving
// New Kernel: Load r, vector of coeffs, vector of pointers ps
AXPYTimer.Start();
for(int s=0;s<nshift;s++){
if ( ! converged[s] ) {
if (s==0){
@ -190,22 +203,34 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
}
}
}
AXPYTimer.Stop();
cp=c;
MatrixTimer.Start();
//Linop.HermOpAndNorm(p,mmp,d,qq); // d is used
// The below is faster on KNL
Linop.HermOp(p,mmp);
d=real(innerProduct(p,mmp));
Linop.HermOpAndNorm(p,mmp,d,qq);
MatrixTimer.Stop();
AXPYTimer.Start();
axpy(mmp,mass[0],p,mmp);
AXPYTimer.Stop();
RealD rn = norm2(p);
d += rn*mass[0];
bp=b;
b=-cp/d;
AXPYTimer.Start();
c=axpy_norm(r,b,mmp,r);
AXPYTimer.Stop();
// Toggle the recurrence history
bs[0] = b;
iz = 1-iz;
ShiftTimer.Start();
for(int s=1;s<nshift;s++){
if((!converged[s])){
RealD z0 = z[s][1-iz];
@ -215,6 +240,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
bs[s] = b*z[s][iz]/z0; // NB sign rel to Mike
}
}
ShiftTimer.Stop();
for(int s=0;s<nshift;s++){
int ss = s;
@ -257,6 +283,9 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
if ( all_converged ){
SolverTimer.Stop();
std::cout<<GridLogMessage<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl;
std::cout<<GridLogMessage<< "CGMultiShift: Checking solutions"<<std::endl;
@ -269,8 +298,19 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
RealD cn = norm2(src);
std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
}
std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tAXPY " << AXPYTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMarix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tShift " << ShiftTimer.Elapsed() <<std::endl;
IterationsToComplete = k;
return;
}
}
// ugly hack
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;

View File

@ -0,0 +1,101 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_DEFLATION_H
#define GRID_DEFLATION_H
namespace Grid {
struct ZeroGuesser {
public:
template<class Field>
void operator()(const Field &src,Field &guess) { guess = Zero(); };
};
struct SourceGuesser {
public:
template<class Field>
void operator()(const Field &src,Field &guess) { guess = src; };
};
////////////////////////////////
// Fine grid deflation
////////////////////////////////
template<class Field>
struct DeflatedGuesser {
private:
const std::vector<Field> &evec;
const std::vector<RealD> &eval;
public:
DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {};
void operator()(const Field &src,Field &guess) {
guess = zero;
assert(evec.size()==eval.size());
auto N = evec.size();
for (int i=0;i<N;i++) {
const Field& tmp = evec[i];
axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
}
}
};
template<class FineField, class CoarseField>
class LocalCoherenceDeflatedGuesser {
private:
const std::vector<FineField> &subspace;
const std::vector<CoarseField> &evec_coarse;
const std::vector<RealD> &eval_coarse;
public:
LocalCoherenceDeflatedGuesser(const std::vector<FineField> &_subspace,
const std::vector<CoarseField> &_evec_coarse,
const std::vector<RealD> &_eval_coarse)
: subspace(_subspace),
evec_coarse(_evec_coarse),
eval_coarse(_eval_coarse)
{
}
void operator()(const FineField &src,FineField &guess) {
int N = (int)evec_coarse.size();
CoarseField src_coarse(evec_coarse[0]._grid);
CoarseField guess_coarse(evec_coarse[0]._grid); guess_coarse = zero;
blockProject(src_coarse,src,subspace);
for (int i=0;i<N;i++) {
const CoarseField & tmp = evec_coarse[i];
axpy(guess_coarse,TensorRemove(innerProduct(tmp,src_coarse)) / eval_coarse[i],tmp,guess_coarse);
}
blockPromote(guess_coarse,guess,subspace);
};
};
}
#endif

View File

@ -57,8 +57,10 @@ void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, i
parallel_region
{
std::vector < vobj > B(Nm); // Thread private
Vector < vobj > B; // Thread private
PARALLEL_CRITICAL { B.resize(Nm); }
parallel_for_internal(int ss=0;ss < grid->oSites();ss++){
for(int j=j0; j<j1; ++j) B[j]=0.;
@ -149,19 +151,6 @@ void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, boo
basisReorderInPlace(_v,sort_vals,idx);
}
// PAB: faster to compute the inner products first then fuse loops.
// If performance critical can improve.
template<class Field>
void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
result = zero;
assert(_v.size()==eval.size());
int N = (int)_v.size();
for (int i=0;i<N;i++) {
Field& tmp = _v[i];
axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
}
}
/////////////////////////////////////////////////////////////
// Implicitly restarted lanczos
/////////////////////////////////////////////////////////////
@ -181,6 +170,7 @@ enum IRLdiagonalisation {
template<class Field> class ImplicitlyRestartedLanczosHermOpTester : public ImplicitlyRestartedLanczosTester<Field>
{
public:
LinearFunction<Field> &_HermOp;
ImplicitlyRestartedLanczosHermOpTester(LinearFunction<Field> &HermOp) : _HermOp(HermOp) { };
int ReconstructEval(int j,RealD resid,Field &B, RealD &eval,RealD evalMaxApprox)
@ -243,6 +233,7 @@ class ImplicitlyRestartedLanczos {
/////////////////////////
public:
//////////////////////////////////////////////////////////////////
// PAB:
//////////////////////////////////////////////////////////////////
@ -490,15 +481,13 @@ until convergence
Field B(grid); B.checkerboard = evec[0].checkerboard;
// power of two search pattern; not every evalue in eval2 is assessed.
int allconv =1;
for(int jj = 1; jj<=Nstop; jj*=2){
int j = Nstop-jj;
RealD e = eval2_copy[j]; // Discard the evalue
basisRotateJ(B,evec,Qt,j,0,Nk,Nm);
if( _Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) {
if ( j > Nconv ) {
Nconv=j+1;
jj=Nstop; // Terminate the scan
}
if( !_Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) {
allconv=0;
}
}
// Do evec[0] for good measure
@ -506,8 +495,10 @@ until convergence
int j=0;
RealD e = eval2_copy[0];
basisRotateJ(B,evec,Qt,j,0,Nk,Nm);
_Tester.TestConvergence(j,eresid,B,e,evalMaxApprox);
if( !_Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) allconv=0;
}
if ( allconv ) Nconv = Nstop;
// test if we converged, if so, terminate
std::cout<<GridLogIRL<<" #modes converged: >= "<<Nconv<<"/"<<Nstop<<std::endl;
// if( Nconv>=Nstop || beta_k < betastp){

View File

@ -28,7 +28,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */
#ifndef GRID_LOCAL_COHERENCE_IRL_H
#define GRID_LOCAL_COHERENCE_IRL_H
namespace Grid {
struct LanczosParams : Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParams,
@ -45,6 +48,7 @@ struct LanczosParams : Serializable {
struct LocalCoherenceLanczosParams : Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams,
bool, saveEvecs,
bool, doFine,
bool, doFineRead,
bool, doCoarse,
@ -70,21 +74,24 @@ public:
typedef Lattice<Fobj> FineField;
LinearOperatorBase<FineField> &_Linop;
Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
std::vector<FineField> &subspace;
ProjectedHermOp(LinearOperatorBase<FineField>& linop, Aggregation<Fobj,CComplex,nbasis> &aggregate) :
_Linop(linop),
_Aggregate(aggregate) { };
ProjectedHermOp(LinearOperatorBase<FineField>& linop, std::vector<FineField> & _subspace) :
_Linop(linop), subspace(_subspace)
{
assert(subspace.size() >0);
};
void operator()(const CoarseField& in, CoarseField& out) {
GridBase *FineGrid = subspace[0]._grid;
int checkerboard = subspace[0].checkerboard;
FineField fin (FineGrid); fin.checkerboard= checkerboard;
FineField fout(FineGrid); fout.checkerboard = checkerboard;
GridBase *FineGrid = _Aggregate.FineGrid;
FineField fin(FineGrid);
FineField fout(FineGrid);
_Aggregate.PromoteFromSubspace(in,fin); std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl;
_Linop.HermOp(fin,fout); std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl;
_Aggregate.ProjectToSubspace(out,fout); std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl;
blockPromote(in,fin,subspace); std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl;
_Linop.HermOp(fin,fout); std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl;
blockProject(out,fout,subspace); std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl;
}
};
@ -99,24 +106,27 @@ public:
OperatorFunction<FineField> & _poly;
LinearOperatorBase<FineField> &_Linop;
Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
std::vector<FineField> &subspace;
ProjectedFunctionHermOp(OperatorFunction<FineField> & poly,LinearOperatorBase<FineField>& linop,
Aggregation<Fobj,CComplex,nbasis> &aggregate) :
ProjectedFunctionHermOp(OperatorFunction<FineField> & poly,
LinearOperatorBase<FineField>& linop,
std::vector<FineField> & _subspace) :
_poly(poly),
_Linop(linop),
_Aggregate(aggregate) { };
subspace(_subspace)
{ };
void operator()(const CoarseField& in, CoarseField& out) {
GridBase *FineGrid = _Aggregate.FineGrid;
FineField fin(FineGrid) ;fin.checkerboard =_Aggregate.checkerboard;
FineField fout(FineGrid);fout.checkerboard =_Aggregate.checkerboard;
_Aggregate.PromoteFromSubspace(in,fin); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl;
GridBase *FineGrid = subspace[0]._grid;
int checkerboard = subspace[0].checkerboard;
FineField fin (FineGrid); fin.checkerboard =checkerboard;
FineField fout(FineGrid);fout.checkerboard =checkerboard;
blockPromote(in,fin,subspace); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl;
_poly(_Linop,fin,fout); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Poly "<<std::endl;
_Aggregate.ProjectToSubspace(out,fout); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Project to coarse "<<std::endl;
blockProject(out,fout,subspace); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Project to coarse "<<std::endl;
}
};
@ -132,19 +142,23 @@ class ImplicitlyRestartedLanczosSmoothedTester : public ImplicitlyRestartedLanc
LinearFunction<CoarseField> & _Poly;
OperatorFunction<FineField> & _smoother;
LinearOperatorBase<FineField> &_Linop;
Aggregation<Fobj,CComplex,nbasis> &_Aggregate;
RealD _coarse_relax_tol;
RealD _coarse_relax_tol;
std::vector<FineField> &_subspace;
ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField> &Poly,
OperatorFunction<FineField> &smoother,
LinearOperatorBase<FineField> &Linop,
Aggregation<Fobj,CComplex,nbasis> &Aggregate,
std::vector<FineField> &subspace,
RealD coarse_relax_tol=5.0e3)
: _smoother(smoother), _Linop(Linop),_Aggregate(Aggregate), _Poly(Poly), _coarse_relax_tol(coarse_relax_tol) { };
: _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace),
_coarse_relax_tol(coarse_relax_tol)
{ };
int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
{
CoarseField v(B);
RealD eval_poly = eval;
// Apply operator
_Poly(B,v);
@ -168,14 +182,13 @@ class ImplicitlyRestartedLanczosSmoothedTester : public ImplicitlyRestartedLanc
}
int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
{
GridBase *FineGrid = _Aggregate.FineGrid;
int checkerboard = _Aggregate.checkerboard;
GridBase *FineGrid = _subspace[0]._grid;
int checkerboard = _subspace[0].checkerboard;
FineField fB(FineGrid);fB.checkerboard =checkerboard;
FineField fv(FineGrid);fv.checkerboard =checkerboard;
_Aggregate.PromoteFromSubspace(B,fv);
blockPromote(B,fv,_subspace);
_smoother(_Linop,fv,fB);
RealD eval_poly = eval;
@ -217,27 +230,65 @@ protected:
int _checkerboard;
LinearOperatorBase<FineField> & _FineOp;
// FIXME replace Aggregation with vector of fine; the code reuse is too small for
// the hassle and complexity of cross coupling.
Aggregation<Fobj,CComplex,nbasis> _Aggregate;
std::vector<RealD> evals_fine;
std::vector<RealD> evals_coarse;
std::vector<CoarseField> evec_coarse;
std::vector<RealD> &evals_fine;
std::vector<RealD> &evals_coarse;
std::vector<FineField> &subspace;
std::vector<CoarseField> &evec_coarse;
private:
std::vector<RealD> _evals_fine;
std::vector<RealD> _evals_coarse;
std::vector<FineField> _subspace;
std::vector<CoarseField> _evec_coarse;
public:
LocalCoherenceLanczos(GridBase *FineGrid,
GridBase *CoarseGrid,
LinearOperatorBase<FineField> &FineOp,
int checkerboard) :
GridBase *CoarseGrid,
LinearOperatorBase<FineField> &FineOp,
int checkerboard) :
_CoarseGrid(CoarseGrid),
_FineGrid(FineGrid),
_Aggregate(CoarseGrid,FineGrid,checkerboard),
_FineOp(FineOp),
_checkerboard(checkerboard)
_checkerboard(checkerboard),
evals_fine (_evals_fine),
evals_coarse(_evals_coarse),
subspace (_subspace),
evec_coarse(_evec_coarse)
{
evals_fine.resize(0);
evals_coarse.resize(0);
};
void Orthogonalise(void ) { _Aggregate.Orthogonalise(); }
//////////////////////////////////////////////////////////////////////////
// Alternate constructore, external storage for use by Hadrons module
//////////////////////////////////////////////////////////////////////////
LocalCoherenceLanczos(GridBase *FineGrid,
GridBase *CoarseGrid,
LinearOperatorBase<FineField> &FineOp,
int checkerboard,
std::vector<FineField> &ext_subspace,
std::vector<CoarseField> &ext_coarse,
std::vector<RealD> &ext_eval_fine,
std::vector<RealD> &ext_eval_coarse
) :
_CoarseGrid(CoarseGrid),
_FineGrid(FineGrid),
_FineOp(FineOp),
_checkerboard(checkerboard),
evals_fine (ext_eval_fine),
evals_coarse(ext_eval_coarse),
subspace (ext_subspace),
evec_coarse (ext_coarse)
{
evals_fine.resize(0);
evals_coarse.resize(0);
};
void Orthogonalise(void ) {
CoarseScalar InnerProd(_CoarseGrid);
blockOrthogonalise(InnerProd,subspace);std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
blockOrthogonalise(InnerProd,subspace);std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl;
};
template<typename T> static RealD normalise(T& v)
{
@ -246,43 +297,44 @@ public:
v = v * (1.0/nn);
return nn;
}
/*
void fakeFine(void)
{
int Nk = nbasis;
_Aggregate.subspace.resize(Nk,_FineGrid);
_Aggregate.subspace[0]=1.0;
_Aggregate.subspace[0].checkerboard=_checkerboard;
normalise(_Aggregate.subspace[0]);
subspace.resize(Nk,_FineGrid);
subspace[0]=1.0;
subspace[0].checkerboard=_checkerboard;
normalise(subspace[0]);
PlainHermOp<FineField> Op(_FineOp);
for(int k=1;k<Nk;k++){
_Aggregate.subspace[k].checkerboard=_checkerboard;
Op(_Aggregate.subspace[k-1],_Aggregate.subspace[k]);
normalise(_Aggregate.subspace[k]);
subspace[k].checkerboard=_checkerboard;
Op(subspace[k-1],subspace[k]);
normalise(subspace[k]);
}
}
*/
void testFine(RealD resid)
{
assert(evals_fine.size() == nbasis);
assert(_Aggregate.subspace.size() == nbasis);
assert(subspace.size() == nbasis);
PlainHermOp<FineField> Op(_FineOp);
ImplicitlyRestartedLanczosHermOpTester<FineField> SimpleTester(Op);
for(int k=0;k<nbasis;k++){
assert(SimpleTester.ReconstructEval(k,resid,_Aggregate.subspace[k],evals_fine[k],1.0)==1);
assert(SimpleTester.ReconstructEval(k,resid,subspace[k],evals_fine[k],1.0)==1);
}
}
void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax)
{
assert(evals_fine.size() == nbasis);
assert(_Aggregate.subspace.size() == nbasis);
assert(subspace.size() == nbasis);
//////////////////////////////////////////////////////////////////////////////////////////////////
// create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
//////////////////////////////////////////////////////////////////////////////////////////////////
Chebyshev<FineField> ChebySmooth(cheby_smooth);
ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (ChebySmooth,_FineOp,_Aggregate);
ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax);
ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (ChebySmooth,_FineOp,_subspace);
ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
for(int k=0;k<evec_coarse.size();k++){
if ( k < nbasis ) {
@ -302,34 +354,34 @@ public:
PlainHermOp<FineField> Op(_FineOp);
evals_fine.resize(Nm);
_Aggregate.subspace.resize(Nm,_FineGrid);
subspace.resize(Nm,_FineGrid);
ImplicitlyRestartedLanczos<FineField> IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
FineField src(_FineGrid); src=1.0; src.checkerboard = _checkerboard;
int Nconv;
IRL.calc(evals_fine,_Aggregate.subspace,src,Nconv,false);
IRL.calc(evals_fine,subspace,src,Nconv,false);
// Shrink down to number saved
assert(Nstop>=nbasis);
assert(Nconv>=nbasis);
evals_fine.resize(nbasis);
_Aggregate.subspace.resize(nbasis,_FineGrid);
subspace.resize(nbasis,_FineGrid);
}
void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
int Nstop, int Nk, int Nm,RealD resid,
RealD MaxIt, RealD betastp, int MinRes)
{
Chebyshev<FineField> Cheby(cheby_op);
ProjectedHermOp<Fobj,CComplex,nbasis> Op(_FineOp,_Aggregate);
ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,_Aggregate);
ProjectedHermOp<Fobj,CComplex,nbasis> Op(_FineOp,_subspace);
ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,_subspace);
//////////////////////////////////////////////////////////////////////////////////////////////////
// create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
//////////////////////////////////////////////////////////////////////////////////////////////////
Chebyshev<FineField> ChebySmooth(cheby_smooth);
ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_Aggregate,relax);
ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,_subspace,relax);
evals_coarse.resize(Nm);
evec_coarse.resize(Nm,_CoarseGrid);

View File

@ -107,7 +107,12 @@ namespace Grid {
};
template<class Matrix>
void operator() (Matrix & _Matrix,const Field &in, Field &out){
void operator() (Matrix & _Matrix,const Field &in, Field &out){
ZeroGuesser guess;
(*this)(_Matrix,in,out,guess);
}
template<class Matrix, class Guesser>
void operator() (Matrix & _Matrix,const Field &in, Field &out, Guesser &guess){
// FIXME CGdiagonalMee not implemented virtual function
// FIXME use CBfactorise to control schur decomp
@ -129,7 +134,6 @@ namespace Grid {
pickCheckerboard(Odd ,src_o,in);
pickCheckerboard(Even,sol_e,out);
pickCheckerboard(Odd ,sol_o,out);
std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve checkerboards picked" <<std::endl;
/////////////////////////////////////////////////////
@ -146,6 +150,7 @@ namespace Grid {
// Call the red-black solver
//////////////////////////////////////////////////////////////
std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver calling the Mpc solver" <<std::endl;
guess(src_o,sol_o);
_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.checkerboard==Odd);
std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver called the Mpc solver" <<std::endl;
@ -189,7 +194,12 @@ namespace Grid {
CBfactorise=cb;
};
template<class Matrix>
void operator() (Matrix & _Matrix,const Field &in, Field &out){
void operator() (Matrix & _Matrix,const Field &in, Field &out){
ZeroGuesser guess;
(*this)(_Matrix,in,out,guess);
}
template<class Matrix, class Guesser>
void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
// FIXME CGdiagonalMee not implemented virtual function
// FIXME use CBfactorise to control schur decomp
@ -225,6 +235,7 @@ namespace Grid {
// Call the red-black solver
//////////////////////////////////////////////////////////////
std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
guess(src_o,sol_o);
_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.checkerboard==Odd);
///////////////////////////////////////////////////
@ -268,7 +279,12 @@ namespace Grid {
};
template<class Matrix>
void operator() (Matrix & _Matrix,const Field &in, Field &out){
void operator() (Matrix & _Matrix,const Field &in, Field &out){
ZeroGuesser guess;
(*this)(_Matrix,in,out,guess);
}
template<class Matrix,class Guesser>
void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
// FIXME CGdiagonalMee not implemented virtual function
// FIXME use CBfactorise to control schur decomp
@ -305,6 +321,7 @@ namespace Grid {
//////////////////////////////////////////////////////////////
std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
// _HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.checkerboard==Odd);
guess(src_o,tmp);
_HermitianRBSolver(_HermOpEO,src_o,tmp); assert(tmp.checkerboard==Odd);
_Matrix.MooeeInv(tmp,sol_o); assert( sol_o.checkerboard ==Odd);
@ -347,7 +364,12 @@ namespace Grid {
};
template<class Matrix>
void operator() (Matrix & _Matrix,const Field &in, Field &out){
void operator() (Matrix & _Matrix,const Field &in, Field &out){
ZeroGuesser guess;
(*this)(_Matrix,in,out,guess);
}
template<class Matrix, class Guesser>
void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
// FIXME CGdiagonalMee not implemented virtual function
// FIXME use CBfactorise to control schur decomp
@ -385,6 +407,7 @@ namespace Grid {
std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
// _HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.checkerboard==Odd);
// _HermitianRBSolver(_HermOpEO,src_o,tmp); assert(tmp.checkerboard==Odd);
guess(src_o,tmp);
_HermitianRBSolver(src_o,tmp); assert(tmp.checkerboard==Odd);
_Matrix.MooeeInv(tmp,sol_o); assert( sol_o.checkerboard ==Odd);

View File

@ -277,7 +277,9 @@ public:
uint8_t *cp = (uint8_t *)ptr;
if ( ptr ) {
// One touch per 4k page, static OMP loop to catch same loop order
#ifdef GRID_OMP
#pragma omp parallel for schedule(static)
#endif
for(size_type n=0;n<bytes;n+=4096){
cp[n]=0;
}

View File

@ -44,11 +44,15 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
MPI_Initialized(&flag); // needed to coexist with other libs apparently
if ( !flag ) {
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
assert (provided == MPI_THREAD_MULTIPLE);
//If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
if( (nCommThreads == 1 && provided == MPI_THREAD_SINGLE) ||
(nCommThreads > 1 && provided != MPI_THREAD_MULTIPLE) )
assert(0);
}
Grid_quiesce_nodes();
// Never clean up as done once.
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
GlobalSharedMemory::Init(communicator_world);
@ -85,9 +89,17 @@ void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &c
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{
MPI_Comm optimal_comm;
GlobalSharedMemory::OptimalCommunicator (processors,optimal_comm); // Remap using the shared memory optimising routine
////////////////////////////////////////////////////
// Remap using the shared memory optimising routine
// The remap creates a comm which must be freed
////////////////////////////////////////////////////
GlobalSharedMemory::OptimalCommunicator (processors,optimal_comm);
InitFromMPICommunicator(processors,optimal_comm);
SetCommunicator(optimal_comm);
///////////////////////////////////////////////////
// Free the temp communicator
///////////////////////////////////////////////////
MPI_Comm_free(&optimal_comm);
}
//////////////////////////////////
@ -183,8 +195,8 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
} else {
srank = 0;
comm_split = parent.communicator;
// std::cout << " Inherited communicator " <<comm_split <<std::endl;
int ierr = MPI_Comm_dup (parent.communicator,&comm_split);
assert(ierr==0);
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
@ -196,6 +208,11 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
// Take the right SHM buffers
//////////////////////////////////////////////////////////////////////////////////////////////////////
SetCommunicator(comm_split);
///////////////////////////////////////////////
// Free the temp communicator
///////////////////////////////////////////////
MPI_Comm_free(&comm_split);
if(0){
std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl;
@ -210,6 +227,9 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,
void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base)
{
////////////////////////////////////////////////////
// Creates communicator, and the communicator_halo
////////////////////////////////////////////////////
_ndimension = processors.size();
_processor_coor.resize(_ndimension);

View File

@ -133,6 +133,7 @@ class SharedMemory
public:
SharedMemory() {};
~SharedMemory();
///////////////////////////////////////////////////////////////////////////////////////
// set the buffers & sizes
///////////////////////////////////////////////////////////////////////////////////////

View File

@ -27,6 +27,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */
#include <Grid/GridCore.h>
#include <pwd.h>
namespace Grid {
@ -113,19 +114,151 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
assert(WorldNode!=-1);
_ShmSetup=1;
}
void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
// Gray encode support
int BinaryToGray (int binary) {
int gray = (binary>>1)^binary;
return gray;
}
int Log2Size(int TwoToPower,int MAXLOG2)
{
////////////////////////////////////////////////////////////////
// Assert power of two shm_size.
////////////////////////////////////////////////////////////////
int log2size = -1;
for(int i=0;i<=MAXLOG2RANKSPERNODE;i++){
if ( (0x1<<i) == WorldShmSize ) {
for(int i=0;i<=MAXLOG2;i++){
if ( (0x1<<i) == TwoToPower ) {
log2size = i;
break;
}
}
return log2size;
}
void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
{
#undef HYPERCUBE
#ifdef HYPERCUBE
////////////////////////////////////////////////////////////////
// Assert power of two shm_size.
////////////////////////////////////////////////////////////////
int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
assert(log2size != -1);
////////////////////////////////////////////////////////////////
// Identify the hypercube coordinate of this node using hostname
////////////////////////////////////////////////////////////////
// n runs 0...7 9...16 18...25 27...34 (8*4) 5 bits
// i runs 0..7 3 bits
// r runs 0..3 2 bits
// 2^10 = 1024 nodes
const int maxhdim = 10;
std::vector<int> HyperCubeCoords(maxhdim,0);
std::vector<int> RootHyperCubeCoords(maxhdim,0);
int R;
int I;
int N;
const int namelen = _POSIX_HOST_NAME_MAX;
char name[namelen];
// Parse ICE-XA hostname to get hypercube location
gethostname(name,namelen);
int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
assert(nscan==3);
int nlo = N%9;
int nhi = N/9;
uint32_t hypercoor = (R<<8)|(I<<5)|(nhi<<3)|nlo ;
uint32_t rootcoor = hypercoor;
//////////////////////////////////////////////////////////////////
// Print debug info
//////////////////////////////////////////////////////////////////
for(int d=0;d<maxhdim;d++){
HyperCubeCoords[d] = (hypercoor>>d)&0x1;
}
std::string hname(name);
std::cout << "hostname "<<hname<<std::endl;
std::cout << "R " << R << " I " << I << " N "<< N<<
<< " hypercoor 0x"<<std::hex<<hypercoor<<std::dec<<std::endl;
//////////////////////////////////////////////////////////////////
// broadcast node 0's base coordinate for this partition.
//////////////////////////////////////////////////////////////////
MPI_Bcast(&rootcoor, sizeof(rootcoor), MPI_BYTE, 0, WorldComm);
hypercoor=hypercoor-rootcoor;
assert(hypercoor<WorldSize);
assert(hypercoor>=0);
//////////////////////////////////////
// Printing
//////////////////////////////////////
for(int d=0;d<maxhdim;d++){
HyperCubeCoords[d] = (hypercoor>>d)&0x1;
}
////////////////////////////////////////////////////////////////
// Identify subblock of ranks on node spreading across dims
// in a maximally symmetrical way
////////////////////////////////////////////////////////////////
int ndimension = processors.size();
std::vector<int> processor_coor(ndimension);
std::vector<int> WorldDims = processors; std::vector<int> ShmDims (ndimension,1); std::vector<int> NodeDims (ndimension);
std::vector<int> ShmCoor (ndimension); std::vector<int> NodeCoor (ndimension); std::vector<int> WorldCoor(ndimension);
std::vector<int> HyperCoor(ndimension);
int dim = 0;
for(int l2=0;l2<log2size;l2++){
while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
ShmDims[dim]*=2;
dim=(dim+1)%ndimension;
}
////////////////////////////////////////////////////////////////
// Establish torus of processes and nodes with sub-blockings
////////////////////////////////////////////////////////////////
for(int d=0;d<ndimension;d++){
NodeDims[d] = WorldDims[d]/ShmDims[d];
}
////////////////////////////////////////////////////////////////
// Map Hcube according to physical lattice
// must partition. Loop over dims and find out who would join.
////////////////////////////////////////////////////////////////
int hcoor = hypercoor;
for(int d=0;d<ndimension;d++){
int bits = Log2Size(NodeDims[d],MAXLOG2RANKSPERNODE);
int msk = (0x1<<bits)-1;
HyperCoor[d]=hcoor & msk;
HyperCoor[d]=BinaryToGray(HyperCoor[d]); // Space filling curve magic
hcoor = hcoor >> bits;
}
////////////////////////////////////////////////////////////////
// Check processor counts match
////////////////////////////////////////////////////////////////
int Nprocessors=1;
for(int i=0;i<ndimension;i++){
Nprocessors*=processors[i];
}
assert(WorldSize==Nprocessors);
////////////////////////////////////////////////////////////////
// Establish mapping between lexico physics coord and WorldRank
////////////////////////////////////////////////////////////////
int rank;
Lexicographic::CoorFromIndexReversed(NodeCoor,WorldNode ,NodeDims);
for(int d=0;d<ndimension;d++) NodeCoor[d]=HyperCoor[d];
Lexicographic::CoorFromIndexReversed(ShmCoor ,WorldShmRank,ShmDims);
for(int d=0;d<ndimension;d++) WorldCoor[d] = NodeCoor[d]*ShmDims[d]+ShmCoor[d];
Lexicographic::IndexFromCoorReversed(WorldCoor,rank,WorldDims);
/////////////////////////////////////////////////////////////////
// Build the new communicator
/////////////////////////////////////////////////////////////////
int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
assert(ierr==0);
#else
////////////////////////////////////////////////////////////////
// Assert power of two shm_size.
////////////////////////////////////////////////////////////////
int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
assert(log2size != -1);
////////////////////////////////////////////////////////////////
@ -174,14 +307,77 @@ void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,
/////////////////////////////////////////////////////////////////
int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
assert(ierr==0);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////
// SHMGET
////////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_MPI3_SHMGET
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared windows for our group
//////////////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize);
std::vector<int> shmids(WorldShmSize);
if ( WorldShmRank == 0 ) {
for(int r=0;r<WorldShmSize;r++){
size_t size = bytes;
key_t key = IPC_PRIVATE;
int flags = IPC_CREAT | SHM_R | SHM_W;
#ifdef SHM_HUGETLB
if (Hugepages) flags|=SHM_HUGETLB;
#endif
if ((shmids[r]= shmget(key,size, flags)) ==-1) {
int errsv = errno;
printf("Errno %d\n",errsv);
printf("key %d\n",key);
printf("size %lld\n",size);
printf("flags %d\n",flags);
perror("shmget");
exit(1);
}
}
}
MPI_Barrier(WorldShmComm);
MPI_Bcast(&shmids[0],WorldShmSize*sizeof(int),MPI_BYTE,0,WorldShmComm);
MPI_Barrier(WorldShmComm);
for(int r=0;r<WorldShmSize;r++){
WorldShmCommBufs[r] = (uint64_t *)shmat(shmids[r], NULL,0);
if (WorldShmCommBufs[r] == (uint64_t *)-1) {
perror("Shared memory attach failure");
shmctl(shmids[r], IPC_RMID, NULL);
exit(2);
}
}
MPI_Barrier(WorldShmComm);
///////////////////////////////////
// Mark for clean up
///////////////////////////////////
for(int r=0;r<WorldShmSize;r++){
shmctl(shmids[r], IPC_RMID,(struct shmid_ds *)NULL);
}
MPI_Barrier(WorldShmComm);
_ShmAlloc=1;
_ShmAllocBytes = bytes;
}
#endif
////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs mapping intended
////////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_MPI3_SHMMMAP
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -191,7 +387,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
WorldShmCommBufs.resize(WorldShmSize);
////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbf and others map filesystems as mappable huge pages
// Hugetlbfs and others map filesystems as mappable huge pages
////////////////////////////////////////////////////////////////////////////////////////////
char shm_name [NAME_MAX];
for(int r=0;r<WorldShmSize;r++){
@ -218,6 +414,49 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
assert(((uint64_t)ptr&0x3F)==0);
close(fd);
WorldShmCommBufs[r] =ptr;
std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
}
_ShmAlloc=1;
_ShmAllocBytes = bytes;
};
#endif // MMAP
#ifdef GRID_MPI3_SHM_NONE
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared windows for our group
//////////////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize);
////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbf and others map filesystems as mappable huge pages
////////////////////////////////////////////////////////////////////////////////////////////
char shm_name [NAME_MAX];
assert(WorldShmSize == 1);
for(int r=0;r<WorldShmSize;r++){
int fd=-1;
int mmap_flag = MAP_SHARED |MAP_ANONYMOUS ;
#ifdef MAP_POPULATE
mmap_flag|=MAP_POPULATE;
#endif
#ifdef MAP_HUGETLB
if ( flags ) mmap_flag |= MAP_HUGETLB;
#endif
void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0);
if ( ptr == (void *)MAP_FAILED ) {
printf("mmap %s failed\n",shm_name);
perror("failed mmap"); assert(0);
}
assert(((uint64_t)ptr&0x3F)==0);
close(fd);
WorldShmCommBufs[r] =ptr;
std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
}
_ShmAlloc=1;
_ShmAllocBytes = bytes;
@ -232,6 +471,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
////////////////////////////////////////////////////////////////////////////////////////////
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
MPI_Barrier(WorldShmComm);
@ -243,7 +483,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
size_t size = bytes;
sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldNode,r);
struct passwd *pw = getpwuid (getuid());
sprintf(shm_name,"/Grid_%s_mpi3_shm_%d_%d",pw->pw_name,WorldNode,r);
shm_unlink(shm_name);
int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666);
@ -259,7 +500,11 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#endif
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
if ( ptr == (void * )MAP_FAILED ) { perror("failed mmap"); assert(0); }
std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
if ( ptr == (void * )MAP_FAILED ) {
perror("failed mmap");
assert(0);
}
assert(((uint64_t)ptr&0x3F)==0);
WorldShmCommBufs[r] =ptr;
@ -274,7 +519,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
size_t size = bytes ;
sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldNode,r);
struct passwd *pw = getpwuid (getuid());
sprintf(shm_name,"/Grid_%s_mpi3_shm_%d_%d",pw->pw_name,WorldNode,r);
int fd=shm_open(shm_name,O_RDWR,0666);
if ( fd<0 ) { perror("failed shm_open"); assert(0); }
@ -292,6 +538,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
}
#endif
////////////////////////////////////////////////////////
// Global shared functionality finished
// Now move to per communicator functionality
@ -318,11 +567,12 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
heap_size = GlobalSharedMemory::ShmAllocBytes();
for(int r=0;r<ShmSize;r++){
uint32_t sr = (r==ShmRank) ? GlobalSharedMemory::WorldRank : 0 ;
uint32_t wsr = (r==ShmRank) ? GlobalSharedMemory::WorldShmRank : 0 ;
MPI_Allreduce(MPI_IN_PLACE,&sr,1,MPI_UINT32_T,MPI_SUM,comm);
MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[sr];
ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
// std::cout << "SetCommunicator ShmCommBufs ["<< r<< "] = "<< ShmCommBufs[r]<< " wsr = "<<wsr<<std::endl;
}
ShmBufferFreeAll();
@ -391,5 +641,12 @@ void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
return (void *) remote;
}
}
SharedMemory::~SharedMemory()
{
int MPI_is_finalised; MPI_Finalized(&MPI_is_finalised);
if ( !MPI_is_finalised ) {
MPI_Comm_free(&ShmComm);
}
};
}

View File

@ -122,5 +122,7 @@ void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
{
return NULL;
}
SharedMemory::~SharedMemory()
{};
}

View File

@ -45,31 +45,33 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
int so=plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension];
int ent = 0;
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
int stride=rhs._grid->_slice_stride[dimension];
if ( cbmask == 0x3 ) {
parallel_for_nest2(int n=0;n<e1;n++){
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*stride;
int bo = n*e2;
buffer[off+bo+b]=rhs._odata[so+o+b];
table[ent++] = std::pair<int,int>(off+bo+b,so+o+b);
}
}
} else {
int bo=0;
std::vector<std::pair<int,int> > table;
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*stride;
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
if ( ocb &cbmask ) {
table.push_back(std::pair<int,int> (bo++,o+b));
table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
}
}
}
parallel_for(int i=0;i<table.size();i++){
buffer[off+table[i].first]=rhs._odata[so+table[i].second];
}
}
parallel_for(int i=0;i<ent;i++){
buffer[table[i].first]=rhs._odata[table[i].second];
}
}
@ -140,31 +142,35 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension];
int stride=rhs._grid->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
int ent =0;
if ( cbmask ==0x3 ) {
parallel_for_nest2(int n=0;n<e1;n++){
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*rhs._grid->_slice_stride[dimension];
int bo =n*rhs._grid->_slice_block[dimension];
rhs._odata[so+o+b]=buffer[bo+b];
table[ent++] = std::pair<int,int>(so+o+b,bo+b);
}
}
} else {
std::vector<std::pair<int,int> > table;
int bo=0;
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*rhs._grid->_slice_stride[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) {
table.push_back(std::pair<int,int> (so+o+b,bo++));
table[ent++]=std::pair<int,int> (so+o+b,bo++);
}
}
}
parallel_for(int i=0;i<table.size();i++){
// std::cout << "Rcv"<< table[i].first << " " << table[i].second << " " <<buffer[table[i].second]<<std::endl;
rhs._odata[table[i].first]=buffer[table[i].second];
}
}
parallel_for(int i=0;i<ent;i++){
rhs._odata[table[i].first]=buffer[table[i].second];
}
}
@ -228,29 +234,32 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
int e2=rhs._grid->_slice_block[dimension];
int stride = rhs._grid->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
int ent=0;
if(cbmask == 0x3 ){
parallel_for_nest2(int n=0;n<e1;n++){
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride+b;
//lhs._odata[lo+o]=rhs._odata[ro+o];
vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
table[ent++] = std::pair<int,int>(lo+o,ro+o);
}
}
} else {
parallel_for_nest2(int n=0;n<e1;n++){
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride+b;
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
if ( ocb&cbmask ) {
//lhs._odata[lo+o]=rhs._odata[ro+o];
vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
table[ent++] = std::pair<int,int>(lo+o,ro+o);
}
}
}
}
parallel_for(int i=0;i<ent;i++){
lhs._odata[table[i].first]=rhs._odata[table[i].second];
}
}
template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
@ -269,16 +278,28 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
int e2=rhs._grid->_slice_block [dimension];
int stride = rhs._grid->_slice_stride[dimension];
parallel_for_nest2(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
int ent=0;
double t_tab,t_perm;
if ( cbmask == 0x3 ) {
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride;
table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
}}
} else {
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride;
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) {
permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
}
if ( ocb&cbmask ) table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
}}
}
}}
parallel_for(int i=0;i<ent;i++){
permute(lhs._odata[table[i].first],rhs._odata[table[i].second],permute_type);
}
}
//////////////////////////////////////////////////////
@ -291,6 +312,8 @@ template<class vobj> void Cshift_local(Lattice<vobj>& ret,const Lattice<vobj> &r
sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even);
sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);
double t_local;
if ( sshift[0] == sshift[1] ) {
Cshift_local(ret,rhs,dimension,shift,0x3);
} else {
@ -299,7 +322,7 @@ template<class vobj> void Cshift_local(Lattice<vobj>& ret,const Lattice<vobj> &r
}
}
template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
template<class vobj> void Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
GridBase *grid = rhs._grid;
int fd = grid->_fdimensions[dimension];
@ -325,11 +348,7 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
int sshift = grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
int sx = (x+sshift)%rd;
// FIXME : This must change where we have a
// Rotate slice.
// Document how this works ; why didn't I do this when I first wrote it...
// wrap is whether sshift > rd.
// num is sshift mod rd.
//
@ -365,10 +384,8 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type_dist);
else Copy_plane(ret,rhs,dimension,x,sx,cbmask);
}
return ret;
}
}
#endif

View File

@ -54,13 +54,13 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
if ( !comm_dim ) {
// std::cout << "Cshift_local" <<std::endl;
//std::cout << "CSHIFT: Cshift_local" <<std::endl;
Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
} else if ( splice_dim ) {
// std::cout << "Cshift_comms_simd" <<std::endl;
//std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl;
Cshift_comms_simd(ret,rhs,dimension,shift);
} else {
// std::cout << "Cshift_comms" <<std::endl;
//std::cout << "CSHIFT: Cshift_comms" <<std::endl;
Cshift_comms(ret,rhs,dimension,shift);
}
return ret;
@ -91,9 +91,12 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even);
sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);
//std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
if ( sshift[0] == sshift[1] ) {
//std::cout << "Single pass Cshift_comms" <<std::endl;
Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
} else {
//std::cout << "Two pass Cshift_comms" <<std::endl;
Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
}
@ -175,6 +178,10 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
int simd_layout = grid->_simd_layout[dimension];
int comm_dim = grid->_processors[dimension] >1 ;
//std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
assert(comm_dim==1);
assert(simd_layout==2);
assert(shift>=0);

View File

@ -244,19 +244,11 @@ namespace Grid {
template<class sobj,class vobj> strong_inline
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
ret.checkerboard = x.checkerboard;
conformable(ret,x);
conformable(x,y);
axpy(ret,a,x,y);
return norm2(ret);
return axpy_norm_fast(ret,a,x,y);
}
template<class sobj,class vobj> strong_inline
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
ret.checkerboard = x.checkerboard;
conformable(ret,x);
conformable(x,y);
axpby(ret,a,b,x,y);
return norm2(ret); // FIXME implement parallel norm in ss loop
return axpby_norm_fast(ret,a,b,x,y);
}
}

View File

@ -256,9 +256,42 @@ public:
_odata[ss]=r._odata[ss];
}
}
Lattice(Lattice&& r){ // move constructor
_grid = r._grid;
checkerboard = r.checkerboard;
_odata=std::move(r._odata);
}
inline Lattice<vobj> & operator = (Lattice<vobj> && r)
{
_grid = r._grid;
checkerboard = r.checkerboard;
_odata =std::move(r._odata);
return *this;
}
inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
_grid = r._grid;
checkerboard = r.checkerboard;
_odata.resize(_grid->oSites());// essential
parallel_for(int ss=0;ss<_grid->oSites();ss++){
_odata[ss]=r._odata[ss];
}
return *this;
}
template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
this->checkerboard = r.checkerboard;
conformable(*this,r);
parallel_for(int ss=0;ss<_grid->oSites();ss++){
this->_odata[ss]=r._odata[ss];
}
return *this;
}
virtual ~Lattice(void) = default;
void reset(GridBase* grid) {
@ -277,15 +310,6 @@ public:
return *this;
}
template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
this->checkerboard = r.checkerboard;
conformable(*this,r);
parallel_for(int ss=0;ss<_grid->oSites();ss++){
this->_odata[ss]=r._odata[ss];
}
return *this;
}
// *=,+=,-= operators inherit behvour from correspond */+/- operation
template<class T> strong_inline Lattice<vobj> &operator *=(const T &r) {

View File

@ -179,7 +179,7 @@ namespace Grid {
return ret;
}
#define DECLARE_RELATIONAL(op,functor) \
#define DECLARE_RELATIONAL_EQ(op,functor) \
template<class vsimd,IfSimd<vsimd> = 0>\
inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\
{\
@ -198,11 +198,6 @@ namespace Grid {
typedef typename vsimd::scalar_type scalar;\
return Comparison(functor<scalar,scalar>(),lhs,rhs);\
}\
template<class vsimd>\
inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
{ \
return lhs._internal op rhs._internal; \
} \
template<class vsimd>\
inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \
{ \
@ -212,14 +207,21 @@ namespace Grid {
inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \
{ \
return lhs op rhs._internal; \
}
} \
#define DECLARE_RELATIONAL(op,functor) \
DECLARE_RELATIONAL_EQ(op,functor) \
template<class vsimd>\
inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
{ \
return lhs._internal op rhs._internal; \
}
DECLARE_RELATIONAL(<,slt);
DECLARE_RELATIONAL(<=,sle);
DECLARE_RELATIONAL(>,sgt);
DECLARE_RELATIONAL(>=,sge);
DECLARE_RELATIONAL(==,seq);
DECLARE_RELATIONAL_EQ(==,seq);
DECLARE_RELATIONAL(!=,sne);
#undef DECLARE_RELATIONAL

View File

@ -52,23 +52,5 @@ namespace Grid {
}
};
// LatticeCoordinate();
// FIXME for debug; deprecate this; made obscelete by
template<class vobj> void lex_sites(Lattice<vobj> &l){
Real *v_ptr = (Real *)&l._odata[0];
size_t o_len = l._grid->oSites();
size_t v_len = sizeof(vobj)/sizeof(vRealF);
size_t vec_len = vRealF::Nsimd();
for(int i=0;i<o_len;i++){
for(int j=0;j<v_len;j++){
for(int vv=0;vv<vec_len;vv+=2){
v_ptr[i*v_len*vec_len+j*vec_len+vv ]= i+vv*500;
v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
}
}}
}
}
#endif

View File

@ -33,7 +33,7 @@ namespace Grid {
// Deterministic Reduction operations
////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
ComplexD nrm = innerProduct(arg,arg);
auto nrm = innerProduct(arg,arg);
return std::real(nrm);
}
@ -43,31 +43,84 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
{
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type;
scalar_type nrm;
GridBase *grid = left._grid;
std::vector<vector_type,alignedAllocator<vector_type> > sumarray(grid->SumArraySize());
const int pad = 8;
ComplexD inner;
Vector<ComplexD> sumarray(grid->SumArraySize()*pad);
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
int nwork, mywork, myoff;
GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
decltype(innerProductD(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation
decltype(innerProductD(left._odata[0],right._odata[0])) vinner=zero; // private to thread; sub summation
for(int ss=myoff;ss<mywork+myoff; ss++){
vnrm = vnrm + innerProductD(left._odata[ss],right._odata[ss]);
vinner = vinner + innerProductD(left._odata[ss],right._odata[ss]);
}
sumarray[thr]=TensorRemove(vnrm) ;
// All threads sum across SIMD; reduce serial work at end
// one write per cacheline with streaming store
ComplexD tmp = Reduce(TensorRemove(vinner)) ;
vstream(sumarray[thr*pad],tmp);
}
vector_type vvnrm; vvnrm=zero; // sum across threads
inner=0.0;
for(int i=0;i<grid->SumArraySize();i++){
vvnrm = vvnrm+sumarray[i];
inner = inner+sumarray[i*pad];
}
nrm = Reduce(vvnrm);// sum across simd
right._grid->GlobalSum(nrm);
return nrm;
right._grid->GlobalSum(inner);
return inner;
}
/////////////////////////
// Fast axpby_norm
// z = a x + b y
// return norm z
/////////////////////////
template<class sobj,class vobj> strong_inline RealD
axpy_norm_fast(Lattice<vobj> &z,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
{
sobj one(1.0);
return axpby_norm_fast(z,a,one,x,y);
}
template<class sobj,class vobj> strong_inline RealD
axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
{
const int pad = 8;
z.checkerboard = x.checkerboard;
conformable(z,x);
conformable(x,y);
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type;
RealD nrm;
GridBase *grid = x._grid;
Vector<RealD> sumarray(grid->SumArraySize()*pad);
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
int nwork, mywork, myoff;
GridThread::GetWork(x._grid->oSites(),thr,mywork,myoff);
// private to thread; sub summation
decltype(innerProductD(z._odata[0],z._odata[0])) vnrm=zero;
for(int ss=myoff;ss<mywork+myoff; ss++){
vobj tmp = a*x._odata[ss]+b*y._odata[ss];
vnrm = vnrm + innerProductD(tmp,tmp);
vstream(z._odata[ss],tmp);
}
vstream(sumarray[thr*pad],real(Reduce(TensorRemove(vnrm)))) ;
}
nrm = 0.0; // sum across threads; linear in thread count but fast
for(int i=0;i<grid->SumArraySize();i++){
nrm = nrm+sumarray[i*pad];
}
z._grid->GlobalSum(nrm);
return nrm;
}
template<class Op,class T1>
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)

View File

@ -158,10 +158,19 @@ namespace Grid {
// tens of seconds per trajectory so this is clean in all reasonable cases,
// and margin of safety is orders of magnitude.
// We could hack Sitmo to skip in the higher order words of state if necessary
//
// Replace with 2^30 ; avoid problem on large volumes
//
/////////////////////////////////////////////////////////////////////////////////////
// uint64_t skip = site+1; // Old init Skipped then drew. Checked compat with faster init
const int shift = 30;
uint64_t skip = site;
skip = skip<<40;
skip = skip<<shift;
assert((skip >> shift)==site); // check for overflow
eng.discard(skip);
// std::cout << " Engine " <<site << " state " <<eng<<std::endl;
}

View File

@ -599,6 +599,51 @@ unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
extract1(in_vobj, out_ptrs, 0);
}
}
template<typename vobj, typename sobj>
typename std::enable_if<isSIMDvectorized<vobj>::value && !isSIMDvectorized<sobj>::value, void>::type
unvectorizeToRevLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
{
typedef typename vobj::vector_type vtype;
GridBase* in_grid = in._grid;
out.resize(in_grid->lSites());
int ndim = in_grid->Nd();
int in_nsimd = vtype::Nsimd();
std::vector<std::vector<int> > in_icoor(in_nsimd);
for(int lane=0; lane < in_nsimd; lane++){
in_icoor[lane].resize(ndim);
in_grid->iCoorFromIindex(in_icoor[lane], lane);
}
parallel_for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index
//Assemble vector of pointers to output elements
std::vector<sobj*> out_ptrs(in_nsimd);
std::vector<int> in_ocoor(ndim);
in_grid->oCoorFromOindex(in_ocoor, in_oidx);
std::vector<int> lcoor(in_grid->Nd());
for(int lane=0; lane < in_nsimd; lane++){
for(int mu=0;mu<ndim;mu++)
lcoor[mu] = in_ocoor[mu] + in_grid->_rdimensions[mu]*in_icoor[lane][mu];
int lex;
Lexicographic::IndexFromCoorReversed(lcoor, lex, in_grid->_ldimensions);
out_ptrs[lane] = &out[lex];
}
//Unpack into those ptrs
const vobj & in_vobj = in._odata[in_oidx];
extract1(in_vobj, out_ptrs, 0);
}
}
//Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order
template<typename vobj, typename sobj>
typename std::enable_if<isSIMDvectorized<vobj>::value
@ -648,10 +693,59 @@ vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
}
}
template<typename vobj, typename sobj>
typename std::enable_if<isSIMDvectorized<vobj>::value
&& !isSIMDvectorized<sobj>::value, void>::type
vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
{
typedef typename vobj::vector_type vtype;
GridBase* grid = out._grid;
assert(in.size()==grid->lSites());
int ndim = grid->Nd();
int nsimd = vtype::Nsimd();
std::vector<std::vector<int> > icoor(nsimd);
for(int lane=0; lane < nsimd; lane++){
icoor[lane].resize(ndim);
grid->iCoorFromIindex(icoor[lane],lane);
}
parallel_for(uint64_t oidx = 0; oidx < grid->oSites(); oidx++){ //loop over outer index
//Assemble vector of pointers to output elements
std::vector<sobj*> ptrs(nsimd);
std::vector<int> ocoor(ndim);
grid->oCoorFromOindex(ocoor, oidx);
std::vector<int> lcoor(grid->Nd());
for(int lane=0; lane < nsimd; lane++){
for(int mu=0;mu<ndim;mu++){
lcoor[mu] = ocoor[mu] + grid->_rdimensions[mu]*icoor[lane][mu];
}
int lex;
Lexicographic::IndexFromCoorReversed(lcoor, lex, grid->_ldimensions);
ptrs[lane] = &in[lex];
}
//pack from those ptrs
vobj vecobj;
merge1(vecobj, ptrs, 0);
out._odata[oidx] = vecobj;
}
}
//Convert a Lattice from one precision to another
template<class VobjOut, class VobjIn>
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
assert(out._grid->Nd() == in._grid->Nd());
assert(out._grid->FullDimensions() == in._grid->FullDimensions());
out.checkerboard = in.checkerboard;
GridBase *in_grid=in._grid;
GridBase *out_grid = out._grid;

View File

@ -91,7 +91,7 @@ class BinaryIO {
typedef typename vobj::scalar_object sobj;
GridBase *grid = lat._grid;
int lsites = grid->lSites();
uint64_t lsites = grid->lSites();
std::vector<sobj> scalardata(lsites);
unvectorizeToLexOrdArray(scalardata,lat);
@ -110,11 +110,11 @@ class BinaryIO {
lsites = 1;
}
#pragma omp parallel
PARALLEL_REGION
{
uint32_t nersc_csum_thr = 0;
#pragma omp for
PARALLEL_FOR_LOOP_INTERN
for (uint64_t local_site = 0; local_site < lsites; local_site++)
{
uint32_t *site_buf = (uint32_t *)&fbuf[local_site];
@ -124,7 +124,7 @@ class BinaryIO {
}
}
#pragma omp critical
PARALLEL_CRITICAL
{
nersc_csum += nersc_csum_thr;
}
@ -146,21 +146,23 @@ class BinaryIO {
std::vector<int> local_start =grid->LocalStarts();
std::vector<int> global_vol =grid->FullDimensions();
#pragma omp parallel
PARALLEL_REGION
{
std::vector<int> coor(nd);
uint32_t scidac_csuma_thr=0;
uint32_t scidac_csumb_thr=0;
uint32_t site_crc=0;
#pragma omp for
PARALLEL_FOR_LOOP_INTERN
for(uint64_t local_site=0;local_site<lsites;local_site++){
uint32_t * site_buf = (uint32_t *)&fbuf[local_site];
/*
* Scidac csum is rather more heavyweight
* FIXME -- 128^3 x 256 x 16 will overflow.
*/
int global_site;
Lexicographic::CoorFromIndex(coor,local_site,local_vol);
@ -181,7 +183,7 @@ class BinaryIO {
scidac_csumb_thr ^= site_crc<<gsite31 | site_crc>>(32-gsite31);
}
#pragma omp critical
PARALLEL_CRITICAL
{
scidac_csuma^= scidac_csuma_thr;
scidac_csumb^= scidac_csumb_thr;
@ -261,7 +263,7 @@ class BinaryIO {
GridBase *grid,
std::vector<fobj> &iodata,
std::string file,
Integer offset,
uint64_t& offset,
const std::string &format, int control,
uint32_t &nersc_csum,
uint32_t &scidac_csuma,
@ -370,7 +372,7 @@ class BinaryIO {
std::cout << GridLogMessage <<"IOobject: C++ read I/O " << file << " : "
<< iodata.size() * sizeof(fobj) << " bytes" << std::endl;
std::ifstream fin;
fin.open(file, std::ios::binary | std::ios::in);
fin.open(file, std::ios::binary | std::ios::in);
if (control & BINARYIO_MASTER_APPEND)
{
fin.seekg(-sizeof(fobj), fin.end);
@ -429,14 +431,20 @@ class BinaryIO {
MPI_Abort(MPI_COMM_WORLD, 1); //assert(ierr == 0);
}
std::cout << GridLogDebug << "MPI read I/O set view " << file << std::endl;
std::cout << GridLogDebug << "MPI write I/O set view " << file << std::endl;
ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);
assert(ierr == 0);
std::cout << GridLogDebug << "MPI read I/O write all " << file << std::endl;
std::cout << GridLogDebug << "MPI write I/O write all " << file << std::endl;
ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);
assert(ierr == 0);
MPI_Offset os;
MPI_File_get_position(fh, &os);
MPI_File_get_byte_offset(fh, os, &disp);
offset = disp;
MPI_File_close(&fh);
MPI_Type_free(&fileArray);
MPI_Type_free(&localArray);
@ -446,16 +454,20 @@ class BinaryIO {
} else {
std::cout << GridLogMessage << "IOobject: C++ write I/O " << file << " : "
<< iodata.size() * sizeof(fobj) << " bytes" << std::endl;
<< iodata.size() * sizeof(fobj) << " bytes and offset " << offset << std::endl;
std::ofstream fout;
fout.exceptions ( std::fstream::failbit | std::fstream::badbit );
try {
fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
if (offset) { // Must already exist and contain data
fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
} else { // Allow create
fout.open(file,std::ios::binary|std::ios::out);
}
} catch (const std::fstream::failure& exc) {
std::cout << GridLogError << "Error in opening the file " << file << " for output" <<std::endl;
std::cout << GridLogError << "Exception description: " << exc.what() << std::endl;
std::cout << GridLogError << "Probable cause: wrong path, inaccessible location "<< std::endl;
// std::cout << GridLogError << "Probable cause: wrong path, inaccessible location "<< std::endl;
#ifdef USE_MPI_IO
MPI_Abort(MPI_COMM_WORLD,1);
#else
@ -489,6 +501,7 @@ class BinaryIO {
exit(1);
#endif
}
offset = fout.tellp();
fout.close();
}
timer.Stop();
@ -523,7 +536,7 @@ class BinaryIO {
static inline void readLatticeObject(Lattice<vobj> &Umu,
std::string file,
munger munge,
Integer offset,
uint64_t offset,
const std::string &format,
uint32_t &nersc_csum,
uint32_t &scidac_csuma,
@ -533,7 +546,7 @@ class BinaryIO {
typedef typename vobj::Realified::scalar_type word; word w=0;
GridBase *grid = Umu._grid;
int lsites = grid->lSites();
uint64_t lsites = grid->lSites();
std::vector<sobj> scalardata(lsites);
std::vector<fobj> iodata(lsites); // Munge, checksum, byte order in here
@ -544,7 +557,7 @@ class BinaryIO {
GridStopWatch timer;
timer.Start();
parallel_for(int x=0;x<lsites;x++) munge(iodata[x], scalardata[x]);
parallel_for(uint64_t x=0;x<lsites;x++) munge(iodata[x], scalardata[x]);
vectorizeFromLexOrdArray(scalardata,Umu);
grid->Barrier();
@ -560,7 +573,7 @@ class BinaryIO {
static inline void writeLatticeObject(Lattice<vobj> &Umu,
std::string file,
munger munge,
Integer offset,
uint64_t offset,
const std::string &format,
uint32_t &nersc_csum,
uint32_t &scidac_csuma,
@ -569,7 +582,7 @@ class BinaryIO {
typedef typename vobj::scalar_object sobj;
typedef typename vobj::Realified::scalar_type word; word w=0;
GridBase *grid = Umu._grid;
int lsites = grid->lSites();
uint64_t lsites = grid->lSites();
std::vector<sobj> scalardata(lsites);
std::vector<fobj> iodata(lsites); // Munge, checksum, byte order in here
@ -580,7 +593,7 @@ class BinaryIO {
GridStopWatch timer; timer.Start();
unvectorizeToLexOrdArray(scalardata,Umu);
parallel_for(int x=0;x<lsites;x++) munge(scalardata[x],iodata[x]);
parallel_for(uint64_t x=0;x<lsites;x++) munge(scalardata[x],iodata[x]);
grid->Barrier();
timer.Stop();
@ -597,7 +610,7 @@ class BinaryIO {
static inline void readRNG(GridSerialRNG &serial,
GridParallelRNG &parallel,
std::string file,
Integer offset,
uint64_t offset,
uint32_t &nersc_csum,
uint32_t &scidac_csuma,
uint32_t &scidac_csumb)
@ -610,8 +623,8 @@ class BinaryIO {
std::string format = "IEEE32BIG";
GridBase *grid = parallel._grid;
int gsites = grid->gSites();
int lsites = grid->lSites();
uint64_t gsites = grid->gSites();
uint64_t lsites = grid->lSites();
uint32_t nersc_csum_tmp = 0;
uint32_t scidac_csuma_tmp = 0;
@ -626,7 +639,7 @@ class BinaryIO {
nersc_csum,scidac_csuma,scidac_csumb);
timer.Start();
parallel_for(int lidx=0;lidx<lsites;lidx++){
parallel_for(uint64_t lidx=0;lidx<lsites;lidx++){
std::vector<RngStateType> tmp(RngStateCount);
std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin());
parallel.SetState(tmp,lidx);
@ -659,7 +672,7 @@ class BinaryIO {
static inline void writeRNG(GridSerialRNG &serial,
GridParallelRNG &parallel,
std::string file,
Integer offset,
uint64_t offset,
uint32_t &nersc_csum,
uint32_t &scidac_csuma,
uint32_t &scidac_csumb)
@ -670,8 +683,8 @@ class BinaryIO {
typedef std::array<RngStateType,RngStateCount> RNGstate;
GridBase *grid = parallel._grid;
int gsites = grid->gSites();
int lsites = grid->lSites();
uint64_t gsites = grid->gSites();
uint64_t lsites = grid->lSites();
uint32_t nersc_csum_tmp;
uint32_t scidac_csuma_tmp;
@ -684,7 +697,7 @@ class BinaryIO {
timer.Start();
std::vector<RNGstate> iodata(lsites);
parallel_for(int lidx=0;lidx<lsites;lidx++){
parallel_for(uint64_t lidx=0;lidx<lsites;lidx++){
std::vector<RngStateType> tmp(RngStateCount);
parallel.GetState(tmp,lidx);
std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin());
@ -693,7 +706,6 @@ class BinaryIO {
IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
nersc_csum,scidac_csuma,scidac_csumb);
iodata.resize(1);
{
std::vector<RngStateType> tmp(RngStateCount);

View File

@ -182,6 +182,11 @@ class GridLimeReader : public BinaryIO {
{
filename= _filename;
File = fopen(filename.c_str(), "r");
if (File == nullptr)
{
std::cerr << "cannot open file '" << filename << "'" << std::endl;
abort();
}
LimeR = limeCreateReader(File);
}
/////////////////////////////////////////////
@ -248,7 +253,6 @@ class GridLimeReader : public BinaryIO {
template<class serialisable_object>
void readLimeObject(serialisable_object &object,std::string object_name,std::string record_name)
{
std::string xmlstring;
// should this be a do while; can we miss a first record??
while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) {
@ -262,7 +266,8 @@ class GridLimeReader : public BinaryIO {
limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);
// std::cout << GridLogMessage<< " readLimeObject matches XML " << &xmlc[0] <<std::endl;
XmlReader RD(&xmlc[0],"");
std::string xmlstring(&xmlc[0]);
XmlReader RD(xmlstring, true, "");
read(RD,object_name,object);
return;
}
@ -272,8 +277,10 @@ class GridLimeReader : public BinaryIO {
}
};
class GridLimeWriter : public BinaryIO {
class GridLimeWriter : public BinaryIO
{
public:
///////////////////////////////////////////////////
// FIXME: format for RNG? Now just binary out instead
// FIXME: collective calls or not ?
@ -282,17 +289,24 @@ class GridLimeWriter : public BinaryIO {
FILE *File;
LimeWriter *LimeW;
std::string filename;
bool boss_node;
GridLimeWriter( bool isboss = true) {
boss_node = isboss;
}
void open(const std::string &_filename) {
filename= _filename;
File = fopen(filename.c_str(), "w");
LimeW = limeCreateWriter(File); assert(LimeW != NULL );
if ( boss_node ) {
File = fopen(filename.c_str(), "w");
LimeW = limeCreateWriter(File); assert(LimeW != NULL );
}
}
/////////////////////////////////////////////
// Close the file
/////////////////////////////////////////////
void close(void) {
fclose(File);
if ( boss_node ) {
fclose(File);
}
// limeDestroyWriter(LimeW);
}
///////////////////////////////////////////////////////
@ -300,10 +314,12 @@ class GridLimeWriter : public BinaryIO {
///////////////////////////////////////////////////////
int createLimeRecordHeader(std::string message, int MB, int ME, size_t PayloadSize)
{
LimeRecordHeader *h;
h = limeCreateHeader(MB, ME, const_cast<char *>(message.c_str()), PayloadSize);
assert(limeWriteRecordHeader(h, LimeW) >= 0);
limeDestroyHeader(h);
if ( boss_node ) {
LimeRecordHeader *h;
h = limeCreateHeader(MB, ME, const_cast<char *>(message.c_str()), PayloadSize);
assert(limeWriteRecordHeader(h, LimeW) >= 0);
limeDestroyHeader(h);
}
return LIME_SUCCESS;
}
////////////////////////////////////////////
@ -312,65 +328,99 @@ class GridLimeWriter : public BinaryIO {
template<class serialisable_object>
void writeLimeObject(int MB,int ME,serialisable_object &object,std::string object_name,std::string record_name)
{
std::string xmlstring;
{
XmlWriter WR("","");
write(WR,object_name,object);
xmlstring = WR.XmlString();
if ( boss_node ) {
std::string xmlstring;
{
XmlWriter WR("","");
write(WR,object_name,object);
xmlstring = WR.XmlString();
}
// std::cout << "WriteLimeObject" << record_name <<std::endl;
uint64_t nbytes = xmlstring.size();
// std::cout << " xmlstring "<< nbytes<< " " << xmlstring <<std::endl;
int err;
LimeRecordHeader *h = limeCreateHeader(MB, ME,const_cast<char *>(record_name.c_str()), nbytes);
assert(h!= NULL);
err=limeWriteRecordHeader(h, LimeW); assert(err>=0);
err=limeWriteRecordData(&xmlstring[0], &nbytes, LimeW); assert(err>=0);
err=limeWriterCloseRecord(LimeW); assert(err>=0);
limeDestroyHeader(h);
}
// std::cout << "WriteLimeObject" << record_name <<std::endl;
uint64_t nbytes = xmlstring.size();
// std::cout << " xmlstring "<< nbytes<< " " << xmlstring <<std::endl;
int err;
LimeRecordHeader *h = limeCreateHeader(MB, ME,const_cast<char *>(record_name.c_str()), nbytes);
assert(h!= NULL);
err=limeWriteRecordHeader(h, LimeW); assert(err>=0);
err=limeWriteRecordData(&xmlstring[0], &nbytes, LimeW); assert(err>=0);
err=limeWriterCloseRecord(LimeW); assert(err>=0);
limeDestroyHeader(h);
// std::cout << " File offset is now"<<ftello(File) << std::endl;
}
////////////////////////////////////////////
////////////////////////////////////////////////////
// Write a generic lattice field and csum
////////////////////////////////////////////
// This routine is Collectively called by all nodes
// in communicator used by the field._grid
////////////////////////////////////////////////////
template<class vobj>
void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
{
////////////////////////////////////////////
// Create record header
////////////////////////////////////////////
typedef typename vobj::scalar_object sobj;
int err;
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
uint64_t PayloadSize = sizeof(sobj) * field._grid->_gsites;
createLimeRecordHeader(record_name, 0, 0, PayloadSize);
// std::cout << "W sizeof(sobj)" <<sizeof(sobj)<<std::endl;
// std::cout << "W Gsites " <<field._grid->_gsites<<std::endl;
// std::cout << "W Payload expected " <<PayloadSize<<std::endl;
////////////////////////////////////////////////////////////////////
// NB: FILE and iostream are jointly writing disjoint sequences in the
// the same file through different file handles (integer units).
//
// These are both buffered, so why I think this code is right is as follows.
//
// i) write record header to FILE *File, telegraphing the size.
// ii) ftello reads the offset from FILE *File .
// i) write record header to FILE *File, telegraphing the size; flush
// ii) ftello reads the offset from FILE *File .
// iii) iostream / MPI Open independently seek this offset. Write sequence direct to disk.
// Closes iostream and flushes.
// iv) fseek on FILE * to end of this disjoint section.
// v) Continue writing scidac record.
////////////////////////////////////////////////////////////////////
uint64_t offset = ftello(File);
// std::cout << " Writing to offset "<<offset << std::endl;
GridBase *grid = field._grid;
assert(boss_node == field._grid->IsBoss() );
////////////////////////////////////////////
// Create record header
////////////////////////////////////////////
typedef typename vobj::scalar_object sobj;
int err;
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
uint64_t PayloadSize = sizeof(sobj) * grid->_gsites;
if ( boss_node ) {
createLimeRecordHeader(record_name, 0, 0, PayloadSize);
fflush(File);
}
// std::cout << "W sizeof(sobj)" <<sizeof(sobj)<<std::endl;
// std::cout << "W Gsites " <<field._grid->_gsites<<std::endl;
// std::cout << "W Payload expected " <<PayloadSize<<std::endl;
////////////////////////////////////////////////
// Check all nodes agree on file position
////////////////////////////////////////////////
uint64_t offset1;
if ( boss_node ) {
offset1 = ftello(File);
}
grid->Broadcast(0,(void *)&offset1,sizeof(offset1));
///////////////////////////////////////////
// The above is collective. Write by other means into the binary record
///////////////////////////////////////////
std::string format = getFormatString<vobj>();
BinarySimpleMunger<sobj,sobj> munge;
BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
// fseek(File,0,SEEK_END); offset = ftello(File);std::cout << " offset now "<<offset << std::endl;
err=limeWriterCloseRecord(LimeW); assert(err>=0);
BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset1, format,nersc_csum,scidac_csuma,scidac_csumb);
///////////////////////////////////////////
// Wind forward and close the record
///////////////////////////////////////////
if ( boss_node ) {
fseek(File,0,SEEK_END);
uint64_t offset2 = ftello(File); // std::cout << " now at offset "<<offset2 << std::endl;
assert( (offset2-offset1) == PayloadSize);
}
/////////////////////////////////////////////////////////////
// Check MPI-2 I/O did what we expect to file
/////////////////////////////////////////////////////////////
if ( boss_node ) {
err=limeWriterCloseRecord(LimeW); assert(err>=0);
}
////////////////////////////////////////
// Write checksum element, propagaing forward from the BinaryIO
// Always pair a checksum with a binary object, and close message
@ -380,21 +430,26 @@ class GridLimeWriter : public BinaryIO {
std::stringstream streamb; streamb << std::hex << scidac_csumb;
checksum.suma= streama.str();
checksum.sumb= streamb.str();
// std::cout << GridLogMessage<<" writing scidac checksums "<<std::hex<<scidac_csuma<<"/"<<scidac_csumb<<std::dec<<std::endl;
writeLimeObject(0,1,checksum,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM));
if ( boss_node ) {
writeLimeObject(0,1,checksum,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM));
}
}
};
class ScidacWriter : public GridLimeWriter {
public:
template<class SerialisableUserFile>
void writeScidacFileRecord(GridBase *grid,SerialisableUserFile &_userFile)
{
scidacFile _scidacFile(grid);
writeLimeObject(1,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
writeLimeObject(0,1,_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
}
ScidacWriter(bool isboss =true ) : GridLimeWriter(isboss) { };
template<class SerialisableUserFile>
void writeScidacFileRecord(GridBase *grid,SerialisableUserFile &_userFile)
{
scidacFile _scidacFile(grid);
if ( this->boss_node ) {
writeLimeObject(1,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
writeLimeObject(0,1,_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
}
}
////////////////////////////////////////////////
// Write generic lattice field in scidac format
////////////////////////////////////////////////
@ -415,9 +470,12 @@ class ScidacWriter : public GridLimeWriter {
//////////////////////////////////////////////
// Fill the Lime file record by record
//////////////////////////////////////////////
writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message
writeLimeObject(0,0,_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML));
writeLimeObject(0,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
if ( this->boss_node ) {
writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message
writeLimeObject(0,0,_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML));
writeLimeObject(0,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
}
// Collective call
writeLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA)); // Closes message with checksum
}
};
@ -484,6 +542,8 @@ class ScidacReader : public GridLimeReader {
class IldgWriter : public ScidacWriter {
public:
IldgWriter(bool isboss) : ScidacWriter(isboss) {};
///////////////////////////////////
// A little helper
@ -568,7 +628,6 @@ class IldgWriter : public ScidacWriter {
writeLimeIldgLFN(header.ildg_lfn); // rec
writeLimeLatticeBinaryObject(Umu,std::string(ILDG_BINARY_DATA)); // Closes message with checksum
// limeDestroyWriter(LimeW);
fclose(File);
}
};
@ -644,9 +703,11 @@ class IldgReader : public GridLimeReader {
//////////////////////////////////
// ILDG format record
std::string xmlstring(&xmlc[0]);
if ( !strncmp(limeReaderType(LimeR), ILDG_FORMAT,strlen(ILDG_FORMAT)) ) {
XmlReader RD(&xmlc[0],"");
XmlReader RD(xmlstring, true, "");
read(RD,"ildgFormat",ildgFormat_);
if ( ildgFormat_.precision == 64 ) format = std::string("IEEE64BIG");
@ -661,13 +722,13 @@ class IldgReader : public GridLimeReader {
}
if ( !strncmp(limeReaderType(LimeR), ILDG_DATA_LFN,strlen(ILDG_DATA_LFN)) ) {
FieldMetaData_.ildg_lfn = std::string(&xmlc[0]);
FieldMetaData_.ildg_lfn = xmlstring;
found_ildgLFN = 1;
}
if ( !strncmp(limeReaderType(LimeR), GRID_FORMAT,strlen(ILDG_FORMAT)) ) {
XmlReader RD(&xmlc[0],"");
XmlReader RD(xmlstring, true, "");
read(RD,"FieldMetaData",FieldMetaData_);
format = FieldMetaData_.floating_point;
@ -681,18 +742,17 @@ class IldgReader : public GridLimeReader {
}
if ( !strncmp(limeReaderType(LimeR), SCIDAC_RECORD_XML,strlen(SCIDAC_RECORD_XML)) ) {
std::string xmls(&xmlc[0]);
// is it a USQCD info field
if ( xmls.find(std::string("usqcdInfo")) != std::string::npos ) {
if ( xmlstring.find(std::string("usqcdInfo")) != std::string::npos ) {
// std::cout << GridLogMessage<<"...found a usqcdInfo field"<<std::endl;
XmlReader RD(&xmlc[0],"");
XmlReader RD(xmlstring, true, "");
read(RD,"usqcdInfo",usqcdInfo_);
found_usqcdInfo = 1;
}
}
if ( !strncmp(limeReaderType(LimeR), SCIDAC_CHECKSUM,strlen(SCIDAC_CHECKSUM)) ) {
XmlReader RD(&xmlc[0],"");
XmlReader RD(xmlstring, true, "");
read(RD,"scidacChecksum",scidacChecksum_);
found_scidacChecksum = 1;
}

View File

@ -136,8 +136,9 @@ struct scidacRecord : Serializable {
int, typesize,
int, datacount);
scidacRecord() { version =1.0; }
scidacRecord()
: version(1.0), recordtype(0), colors(0), spins(0), typesize(0), datacount(0)
{}
};
////////////////////////

View File

@ -81,18 +81,16 @@ namespace Grid {
std::string, creation_date,
std::string, archive_date,
std::string, floating_point);
FieldMetaData(void) {
nd=4;
dimension.resize(4);
boundary.resize(4);
scidac_checksuma=0;
scidac_checksumb=0;
checksum=0;
}
// WARNING: non-initialised values might lead to twisted parallel IO
// issues, std::string are fine because they initliase to size 0
// as per C++ standard.
FieldMetaData(void)
: nd(4), dimension(4,0), boundary(4, ""), data_start(0),
link_trace(0.), plaquette(0.), checksum(0),
scidac_checksuma(0), scidac_checksumb(0), sequence_number(0)
{}
};
namespace QCD {
using namespace Grid;

View File

@ -57,7 +57,7 @@ namespace Grid {
// for the header-reader
static inline int readHeader(std::string file,GridBase *grid, FieldMetaData &field)
{
int offset=0;
uint64_t offset=0;
std::map<std::string,std::string> header;
std::string line;
@ -139,7 +139,7 @@ namespace Grid {
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
GridBase *grid = Umu._grid;
int offset = readHeader(file,Umu._grid,header);
uint64_t offset = readHeader(file,Umu._grid,header);
FieldMetaData clone(header);
@ -236,21 +236,25 @@ namespace Grid {
GaugeStatistics(Umu,header);
MachineCharacteristics(header);
int offset;
truncate(file);
uint64_t offset;
// Sod it -- always write 3x3 double
header.floating_point = std::string("IEEE64BIG");
header.data_type = std::string("4D_SU3_GAUGE_3x3");
GaugeSimpleUnmunger<fobj3D,sobj> munge;
offset = writeHeader(header,file);
if ( grid->IsBoss() ) {
truncate(file);
offset = writeHeader(header,file);
}
grid->Broadcast(0,(void *)&offset,sizeof(offset));
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
nersc_csum,scidac_csuma,scidac_csumb);
header.checksum = nersc_csum;
writeHeader(header,file);
if ( grid->IsBoss() ) {
writeHeader(header,file);
}
std::cout<<GridLogMessage <<"Written NERSC Configuration on "<< file << " checksum "
<<std::hex<<header.checksum
@ -278,7 +282,7 @@ namespace Grid {
header.plaquette=0.0;
MachineCharacteristics(header);
int offset;
uint64_t offset;
#ifdef RNG_RANLUX
header.floating_point = std::string("UINT64");
@ -293,12 +297,18 @@ namespace Grid {
header.data_type = std::string("SITMO");
#endif
truncate(file);
offset = writeHeader(header,file);
if ( grid->IsBoss() ) {
truncate(file);
offset = writeHeader(header,file);
}
grid->Broadcast(0,(void *)&offset,sizeof(offset));
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::writeRNG(serial,parallel,file,offset,nersc_csum,scidac_csuma,scidac_csumb);
header.checksum = nersc_csum;
offset = writeHeader(header,file);
if ( grid->IsBoss() ) {
offset = writeHeader(header,file);
}
std::cout<<GridLogMessage
<<"Written NERSC RNG STATE "<<file<< " checksum "
@ -313,7 +323,7 @@ namespace Grid {
GridBase *grid = parallel._grid;
int offset = readHeader(file,grid,header);
uint64_t offset = readHeader(file,grid,header);
FieldMetaData clone(header);

View File

@ -49,7 +49,8 @@ inline double usecond(void) {
typedef std::chrono::system_clock GridClock;
typedef std::chrono::time_point<GridClock> GridTimePoint;
typedef std::chrono::milliseconds GridTime;
typedef std::chrono::milliseconds GridMillisecs;
typedef std::chrono::microseconds GridTime;
typedef std::chrono::microseconds GridUsecs;
inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milliseconds & time)
@ -57,6 +58,11 @@ inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milli
stream << time.count()<<" ms";
return stream;
}
inline std::ostream& operator<< (std::ostream & stream, const std::chrono::microseconds & time)
{
stream << time.count()<<" usec";
return stream;
}
class GridStopWatch {
private:

View File

@ -1,44 +0,0 @@
pugixml [![Build Status](https://travis-ci.org/zeux/pugixml.svg?branch=master)](https://travis-ci.org/zeux/pugixml) [![Build status](https://ci.appveyor.com/api/projects/status/9hdks1doqvq8pwe7/branch/master?svg=true)](https://ci.appveyor.com/project/zeux/pugixml)
=======
pugixml is a C++ XML processing library, which consists of a DOM-like interface with rich traversal/modification
capabilities, an extremely fast XML parser which constructs the DOM tree from an XML file/buffer, and an XPath 1.0
implementation for complex data-driven tree queries. Full Unicode support is also available, with Unicode interface
variants and conversions between different Unicode encodings (which happen automatically during parsing/saving).
pugixml is used by a lot of projects, both open-source and proprietary, for performance and easy-to-use interface.
## Documentation
Documentation for the current release of pugixml is available on-line as two separate documents:
* [Quick-start guide](http://pugixml.org/docs/quickstart.html), that aims to provide enough information to start using the library;
* [Complete reference manual](http://pugixml.org/docs/manual.html), that describes all features of the library in detail.
Youre advised to start with the quick-start guide; however, many important library features are either not described in it at all or only mentioned briefly; if you require more information you should read the complete manual.
## License
This library is available to anybody free of charge, under the terms of MIT License:
Copyright (c) 2006-2015 Arseny Kapoulkine
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.

View File

@ -1,7 +1,7 @@
/**
* pugixml parser - version 1.6
* pugixml parser - version 1.9
* --------------------------------------------------------
* Copyright (C) 2006-2015, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
* Copyright (C) 2006-2018, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
* Report bugs and download new versions at http://pugixml.org/
*
* This library is distributed under the MIT License. See notice at the end
@ -17,6 +17,9 @@
// Uncomment this to enable wchar_t mode
// #define PUGIXML_WCHAR_MODE
// Uncomment this to enable compact mode
// #define PUGIXML_COMPACT
// Uncomment this to disable XPath
// #define PUGIXML_NO_XPATH
@ -46,7 +49,7 @@
#endif
/**
* Copyright (c) 2006-2015 Arseny Kapoulkine
* Copyright (c) 2006-2018 Arseny Kapoulkine
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
@ -59,7 +62,7 @@
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND

File diff suppressed because it is too large Load Diff

View File

@ -1,7 +1,7 @@
/**
* pugixml parser - version 1.6
* pugixml parser - version 1.9
* --------------------------------------------------------
* Copyright (C) 2006-2015, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
* Copyright (C) 2006-2018, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
* Report bugs and download new versions at http://pugixml.org/
*
* This library is distributed under the MIT License. See notice at the end
@ -13,7 +13,7 @@
#ifndef PUGIXML_VERSION
// Define version macro; evaluates to major * 100 + minor so that it's safe to use in less-than comparisons
# define PUGIXML_VERSION 160
# define PUGIXML_VERSION 190
#endif
// Include user configuration file (this can define various configuration macros)
@ -72,6 +72,44 @@
# endif
#endif
// If the platform is known to have move semantics support, compile move ctor/operator implementation
#ifndef PUGIXML_HAS_MOVE
# if __cplusplus >= 201103
# define PUGIXML_HAS_MOVE
# elif defined(_MSC_VER) && _MSC_VER >= 1600
# define PUGIXML_HAS_MOVE
# endif
#endif
// If C++ is 2011 or higher, add 'noexcept' specifiers
#ifndef PUGIXML_NOEXCEPT
# if __cplusplus >= 201103
# define PUGIXML_NOEXCEPT noexcept
# elif defined(_MSC_VER) && _MSC_VER >= 1900
# define PUGIXML_NOEXCEPT noexcept
# else
# define PUGIXML_NOEXCEPT
# endif
#endif
// Some functions can not be noexcept in compact mode
#ifdef PUGIXML_COMPACT
# define PUGIXML_NOEXCEPT_IF_NOT_COMPACT
#else
# define PUGIXML_NOEXCEPT_IF_NOT_COMPACT PUGIXML_NOEXCEPT
#endif
// If C++ is 2011 or higher, add 'override' qualifiers
#ifndef PUGIXML_OVERRIDE
# if __cplusplus >= 201103
# define PUGIXML_OVERRIDE override
# elif defined(_MSC_VER) && _MSC_VER >= 1700
# define PUGIXML_OVERRIDE override
# else
# define PUGIXML_OVERRIDE
# endif
#endif
// Character interface macros
#ifdef PUGIXML_WCHAR_MODE
# define PUGIXML_TEXT(t) L ## t
@ -133,13 +171,13 @@ namespace pugi
// This flag determines if EOL characters are normalized (converted to #xA) during parsing. This flag is on by default.
const unsigned int parse_eol = 0x0020;
// This flag determines if attribute values are normalized using CDATA normalization rules during parsing. This flag is on by default.
const unsigned int parse_wconv_attribute = 0x0040;
// This flag determines if attribute values are normalized using NMTOKENS normalization rules during parsing. This flag is off by default.
const unsigned int parse_wnorm_attribute = 0x0080;
// This flag determines if document declaration (node_declaration) is added to the DOM tree. This flag is off by default.
const unsigned int parse_declaration = 0x0100;
@ -158,6 +196,11 @@ namespace pugi
// is a valid document. This flag is off by default.
const unsigned int parse_fragment = 0x1000;
// This flag determines if plain character data is be stored in the parent element's value. This significantly changes the structure of
// the document; this flag is only recommended for parsing documents with many PCDATA nodes in memory-constrained environments.
// This flag is off by default.
const unsigned int parse_embed_pcdata = 0x2000;
// The default parsing mode.
// Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded,
// End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules.
@ -184,16 +227,16 @@ namespace pugi
};
// Formatting flags
// Indent the nodes that are written to output stream with as many indentation strings as deep the node is in DOM tree. This flag is on by default.
const unsigned int format_indent = 0x01;
// Write encoding-specific BOM to the output stream. This flag is off by default.
const unsigned int format_write_bom = 0x02;
// Use raw output mode (no indentation and no line breaks are written). This flag is off by default.
const unsigned int format_raw = 0x04;
// Omit default XML declaration even if there is no declaration in the document. This flag is off by default.
const unsigned int format_no_declaration = 0x08;
@ -206,6 +249,9 @@ namespace pugi
// Write every attribute on a new line with appropriate indentation. This flag is off by default.
const unsigned int format_indent_attributes = 0x40;
// Don't output empty element tags, instead writing an explicit start and end tag even if there are no children. This flag is off by default.
const unsigned int format_no_empty_element_tags = 0x80;
// The default set of formatting flags.
// Nodes are indented depending on their depth in DOM tree, a default declaration is output if document has none.
const unsigned int format_default = format_indent;
@ -225,7 +271,7 @@ namespace pugi
class xml_node;
class xml_text;
#ifndef PUGIXML_NO_XPATH
class xpath_node;
class xpath_node_set;
@ -268,7 +314,7 @@ namespace pugi
// Construct writer from a FILE* object; void* is used to avoid header dependencies on stdio
xml_writer_file(void* file);
virtual void write(const void* data, size_t size);
virtual void write(const void* data, size_t size) PUGIXML_OVERRIDE;
private:
void* file;
@ -283,7 +329,7 @@ namespace pugi
xml_writer_stream(std::basic_ostream<char, std::char_traits<char> >& stream);
xml_writer_stream(std::basic_ostream<wchar_t, std::char_traits<wchar_t> >& stream);
virtual void write(const void* data, size_t size);
virtual void write(const void* data, size_t size) PUGIXML_OVERRIDE;
private:
std::basic_ostream<char, std::char_traits<char> >* narrow_stream;
@ -299,13 +345,13 @@ namespace pugi
private:
xml_attribute_struct* _attr;
typedef void (*unspecified_bool_type)(xml_attribute***);
public:
// Default constructor. Constructs an empty attribute.
xml_attribute();
// Constructs attribute from internal pointer
explicit xml_attribute(xml_attribute_struct* attr);
@ -354,6 +400,8 @@ namespace pugi
// Set attribute value with type conversion (numbers are converted to strings, boolean is converted to "true"/"false")
bool set_value(int rhs);
bool set_value(unsigned int rhs);
bool set_value(long rhs);
bool set_value(unsigned long rhs);
bool set_value(double rhs);
bool set_value(float rhs);
bool set_value(bool rhs);
@ -367,6 +415,8 @@ namespace pugi
xml_attribute& operator=(const char_t* rhs);
xml_attribute& operator=(int rhs);
xml_attribute& operator=(unsigned int rhs);
xml_attribute& operator=(long rhs);
xml_attribute& operator=(unsigned long rhs);
xml_attribute& operator=(double rhs);
xml_attribute& operator=(float rhs);
xml_attribute& operator=(bool rhs);
@ -417,7 +467,7 @@ namespace pugi
// Borland C++ workaround
bool operator!() const;
// Comparison operators (compares wrapped node pointers)
bool operator==(const xml_node& r) const;
bool operator!=(const xml_node& r) const;
@ -438,7 +488,7 @@ namespace pugi
// Get node value, or "" if node is empty or it has no value
// Note: For <node>text</node> node.value() does not return "text"! Use child_value() or text() methods to access text inside nodes.
const char_t* value() const;
// Get attribute list
xml_attribute first_attribute() const;
xml_attribute last_attribute() const;
@ -450,7 +500,7 @@ namespace pugi
// Get next/previous sibling in the children list of the parent node
xml_node next_sibling() const;
xml_node previous_sibling() const;
// Get parent node
xml_node parent() const;
@ -478,7 +528,7 @@ namespace pugi
// Set node name/value (returns false if node is empty, there is not enough memory, or node can not have name/value)
bool set_name(const char_t* rhs);
bool set_value(const char_t* rhs);
// Add attribute with specified name. Returns added attribute, or empty attribute on errors.
xml_attribute append_attribute(const char_t* name);
xml_attribute prepend_attribute(const char_t* name);
@ -532,11 +582,11 @@ namespace pugi
template <typename Predicate> xml_attribute find_attribute(Predicate pred) const
{
if (!_root) return xml_attribute();
for (xml_attribute attrib = first_attribute(); attrib; attrib = attrib.next_attribute())
if (pred(attrib))
return attrib;
return xml_attribute();
}
@ -544,11 +594,11 @@ namespace pugi
template <typename Predicate> xml_node find_child(Predicate pred) const
{
if (!_root) return xml_node();
for (xml_node node = first_child(); node; node = node.next_sibling())
if (pred(node))
return node;
return xml_node();
}
@ -558,7 +608,7 @@ namespace pugi
if (!_root) return xml_node();
xml_node cur = first_child();
while (cur._root && cur._root != _root)
{
if (pred(cur)) return cur;
@ -590,7 +640,7 @@ namespace pugi
// Recursively traverse subtree with xml_tree_walker
bool traverse(xml_tree_walker& walker);
#ifndef PUGIXML_NO_XPATH
// Select single node by evaluating XPath query. Returns first node from the resulting node set.
xpath_node select_node(const char_t* query, xpath_variable_set* variables = 0) const;
@ -601,11 +651,11 @@ namespace pugi
xpath_node_set select_nodes(const xpath_query& query) const;
// (deprecated: use select_node instead) Select single node by evaluating XPath query.
xpath_node select_single_node(const char_t* query, xpath_variable_set* variables = 0) const;
xpath_node select_single_node(const xpath_query& query) const;
PUGIXML_DEPRECATED xpath_node select_single_node(const char_t* query, xpath_variable_set* variables = 0) const;
PUGIXML_DEPRECATED xpath_node select_single_node(const xpath_query& query) const;
#endif
// Print subtree using a writer object
void print(xml_writer& writer, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const;
@ -701,6 +751,8 @@ namespace pugi
// Set text with type conversion (numbers are converted to strings, boolean is converted to "true"/"false")
bool set(int rhs);
bool set(unsigned int rhs);
bool set(long rhs);
bool set(unsigned long rhs);
bool set(double rhs);
bool set(float rhs);
bool set(bool rhs);
@ -714,6 +766,8 @@ namespace pugi
xml_text& operator=(const char_t* rhs);
xml_text& operator=(int rhs);
xml_text& operator=(unsigned int rhs);
xml_text& operator=(long rhs);
xml_text& operator=(unsigned long rhs);
xml_text& operator=(double rhs);
xml_text& operator=(float rhs);
xml_text& operator=(bool rhs);
@ -867,11 +921,11 @@ namespace pugi
private:
int _depth;
protected:
// Get current traversal depth
int depth() const;
public:
xml_tree_walker();
virtual ~xml_tree_walker();
@ -942,13 +996,14 @@ namespace pugi
char_t* _buffer;
char _memory[192];
// Non-copyable semantics
xml_document(const xml_document&);
const xml_document& operator=(const xml_document&);
xml_document& operator=(const xml_document&);
void create();
void destroy();
void _create();
void _destroy();
void _move(xml_document& rhs) PUGIXML_NOEXCEPT_IF_NOT_COMPACT;
public:
// Default constructor, makes empty document
@ -957,6 +1012,12 @@ namespace pugi
// Destructor, invalidates all node/attribute handles to this document
~xml_document();
#ifdef PUGIXML_HAS_MOVE
// Move semantics support
xml_document(xml_document&& rhs) PUGIXML_NOEXCEPT_IF_NOT_COMPACT;
xml_document& operator=(xml_document&& rhs) PUGIXML_NOEXCEPT_IF_NOT_COMPACT;
#endif
// Removes all nodes, leaving the empty document
void reset();
@ -970,7 +1031,7 @@ namespace pugi
#endif
// (deprecated: use load_string instead) Load document from zero-terminated string. No encoding conversions are applied.
xml_parse_result load(const char_t* contents, unsigned int options = parse_default);
PUGIXML_DEPRECATED xml_parse_result load(const char_t* contents, unsigned int options = parse_default);
// Load document from zero-terminated string. No encoding conversions are applied.
xml_parse_result load_string(const char_t* contents, unsigned int options = parse_default);
@ -1051,7 +1112,7 @@ namespace pugi
// Non-copyable semantics
xpath_variable(const xpath_variable&);
xpath_variable& operator=(const xpath_variable&);
public:
// Get variable name
const char_t* name() const;
@ -1095,10 +1156,10 @@ namespace pugi
xpath_variable_set(const xpath_variable_set& rhs);
xpath_variable_set& operator=(const xpath_variable_set& rhs);
#if __cplusplus >= 201103
#ifdef PUGIXML_HAS_MOVE
// Move semantics support
xpath_variable_set(xpath_variable_set&& rhs);
xpath_variable_set& operator=(xpath_variable_set&& rhs);
xpath_variable_set(xpath_variable_set&& rhs) PUGIXML_NOEXCEPT;
xpath_variable_set& operator=(xpath_variable_set&& rhs) PUGIXML_NOEXCEPT;
#endif
// Add a new variable or get the existing one, if the types match
@ -1139,29 +1200,29 @@ namespace pugi
// Destructor
~xpath_query();
#if __cplusplus >= 201103
#ifdef PUGIXML_HAS_MOVE
// Move semantics support
xpath_query(xpath_query&& rhs);
xpath_query& operator=(xpath_query&& rhs);
xpath_query(xpath_query&& rhs) PUGIXML_NOEXCEPT;
xpath_query& operator=(xpath_query&& rhs) PUGIXML_NOEXCEPT;
#endif
// Get query expression return type
xpath_value_type return_type() const;
// Evaluate expression as boolean value in the specified context; performs type conversion if necessary.
// If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
bool evaluate_boolean(const xpath_node& n) const;
// Evaluate expression as double value in the specified context; performs type conversion if necessary.
// If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
double evaluate_number(const xpath_node& n) const;
#ifndef PUGIXML_NO_STL
// Evaluate expression as string value in the specified context; performs type conversion if necessary.
// If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
string_t evaluate_string(const xpath_node& n) const;
#endif
// Evaluate expression as string value in the specified context; performs type conversion if necessary.
// At most capacity characters are written to the destination buffer, full result size is returned (includes terminating zero).
// If PUGIXML_NO_EXCEPTIONS is not defined, throws std::bad_alloc on out of memory errors.
@ -1188,7 +1249,7 @@ namespace pugi
// Borland C++ workaround
bool operator!() const;
};
#ifndef PUGIXML_NO_EXCEPTIONS
// XPath exception class
class PUGIXML_CLASS xpath_exception: public std::exception
@ -1201,26 +1262,26 @@ namespace pugi
explicit xpath_exception(const xpath_parse_result& result);
// Get error message
virtual const char* what() const throw();
virtual const char* what() const throw() PUGIXML_OVERRIDE;
// Get parse result
const xpath_parse_result& result() const;
};
#endif
// XPath node class (either xml_node or xml_attribute)
class PUGIXML_CLASS xpath_node
{
private:
xml_node _node;
xml_attribute _attribute;
typedef void (*unspecified_bool_type)(xpath_node***);
public:
// Default constructor; constructs empty XPath node
xpath_node();
// Construct XPath node from XML node/attribute
xpath_node(const xml_node& node);
xpath_node(const xml_attribute& attribute, const xml_node& parent);
@ -1228,13 +1289,13 @@ namespace pugi
// Get node/attribute, if any
xml_node node() const;
xml_attribute attribute() const;
// Get parent of contained node/attribute
xml_node parent() const;
// Safe bool conversion operator
operator unspecified_bool_type() const;
// Borland C++ workaround
bool operator!() const;
@ -1260,13 +1321,13 @@ namespace pugi
type_sorted, // Sorted by document order (ascending)
type_sorted_reverse // Sorted by document order (descending)
};
// Constant iterator type
typedef const xpath_node* const_iterator;
// We define non-constant iterator to be the same as constant iterator so that various generic algorithms (i.e. boost foreach) work
typedef const xpath_node* iterator;
// Default constructor. Constructs empty set.
xpath_node_set();
@ -1275,49 +1336,49 @@ namespace pugi
// Destructor
~xpath_node_set();
// Copy constructor/assignment operator
xpath_node_set(const xpath_node_set& ns);
xpath_node_set& operator=(const xpath_node_set& ns);
#if __cplusplus >= 201103
#ifdef PUGIXML_HAS_MOVE
// Move semantics support
xpath_node_set(xpath_node_set&& rhs);
xpath_node_set& operator=(xpath_node_set&& rhs);
xpath_node_set(xpath_node_set&& rhs) PUGIXML_NOEXCEPT;
xpath_node_set& operator=(xpath_node_set&& rhs) PUGIXML_NOEXCEPT;
#endif
// Get collection type
type_t type() const;
// Get collection size
size_t size() const;
// Indexing operator
const xpath_node& operator[](size_t index) const;
// Collection iterators
const_iterator begin() const;
const_iterator end() const;
// Sort the collection in ascending/descending order by document order
void sort(bool reverse = false);
// Get first node in the collection by document order
xpath_node first() const;
// Check if collection is empty
bool empty() const;
private:
type_t _type;
xpath_node _storage;
xpath_node* _begin;
xpath_node* _end;
void _assign(const_iterator begin, const_iterator end, type_t type);
void _move(xpath_node_set& rhs);
void _move(xpath_node_set& rhs) PUGIXML_NOEXCEPT;
};
#endif
@ -1325,7 +1386,7 @@ namespace pugi
// Convert wide string to UTF8
std::basic_string<char, std::char_traits<char>, std::allocator<char> > PUGIXML_FUNCTION as_utf8(const wchar_t* str);
std::basic_string<char, std::char_traits<char>, std::allocator<char> > PUGIXML_FUNCTION as_utf8(const std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >& str);
// Convert UTF8 to wide string
std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > PUGIXML_FUNCTION as_wide(const char* str);
std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> > PUGIXML_FUNCTION as_wide(const std::basic_string<char, std::char_traits<char>, std::allocator<char> >& str);
@ -1333,13 +1394,13 @@ namespace pugi
// Memory allocation function interface; returns pointer to allocated memory or NULL on failure
typedef void* (*allocation_function)(size_t size);
// Memory deallocation function interface
typedef void (*deallocation_function)(void* ptr);
// Override default memory management functions. All subsequent allocations/deallocations will be performed via supplied functions.
void PUGIXML_FUNCTION set_memory_management_functions(allocation_function allocate, deallocation_function deallocate);
// Get current memory management functions
allocation_function PUGIXML_FUNCTION get_memory_allocation_function();
deallocation_function PUGIXML_FUNCTION get_memory_deallocation_function();
@ -1375,7 +1436,7 @@ namespace std
#endif
/**
* Copyright (c) 2006-2015 Arseny Kapoulkine
* Copyright (c) 2006-2018 Arseny Kapoulkine
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
@ -1388,7 +1449,7 @@ namespace std
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND

View File

@ -1,6 +1,6 @@
pugixml 1.6 - an XML processing library
pugixml 1.9 - an XML processing library
Copyright (C) 2006-2015, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
Copyright (C) 2006-2018, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
Report bugs and download new versions at http://pugixml.org/
This is the distribution of pugixml, which is a C++ XML processing library,
@ -28,7 +28,7 @@ The distribution contains the following folders:
This library is distributed under the MIT License:
Copyright (c) 2006-2015 Arseny Kapoulkine
Copyright (c) 2006-2018 Arseny Kapoulkine
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation

View File

@ -52,6 +52,35 @@ namespace QCD {
{
}
///////////////////////////////////////////////////////////////
// Physical surface field utilities
///////////////////////////////////////////////////////////////
template<class Impl>
void CayleyFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
{
int Ls = this->Ls;
FermionField tmp(this->FermionGrid());
tmp = solution5d;
conformable(solution5d._grid,this->FermionGrid());
conformable(exported4d._grid,this->GaugeGrid());
axpby_ssp_pminus(tmp, 0., solution5d, 1., solution5d, 0, 0);
axpby_ssp_pplus (tmp, 1., tmp , 1., solution5d, 0, Ls-1);
ExtractSlice(exported4d, tmp, 0, 0);
}
template<class Impl>
void CayleyFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
{
int Ls = this->Ls;
FermionField tmp(this->FermionGrid());
conformable(imported5d._grid,this->FermionGrid());
conformable(input4d._grid ,this->GaugeGrid());
tmp = zero;
InsertSlice(input4d, tmp, 0 , 0);
InsertSlice(input4d, tmp, Ls-1, 0);
axpby_ssp_pplus (tmp, 0., tmp, 1., tmp, 0, 0);
axpby_ssp_pminus(tmp, 0., tmp, 1., tmp, Ls-1, Ls-1);
Dminus(tmp,imported5d);
}
template<class Impl>
void CayleyFermion5D<Impl>::Dminus(const FermionField &psi, FermionField &chi)
{
@ -73,7 +102,7 @@ void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi
this->DW(psi,tmp_f,DaggerYes);
for(int s=0;s<Ls;s++){
axpby_ssp(chi,Coeff_t(1.0),psi,-cs[s],tmp_f,s,s);// chi = (1-c[s] D_W) psi
axpby_ssp(chi,Coeff_t(1.0),psi,conjugate(-cs[s]),tmp_f,s,s);// chi = (1-c[s] D_W) psi
}
}

View File

@ -83,8 +83,13 @@ namespace Grid {
virtual void M5D (const FermionField &psi, FermionField &chi);
virtual void M5Ddag(const FermionField &psi, FermionField &chi);
///////////////////////////////////////////////////////////////
// Physical surface field utilities
///////////////////////////////////////////////////////////////
virtual void Dminus(const FermionField &psi, FermionField &chi);
virtual void DminusDag(const FermionField &psi, FermionField &chi);
virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
virtual void ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d);
/////////////////////////////////////////////////////
// Instantiate different versions depending on Impl

View File

@ -469,7 +469,7 @@ void CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi, FermionFie
}
a0 = a0+incr;
a1 = a1+incr;
a2 = a2+sizeof(Simd::scalar_type);
a2 = a2+sizeof(typename Simd::scalar_type);
}}
{
int lexa = s1+LLs*site;
@ -701,7 +701,7 @@ void CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi, FermionFi
}
a0 = a0+incr;
a1 = a1+incr;
a2 = a2+sizeof(Simd::scalar_type);
a2 = a2+sizeof(typename Simd::scalar_type);
}}
{
int lexa = s1+LLs*site;

View File

@ -295,6 +295,27 @@ namespace Grid {
assert((Ls&0x1)==1); // Odd Ls required
}
template<class Impl>
void ContinuedFractionFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
{
int Ls = this->Ls;
conformable(solution5d._grid,this->FermionGrid());
conformable(exported4d._grid,this->GaugeGrid());
ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
}
template<class Impl>
void ContinuedFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
{
int Ls = this->Ls;
conformable(imported5d._grid,this->FermionGrid());
conformable(input4d._grid ,this->GaugeGrid());
FermionField tmp(this->FermionGrid());
tmp=zero;
InsertSlice(input4d, tmp, Ls-1, Ls-1);
tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
this->Dminus(tmp,imported5d);
}
FermOpTemplateInstantiate(ContinuedFractionFermion5D);
}

View File

@ -65,6 +65,14 @@ namespace Grid {
// Efficient support for multigrid coarsening
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp);
///////////////////////////////////////////////////////////////
// Physical surface field utilities
///////////////////////////////////////////////////////////////
// virtual void Dminus(const FermionField &psi, FermionField &chi); // Inherit trivial case
// virtual void DminusDag(const FermionField &psi, FermionField &chi); // Inherit trivial case
virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
virtual void ImportPhysicalFermionSource (const FermionField &input4d,FermionField &imported5d);
// Constructors
ContinuedFractionFermion5D(GaugeField &_Umu,
GridCartesian &FiveDimGrid,

View File

@ -475,7 +475,7 @@ namespace QCD {
}
a0 = a0 + incr;
a1 = a1 + incr;
a2 = a2 + sizeof(Simd::scalar_type);
a2 = a2 + sizeof(typename Simd::scalar_type);
}
}

View File

@ -63,9 +63,12 @@ namespace Grid {
virtual RealD M (const FermionField &in, FermionField &out)=0;
virtual RealD Mdag (const FermionField &in, FermionField &out)=0;
// half checkerboard operaions
// Query the even even properties to make algorithmic decisions
virtual int ConstEE(void) { return 1; }; // clover returns zero as EE depends on gauge field
virtual int isTrivialEE(void) { return 0; };
virtual RealD Mass(void) {return 0.0;};
// half checkerboard operaions
virtual void Meooe (const FermionField &in, FermionField &out)=0;
virtual void MeooeDag (const FermionField &in, FermionField &out)=0;
virtual void Mooee (const FermionField &in, FermionField &out)=0;
@ -128,6 +131,19 @@ namespace Grid {
std::vector<Real> mom,
unsigned int tmin,
unsigned int tmax)=0;
///////////////////////////////////////////////
// Physical field import/export
///////////////////////////////////////////////
virtual void Dminus(const FermionField &psi, FermionField &chi) { chi=psi; }
virtual void DminusDag(const FermionField &psi, FermionField &chi) { chi=psi; }
virtual void ImportPhysicalFermionSource(const FermionField &input,FermionField &imported)
{
imported = input;
};
virtual void ExportPhysicalFermionSolution(const FermionField &solution,FermionField &exported)
{
exported=solution;
};
};
}

View File

@ -164,6 +164,7 @@ namespace QCD {
public:
static const int Dimension = Representation::Dimension;
static const bool isFundamental = Representation::isFundamental;
static const bool LsVectorised=false;
static const int Nhcs = Options::Nhcs;
@ -298,27 +299,28 @@ namespace QCD {
////////////////////////////////////////////////////////////////////////////////////
// Single flavour four spinors with colour index, 5d redblack
////////////////////////////////////////////////////////////////////////////////////
template<class S,int Nrepresentation=Nc, class Options=CoeffReal>
class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > {
template<class S,class Representation = FundamentalRepresentation, class Options=CoeffReal>
class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Representation::Dimension> > {
public:
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Nrepresentation> > Gimpl;
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > Gimpl;
INHERIT_GIMPL_TYPES(Gimpl);
static const int Dimension = Nrepresentation;
static const int Dimension = Representation::Dimension;
static const bool isFundamental = Representation::isFundamental;
static const bool LsVectorised=true;
static const int Nhcs = Options::Nhcs;
typedef typename Options::_Coeff_t Coeff_t;
typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL;
template <typename vtype> using iImplSpinor = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >;
template <typename vtype> using iImplPropagator = iScalar<iMatrix<iMatrix<vtype, Nrepresentation>, Ns> >;
template <typename vtype> using iImplHalfSpinor = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >;
template <typename vtype> using iImplHalfCommSpinor = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhcs> >;
template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>;
template <typename vtype> using iImplGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd>;
template <typename vtype> using iImplGaugeLink = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
template <typename vtype> using iImplSpinor = iScalar<iVector<iVector<vtype, Dimension>, Ns> >;
template <typename vtype> using iImplPropagator = iScalar<iMatrix<iMatrix<vtype, Dimension>, Ns> >;
template <typename vtype> using iImplHalfSpinor = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
template <typename vtype> using iImplHalfCommSpinor = iScalar<iVector<iVector<vtype, Dimension>, Nhcs> >;
template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
template <typename vtype> using iImplGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nd>;
template <typename vtype> using iImplGaugeLink = iScalar<iScalar<iMatrix<vtype, Dimension> > >;
typedef iImplSpinor<Simd> SiteSpinor;
typedef iImplPropagator<Simd> SitePropagator;
@ -354,8 +356,8 @@ class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
StencilImpl &St) {
SiteGaugeLink UU;
for (int i = 0; i < Nrepresentation; i++) {
for (int j = 0; j < Nrepresentation; j++) {
for (int i = 0; i < Dimension; i++) {
for (int j = 0; j < Dimension; j++) {
vsplat(UU()()(i, j), U(mu)()(i, j));
}
}
@ -367,8 +369,8 @@ class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
const SitePropagator &chi,
int mu) {
SiteGaugeLink UU;
for (int i = 0; i < Nrepresentation; i++) {
for (int j = 0; j < Nrepresentation; j++) {
for (int i = 0; i < Dimension; i++) {
for (int j = 0; j < Dimension; j++) {
vsplat(UU()()(i, j), U(mu)()(i, j));
}
}
@ -472,25 +474,26 @@ class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepres
////////////////////////////////////////////////////////////////////////////////////////
// Flavour doubled spinors; is Gparity the only? what about C*?
////////////////////////////////////////////////////////////////////////////////////////
template <class S, int Nrepresentation, class Options=CoeffReal>
class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresentation> > {
template <class S, class Representation = FundamentalRepresentation, class Options=CoeffReal>
class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > {
public:
static const int Dimension = Nrepresentation;
static const int Dimension = Representation::Dimension;
static const bool isFundamental = Representation::isFundamental;
static const int Nhcs = Options::Nhcs;
static const bool LsVectorised=false;
typedef ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> > Gimpl;
typedef ConjugateGaugeImpl< GaugeImplTypes<S,Dimension> > Gimpl;
INHERIT_GIMPL_TYPES(Gimpl);
typedef typename Options::_Coeff_t Coeff_t;
typedef typename Options::template PrecisionMapper<Simd>::LowerPrecVector SimdL;
template <typename vtype> using iImplSpinor = iVector<iVector<iVector<vtype, Nrepresentation>, Ns>, Ngp>;
template <typename vtype> using iImplPropagator = iVector<iMatrix<iMatrix<vtype, Nrepresentation>, Ns>, Ngp>;
template <typename vtype> using iImplHalfSpinor = iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>, Ngp>;
template <typename vtype> using iImplHalfCommSpinor = iVector<iVector<iVector<vtype, Nrepresentation>, Nhcs>, Ngp>;
template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>, Ngp>;
template <typename vtype> using iImplSpinor = iVector<iVector<iVector<vtype, Dimension>, Ns>, Ngp>;
template <typename vtype> using iImplPropagator = iVector<iMatrix<iMatrix<vtype, Dimension>, Ns>, Ngp>;
template <typename vtype> using iImplHalfSpinor = iVector<iVector<iVector<vtype, Dimension>, Nhs>, Ngp>;
template <typename vtype> using iImplHalfCommSpinor = iVector<iVector<iVector<vtype, Dimension>, Nhcs>, Ngp>;
template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>, Ngp>;
typedef iImplSpinor<Simd> SiteSpinor;
typedef iImplPropagator<Simd> SitePropagator;
@ -711,6 +714,7 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
typedef RealD _Coeff_t ;
static const int Dimension = Representation::Dimension;
static const bool isFundamental = Representation::isFundamental;
static const bool LsVectorised=false;
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
@ -760,7 +764,12 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
inline void loadLinkElement(Simd &reg, ref &memory) {
reg = memory;
}
inline void InsertGaugeField(DoubledGaugeField &U_ds,
const GaugeLinkField &U,int mu)
{
PokeIndex<LorentzIndex>(U_ds, U, mu);
}
inline void DoubleStore(GridBase *GaugeGrid,
DoubledGaugeField &UUUds, // for Naik term
DoubledGaugeField &Uds,
@ -799,8 +808,10 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
U = U *phases;
Udag = Udag *phases;
PokeIndex<LorentzIndex>(Uds, U, mu);
PokeIndex<LorentzIndex>(Uds, Udag, mu + 4);
InsertGaugeField(Uds,U,mu);
InsertGaugeField(Uds,Udag,mu+4);
// PokeIndex<LorentzIndex>(Uds, U, mu);
// PokeIndex<LorentzIndex>(Uds, Udag, mu + 4);
// 3 hop based on thin links. Crazy huh ?
U = PeekIndex<LorentzIndex>(Uthin, mu);
@ -812,8 +823,8 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
UUU = UUU *phases;
UUUdag = UUUdag *phases;
PokeIndex<LorentzIndex>(UUUds, UUU, mu);
PokeIndex<LorentzIndex>(UUUds, UUUdag, mu+4);
InsertGaugeField(UUUds,UUU,mu);
InsertGaugeField(UUUds,UUUdag,mu+4);
}
}
@ -839,6 +850,7 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
public:
static const int Dimension = Representation::Dimension;
static const bool isFundamental = Representation::isFundamental;
static const bool LsVectorised=true;
typedef RealD Coeff_t ;
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
@ -905,6 +917,23 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
mac(&phi(), &UU(), &chi());
}
inline void InsertGaugeField(DoubledGaugeField &U_ds,const GaugeLinkField &U,int mu)
{
GridBase *GaugeGrid = U_ds._grid;
parallel_for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
SiteScalarGaugeLink ScalarU;
SiteDoubledGaugeField ScalarUds;
std::vector<int> lcoor;
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
peekLocalSite(ScalarUds, U_ds, lcoor);
peekLocalSite(ScalarU, U, lcoor);
ScalarUds(mu) = ScalarU();
}
}
inline void DoubleStore(GridBase *GaugeGrid,
DoubledGaugeField &UUUds, // for Naik term
DoubledGaugeField &Uds,
@ -946,23 +975,8 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
U = U *phases;
Udag = Udag *phases;
for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
SiteScalarGaugeLink ScalarU;
SiteDoubledGaugeField ScalarUds;
std::vector<int> lcoor;
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
peekLocalSite(ScalarUds, Uds, lcoor);
peekLocalSite(ScalarU, U, lcoor);
ScalarUds(mu) = ScalarU();
peekLocalSite(ScalarU, Udag, lcoor);
ScalarUds(mu + 4) = ScalarU();
pokeLocalSite(ScalarUds, Uds, lcoor);
}
InsertGaugeField(Uds,U,mu);
InsertGaugeField(Uds,Udag,mu+4);
// 3 hop based on thin links. Crazy huh ?
U = PeekIndex<LorentzIndex>(Uthin, mu);
@ -974,24 +988,8 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
UUU = UUU *phases;
UUUdag = UUUdag *phases;
for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
SiteScalarGaugeLink ScalarU;
SiteDoubledGaugeField ScalarUds;
std::vector<int> lcoor;
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
peekLocalSite(ScalarUds, UUUds, lcoor);
peekLocalSite(ScalarU, UUU, lcoor);
ScalarUds(mu) = ScalarU();
peekLocalSite(ScalarU, UUUdag, lcoor);
ScalarUds(mu + 4) = ScalarU();
pokeLocalSite(ScalarUds, UUUds, lcoor);
}
InsertGaugeField(UUUds,UUU,mu);
InsertGaugeField(UUUds,UUUdag,mu+4);
}
}
@ -1033,29 +1031,29 @@ typedef WilsonImpl<vComplex, TwoIndexAntiSymmetricRepresentation, CoeffReal > W
typedef WilsonImpl<vComplexF, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplF; // Float
typedef WilsonImpl<vComplexD, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplD; // Double
typedef DomainWallVec5dImpl<vComplex ,Nc, CoeffReal> DomainWallVec5dImplR; // Real.. whichever prec
typedef DomainWallVec5dImpl<vComplexF,Nc, CoeffReal> DomainWallVec5dImplF; // Float
typedef DomainWallVec5dImpl<vComplexD,Nc, CoeffReal> DomainWallVec5dImplD; // Double
typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplR; // Real.. whichever prec
typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplF; // Float
typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation, CoeffReal> DomainWallVec5dImplD; // Double
typedef DomainWallVec5dImpl<vComplex ,Nc, CoeffRealHalfComms> DomainWallVec5dImplRL; // Real.. whichever prec
typedef DomainWallVec5dImpl<vComplexF,Nc, CoeffRealHalfComms> DomainWallVec5dImplFH; // Float
typedef DomainWallVec5dImpl<vComplexD,Nc, CoeffRealHalfComms> DomainWallVec5dImplDF; // Double
typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplRL; // Real.. whichever prec
typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplFH; // Float
typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation, CoeffRealHalfComms> DomainWallVec5dImplDF; // Double
typedef DomainWallVec5dImpl<vComplex ,Nc,CoeffComplex> ZDomainWallVec5dImplR; // Real.. whichever prec
typedef DomainWallVec5dImpl<vComplexF,Nc,CoeffComplex> ZDomainWallVec5dImplF; // Float
typedef DomainWallVec5dImpl<vComplexD,Nc,CoeffComplex> ZDomainWallVec5dImplD; // Double
typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplR; // Real.. whichever prec
typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplF; // Float
typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation,CoeffComplex> ZDomainWallVec5dImplD; // Double
typedef DomainWallVec5dImpl<vComplex ,Nc,CoeffComplexHalfComms> ZDomainWallVec5dImplRL; // Real.. whichever prec
typedef DomainWallVec5dImpl<vComplexF,Nc,CoeffComplexHalfComms> ZDomainWallVec5dImplFH; // Float
typedef DomainWallVec5dImpl<vComplexD,Nc,CoeffComplexHalfComms> ZDomainWallVec5dImplDF; // Double
typedef DomainWallVec5dImpl<vComplex ,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplRL; // Real.. whichever prec
typedef DomainWallVec5dImpl<vComplexF,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplFH; // Float
typedef DomainWallVec5dImpl<vComplexD,FundamentalRepresentation,CoeffComplexHalfComms> ZDomainWallVec5dImplDF; // Double
typedef GparityWilsonImpl<vComplex , Nc,CoeffReal> GparityWilsonImplR; // Real.. whichever prec
typedef GparityWilsonImpl<vComplexF, Nc,CoeffReal> GparityWilsonImplF; // Float
typedef GparityWilsonImpl<vComplexD, Nc,CoeffReal> GparityWilsonImplD; // Double
typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffReal> GparityWilsonImplR; // Real.. whichever prec
typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffReal> GparityWilsonImplF; // Float
typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffReal> GparityWilsonImplD; // Double
typedef GparityWilsonImpl<vComplex , Nc,CoeffRealHalfComms> GparityWilsonImplRL; // Real.. whichever prec
typedef GparityWilsonImpl<vComplexF, Nc,CoeffRealHalfComms> GparityWilsonImplFH; // Float
typedef GparityWilsonImpl<vComplexD, Nc,CoeffRealHalfComms> GparityWilsonImplDF; // Double
typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplRL; // Real.. whichever prec
typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplFH; // Float
typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplDF; // Double
typedef StaggeredImpl<vComplex, FundamentalRepresentation > StaggeredImplR; // Real.. whichever prec
typedef StaggeredImpl<vComplexF, FundamentalRepresentation > StaggeredImplF; // Float

View File

@ -44,6 +44,7 @@ ImprovedStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3,
template <class Impl>
ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid,
RealD _mass,
RealD _c1, RealD _c2,RealD _u0,
const ImplParams &p)
: Kernels(p),
_grid(&Fgrid),
@ -62,6 +63,16 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, G
UUUmuOdd(&Hgrid) ,
_tmp(&Hgrid)
{
int vol4;
int LLs=1;
c1=_c1;
c2=_c2;
u0=_u0;
vol4= _grid->oSites();
Stencil.BuildSurfaceList(LLs,vol4);
vol4= _cbgrid->oSites();
StencilEven.BuildSurfaceList(LLs,vol4);
StencilOdd.BuildSurfaceList(LLs,vol4);
}
template <class Impl>
@ -69,22 +80,10 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin, Gau
GridRedBlackCartesian &Hgrid, RealD _mass,
RealD _c1, RealD _c2,RealD _u0,
const ImplParams &p)
: ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,p)
: ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,_c1,_c2,_u0,p)
{
c1=_c1;
c2=_c2;
u0=_u0;
ImportGauge(_Uthin,_Ufat);
}
template <class Impl>
ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin,GaugeField &_Utriple, GaugeField &_Ufat, GridCartesian &Fgrid,
GridRedBlackCartesian &Hgrid, RealD _mass,
const ImplParams &p)
: ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,p)
{
ImportGaugeSimple(_Utriple,_Ufat);
}
////////////////////////////////////////////////////////////
// Momentum space propagator should be
@ -98,11 +97,6 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin,Gaug
// of above link to implmement fourier based solver.
////////////////////////////////////////////////////////////
template <class Impl>
void ImprovedStaggeredFermion<Impl>::ImportGauge(const GaugeField &_Uthin)
{
ImportGauge(_Uthin,_Uthin);
};
template <class Impl>
void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat)
{
/////////////////////////////////////////////////////////////////
@ -125,6 +119,20 @@ void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const GaugeField &_Utripl
PokeIndex<LorentzIndex>(Umu, -U, mu+4);
}
CopyGaugeCheckerboards();
}
template <class Impl>
void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U)
{
Umu = _U;
UUUmu = _UUU;
CopyGaugeCheckerboards();
}
template <class Impl>
void ImprovedStaggeredFermion<Impl>::CopyGaugeCheckerboards(void)
{
pickCheckerboard(Even, UmuEven, Umu);
pickCheckerboard(Odd, UmuOdd , Umu);
pickCheckerboard(Even, UUUmuEven,UUUmu);
@ -160,10 +168,7 @@ void ImprovedStaggeredFermion<Impl>::ImportGauge(const GaugeField &_Uthin,const
PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4);
}
pickCheckerboard(Even, UmuEven, Umu);
pickCheckerboard(Odd, UmuOdd , Umu);
pickCheckerboard(Even, UUUmuEven, UUUmu);
pickCheckerboard(Odd, UUUmuOdd, UUUmu);
CopyGaugeCheckerboards();
}
/////////////////////////////
@ -322,6 +327,7 @@ void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionF
template <class Impl>
void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) {
DhopCalls+=2;
conformable(in._grid, _grid); // verifies full grid
conformable(in._grid, out._grid);
@ -332,6 +338,7 @@ void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &
template <class Impl>
void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) {
DhopCalls+=1;
conformable(in._grid, _cbgrid); // verifies half grid
conformable(in._grid, out._grid); // drops the cb check
@ -343,6 +350,7 @@ void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField
template <class Impl>
void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) {
DhopCalls+=1;
conformable(in._grid, _cbgrid); // verifies half grid
conformable(in._grid, out._grid); // drops the cb check
@ -374,25 +382,193 @@ void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder
DoubledGaugeField &U,
DoubledGaugeField &UUU,
const FermionField &in,
FermionField &out, int dag) {
FermionField &out, int dag)
{
#ifdef GRID_OMP
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
else
#endif
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
}
template <class Impl>
void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
DoubledGaugeField &UUU,
const FermionField &in,
FermionField &out, int dag)
{
#ifdef GRID_OMP
Compressor compressor;
int len = U._grid->oSites();
const int LLs = 1;
DhopTotalTime -= usecond();
DhopFaceTime -= usecond();
st.Prepare();
st.HaloGather(in,compressor);
st.CommsMergeSHM(compressor);
DhopFaceTime += usecond();
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Ugly explicit thread mapping introduced for OPA reasons.
//////////////////////////////////////////////////////////////////////////////////////////////////////
DhopComputeTime -= usecond();
#pragma omp parallel
{
int tid = omp_get_thread_num();
int nthreads = omp_get_num_threads();
int ncomms = CartesianCommunicator::nCommThreads;
if (ncomms == -1) ncomms = 1;
assert(nthreads > ncomms);
if (tid >= ncomms) {
nthreads -= ncomms;
int ttid = tid - ncomms;
int n = len;
int chunk = n / nthreads;
int rem = n % nthreads;
int myblock, myn;
if (ttid < rem) {
myblock = ttid * chunk + ttid;
myn = chunk+1;
} else {
myblock = ttid*chunk + rem;
myn = chunk;
}
// do the compute
if (dag == DaggerYes) {
for (int ss = myblock; ss < myblock+myn; ++ss) {
int sU = ss;
// Interior = 1; Exterior = 0; must implement for staggered
Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,1,0);
}
} else {
for (int ss = myblock; ss < myblock+myn; ++ss) {
// Interior = 1; Exterior = 0;
int sU = ss;
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,1,0);
}
}
} else {
st.CommunicateThreaded();
}
}
DhopComputeTime += usecond();
// First to enter, last to leave timing
DhopFaceTime -= usecond();
st.CommsMerge(compressor);
DhopFaceTime -= usecond();
DhopComputeTime2 -= usecond();
if (dag == DaggerYes) {
int sz=st.surface_list.size();
parallel_for (int ss = 0; ss < sz; ss++) {
int sU = st.surface_list[ss];
Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,0,1);
}
} else {
int sz=st.surface_list.size();
parallel_for (int ss = 0; ss < sz; ss++) {
int sU = st.surface_list[ss];
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,0,1);
}
}
DhopComputeTime2 += usecond();
#else
assert(0);
#endif
}
template <class Impl>
void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
DoubledGaugeField &UUU,
const FermionField &in,
FermionField &out, int dag)
{
assert((dag == DaggerNo) || (dag == DaggerYes));
DhopTotalTime -= usecond();
DhopCommTime -= usecond();
Compressor compressor;
st.HaloExchange(in, compressor);
DhopCommTime += usecond();
DhopComputeTime -= usecond();
if (dag == DaggerYes) {
PARALLEL_FOR_LOOP
for (int sss = 0; sss < in._grid->oSites(); sss++) {
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out);
}
} else {
PARALLEL_FOR_LOOP
for (int sss = 0; sss < in._grid->oSites(); sss++) {
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DhopSite(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out);
}
}
DhopComputeTime += usecond();
DhopTotalTime += usecond();
};
////////////////////////////////////////////////////////////////
// Reporting
////////////////////////////////////////////////////////////////
template<class Impl>
void ImprovedStaggeredFermion<Impl>::Report(void)
{
std::vector<int> latt = GridDefaultLatt();
RealD volume = 1; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
RealD NP = _grid->_Nprocessors;
RealD NN = _grid->NodeCount();
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion Number of DhopEO Calls : "
<< DhopCalls << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion TotalTime /Calls : "
<< DhopTotalTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion CommTime /Calls : "
<< DhopCommTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion ComputeTime/Calls : "
<< DhopComputeTime / DhopCalls << " us" << std::endl;
// Average the compute time
_grid->GlobalSum(DhopComputeTime);
DhopComputeTime/=NP;
RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion Stencil" <<std::endl; Stencil.Report();
std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilEven"<<std::endl; StencilEven.Report();
std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilOdd" <<std::endl; StencilOdd.Report();
}
template<class Impl>
void ImprovedStaggeredFermion<Impl>::ZeroCounters(void)
{
DhopCalls = 0;
DhopTotalTime = 0;
DhopCommTime = 0;
DhopComputeTime = 0;
DhopFaceTime = 0;
Stencil.ZeroCounters();
StencilEven.ZeroCounters();
StencilOdd.ZeroCounters();
}
////////////////////////////////////////////////////////
// Conserved current - not yet implemented.
////////////////////////////////////////////////////////

View File

@ -49,6 +49,18 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS
FermionField _tmp;
FermionField &tmp(void) { return _tmp; }
////////////////////////////////////////
// Performance monitoring
////////////////////////////////////////
void Report(void);
void ZeroCounters(void);
double DhopTotalTime;
double DhopCalls;
double DhopCommTime;
double DhopComputeTime;
double DhopComputeTime2;
double DhopFaceTime;
///////////////////////////////////////////////////////////////
// Implement the abstract base
///////////////////////////////////////////////////////////////
@ -105,25 +117,34 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
const FermionField &in, FermionField &out, int dag);
void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
const FermionField &in, FermionField &out, int dag);
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
const FermionField &in, FermionField &out, int dag);
// Constructor
//////////////////////////////////////////////////////////////////////////
// Grid own interface Constructor
//////////////////////////////////////////////////////////////////////////
ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Ufat, GridCartesian &Fgrid,
GridRedBlackCartesian &Hgrid, RealD _mass,
RealD _c1=9.0/8.0, RealD _c2=-1.0/24.0,RealD _u0=1.0,
const ImplParams &p = ImplParams());
ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Utriple, GaugeField &_Ufat, GridCartesian &Fgrid,
GridRedBlackCartesian &Hgrid, RealD _mass,
RealD _c1, RealD _c2,RealD _u0,
const ImplParams &p = ImplParams());
//////////////////////////////////////////////////////////////////////////
// MILC constructor no gauge fields
//////////////////////////////////////////////////////////////////////////
ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass,
RealD _c1=1.0, RealD _c2=1.0,RealD _u0=1.0,
const ImplParams &p = ImplParams());
// DoubleStore impl dependent
void ImportGaugeSimple(const GaugeField &_Utriple, const GaugeField &_Ufat);
void ImportGauge(const GaugeField &_Uthin, const GaugeField &_Ufat);
void ImportGauge(const GaugeField &_Uthin);
void ImportGauge (const GaugeField &_Uthin ) { assert(0); }
void ImportGauge (const GaugeField &_Uthin ,const GaugeField &_Ufat);
void ImportGaugeSimple(const GaugeField &_UUU ,const GaugeField &_U);
void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
DoubledGaugeField &GetU(void) { return Umu ; } ;
DoubledGaugeField &GetUUU(void) { return UUUmu; };
void CopyGaugeCheckerboards(void);
///////////////////////////////////////////////////////////////
// Data members require to support the functionality
@ -132,7 +153,8 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS
// protected:
public:
// any other parameters of action ???
virtual int isTrivialEE(void) { return 1; };
virtual RealD Mass(void) { return mass; }
RealD mass;
RealD u0;
RealD c1;

View File

@ -41,8 +41,7 @@ ImprovedStaggeredFermion5DStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3,
// 5d lattice for DWF.
template<class Impl>
ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GaugeField &_Uthin,GaugeField &_Ufat,
GridCartesian &FiveDimGrid,
ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GridCartesian &FiveDimGrid,
GridRedBlackCartesian &FiveDimRedBlackGrid,
GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid,
@ -121,16 +120,74 @@ ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GaugeField &_Uthin,
assert(FiveDimGrid._simd_layout[0] ==1);
}
int LLs = FiveDimGrid._rdimensions[0];
int vol4= FourDimGrid.oSites();
Stencil.BuildSurfaceList(LLs,vol4);
// Allocate the required comms buffer
vol4=FourDimRedBlackGrid.oSites();
StencilEven.BuildSurfaceList(LLs,vol4);
StencilOdd.BuildSurfaceList(LLs,vol4);
}
template <class Impl>
void ImprovedStaggeredFermion5D<Impl>::CopyGaugeCheckerboards(void)
{
pickCheckerboard(Even, UmuEven, Umu);
pickCheckerboard(Odd, UmuOdd , Umu);
pickCheckerboard(Even, UUUmuEven,UUUmu);
pickCheckerboard(Odd, UUUmuOdd, UUUmu);
}
template<class Impl>
ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GaugeField &_Uthin,GaugeField &_Ufat,
GridCartesian &FiveDimGrid,
GridRedBlackCartesian &FiveDimRedBlackGrid,
GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid,
RealD _mass,
RealD _c1,RealD _c2, RealD _u0,
const ImplParams &p) :
ImprovedStaggeredFermion5D(FiveDimGrid,FiveDimRedBlackGrid,
FourDimGrid,FourDimRedBlackGrid,
_mass,_c1,_c2,_u0,p)
{
ImportGauge(_Uthin,_Ufat);
}
///////////////////////////////////////////////////
// For MILC use; pass three link U's and 1 link U
///////////////////////////////////////////////////
template <class Impl>
void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin)
void ImprovedStaggeredFermion5D<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat)
{
ImportGauge(_Uthin,_Uthin);
};
/////////////////////////////////////////////////////////////////
// Trivial import; phases and fattening and such like preapplied
/////////////////////////////////////////////////////////////////
for (int mu = 0; mu < Nd; mu++) {
auto U = PeekIndex<LorentzIndex>(_Utriple, mu);
Impl::InsertGaugeField(UUUmu,U,mu);
U = adj( Cshift(U, mu, -3));
Impl::InsertGaugeField(UUUmu,-U,mu+4);
U = PeekIndex<LorentzIndex>(_Ufat, mu);
Impl::InsertGaugeField(Umu,U,mu);
U = adj( Cshift(U, mu, -1));
Impl::InsertGaugeField(Umu,-U,mu+4);
}
CopyGaugeCheckerboards();
}
template <class Impl>
void ImprovedStaggeredFermion5D<Impl>::ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U)
{
/////////////////////////////////////////////////////////////////
// Trivial import; phases and fattening and such like preapplied
/////////////////////////////////////////////////////////////////
Umu = _U;
UUUmu = _UUU;
CopyGaugeCheckerboards();
}
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat)
{
@ -159,10 +216,7 @@ void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin,cons
PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4);
}
pickCheckerboard(Even, UmuEven, Umu);
pickCheckerboard(Odd, UmuOdd , Umu);
pickCheckerboard(Even, UUUmuEven, UUUmu);
pickCheckerboard(Odd, UUUmuOdd, UUUmu);
CopyGaugeCheckerboards();
}
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,int dir5,int disp)
@ -223,6 +277,162 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
assert(0);
}
/*CHANGE */
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,DoubledGaugeField & UUU,
const FermionField &in, FermionField &out,int dag)
{
#ifdef GRID_OMP
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
else
#endif
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
}
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,DoubledGaugeField & UUU,
const FermionField &in, FermionField &out,int dag)
{
#ifdef GRID_OMP
// assert((dag==DaggerNo) ||(dag==DaggerYes));
Compressor compressor;
int LLs = in._grid->_rdimensions[0];
int len = U._grid->oSites();
DhopFaceTime-=usecond();
st.Prepare();
st.HaloGather(in,compressor);
// st.HaloExchangeOptGather(in,compressor); // Wilson compressor
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
DhopFaceTime+=usecond();
double ctime=0;
double ptime=0;
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Ugly explicit thread mapping introduced for OPA reasons.
//////////////////////////////////////////////////////////////////////////////////////////////////////
#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
{
int tid = omp_get_thread_num();
int nthreads = omp_get_num_threads();
int ncomms = CartesianCommunicator::nCommThreads;
if (ncomms == -1) ncomms = 1;
assert(nthreads > ncomms);
if (tid >= ncomms) {
double start = usecond();
nthreads -= ncomms;
int ttid = tid - ncomms;
int n = U._grid->oSites(); // 4d vol
int chunk = n / nthreads;
int rem = n % nthreads;
int myblock, myn;
if (ttid < rem) {
myblock = ttid * chunk + ttid;
myn = chunk+1;
} else {
myblock = ttid*chunk + rem;
myn = chunk;
}
// do the compute
if (dag == DaggerYes) {
for (int ss = myblock; ss < myblock+myn; ++ss) {
int sU = ss;
// Interior = 1; Exterior = 0; must implement for staggered
Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,1,0); //<---------
}
} else {
for (int ss = myblock; ss < myblock+myn; ++ss) {
// Interior = 1; Exterior = 0;
int sU = ss;
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,1,0); //<------------
}
}
ptime = usecond() - start;
} else {
double start = usecond();
st.CommunicateThreaded();
ctime = usecond() - start;
}
}
DhopCommTime += ctime;
DhopComputeTime+=ptime;
// First to enter, last to leave timing
st.CollateThreads();
DhopFaceTime-=usecond();
st.CommsMerge(compressor);
DhopFaceTime+=usecond();
DhopComputeTime2-=usecond();
if (dag == DaggerYes) {
int sz=st.surface_list.size();
parallel_for (int ss = 0; ss < sz; ss++) {
int sU = st.surface_list[ss];
Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,0,1); //<----------
}
} else {
int sz=st.surface_list.size();
parallel_for (int ss = 0; ss < sz; ss++) {
int sU = st.surface_list[ss];
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,0,1);//<----------
}
}
DhopComputeTime2+=usecond();
#else
assert(0);
#endif
}
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,DoubledGaugeField & UUU,
const FermionField &in, FermionField &out,int dag)
{
Compressor compressor;
int LLs = in._grid->_rdimensions[0];
//double t1=usecond();
DhopTotalTime -= usecond();
DhopCommTime -= usecond();
st.HaloExchange(in,compressor);
DhopCommTime += usecond();
DhopComputeTime -= usecond();
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
if (dag == DaggerYes) {
parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
int sU=ss;
Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), LLs, sU,in, out);
}
} else {
parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
int sU=ss;
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
}
}
DhopComputeTime += usecond();
DhopTotalTime += usecond();
//double t2=usecond();
//std::cout << __FILE__ << " " << __func__ << " Total Time " << DhopTotalTime << std::endl;
//std::cout << __FILE__ << " " << __func__ << " Total Time Org " << t2-t1 << std::endl;
//std::cout << __FILE__ << " " << __func__ << " Comml Time " << DhopCommTime << std::endl;
//std::cout << __FILE__ << " " << __func__ << " Compute Time " << DhopComputeTime << std::endl;
}
/*CHANGE END*/
/* ORG
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,DoubledGaugeField & UUU,
@ -254,6 +464,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
DhopComputeTime += usecond();
DhopTotalTime += usecond();
}
*/
template<class Impl>
@ -336,6 +547,9 @@ void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void)
DhopTotalTime = 0;
DhopCommTime = 0;
DhopComputeTime = 0;
DhopFaceTime = 0;
Stencil.ZeroCounters();
StencilEven.ZeroCounters();
StencilOdd.ZeroCounters();

View File

@ -64,6 +64,8 @@ namespace QCD {
double DhopCalls;
double DhopCommTime;
double DhopComputeTime;
double DhopComputeTime2;
double DhopFaceTime;
///////////////////////////////////////////////////////////////
// Implement the abstract base
@ -119,7 +121,27 @@ namespace QCD {
FermionField &out,
int dag);
void DhopInternalOverlappedComms(StencilImpl & st,
LebesgueOrder &lo,
DoubledGaugeField &U,
DoubledGaugeField &UUU,
const FermionField &in,
FermionField &out,
int dag);
void DhopInternalSerialComms(StencilImpl & st,
LebesgueOrder &lo,
DoubledGaugeField &U,
DoubledGaugeField &UUU,
const FermionField &in,
FermionField &out,
int dag);
// Constructors
////////////////////////////////////////////////////////////////////////////////////////////////
// Grid internal interface -- Thin link and fat link, with coefficients
////////////////////////////////////////////////////////////////////////////////////////////////
ImprovedStaggeredFermion5D(GaugeField &_Uthin,
GaugeField &_Ufat,
GridCartesian &FiveDimGrid,
@ -127,17 +149,37 @@ namespace QCD {
GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid,
double _mass,
RealD _c1=9.0/8.0, RealD _c2=-1.0/24.0,RealD _u0=1.0,
RealD _c1, RealD _c2,RealD _u0,
const ImplParams &p= ImplParams());
// DoubleStore
void ImportGauge(const GaugeField &_U);
void ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat);
////////////////////////////////////////////////////////////////////////////////////////////////
// MILC constructor ; triple links, no rescale factors; must be externally pre multiplied
////////////////////////////////////////////////////////////////////////////////////////////////
ImprovedStaggeredFermion5D(GridCartesian &FiveDimGrid,
GridRedBlackCartesian &FiveDimRedBlackGrid,
GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid,
double _mass,
RealD _c1=1.0, RealD _c2=1.0,RealD _u0=1.0,
const ImplParams &p= ImplParams());
// DoubleStore gauge field in operator
void ImportGauge (const GaugeField &_Uthin ) { assert(0); }
void ImportGauge (const GaugeField &_Uthin ,const GaugeField &_Ufat);
void ImportGaugeSimple(const GaugeField &_UUU,const GaugeField &_U);
void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
// Give a reference; can be used to do an assignment or copy back out after import
// if Carleton wants to cache them and not use the ImportSimple
DoubledGaugeField &GetU(void) { return Umu ; } ;
DoubledGaugeField &GetUUU(void) { return UUUmu; };
void CopyGaugeCheckerboards(void);
///////////////////////////////////////////////////////////////
// Data members require to support the functionality
///////////////////////////////////////////////////////////////
public:
virtual int isTrivialEE(void) { return 1; };
virtual RealD Mass(void) { return mass; }
GridBase *_FourDimGrid;
GridBase *_FourDimRedBlackGrid;

View File

@ -853,7 +853,7 @@ namespace QCD {
a0 = a0 + incr;
a1 = a1 + incr;
a2 = a2 + sizeof(Simd::scalar_type);
a2 = a2 + sizeof(typename Simd::scalar_type);
}
}

View File

@ -396,6 +396,27 @@ namespace Grid {
amax=zolo_hi;
}
template<class Impl>
void PartialFractionFermion5D<Impl>::ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d)
{
int Ls = this->Ls;
conformable(solution5d._grid,this->FermionGrid());
conformable(exported4d._grid,this->GaugeGrid());
ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
}
template<class Impl>
void PartialFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
{
int Ls = this->Ls;
conformable(imported5d._grid,this->FermionGrid());
conformable(input4d._grid ,this->GaugeGrid());
FermionField tmp(this->FermionGrid());
tmp=zero;
InsertSlice(input4d, tmp, Ls-1, Ls-1);
tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
this->Dminus(tmp,imported5d);
}
// Constructors
template<class Impl>
PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,

View File

@ -70,6 +70,12 @@ namespace Grid {
// Efficient support for multigrid coarsening
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp);
///////////////////////////////////////////////////////////////
// Physical surface field utilities
///////////////////////////////////////////////////////////////
virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
virtual void ImportPhysicalFermionSource (const FermionField &input4d,FermionField &imported5d);
// Constructors
PartialFractionFermion5D(GaugeField &_Umu,
GridCartesian &FiveDimGrid,

View File

@ -32,223 +32,241 @@ namespace Grid {
namespace QCD {
int StaggeredKernelsStatic::Opt= StaggeredKernelsStatic::OptGeneric;
int StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsAndCompute;
#define GENERIC_STENCIL_LEG(U,Dir,skew,multLink) \
SE = st.GetEntry(ptype, Dir+skew, sF); \
if (SE->_is_local ) { \
if (SE->_permute) { \
chi_p = &chi; \
permute(chi, in._odata[SE->_offset], ptype); \
} else { \
chi_p = &in._odata[SE->_offset]; \
} \
} else { \
chi_p = &buf[SE->_offset]; \
} \
multLink(Uchi, U._odata[sU], *chi_p, Dir);
#define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink) \
SE = st.GetEntry(ptype, Dir+skew, sF); \
if (SE->_is_local ) { \
if (SE->_permute) { \
chi_p = &chi; \
permute(chi, in._odata[SE->_offset], ptype); \
} else { \
chi_p = &in._odata[SE->_offset]; \
} \
} else if ( st.same_node[Dir] ) { \
chi_p = &buf[SE->_offset]; \
} \
if (SE->_is_local || st.same_node[Dir] ) { \
multLink(Uchi, U._odata[sU], *chi_p, Dir); \
}
#define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \
SE = st.GetEntry(ptype, Dir+skew, sF); \
if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
nmu++; \
chi_p = &buf[SE->_offset]; \
multLink(Uchi, U._odata[sU], *chi_p, Dir); \
}
template <class Impl>
StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////
// Generic implementation; move to different file?
////////////////////////////////////////////
// Int, Ext, Int+Ext cases for comms overlap
////////////////////////////////////////////////////////////////////////////////////
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteDepth(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
SiteSpinor *buf, int sF,
int sU, const FermionField &in, SiteSpinor &out,int threeLink) {
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out, int dag) {
const SiteSpinor *chi_p;
SiteSpinor chi;
SiteSpinor Uchi;
StencilEntry *SE;
int ptype;
int skew = 0;
if (threeLink) skew=8;
///////////////////////////
// Xp
///////////////////////////
int skew;
SE = st.GetEntry(ptype, Xp+skew, sF);
if (SE->_is_local) {
if (SE->_permute) {
chi_p = &chi;
permute(chi, in._odata[SE->_offset], ptype);
} else {
chi_p = &in._odata[SE->_offset];
}
} else {
chi_p = &buf[SE->_offset];
for(int s=0;s<LLs;s++){
int sF=LLs*sU+s;
skew = 0;
GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink);
GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(U,Zp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(U,Tp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(U,Xm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd);
skew=8;
GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Zp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Tp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Xm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd);
if ( dag ) {
Uchi = - Uchi;
}
vstream(out._odata[sF], Uchi);
}
Impl::multLink(Uchi, U._odata[sU], *chi_p, Xp);
///////////////////////////
// Yp
///////////////////////////
SE = st.GetEntry(ptype, Yp+skew, sF);
if (SE->_is_local) {
if (SE->_permute) {
chi_p = &chi;
permute(chi, in._odata[SE->_offset], ptype);
} else {
chi_p = &in._odata[SE->_offset];
}
} else {
chi_p = &buf[SE->_offset];
}
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Yp);
///////////////////////////
// Zp
///////////////////////////
SE = st.GetEntry(ptype, Zp+skew, sF);
if (SE->_is_local) {
if (SE->_permute) {
chi_p = &chi;
permute(chi, in._odata[SE->_offset], ptype);
} else {
chi_p = &in._odata[SE->_offset];
}
} else {
chi_p = &buf[SE->_offset];
}
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Zp);
///////////////////////////
// Tp
///////////////////////////
SE = st.GetEntry(ptype, Tp+skew, sF);
if (SE->_is_local) {
if (SE->_permute) {
chi_p = &chi;
permute(chi, in._odata[SE->_offset], ptype);
} else {
chi_p = &in._odata[SE->_offset];
}
} else {
chi_p = &buf[SE->_offset];
}
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Tp);
///////////////////////////
// Xm
///////////////////////////
SE = st.GetEntry(ptype, Xm+skew, sF);
if (SE->_is_local) {
if (SE->_permute) {
chi_p = &chi;
permute(chi, in._odata[SE->_offset], ptype);
} else {
chi_p = &in._odata[SE->_offset];
}
} else {
chi_p = &buf[SE->_offset];
}
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Xm);
///////////////////////////
// Ym
///////////////////////////
SE = st.GetEntry(ptype, Ym+skew, sF);
if (SE->_is_local) {
if (SE->_permute) {
chi_p = &chi;
permute(chi, in._odata[SE->_offset], ptype);
} else {
chi_p = &in._odata[SE->_offset];
}
} else {
chi_p = &buf[SE->_offset];
}
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Ym);
///////////////////////////
// Zm
///////////////////////////
SE = st.GetEntry(ptype, Zm+skew, sF);
if (SE->_is_local) {
if (SE->_permute) {
chi_p = &chi;
permute(chi, in._odata[SE->_offset], ptype);
} else {
chi_p = &in._odata[SE->_offset];
}
} else {
chi_p = &buf[SE->_offset];
}
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Zm);
///////////////////////////
// Tm
///////////////////////////
SE = st.GetEntry(ptype, Tm+skew, sF);
if (SE->_is_local) {
if (SE->_permute) {
chi_p = &chi;
permute(chi, in._odata[SE->_offset], ptype);
} else {
chi_p = &in._odata[SE->_offset];
}
} else {
chi_p = &buf[SE->_offset];
}
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Tm);
vstream(out, Uchi);
};
///////////////////////////////////////////////////
// Only contributions from interior of our node
///////////////////////////////////////////////////
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag) {
const SiteSpinor *chi_p;
SiteSpinor chi;
SiteSpinor Uchi;
StencilEntry *SE;
int ptype;
int skew ;
for(int s=0;s<LLs;s++){
int sF=LLs*sU+s;
skew = 0;
Uchi=zero;
GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Yp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Zp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Tp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Xm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd);
skew=8;
GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Zp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Tp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Xm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd);
if ( dag ) {
Uchi = - Uchi;
}
vstream(out._odata[sF], Uchi);
}
};
///////////////////////////////////////////////////
// Only contributions from exterior of our node
///////////////////////////////////////////////////
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag) {
const SiteSpinor *chi_p;
SiteSpinor chi;
SiteSpinor Uchi;
StencilEntry *SE;
int ptype;
int nmu=0;
int skew ;
for(int s=0;s<LLs;s++){
int sF=LLs*sU+s;
skew = 0;
Uchi=zero;
GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Yp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Zp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Tp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Xm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd);
skew=8;
GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Zp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Tp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Xm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
if ( nmu ) {
if ( dag ) {
out._odata[sF] = out._odata[sF] - Uchi;
} else {
out._odata[sF] = out._odata[sF] + Uchi;
}
}
}
};
////////////////////////////////////////////////////////////////////////////////////
// Driving / wrapping routine to select right kernel
////////////////////////////////////////////////////////////////////////////////////
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out) {
SiteSpinor naik;
SiteSpinor naive;
int oneLink =0;
int threeLink=1;
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,
int interior,int exterior)
{
int dag=1;
switch(Opt) {
#ifdef AVX512
//FIXME; move the sign into the Asm routine
case OptInlineAsm:
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out);
for(int s=0;s<LLs;s++) {
int sF=s+LLs*sU;
out._odata[sF]=-out._odata[sF];
}
break;
#endif
case OptHandUnroll:
DhopSiteHand(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
break;
case OptGeneric:
for(int s=0;s<LLs;s++){
int sF=s+LLs*sU;
DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink);
DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
out._odata[sF] =-naive-naik;
}
break;
default:
std::cout<<"Oops Opt = "<<Opt<<std::endl;
assert(0);
break;
}
DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
};
template <class Impl>
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,
int interior,int exterior)
{
int dag=0;
DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
};
template <class Impl>
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs,
int sU, const FermionField &in, FermionField &out)
int sU, const FermionField &in, FermionField &out,
int dag,int interior,int exterior)
{
int oneLink =0;
int threeLink=1;
SiteSpinor naik;
SiteSpinor naive;
int dag=0;
switch(Opt) {
#ifdef AVX512
case OptInlineAsm:
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out);
if ( interior && exterior ) {
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else {
std::cout << GridLogError << "Cannot overlap comms and compute with Staggered assembly"<<std::endl;
assert(0);
}
break;
#endif
case OptHandUnroll:
DhopSiteHand(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
if ( interior && exterior ) {
DhopSiteHand (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else if ( interior ) {
DhopSiteHandInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else if ( exterior ) {
DhopSiteHandExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
}
break;
case OptGeneric:
for(int s=0;s<LLs;s++){
int sF=LLs*sU+s;
// assert(sF<in._odata.size());
// assert(sU< U._odata.size());
// assert(sF>=0); assert(sU>=0);
DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink);
DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
out._odata[sF] =naive+naik;
if ( interior && exterior ) {
DhopSiteGeneric (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else if ( interior ) {
DhopSiteGenericInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else if ( exterior ) {
DhopSiteGenericExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
}
break;
default:

View File

@ -38,8 +38,9 @@ namespace QCD {
class StaggeredKernelsStatic {
public:
enum { OptGeneric, OptHandUnroll, OptInlineAsm };
// S-direction is INNERMOST and takes no part in the parity.
static int Opt; // these are a temporary hack
enum { CommsAndCompute, CommsThenCompute };
static int Opt;
static int Comms;
};
template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , public StaggeredKernelsStatic {
@ -53,24 +54,62 @@ public:
void DhopDir(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out, int dir,int disp);
void DhopSiteDepth(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteSpinor * buf,
int sF, int sU, const FermionField &in, SiteSpinor &out,int threeLink);
///////////////////////////////////////////////////////////////////////////////////////
// Generic Nc kernels
///////////////////////////////////////////////////////////////////////////////////////
void DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag);
void DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag);
void DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag);
///////////////////////////////////////////////////////////////////////////////////////
// Nc=3 specific kernels
///////////////////////////////////////////////////////////////////////////////////////
void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag);
void DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag);
void DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag);
void DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteSpinor * buf,
int sF, int sU, const FermionField &in, SiteSpinor&out,int threeLink);
///////////////////////////////////////////////////////////////////////////////////////
// Asm Nc=3 specific kernels
///////////////////////////////////////////////////////////////////////////////////////
void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag);
///////////////////////////////////////////////////////////////////////////////////////////////////
// Generic interface; fan out to right routine
///////////////////////////////////////////////////////////////////////////////////////////////////
void DhopSite(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out, int interior=1,int exterior=1);
void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,SiteSpinor * buf,
int LLs, int sU, const FermionField &in, FermionField &out, int dag);
void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out, int interior=1,int exterior=1);
void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, SiteSpinor * buf,
int LLs, int sU, const FermionField &in, FermionField &out);
void DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out);
void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor *buf,
int LLs, int sU, const FermionField &in, FermionField &out);
void DhopSite(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
public:

View File

@ -560,16 +560,53 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
VSTORE(2,%0,pUChi_02) \
: : "r" (out) : "memory" );
#define nREDUCE(out) \
asm ( \
VADD(UChi_00,UChi_10,UChi_00) \
VADD(UChi_01,UChi_11,UChi_01) \
VADD(UChi_02,UChi_12,UChi_02) \
VADD(UChi_30,UChi_20,UChi_30) \
VADD(UChi_31,UChi_21,UChi_31) \
VADD(UChi_32,UChi_22,UChi_32) \
VADD(UChi_00,UChi_30,UChi_00) \
VADD(UChi_01,UChi_31,UChi_01) \
VADD(UChi_02,UChi_32,UChi_02) ); \
asm (VZERO(Chi_00) \
VSUB(UChi_00,Chi_00,UChi_00) \
VSUB(UChi_01,Chi_00,UChi_01) \
VSUB(UChi_02,Chi_00,UChi_02) ); \
asm ( \
VSTORE(0,%0,pUChi_00) \
VSTORE(1,%0,pUChi_01) \
VSTORE(2,%0,pUChi_02) \
: : "r" (out) : "memory" );
#define REDUCEa(out) \
asm ( \
VADD(UChi_00,UChi_10,UChi_00) \
VADD(UChi_01,UChi_11,UChi_01) \
VADD(UChi_02,UChi_12,UChi_02) ); \
asm ( \
VSTORE(0,%0,pUChi_00) \
VSTORE(1,%0,pUChi_01) \
VSTORE(2,%0,pUChi_02) \
: : "r" (out) : "memory" );
// FIXME is sign right in the VSUB ?
#define nREDUCEa(out) \
asm ( \
VSTORE(0,%0,pUChi_00) \
VSTORE(1,%0,pUChi_01) \
VSTORE(2,%0,pUChi_02) \
: : "r" (out) : "memory" );
VADD(UChi_00,UChi_10,UChi_00) \
VADD(UChi_01,UChi_11,UChi_01) \
VADD(UChi_02,UChi_12,UChi_02) ); \
asm (VZERO(Chi_00) \
VSUB(UChi_00,Chi_00,UChi_00) \
VSUB(UChi_01,Chi_00,UChi_01) \
VSUB(UChi_02,Chi_00,UChi_02) ); \
asm ( \
VSTORE(0,%0,pUChi_00) \
VSTORE(1,%0,pUChi_01) \
VSTORE(2,%0,pUChi_02) \
: : "r" (out) : "memory" );
#define PERMUTE_DIR(dir) \
permute##dir(Chi_0,Chi_0);\
@ -581,10 +618,9 @@ namespace QCD {
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs,
int sU, const FermionField &in, FermionField &out)
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag)
{
assert(0);
};
@ -645,10 +681,9 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
// This is the single precision 5th direction vectorised kernel
#include <simd/Intel512single.h>
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs,
int sU, const FermionField &in, FermionField &out)
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag)
{
#ifdef AVX512
uint64_t gauge0,gauge1,gauge2,gauge3;
@ -685,7 +720,11 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
addr0 = (uint64_t) &out._odata[sF];
REDUCE(addr0);
if ( dag ) {
nREDUCE(addr0);
} else {
REDUCE(addr0);
}
}
#else
assert(0);
@ -695,10 +734,9 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
#include <simd/Intel512double.h>
template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs,
int sU, const FermionField &in, FermionField &out)
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag)
{
#ifdef AVX512
uint64_t gauge0,gauge1,gauge2,gauge3;
@ -734,7 +772,11 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
addr0 = (uint64_t) &out._odata[sF];
REDUCE(addr0);
if ( dag ) {
nREDUCE(addr0);
} else {
REDUCE(addr0);
}
}
#else
assert(0);
@ -776,10 +818,9 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
#include <simd/Intel512single.h>
template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs,
int sU, const FermionField &in, FermionField &out)
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag)
{
#ifdef AVX512
uint64_t gauge0,gauge1,gauge2,gauge3;
@ -832,7 +873,11 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
MULT_ADD_XYZT(gauge2,gauge3);
addr0 = (uint64_t) &out._odata[sF];
REDUCEa(addr0);
if ( dag ) {
nREDUCEa(addr0);
} else {
REDUCEa(addr0);
}
}
#else
assert(0);
@ -841,10 +886,9 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
#include <simd/Intel512double.h>
template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs,
int sU, const FermionField &in, FermionField &out)
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag)
{
#ifdef AVX512
uint64_t gauge0,gauge1,gauge2,gauge3;
@ -897,7 +941,11 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
MULT_ADD_XYZT(gauge2,gauge3);
addr0 = (uint64_t) &out._odata[sF];
REDUCEa(addr0);
if ( dag ) {
nREDUCEa(addr0);
} else {
REDUCEa(addr0);
}
}
#else
assert(0);
@ -909,7 +957,7 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
DoubledGaugeField &U, \
DoubledGaugeField &UUU, \
SiteSpinor *buf, int LLs, \
int sU, const FermionField &in, FermionField &out);
int sU, const FermionField &in, FermionField &out,int dag);
KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplD);
KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplF);

View File

@ -28,7 +28,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */
#include <Grid.h>
#define REGISTER
#define LOAD_CHI(b) \
const SiteSpinor & ref (b[offset]); \
@ -59,7 +58,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
UChi ## _1 += U_12*Chi_2;\
UChi ## _2 += U_22*Chi_2;
#define MULT_ADD(A,UChi) \
#define MULT_ADD(U,A,UChi) \
auto & ref(U._odata[sU](A)); \
Impl::loadLinkElement(U_00,ref()(0,0)); \
Impl::loadLinkElement(U_10,ref()(1,0)); \
@ -82,241 +81,319 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define PERMUTE_DIR(dir) \
permute##dir(Chi_0,Chi_0);\
permute##dir(Chi_1,Chi_1);\
permute##dir(Chi_2,Chi_2);
permute##dir(Chi_0,Chi_0); \
permute##dir(Chi_1,Chi_1); \
permute##dir(Chi_2,Chi_2);
#define HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
SE=st.GetEntry(ptype,Dir+skew,sF); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
if ( local ) { \
LOAD_CHI(in._odata); \
if ( perm) { \
PERMUTE_DIR(Perm); \
} \
} else { \
LOAD_CHI(buf); \
}
#define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even) \
HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
{ \
MULT(Dir,even); \
}
#define HAND_STENCIL_LEG(U,Dir,Perm,skew,even) \
HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
{ \
MULT_ADD(U,Dir,even); \
}
#define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even) \
SE=st.GetEntry(ptype,Dir+skew,sF); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
if ( local ) { \
LOAD_CHI(in._odata); \
if ( perm) { \
PERMUTE_DIR(Perm); \
} \
} else if ( st.same_node[Dir] ) { \
LOAD_CHI(buf); \
} \
if (SE->_is_local || st.same_node[Dir] ) { \
MULT_ADD(U,Dir,even); \
}
#define HAND_STENCIL_LEG_EXT(U,Dir,Perm,skew,even) \
SE=st.GetEntry(ptype,Dir+skew,sF); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
nmu++; \
{ LOAD_CHI(buf); } \
{ MULT_ADD(U,Dir,even); } \
}
namespace Grid {
namespace QCD {
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs,
int sU, const FermionField &in, FermionField &out, int dag)
{
SiteSpinor naik;
SiteSpinor naive;
int oneLink =0;
int threeLink=1;
int skew(0);
Real scale(1.0);
if(dag) scale = -1.0;
for(int s=0;s<LLs;s++){
int sF=s+LLs*sU;
DhopSiteDepthHand(st,lo,U,buf,sF,sU,in,naive,oneLink);
DhopSiteDepthHand(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
out._odata[sF] =scale*(naive+naik);
}
}
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
SiteSpinor *buf, int sF,
int sU, const FermionField &in, SiteSpinor &out,int threeLink)
void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag)
{
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
REGISTER Simd even_0; // 12 regs on knc
REGISTER Simd even_1;
REGISTER Simd even_2;
REGISTER Simd odd_0; // 12 regs on knc
REGISTER Simd odd_1;
REGISTER Simd odd_2;
Simd even_0; // 12 regs on knc
Simd even_1;
Simd even_2;
Simd odd_0; // 12 regs on knc
Simd odd_1;
Simd odd_2;
REGISTER Simd Chi_0; // two spinor; 6 regs
REGISTER Simd Chi_1;
REGISTER Simd Chi_2;
REGISTER Simd U_00; // two rows of U matrix
REGISTER Simd U_10;
REGISTER Simd U_20;
REGISTER Simd U_01;
REGISTER Simd U_11;
REGISTER Simd U_21; // 2 reg left.
REGISTER Simd U_02;
REGISTER Simd U_12;
REGISTER Simd U_22;
int skew = 0;
if (threeLink) skew=8;
Simd Chi_0; // two spinor; 6 regs
Simd Chi_1;
Simd Chi_2;
Simd U_00; // two rows of U matrix
Simd U_10;
Simd U_20;
Simd U_01;
Simd U_11;
Simd U_21; // 2 reg left.
Simd U_02;
Simd U_12;
Simd U_22;
SiteSpinor result;
int offset,local,perm, ptype;
StencilEntry *SE;
int skew;
// Xp
SE=st.GetEntry(ptype,Xp+skew,sF);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
LOAD_CHI(in._odata);
if ( perm) {
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(buf);
}
{
MULT(Xp,even);
}
// Yp
SE=st.GetEntry(ptype,Yp+skew,sF);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
LOAD_CHI(in._odata);
if ( perm) {
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(buf);
}
{
MULT(Yp,odd);
}
for(int s=0;s<LLs;s++){
int sF=s+LLs*sU;
// Zp
SE=st.GetEntry(ptype,Zp+skew,sF);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
LOAD_CHI(in._odata);
if ( perm) {
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
skew = 0;
HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even);
HAND_STENCIL_LEG_BEGIN(Yp,2,skew,odd);
HAND_STENCIL_LEG (U,Zp,1,skew,even);
HAND_STENCIL_LEG (U,Tp,0,skew,odd);
HAND_STENCIL_LEG (U,Xm,3,skew,even);
HAND_STENCIL_LEG (U,Ym,2,skew,odd);
HAND_STENCIL_LEG (U,Zm,1,skew,even);
HAND_STENCIL_LEG (U,Tm,0,skew,odd);
skew = 8;
HAND_STENCIL_LEG(UUU,Xp,3,skew,even);
HAND_STENCIL_LEG(UUU,Yp,2,skew,odd);
HAND_STENCIL_LEG(UUU,Zp,1,skew,even);
HAND_STENCIL_LEG(UUU,Tp,0,skew,odd);
HAND_STENCIL_LEG(UUU,Xm,3,skew,even);
HAND_STENCIL_LEG(UUU,Ym,2,skew,odd);
HAND_STENCIL_LEG(UUU,Zm,1,skew,even);
HAND_STENCIL_LEG(UUU,Tm,0,skew,odd);
if ( dag ) {
result()()(0) = - even_0 - odd_0;
result()()(1) = - even_1 - odd_1;
result()()(2) = - even_2 - odd_2;
} else {
result()()(0) = even_0 + odd_0;
result()()(1) = even_1 + odd_1;
result()()(2) = even_2 + odd_2;
}
} else {
LOAD_CHI(buf);
vstream(out._odata[sF],result);
}
{
MULT_ADD(Zp,even);
}
// Tp
SE=st.GetEntry(ptype,Tp+skew,sF);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
LOAD_CHI(in._odata);
if ( perm) {
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(buf);
}
{
MULT_ADD(Tp,odd);
}
// Xm
SE=st.GetEntry(ptype,Xm+skew,sF);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
LOAD_CHI(in._odata);
if ( perm) {
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(buf);
}
{
MULT_ADD(Xm,even);
}
// Ym
SE=st.GetEntry(ptype,Ym+skew,sF);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
LOAD_CHI(in._odata);
if ( perm) {
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(buf);
}
{
MULT_ADD(Ym,odd);
}
// Zm
SE=st.GetEntry(ptype,Zm+skew,sF);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
LOAD_CHI(in._odata);
if ( perm) {
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(buf);
}
{
MULT_ADD(Zm,even);
}
// Tm
SE=st.GetEntry(ptype,Tm+skew,sF);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
LOAD_CHI(in._odata);
if ( perm) {
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI(buf);
}
{
MULT_ADD(Tm,odd);
}
vstream(out()()(0),even_0+odd_0);
vstream(out()()(1),even_1+odd_1);
vstream(out()()(2),even_2+odd_2);
}
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag)
{
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
Simd even_0; // 12 regs on knc
Simd even_1;
Simd even_2;
Simd odd_0; // 12 regs on knc
Simd odd_1;
Simd odd_2;
Simd Chi_0; // two spinor; 6 regs
Simd Chi_1;
Simd Chi_2;
Simd U_00; // two rows of U matrix
Simd U_10;
Simd U_20;
Simd U_01;
Simd U_11;
Simd U_21; // 2 reg left.
Simd U_02;
Simd U_12;
Simd U_22;
SiteSpinor result;
int offset,local,perm, ptype;
StencilEntry *SE;
int skew;
for(int s=0;s<LLs;s++){
int sF=s+LLs*sU;
even_0 = zero; even_1 = zero; even_2 = zero;
odd_0 = zero; odd_1 = zero; odd_2 = zero;
skew = 0;
HAND_STENCIL_LEG_INT(U,Xp,3,skew,even);
HAND_STENCIL_LEG_INT(U,Yp,2,skew,odd);
HAND_STENCIL_LEG_INT(U,Zp,1,skew,even);
HAND_STENCIL_LEG_INT(U,Tp,0,skew,odd);
HAND_STENCIL_LEG_INT(U,Xm,3,skew,even);
HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd);
HAND_STENCIL_LEG_INT(U,Zm,1,skew,even);
HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd);
skew = 8;
HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even);
HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd);
HAND_STENCIL_LEG_INT(UUU,Zp,1,skew,even);
HAND_STENCIL_LEG_INT(UUU,Tp,0,skew,odd);
HAND_STENCIL_LEG_INT(UUU,Xm,3,skew,even);
HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd);
HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even);
HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd);
// Assume every site must be connected to at least one interior point. No 1^4 subvols.
if ( dag ) {
result()()(0) = - even_0 - odd_0;
result()()(1) = - even_1 - odd_1;
result()()(2) = - even_2 - odd_2;
} else {
result()()(0) = even_0 + odd_0;
result()()(1) = even_1 + odd_1;
result()()(2) = even_2 + odd_2;
}
vstream(out._odata[sF],result);
}
}
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag)
{
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
Simd even_0; // 12 regs on knc
Simd even_1;
Simd even_2;
Simd odd_0; // 12 regs on knc
Simd odd_1;
Simd odd_2;
Simd Chi_0; // two spinor; 6 regs
Simd Chi_1;
Simd Chi_2;
Simd U_00; // two rows of U matrix
Simd U_10;
Simd U_20;
Simd U_01;
Simd U_11;
Simd U_21; // 2 reg left.
Simd U_02;
Simd U_12;
Simd U_22;
SiteSpinor result;
int offset,local,perm, ptype;
StencilEntry *SE;
int skew;
for(int s=0;s<LLs;s++){
int sF=s+LLs*sU;
even_0 = zero; even_1 = zero; even_2 = zero;
odd_0 = zero; odd_1 = zero; odd_2 = zero;
int nmu=0;
skew = 0;
HAND_STENCIL_LEG_EXT(U,Xp,3,skew,even);
HAND_STENCIL_LEG_EXT(U,Yp,2,skew,odd);
HAND_STENCIL_LEG_EXT(U,Zp,1,skew,even);
HAND_STENCIL_LEG_EXT(U,Tp,0,skew,odd);
HAND_STENCIL_LEG_EXT(U,Xm,3,skew,even);
HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd);
HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even);
HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd);
skew = 8;
HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even);
HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd);
HAND_STENCIL_LEG_EXT(UUU,Zp,1,skew,even);
HAND_STENCIL_LEG_EXT(UUU,Tp,0,skew,odd);
HAND_STENCIL_LEG_EXT(UUU,Xm,3,skew,even);
HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd);
HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even);
HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd);
// Add sum of all exterior connected stencil legs
if ( nmu ) {
if ( dag ) {
result()()(0) = - even_0 - odd_0;
result()()(1) = - even_1 - odd_1;
result()()(2) = - even_2 - odd_2;
} else {
result()()(0) = even_0 + odd_0;
result()()(1) = even_1 + odd_1;
result()()(2) = even_2 + odd_2;
}
out._odata[sF] = out._odata[sF] + result;
}
}
}
#define DHOP_SITE_HAND_INSTANTIATE(IMPL) \
template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
DoubledGaugeField &U,DoubledGaugeField &UUU, \
SiteSpinor *buf, int LLs, \
int sU, const FermionField &in, FermionField &out, int dag);
SiteSpinor *buf, int LLs, int sU, \
const FermionField &in, FermionField &out, int dag); \
\
template void StaggeredKernels<IMPL>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, \
DoubledGaugeField &U,DoubledGaugeField &UUU, \
SiteSpinor *buf, int LLs, int sU, \
const FermionField &in, FermionField &out, int dag); \
\
template void StaggeredKernels<IMPL>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, \
DoubledGaugeField &U,DoubledGaugeField &UUU, \
SiteSpinor *buf, int LLs, int sU, \
const FermionField &in, FermionField &out, int dag); \
#define DHOP_SITE_DEPTH_HAND_INSTANTIATE(IMPL) \
template void StaggeredKernels<IMPL>::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, \
SiteSpinor *buf, int sF, \
int sU, const FermionField &in, SiteSpinor &out,int threeLink) ;
DHOP_SITE_HAND_INSTANTIATE(StaggeredImplD);
DHOP_SITE_HAND_INSTANTIATE(StaggeredImplF);
DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplD);
DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplF);
DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredImplD);
DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredImplF);
DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredVec5dImplD);
DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredVec5dImplF);
}}
}
}

View File

@ -69,39 +69,47 @@ class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
/*****************************************************/
/* Compress includes precision change if mpi data is not same */
/*****************************************************/
inline void Compress(SiteHalfSpinor *buf,Integer o,const SiteSpinor &in) {
projector::Proj(buf[o],in,mu,dag);
inline void Compress(SiteHalfSpinor * __restrict__ buf,Integer o,const SiteSpinor &in) {
SiteHalfSpinor tmp;
projector::Proj(tmp,in,mu,dag);
vstream(buf[o],tmp);
}
/*****************************************************/
/* Exchange includes precision change if mpi data is not same */
/*****************************************************/
inline void Exchange(SiteHalfSpinor *mp,
SiteHalfSpinor *vp0,
SiteHalfSpinor *vp1,
inline void Exchange(SiteHalfSpinor * __restrict__ mp,
const SiteHalfSpinor * __restrict__ vp0,
const SiteHalfSpinor * __restrict__ vp1,
Integer type,Integer o){
exchange(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type);
SiteHalfSpinor tmp1;
SiteHalfSpinor tmp2;
exchange(tmp1,tmp2,vp0[o],vp1[o],type);
vstream(mp[2*o ],tmp1);
vstream(mp[2*o+1],tmp2);
}
/*****************************************************/
/* Have a decompression step if mpi data is not same */
/*****************************************************/
inline void Decompress(SiteHalfSpinor *out,
SiteHalfSpinor *in, Integer o) {
inline void Decompress(SiteHalfSpinor * __restrict__ out,
SiteHalfSpinor * __restrict__ in, Integer o) {
assert(0);
}
/*****************************************************/
/* Compress Exchange */
/*****************************************************/
inline void CompressExchange(SiteHalfSpinor *out0,
SiteHalfSpinor *out1,
const SiteSpinor *in,
inline void CompressExchange(SiteHalfSpinor * __restrict__ out0,
SiteHalfSpinor * __restrict__ out1,
const SiteSpinor * __restrict__ in,
Integer j,Integer k, Integer m,Integer type){
SiteHalfSpinor temp1, temp2,temp3,temp4;
projector::Proj(temp1,in[k],mu,dag);
projector::Proj(temp2,in[m],mu,dag);
exchange(out0[j],out1[j],temp1,temp2,type);
exchange(temp3,temp4,temp1,temp2,type);
vstream(out0[j],temp3);
vstream(out1[j],temp4);
}
/*****************************************************/
@ -266,41 +274,16 @@ public:
if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl;
}
std::vector<int> same_node;
std::vector<int> surface_list;
WilsonStencil(GridBase *grid,
int npoints,
int checkerboard,
const std::vector<int> &directions,
const std::vector<int> &distances)
: CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) ,
same_node(npoints)
: CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances)
{
ZeroCountersi();
surface_list.resize(0);
};
void BuildSurfaceList(int Ls,int vol4){
// find same node for SHM
// Here we know the distance is 1 for WilsonStencil
for(int point=0;point<this->_npoints;point++){
same_node[point] = this->SameNode(point);
}
for(int site = 0 ;site< vol4;site++){
int local = 1;
for(int point=0;point<this->_npoints;point++){
if( (!this->GetNodeLocal(site*Ls,point)) && (!same_node[point]) ){
local = 0;
}
}
if(local == 0) {
surface_list.push_back(site);
}
}
}
template < class compressor>
void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)
@ -361,23 +344,23 @@ public:
int dag = compress.dag;
int face_idx=0;
if ( dag ) {
assert(same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
assert(same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx));
assert(same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx));
assert(same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx));
assert(same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx));
assert(same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx));
assert(this->same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
assert(this->same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
assert(this->same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
assert(this->same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx));
assert(this->same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx));
assert(this->same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx));
assert(this->same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx));
assert(this->same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx));
} else {
assert(same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx));
assert(same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx));
assert(same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx));
assert(same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx));
assert(same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx));
assert(same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx));
assert(same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx));
assert(same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx));
assert(this->same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx));
assert(this->same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx));
assert(this->same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx));
assert(this->same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx));
assert(this->same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx));
assert(this->same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx));
assert(this->same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx));
assert(this->same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx));
}
this->face_table_computed=1;
assert(this->u_comm_offset==this->_unified_buffer_size);

View File

@ -348,15 +348,98 @@ void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, dirdisp, gamma);
}
};
}
/*Change starts*/
template <class Impl>
void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out, int dag) {
assert((dag == DaggerNo) || (dag == DaggerYes));
#ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
else
#endif
DhopInternalSerial(st,lo,U,in,out,dag);
}
template <class Impl>
void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out, int dag) {
assert((dag == DaggerNo) || (dag == DaggerYes));
#ifdef GRID_OMP
Compressor compressor;
int len = U._grid->oSites();
const int LLs = 1;
st.Prepare();
st.HaloGather(in,compressor);
st.CommsMergeSHM(compressor);
#pragma omp parallel
{
int tid = omp_get_thread_num();
int nthreads = omp_get_num_threads();
int ncomms = CartesianCommunicator::nCommThreads;
if (ncomms == -1) ncomms = 1;
assert(nthreads > ncomms);
if (tid >= ncomms) {
nthreads -= ncomms;
int ttid = tid - ncomms;
int n = len;
int chunk = n / nthreads;
int rem = n % nthreads;
int myblock, myn;
if (ttid < rem) {
myblock = ttid * chunk + ttid;
myn = chunk+1;
} else {
myblock = ttid*chunk + rem;
myn = chunk;
}
// do the compute
if (dag == DaggerYes) {
for (int sss = myblock; sss < myblock+myn; ++sss) {
Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
}
} else {
for (int sss = myblock; sss < myblock+myn; ++sss) {
Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
}
} //else
} else {
st.CommunicateThreaded();
}
Compressor compressor(dag);
if (dag == DaggerYes) {
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
}
} else {
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
}
}
} //pragma
#else
assert(0);
#endif
};
template <class Impl>
void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out, int dag) {
assert((dag == DaggerNo) || (dag == DaggerYes));
Compressor compressor(dag);
st.HaloExchange(in, compressor);
@ -370,6 +453,7 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
}
}
};
/*Change ends */
/*******************************************************************************
* Conserved current utilities for Wilson fermions, for contracting propagators
@ -419,7 +503,7 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
conformable(_grid, q_in._grid);
conformable(_grid, q_out._grid);
Lattice<iSinglet<Simd>> ph(_grid), coor(_grid);
Complex i(0.0,1.0);
ComplexD i(0.0,1.0);
PropagatorField tmpFwd(_grid), tmpBwd(_grid), tmp(_grid);
unsigned int tshift = (mu == Tp) ? 1 : 0;
unsigned int LLt = GridDefaultLatt()[Tp];
@ -431,7 +515,7 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
LatticeCoordinate(coor, mu);
ph = ph + mom[mu]*coor*((1./(_grid->_fdimensions[mu])));
}
ph = exp((Real)(2*M_PI)*i*ph);
ph = exp((RealD)(2*M_PI)*i*ph);
q_out = zero;
LatticeInteger coords(_grid);

View File

@ -130,6 +130,12 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
void DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
// Constructor
WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
GridRedBlackCartesian &Hgrid, RealD _mass,
@ -145,6 +151,8 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
// protected:
public:
virtual RealD Mass(void) { return mass; }
virtual int isTrivialEE(void) { return 1; };
RealD mass;
RealD diag_mass;

View File

@ -445,8 +445,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
}
}
ptime = usecond() - start;
}
{
} else {
double start = usecond();
st.CommunicateThreaded();
ctime = usecond() - start;
@ -793,7 +792,7 @@ void WilsonFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
Lattice<iSinglet<Simd>> ph(FermionGrid()), coor(FermionGrid());
PropagatorField tmpFwd(FermionGrid()), tmpBwd(FermionGrid()),
tmp(FermionGrid());
Complex i(0.0, 1.0);
ComplexD i(0.0, 1.0);
unsigned int tshift = (mu == Tp) ? 1 : 0;
unsigned int LLs = q_in._grid->_rdimensions[0];
unsigned int LLt = GridDefaultLatt()[Tp];
@ -806,7 +805,7 @@ void WilsonFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
LatticeCoordinate(coor, nu + 1);
ph = ph + mom[nu]*coor*((1./(_FourDimGrid->_fdimensions[nu])));
}
ph = exp((Real)(2*M_PI)*i*ph);
ph = exp((RealD)(2*M_PI)*i*ph);
q_out = zero;
LatticeInteger coords(_FourDimGrid);

View File

@ -53,9 +53,9 @@ template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public
typedef FermionOperator<Impl> Base;
public:
template <bool EnableBool = true>
typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
typename std::enable_if<Impl::isFundamental==true && Nc == 3 &&EnableBool, void>::type
DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1)
{
@ -70,27 +70,27 @@ public:
break;
#endif
case OptHandUnroll:
for (int site = 0; site < Ns; site++) {
for (int s = 0; s < Ls; s++) {
if(interior&&exterior) WilsonKernels<Impl>::HandDhopSite(st,lo,U,buf,sF,sU,in,out);
else if (interior) WilsonKernels<Impl>::HandDhopSiteInt(st,lo,U,buf,sF,sU,in,out);
else if (exterior) WilsonKernels<Impl>::HandDhopSiteExt(st,lo,U,buf,sF,sU,in,out);
sF++;
}
sU++;
}
for (int site = 0; site < Ns; site++) {
for (int s = 0; s < Ls; s++) {
if(interior&&exterior) WilsonKernels<Impl>::HandDhopSite(st,lo,U,buf,sF,sU,in,out);
else if (interior) WilsonKernels<Impl>::HandDhopSiteInt(st,lo,U,buf,sF,sU,in,out);
else if (exterior) WilsonKernels<Impl>::HandDhopSiteExt(st,lo,U,buf,sF,sU,in,out);
sF++;
}
sU++;
}
break;
case OptGeneric:
for (int site = 0; site < Ns; site++) {
for (int s = 0; s < Ls; s++) {
if(interior&&exterior) WilsonKernels<Impl>::GenericDhopSite(st,lo,U,buf,sF,sU,in,out);
else if (interior) WilsonKernels<Impl>::GenericDhopSiteInt(st,lo,U,buf,sF,sU,in,out);
else if (exterior) WilsonKernels<Impl>::GenericDhopSiteExt(st,lo,U,buf,sF,sU,in,out);
else assert(0);
sF++;
}
sU++;
}
for (int site = 0; site < Ns; site++) {
for (int s = 0; s < Ls; s++) {
if(interior&&exterior) WilsonKernels<Impl>::GenericDhopSite(st,lo,U,buf,sF,sU,in,out);
else if (interior) WilsonKernels<Impl>::GenericDhopSiteInt(st,lo,U,buf,sF,sU,in,out);
else if (exterior) WilsonKernels<Impl>::GenericDhopSiteExt(st,lo,U,buf,sF,sU,in,out);
else assert(0);
sF++;
}
sU++;
}
break;
default:
assert(0);
@ -99,7 +99,7 @@ public:
}
template <bool EnableBool = true>
typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
typename std::enable_if<(Impl::isFundamental==false || (Impl::isFundamental==true && Nc != 3)) && EnableBool, void>::type
DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1 ) {
// no kernel choice
@ -116,7 +116,7 @@ public:
}
template <bool EnableBool = true>
typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,void>::type
typename std::enable_if<Impl::isFundamental==true && Nc == 3 && EnableBool,void>::type
DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1)
{
@ -161,7 +161,7 @@ public:
}
template <bool EnableBool = true>
typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool,void>::type
typename std::enable_if<(Impl::isFundamental==false || (Impl::isFundamental==true && Nc != 3)) && EnableBool,void>::type
DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) {
@ -232,6 +232,7 @@ private:
void GenericDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out);
void AsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);

View File

@ -30,181 +30,60 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define REGISTER
#define LOAD_CHIMU_BODY(F) \
Chimu_00=ref(F)(0)(0); \
Chimu_01=ref(F)(0)(1); \
Chimu_02=ref(F)(0)(2); \
Chimu_10=ref(F)(1)(0); \
Chimu_11=ref(F)(1)(1); \
Chimu_12=ref(F)(1)(2); \
Chimu_20=ref(F)(2)(0); \
Chimu_21=ref(F)(2)(1); \
Chimu_22=ref(F)(2)(2); \
Chimu_30=ref(F)(3)(0); \
Chimu_31=ref(F)(3)(1); \
Chimu_32=ref(F)(3)(2)
#define LOAD_CHIMU \
{const SiteSpinor & ref (in._odata[offset]); \
Chimu_00=ref()(0)(0);\
Chimu_01=ref()(0)(1);\
Chimu_02=ref()(0)(2);\
Chimu_10=ref()(1)(0);\
Chimu_11=ref()(1)(1);\
Chimu_12=ref()(1)(2);\
Chimu_20=ref()(2)(0);\
Chimu_21=ref()(2)(1);\
Chimu_22=ref()(2)(2);\
Chimu_30=ref()(3)(0);\
Chimu_31=ref()(3)(1);\
Chimu_32=ref()(3)(2);}
#define LOAD_CHIMU(DIR,F,PERM) \
{ const SiteSpinor & ref (in._odata[offset]); LOAD_CHIMU_BODY(F); }
#define LOAD_CHI_BODY(F) \
Chi_00 = ref(F)(0)(0);\
Chi_01 = ref(F)(0)(1);\
Chi_02 = ref(F)(0)(2);\
Chi_10 = ref(F)(1)(0);\
Chi_11 = ref(F)(1)(1);\
Chi_12 = ref(F)(1)(2)
#define LOAD_CHI(DIR,F,PERM) \
{const SiteHalfSpinor &ref(buf[offset]); LOAD_CHI_BODY(F); }
//G-parity implementations using in-place intrinsic ops
//1l 1h -> 1h 1l
//0l 0h , 1h 1l -> 0l 1h 0h,1l
//0h,1l -> 1l,0h
//if( (distance == 1 && !perm_will_occur) || (distance == -1 && perm_will_occur) )
//Pulled fermion through forwards face, GPBC on upper component
//Need 0= 0l 1h 1= 1l 0h
//else if( (distance == -1 && !perm) || (distance == 1 && perm) )
//Pulled fermion through backwards face, GPBC on lower component
//Need 0= 1l 0h 1= 0l 1h
//1l 1h -> 1h 1l
//0l 0h , 1h 1l -> 0l 1h 0h,1l
#define DO_TWIST_0L_1H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3) \
permute##PERM(tmp1, ref(1)(S)(C)); \
exchange##PERM(tmp2,tmp3, ref(0)(S)(C), tmp1); \
INTO = tmp2;
//0l 0h -> 0h 0l
//1l 1h, 0h 0l -> 1l 0h, 1h 0l
#define DO_TWIST_1L_0H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3) \
permute##PERM(tmp1, ref(0)(S)(C)); \
exchange##PERM(tmp2,tmp3, ref(1)(S)(C), tmp1); \
INTO = tmp2;
#define LOAD_CHI_SETUP(DIR,F) \
g = F; \
direction = st._directions[DIR]; \
distance = st._distances[DIR]; \
sl = st._grid->_simd_layout[direction]; \
inplace_twist = 0; \
if(SE->_around_the_world && this->Params.twists[DIR % 4]){ \
if(sl == 1){ \
g = (F+1) % 2; \
}else{ \
inplace_twist = 1; \
} \
}
#define LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM) \
{ const SiteSpinor &ref(in._odata[offset]); \
LOAD_CHI_SETUP(DIR,F); \
if(!inplace_twist){ \
LOAD_CHIMU_BODY(g); \
}else{ \
if( ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
DO_TWIST_0L_1H(Chimu_00,0,0,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chimu_01,0,1,F,PERM, U_11,U_20,U_21); \
DO_TWIST_0L_1H(Chimu_02,0,2,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chimu_10,1,0,F,PERM, U_11,U_20,U_21); \
DO_TWIST_0L_1H(Chimu_11,1,1,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chimu_12,1,2,F,PERM, U_11,U_20,U_21); \
DO_TWIST_0L_1H(Chimu_20,2,0,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chimu_21,2,1,F,PERM, U_11,U_20,U_21); \
DO_TWIST_0L_1H(Chimu_22,2,2,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chimu_30,3,0,F,PERM, U_11,U_20,U_21); \
DO_TWIST_0L_1H(Chimu_31,3,1,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chimu_32,3,2,F,PERM, U_11,U_20,U_21); \
}else{ \
DO_TWIST_1L_0H(Chimu_00,0,0,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chimu_01,0,1,F,PERM, U_11,U_20,U_21); \
DO_TWIST_1L_0H(Chimu_02,0,2,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chimu_10,1,0,F,PERM, U_11,U_20,U_21); \
DO_TWIST_1L_0H(Chimu_11,1,1,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chimu_12,1,2,F,PERM, U_11,U_20,U_21); \
DO_TWIST_1L_0H(Chimu_20,2,0,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chimu_21,2,1,F,PERM, U_11,U_20,U_21); \
DO_TWIST_1L_0H(Chimu_22,2,2,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chimu_30,3,0,F,PERM, U_11,U_20,U_21); \
DO_TWIST_1L_0H(Chimu_31,3,1,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chimu_32,3,2,F,PERM, U_11,U_20,U_21); \
} \
} \
}
#define LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM) \
{ const SiteHalfSpinor &ref(buf[offset]); \
LOAD_CHI_SETUP(DIR,F); \
if(!inplace_twist){ \
LOAD_CHI_BODY(g); \
}else{ \
if( ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
DO_TWIST_0L_1H(Chi_00,0,0,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chi_01,0,1,F,PERM, U_11,U_20,U_21); \
DO_TWIST_0L_1H(Chi_02,0,2,F,PERM, UChi_00,UChi_01,UChi_02); \
DO_TWIST_0L_1H(Chi_10,1,0,F,PERM, UChi_10,UChi_11,UChi_12); \
DO_TWIST_0L_1H(Chi_11,1,1,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chi_12,1,2,F,PERM, U_11,U_20,U_21); \
}else{ \
DO_TWIST_1L_0H(Chi_00,0,0,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chi_01,0,1,F,PERM, U_11,U_20,U_21); \
DO_TWIST_1L_0H(Chi_02,0,2,F,PERM, UChi_00,UChi_01,UChi_02); \
DO_TWIST_1L_0H(Chi_10,1,0,F,PERM, UChi_10,UChi_11,UChi_12); \
DO_TWIST_1L_0H(Chi_11,1,1,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chi_12,1,2,F,PERM, U_11,U_20,U_21); \
} \
} \
}
#define LOAD_CHI_GPARITY(DIR,F,PERM) LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM)
#define LOAD_CHIMU_GPARITY(DIR,F,PERM) LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM)
#define LOAD_CHI\
{const SiteHalfSpinor &ref(buf[offset]); \
Chi_00 = ref()(0)(0);\
Chi_01 = ref()(0)(1);\
Chi_02 = ref()(0)(2);\
Chi_10 = ref()(1)(0);\
Chi_11 = ref()(1)(1);\
Chi_12 = ref()(1)(2);}
// To splat or not to splat depends on the implementation
#define MULT_2SPIN_BODY \
Impl::loadLinkElement(U_00,ref()(0,0)); \
Impl::loadLinkElement(U_10,ref()(1,0)); \
Impl::loadLinkElement(U_20,ref()(2,0)); \
Impl::loadLinkElement(U_01,ref()(0,1)); \
Impl::loadLinkElement(U_11,ref()(1,1)); \
Impl::loadLinkElement(U_21,ref()(2,1)); \
UChi_00 = U_00*Chi_00; \
UChi_10 = U_00*Chi_10; \
UChi_01 = U_10*Chi_00; \
UChi_11 = U_10*Chi_10; \
UChi_02 = U_20*Chi_00; \
UChi_12 = U_20*Chi_10; \
UChi_00+= U_01*Chi_01; \
UChi_10+= U_01*Chi_11; \
UChi_01+= U_11*Chi_01; \
UChi_11+= U_11*Chi_11; \
UChi_02+= U_21*Chi_01; \
UChi_12+= U_21*Chi_11; \
Impl::loadLinkElement(U_00,ref()(0,2)); \
Impl::loadLinkElement(U_10,ref()(1,2)); \
Impl::loadLinkElement(U_20,ref()(2,2)); \
UChi_00+= U_00*Chi_02; \
UChi_10+= U_00*Chi_12; \
UChi_01+= U_10*Chi_02; \
UChi_11+= U_10*Chi_12; \
UChi_02+= U_20*Chi_02; \
UChi_12+= U_20*Chi_12
#define MULT_2SPIN(A,F) \
{auto & ref(U._odata[sU](A)); MULT_2SPIN_BODY; }
#define MULT_2SPIN_GPARITY(A,F) \
{auto & ref(U._odata[sU](F)(A)); MULT_2SPIN_BODY; }
#define MULT_2SPIN(A)\
{auto & ref(U._odata[sU](A)); \
Impl::loadLinkElement(U_00,ref()(0,0)); \
Impl::loadLinkElement(U_10,ref()(1,0)); \
Impl::loadLinkElement(U_20,ref()(2,0)); \
Impl::loadLinkElement(U_01,ref()(0,1)); \
Impl::loadLinkElement(U_11,ref()(1,1)); \
Impl::loadLinkElement(U_21,ref()(2,1)); \
UChi_00 = U_00*Chi_00;\
UChi_10 = U_00*Chi_10;\
UChi_01 = U_10*Chi_00;\
UChi_11 = U_10*Chi_10;\
UChi_02 = U_20*Chi_00;\
UChi_12 = U_20*Chi_10;\
UChi_00+= U_01*Chi_01;\
UChi_10+= U_01*Chi_11;\
UChi_01+= U_11*Chi_01;\
UChi_11+= U_11*Chi_11;\
UChi_02+= U_21*Chi_01;\
UChi_12+= U_21*Chi_11;\
Impl::loadLinkElement(U_00,ref()(0,2)); \
Impl::loadLinkElement(U_10,ref()(1,2)); \
Impl::loadLinkElement(U_20,ref()(2,2)); \
UChi_00+= U_00*Chi_02;\
UChi_10+= U_00*Chi_12;\
UChi_01+= U_10*Chi_02;\
UChi_11+= U_10*Chi_12;\
UChi_02+= U_20*Chi_02;\
UChi_12+= U_20*Chi_12;}
#define PERMUTE_DIR(dir) \
@ -428,87 +307,84 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
result_31-= UChi_11; \
result_32-= UChi_12;
#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \
SE=st.GetEntry(ptype,DIR,ss); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
if ( local ) { \
LOAD_CHIMU_IMPL(DIR,F,PERM); \
LOAD_CHIMU; \
PROJ; \
if ( perm) { \
PERMUTE_DIR(PERM); \
} \
} else { \
LOAD_CHI_IMPL(DIR,F,PERM); \
LOAD_CHI; \
} \
MULT_2SPIN_IMPL(DIR,F); \
MULT_2SPIN(DIR); \
RECON;
#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON) \
SE=st.GetEntry(ptype,DIR,ss); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
if ( local ) { \
LOAD_CHIMU_IMPL(DIR,F,PERM); \
LOAD_CHIMU; \
PROJ; \
if ( perm) { \
PERMUTE_DIR(PERM); \
} \
} else if ( st.same_node[DIR] ) { \
LOAD_CHI_IMPL(DIR,F,PERM); \
LOAD_CHI; \
} \
if (local || st.same_node[DIR] ) { \
MULT_2SPIN_IMPL(DIR,F); \
MULT_2SPIN(DIR); \
RECON; \
}
#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON) \
SE=st.GetEntry(ptype,DIR,ss); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
if((!SE->_is_local)&&(!st.same_node[DIR]) ) { \
LOAD_CHI_IMPL(DIR,F,PERM); \
MULT_2SPIN_IMPL(DIR,F); \
LOAD_CHI; \
MULT_2SPIN(DIR); \
RECON; \
nmu++; \
}
#define HAND_RESULT(ss,F) \
#define HAND_RESULT(ss) \
{ \
SiteSpinor & ref (out._odata[ss]); \
vstream(ref(F)(0)(0),result_00); \
vstream(ref(F)(0)(1),result_01); \
vstream(ref(F)(0)(2),result_02); \
vstream(ref(F)(1)(0),result_10); \
vstream(ref(F)(1)(1),result_11); \
vstream(ref(F)(1)(2),result_12); \
vstream(ref(F)(2)(0),result_20); \
vstream(ref(F)(2)(1),result_21); \
vstream(ref(F)(2)(2),result_22); \
vstream(ref(F)(3)(0),result_30); \
vstream(ref(F)(3)(1),result_31); \
vstream(ref(F)(3)(2),result_32); \
vstream(ref()(0)(0),result_00); \
vstream(ref()(0)(1),result_01); \
vstream(ref()(0)(2),result_02); \
vstream(ref()(1)(0),result_10); \
vstream(ref()(1)(1),result_11); \
vstream(ref()(1)(2),result_12); \
vstream(ref()(2)(0),result_20); \
vstream(ref()(2)(1),result_21); \
vstream(ref()(2)(2),result_22); \
vstream(ref()(3)(0),result_30); \
vstream(ref()(3)(1),result_31); \
vstream(ref()(3)(2),result_32); \
}
#define HAND_RESULT_EXT(ss,F) \
#define HAND_RESULT_EXT(ss) \
if (nmu){ \
SiteSpinor & ref (out._odata[ss]); \
ref(F)(0)(0)+=result_00; \
ref(F)(0)(1)+=result_01; \
ref(F)(0)(2)+=result_02; \
ref(F)(1)(0)+=result_10; \
ref(F)(1)(1)+=result_11; \
ref(F)(1)(2)+=result_12; \
ref(F)(2)(0)+=result_20; \
ref(F)(2)(1)+=result_21; \
ref(F)(2)(2)+=result_22; \
ref(F)(3)(0)+=result_30; \
ref(F)(3)(1)+=result_31; \
ref(F)(3)(2)+=result_32; \
ref()(0)(0)+=result_00; \
ref()(0)(1)+=result_01; \
ref()(0)(2)+=result_02; \
ref()(1)(0)+=result_10; \
ref()(1)(1)+=result_11; \
ref()(1)(2)+=result_12; \
ref()(2)(0)+=result_20; \
ref()(2)(1)+=result_21; \
ref()(2)(2)+=result_22; \
ref()(3)(0)+=result_30; \
ref()(3)(1)+=result_31; \
ref()(3)(2)+=result_32; \
}
@ -587,18 +463,15 @@ WilsonKernels<Impl>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGauge
int offset,local,perm, ptype;
StencilEntry *SE;
#define HAND_DOP_SITE(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_RESULT(ss,F)
HAND_DOP_SITE(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM);
HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM);
HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM);
HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM);
HAND_RESULT(ss);
}
template<class Impl>
@ -612,19 +485,16 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,Doub
StencilEntry *SE;
int offset,local,perm, ptype;
#define HAND_DOP_SITE_DAG(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_RESULT(ss,F)
HAND_DOP_SITE_DAG(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM);
HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM);
HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM);
HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM);
HAND_RESULT(ss);
}
template<class Impl> void
@ -639,20 +509,16 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGa
int offset,local,perm, ptype;
StencilEntry *SE;
#define HAND_DOP_SITE_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
ZERO_RESULT; \
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_RESULT(ss,F)
HAND_DOP_SITE_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
ZERO_RESULT;
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
HAND_RESULT(ss);
}
template<class Impl>
@ -666,20 +532,16 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,D
StencilEntry *SE;
int offset,local,perm, ptype;
#define HAND_DOP_SITE_DAG_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
ZERO_RESULT; \
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_RESULT(ss,F)
HAND_DOP_SITE_DAG_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
ZERO_RESULT;
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
HAND_RESULT(ss);
}
template<class Impl> void
@ -695,20 +557,16 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGa
int offset,local,perm, ptype;
StencilEntry *SE;
int nmu=0;
#define HAND_DOP_SITE_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
ZERO_RESULT; \
HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_RESULT_EXT(ss,F)
HAND_DOP_SITE_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
ZERO_RESULT;
HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
HAND_RESULT_EXT(ss);
}
template<class Impl>
@ -723,193 +581,18 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D
StencilEntry *SE;
int offset,local,perm, ptype;
int nmu=0;
#define HAND_DOP_SITE_DAG_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
ZERO_RESULT; \
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_RESULT_EXT(ss,F)
HAND_DOP_SITE_DAG_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
ZERO_RESULT;
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
HAND_RESULT_EXT(ss);
}
////////////////////////////////////////////////
// Specialise Gparity to simple implementation
////////////////////////////////////////////////
#define HAND_SPECIALISE_EMPTY(IMPL) \
template<> void \
WilsonKernels<IMPL>::HandDhopSite(StencilImpl &st, \
LebesgueOrder &lo, \
DoubledGaugeField &U, \
SiteHalfSpinor *buf, \
int sF,int sU, \
const FermionField &in, \
FermionField &out){ assert(0); } \
template<> void \
WilsonKernels<IMPL>::HandDhopSiteDag(StencilImpl &st, \
LebesgueOrder &lo, \
DoubledGaugeField &U, \
SiteHalfSpinor *buf, \
int sF,int sU, \
const FermionField &in, \
FermionField &out){ assert(0); } \
template<> void \
WilsonKernels<IMPL>::HandDhopSiteInt(StencilImpl &st, \
LebesgueOrder &lo, \
DoubledGaugeField &U, \
SiteHalfSpinor *buf, \
int sF,int sU, \
const FermionField &in, \
FermionField &out){ assert(0); } \
template<> void \
WilsonKernels<IMPL>::HandDhopSiteExt(StencilImpl &st, \
LebesgueOrder &lo, \
DoubledGaugeField &U, \
SiteHalfSpinor *buf, \
int sF,int sU, \
const FermionField &in, \
FermionField &out){ assert(0); } \
template<> void \
WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilImpl &st, \
LebesgueOrder &lo, \
DoubledGaugeField &U, \
SiteHalfSpinor *buf, \
int sF,int sU, \
const FermionField &in, \
FermionField &out){ assert(0); } \
template<> void \
WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilImpl &st, \
LebesgueOrder &lo, \
DoubledGaugeField &U, \
SiteHalfSpinor *buf, \
int sF,int sU, \
const FermionField &in, \
FermionField &out){ assert(0); } \
#define HAND_SPECIALISE_GPARITY(IMPL) \
template<> void \
WilsonKernels<IMPL>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out) \
{ \
typedef IMPL Impl; \
typedef typename Simd::scalar_type S; \
typedef typename Simd::vector_type V; \
\
HAND_DECLARATIONS(ignore); \
\
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
StencilEntry *SE; \
HAND_DOP_SITE(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \
\
template<> \
void WilsonKernels<IMPL>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out) \
{ \
typedef IMPL Impl; \
typedef typename Simd::scalar_type S; \
typedef typename Simd::vector_type V; \
\
HAND_DECLARATIONS(ignore); \
\
StencilEntry *SE; \
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
HAND_DOP_SITE_DAG(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \
\
template<> void \
WilsonKernels<IMPL>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out) \
{ \
typedef IMPL Impl; \
typedef typename Simd::scalar_type S; \
typedef typename Simd::vector_type V; \
\
HAND_DECLARATIONS(ignore); \
\
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
StencilEntry *SE; \
HAND_DOP_SITE_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \
\
template<> \
void WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out) \
{ \
typedef IMPL Impl; \
typedef typename Simd::scalar_type S; \
typedef typename Simd::vector_type V; \
\
HAND_DECLARATIONS(ignore); \
\
StencilEntry *SE; \
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
HAND_DOP_SITE_DAG_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \
\
template<> void \
WilsonKernels<IMPL>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out) \
{ \
typedef IMPL Impl; \
typedef typename Simd::scalar_type S; \
typedef typename Simd::vector_type V; \
\
HAND_DECLARATIONS(ignore); \
\
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
StencilEntry *SE; \
int nmu=0; \
HAND_DOP_SITE_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
nmu = 0; \
HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \
template<> \
void WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out) \
{ \
typedef IMPL Impl; \
typedef typename Simd::scalar_type S; \
typedef typename Simd::vector_type V; \
\
HAND_DECLARATIONS(ignore); \
\
StencilEntry *SE; \
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
int nmu=0; \
HAND_DOP_SITE_DAG_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
nmu = 0; \
HAND_DOP_SITE_DAG_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
}
HAND_SPECIALISE_GPARITY(GparityWilsonImplF);
HAND_SPECIALISE_GPARITY(GparityWilsonImplD);
HAND_SPECIALISE_GPARITY(GparityWilsonImplFH);
HAND_SPECIALISE_GPARITY(GparityWilsonImplDF);
////////////// Wilson ; uses this implementation /////////////////////
#define INSTANTIATE_THEM(A) \
@ -930,8 +613,6 @@ INSTANTIATE_THEM(WilsonImplF);
INSTANTIATE_THEM(WilsonImplD);
INSTANTIATE_THEM(ZWilsonImplF);
INSTANTIATE_THEM(ZWilsonImplD);
INSTANTIATE_THEM(GparityWilsonImplF);
INSTANTIATE_THEM(GparityWilsonImplD);
INSTANTIATE_THEM(DomainWallVec5dImplF);
INSTANTIATE_THEM(DomainWallVec5dImplD);
INSTANTIATE_THEM(ZDomainWallVec5dImplF);
@ -940,12 +621,11 @@ INSTANTIATE_THEM(WilsonImplFH);
INSTANTIATE_THEM(WilsonImplDF);
INSTANTIATE_THEM(ZWilsonImplFH);
INSTANTIATE_THEM(ZWilsonImplDF);
INSTANTIATE_THEM(GparityWilsonImplFH);
INSTANTIATE_THEM(GparityWilsonImplDF);
INSTANTIATE_THEM(DomainWallVec5dImplFH);
INSTANTIATE_THEM(DomainWallVec5dImplDF);
INSTANTIATE_THEM(ZDomainWallVec5dImplFH);
INSTANTIATE_THEM(ZDomainWallVec5dImplDF);
INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplF);
INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplD);
}}

View File

@ -0,0 +1,878 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernelsHand.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#define REGISTER
#define LOAD_CHIMU_BODY(F) \
Chimu_00=ref(F)(0)(0); \
Chimu_01=ref(F)(0)(1); \
Chimu_02=ref(F)(0)(2); \
Chimu_10=ref(F)(1)(0); \
Chimu_11=ref(F)(1)(1); \
Chimu_12=ref(F)(1)(2); \
Chimu_20=ref(F)(2)(0); \
Chimu_21=ref(F)(2)(1); \
Chimu_22=ref(F)(2)(2); \
Chimu_30=ref(F)(3)(0); \
Chimu_31=ref(F)(3)(1); \
Chimu_32=ref(F)(3)(2)
#define LOAD_CHIMU(DIR,F,PERM) \
{ const SiteSpinor & ref (in._odata[offset]); LOAD_CHIMU_BODY(F); }
#define LOAD_CHI_BODY(F) \
Chi_00 = ref(F)(0)(0);\
Chi_01 = ref(F)(0)(1);\
Chi_02 = ref(F)(0)(2);\
Chi_10 = ref(F)(1)(0);\
Chi_11 = ref(F)(1)(1);\
Chi_12 = ref(F)(1)(2)
#define LOAD_CHI(DIR,F,PERM) \
{const SiteHalfSpinor &ref(buf[offset]); LOAD_CHI_BODY(F); }
//G-parity implementations using in-place intrinsic ops
//1l 1h -> 1h 1l
//0l 0h , 1h 1l -> 0l 1h 0h,1l
//0h,1l -> 1l,0h
//if( (distance == 1 && !perm_will_occur) || (distance == -1 && perm_will_occur) )
//Pulled fermion through forwards face, GPBC on upper component
//Need 0= 0l 1h 1= 1l 0h
//else if( (distance == -1 && !perm) || (distance == 1 && perm) )
//Pulled fermion through backwards face, GPBC on lower component
//Need 0= 1l 0h 1= 0l 1h
//1l 1h -> 1h 1l
//0l 0h , 1h 1l -> 0l 1h 0h,1l
#define DO_TWIST_0L_1H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3) \
permute##PERM(tmp1, ref(1)(S)(C)); \
exchange##PERM(tmp2,tmp3, ref(0)(S)(C), tmp1); \
INTO = tmp2;
//0l 0h -> 0h 0l
//1l 1h, 0h 0l -> 1l 0h, 1h 0l
#define DO_TWIST_1L_0H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3) \
permute##PERM(tmp1, ref(0)(S)(C)); \
exchange##PERM(tmp2,tmp3, ref(1)(S)(C), tmp1); \
INTO = tmp2;
#define LOAD_CHI_SETUP(DIR,F) \
g = F; \
direction = st._directions[DIR]; \
distance = st._distances[DIR]; \
sl = st._grid->_simd_layout[direction]; \
inplace_twist = 0; \
if(SE->_around_the_world && this->Params.twists[DIR % 4]){ \
if(sl == 1){ \
g = (F+1) % 2; \
}else{ \
inplace_twist = 1; \
} \
}
#define LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM) \
{ const SiteSpinor &ref(in._odata[offset]); \
LOAD_CHI_SETUP(DIR,F); \
if(!inplace_twist){ \
LOAD_CHIMU_BODY(g); \
}else{ \
if( ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
DO_TWIST_0L_1H(Chimu_00,0,0,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chimu_01,0,1,F,PERM, U_11,U_20,U_21); \
DO_TWIST_0L_1H(Chimu_02,0,2,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chimu_10,1,0,F,PERM, U_11,U_20,U_21); \
DO_TWIST_0L_1H(Chimu_11,1,1,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chimu_12,1,2,F,PERM, U_11,U_20,U_21); \
DO_TWIST_0L_1H(Chimu_20,2,0,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chimu_21,2,1,F,PERM, U_11,U_20,U_21); \
DO_TWIST_0L_1H(Chimu_22,2,2,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chimu_30,3,0,F,PERM, U_11,U_20,U_21); \
DO_TWIST_0L_1H(Chimu_31,3,1,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chimu_32,3,2,F,PERM, U_11,U_20,U_21); \
}else{ \
DO_TWIST_1L_0H(Chimu_00,0,0,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chimu_01,0,1,F,PERM, U_11,U_20,U_21); \
DO_TWIST_1L_0H(Chimu_02,0,2,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chimu_10,1,0,F,PERM, U_11,U_20,U_21); \
DO_TWIST_1L_0H(Chimu_11,1,1,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chimu_12,1,2,F,PERM, U_11,U_20,U_21); \
DO_TWIST_1L_0H(Chimu_20,2,0,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chimu_21,2,1,F,PERM, U_11,U_20,U_21); \
DO_TWIST_1L_0H(Chimu_22,2,2,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chimu_30,3,0,F,PERM, U_11,U_20,U_21); \
DO_TWIST_1L_0H(Chimu_31,3,1,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chimu_32,3,2,F,PERM, U_11,U_20,U_21); \
} \
} \
}
#define LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM) \
{ const SiteHalfSpinor &ref(buf[offset]); \
LOAD_CHI_SETUP(DIR,F); \
if(!inplace_twist){ \
LOAD_CHI_BODY(g); \
}else{ \
if( ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
DO_TWIST_0L_1H(Chi_00,0,0,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chi_01,0,1,F,PERM, U_11,U_20,U_21); \
DO_TWIST_0L_1H(Chi_02,0,2,F,PERM, UChi_00,UChi_01,UChi_02); \
DO_TWIST_0L_1H(Chi_10,1,0,F,PERM, UChi_10,UChi_11,UChi_12); \
DO_TWIST_0L_1H(Chi_11,1,1,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chi_12,1,2,F,PERM, U_11,U_20,U_21); \
}else{ \
DO_TWIST_1L_0H(Chi_00,0,0,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chi_01,0,1,F,PERM, U_11,U_20,U_21); \
DO_TWIST_1L_0H(Chi_02,0,2,F,PERM, UChi_00,UChi_01,UChi_02); \
DO_TWIST_1L_0H(Chi_10,1,0,F,PERM, UChi_10,UChi_11,UChi_12); \
DO_TWIST_1L_0H(Chi_11,1,1,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chi_12,1,2,F,PERM, U_11,U_20,U_21); \
} \
} \
}
#define LOAD_CHI_GPARITY(DIR,F,PERM) LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM)
#define LOAD_CHIMU_GPARITY(DIR,F,PERM) LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM)
// To splat or not to splat depends on the implementation
#define MULT_2SPIN_BODY \
Impl::loadLinkElement(U_00,ref()(0,0)); \
Impl::loadLinkElement(U_10,ref()(1,0)); \
Impl::loadLinkElement(U_20,ref()(2,0)); \
Impl::loadLinkElement(U_01,ref()(0,1)); \
Impl::loadLinkElement(U_11,ref()(1,1)); \
Impl::loadLinkElement(U_21,ref()(2,1)); \
UChi_00 = U_00*Chi_00; \
UChi_10 = U_00*Chi_10; \
UChi_01 = U_10*Chi_00; \
UChi_11 = U_10*Chi_10; \
UChi_02 = U_20*Chi_00; \
UChi_12 = U_20*Chi_10; \
UChi_00+= U_01*Chi_01; \
UChi_10+= U_01*Chi_11; \
UChi_01+= U_11*Chi_01; \
UChi_11+= U_11*Chi_11; \
UChi_02+= U_21*Chi_01; \
UChi_12+= U_21*Chi_11; \
Impl::loadLinkElement(U_00,ref()(0,2)); \
Impl::loadLinkElement(U_10,ref()(1,2)); \
Impl::loadLinkElement(U_20,ref()(2,2)); \
UChi_00+= U_00*Chi_02; \
UChi_10+= U_00*Chi_12; \
UChi_01+= U_10*Chi_02; \
UChi_11+= U_10*Chi_12; \
UChi_02+= U_20*Chi_02; \
UChi_12+= U_20*Chi_12
#define MULT_2SPIN(A,F) \
{auto & ref(U._odata[sU](A)); MULT_2SPIN_BODY; }
#define MULT_2SPIN_GPARITY(A,F) \
{auto & ref(U._odata[sU](F)(A)); MULT_2SPIN_BODY; }
#define PERMUTE_DIR(dir) \
permute##dir(Chi_00,Chi_00);\
permute##dir(Chi_01,Chi_01);\
permute##dir(Chi_02,Chi_02);\
permute##dir(Chi_10,Chi_10);\
permute##dir(Chi_11,Chi_11);\
permute##dir(Chi_12,Chi_12);
// hspin(0)=fspin(0)+timesI(fspin(3));
// hspin(1)=fspin(1)+timesI(fspin(2));
#define XP_PROJ \
Chi_00 = Chimu_00+timesI(Chimu_30);\
Chi_01 = Chimu_01+timesI(Chimu_31);\
Chi_02 = Chimu_02+timesI(Chimu_32);\
Chi_10 = Chimu_10+timesI(Chimu_20);\
Chi_11 = Chimu_11+timesI(Chimu_21);\
Chi_12 = Chimu_12+timesI(Chimu_22);
#define YP_PROJ \
Chi_00 = Chimu_00-Chimu_30;\
Chi_01 = Chimu_01-Chimu_31;\
Chi_02 = Chimu_02-Chimu_32;\
Chi_10 = Chimu_10+Chimu_20;\
Chi_11 = Chimu_11+Chimu_21;\
Chi_12 = Chimu_12+Chimu_22;
#define ZP_PROJ \
Chi_00 = Chimu_00+timesI(Chimu_20); \
Chi_01 = Chimu_01+timesI(Chimu_21); \
Chi_02 = Chimu_02+timesI(Chimu_22); \
Chi_10 = Chimu_10-timesI(Chimu_30); \
Chi_11 = Chimu_11-timesI(Chimu_31); \
Chi_12 = Chimu_12-timesI(Chimu_32);
#define TP_PROJ \
Chi_00 = Chimu_00+Chimu_20; \
Chi_01 = Chimu_01+Chimu_21; \
Chi_02 = Chimu_02+Chimu_22; \
Chi_10 = Chimu_10+Chimu_30; \
Chi_11 = Chimu_11+Chimu_31; \
Chi_12 = Chimu_12+Chimu_32;
// hspin(0)=fspin(0)-timesI(fspin(3));
// hspin(1)=fspin(1)-timesI(fspin(2));
#define XM_PROJ \
Chi_00 = Chimu_00-timesI(Chimu_30);\
Chi_01 = Chimu_01-timesI(Chimu_31);\
Chi_02 = Chimu_02-timesI(Chimu_32);\
Chi_10 = Chimu_10-timesI(Chimu_20);\
Chi_11 = Chimu_11-timesI(Chimu_21);\
Chi_12 = Chimu_12-timesI(Chimu_22);
#define YM_PROJ \
Chi_00 = Chimu_00+Chimu_30;\
Chi_01 = Chimu_01+Chimu_31;\
Chi_02 = Chimu_02+Chimu_32;\
Chi_10 = Chimu_10-Chimu_20;\
Chi_11 = Chimu_11-Chimu_21;\
Chi_12 = Chimu_12-Chimu_22;
#define ZM_PROJ \
Chi_00 = Chimu_00-timesI(Chimu_20); \
Chi_01 = Chimu_01-timesI(Chimu_21); \
Chi_02 = Chimu_02-timesI(Chimu_22); \
Chi_10 = Chimu_10+timesI(Chimu_30); \
Chi_11 = Chimu_11+timesI(Chimu_31); \
Chi_12 = Chimu_12+timesI(Chimu_32);
#define TM_PROJ \
Chi_00 = Chimu_00-Chimu_20; \
Chi_01 = Chimu_01-Chimu_21; \
Chi_02 = Chimu_02-Chimu_22; \
Chi_10 = Chimu_10-Chimu_30; \
Chi_11 = Chimu_11-Chimu_31; \
Chi_12 = Chimu_12-Chimu_32;
// fspin(0)=hspin(0);
// fspin(1)=hspin(1);
// fspin(2)=timesMinusI(hspin(1));
// fspin(3)=timesMinusI(hspin(0));
#define XP_RECON\
result_00 = UChi_00;\
result_01 = UChi_01;\
result_02 = UChi_02;\
result_10 = UChi_10;\
result_11 = UChi_11;\
result_12 = UChi_12;\
result_20 = timesMinusI(UChi_10);\
result_21 = timesMinusI(UChi_11);\
result_22 = timesMinusI(UChi_12);\
result_30 = timesMinusI(UChi_00);\
result_31 = timesMinusI(UChi_01);\
result_32 = timesMinusI(UChi_02);
#define XP_RECON_ACCUM\
result_00+=UChi_00;\
result_01+=UChi_01;\
result_02+=UChi_02;\
result_10+=UChi_10;\
result_11+=UChi_11;\
result_12+=UChi_12;\
result_20-=timesI(UChi_10);\
result_21-=timesI(UChi_11);\
result_22-=timesI(UChi_12);\
result_30-=timesI(UChi_00);\
result_31-=timesI(UChi_01);\
result_32-=timesI(UChi_02);
#define XM_RECON\
result_00 = UChi_00;\
result_01 = UChi_01;\
result_02 = UChi_02;\
result_10 = UChi_10;\
result_11 = UChi_11;\
result_12 = UChi_12;\
result_20 = timesI(UChi_10);\
result_21 = timesI(UChi_11);\
result_22 = timesI(UChi_12);\
result_30 = timesI(UChi_00);\
result_31 = timesI(UChi_01);\
result_32 = timesI(UChi_02);
#define XM_RECON_ACCUM\
result_00+= UChi_00;\
result_01+= UChi_01;\
result_02+= UChi_02;\
result_10+= UChi_10;\
result_11+= UChi_11;\
result_12+= UChi_12;\
result_20+= timesI(UChi_10);\
result_21+= timesI(UChi_11);\
result_22+= timesI(UChi_12);\
result_30+= timesI(UChi_00);\
result_31+= timesI(UChi_01);\
result_32+= timesI(UChi_02);
#define YP_RECON_ACCUM\
result_00+= UChi_00;\
result_01+= UChi_01;\
result_02+= UChi_02;\
result_10+= UChi_10;\
result_11+= UChi_11;\
result_12+= UChi_12;\
result_20+= UChi_10;\
result_21+= UChi_11;\
result_22+= UChi_12;\
result_30-= UChi_00;\
result_31-= UChi_01;\
result_32-= UChi_02;
#define YM_RECON_ACCUM\
result_00+= UChi_00;\
result_01+= UChi_01;\
result_02+= UChi_02;\
result_10+= UChi_10;\
result_11+= UChi_11;\
result_12+= UChi_12;\
result_20-= UChi_10;\
result_21-= UChi_11;\
result_22-= UChi_12;\
result_30+= UChi_00;\
result_31+= UChi_01;\
result_32+= UChi_02;
#define ZP_RECON_ACCUM\
result_00+= UChi_00;\
result_01+= UChi_01;\
result_02+= UChi_02;\
result_10+= UChi_10;\
result_11+= UChi_11;\
result_12+= UChi_12;\
result_20-= timesI(UChi_00); \
result_21-= timesI(UChi_01); \
result_22-= timesI(UChi_02); \
result_30+= timesI(UChi_10); \
result_31+= timesI(UChi_11); \
result_32+= timesI(UChi_12);
#define ZM_RECON_ACCUM\
result_00+= UChi_00;\
result_01+= UChi_01;\
result_02+= UChi_02;\
result_10+= UChi_10;\
result_11+= UChi_11;\
result_12+= UChi_12;\
result_20+= timesI(UChi_00); \
result_21+= timesI(UChi_01); \
result_22+= timesI(UChi_02); \
result_30-= timesI(UChi_10); \
result_31-= timesI(UChi_11); \
result_32-= timesI(UChi_12);
#define TP_RECON_ACCUM\
result_00+= UChi_00;\
result_01+= UChi_01;\
result_02+= UChi_02;\
result_10+= UChi_10;\
result_11+= UChi_11;\
result_12+= UChi_12;\
result_20+= UChi_00; \
result_21+= UChi_01; \
result_22+= UChi_02; \
result_30+= UChi_10; \
result_31+= UChi_11; \
result_32+= UChi_12;
#define TM_RECON_ACCUM\
result_00+= UChi_00;\
result_01+= UChi_01;\
result_02+= UChi_02;\
result_10+= UChi_10;\
result_11+= UChi_11;\
result_12+= UChi_12;\
result_20-= UChi_00; \
result_21-= UChi_01; \
result_22-= UChi_02; \
result_30-= UChi_10; \
result_31-= UChi_11; \
result_32-= UChi_12;
#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
SE=st.GetEntry(ptype,DIR,ss); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
if ( local ) { \
LOAD_CHIMU_IMPL(DIR,F,PERM); \
PROJ; \
if ( perm) { \
PERMUTE_DIR(PERM); \
} \
} else { \
LOAD_CHI_IMPL(DIR,F,PERM); \
} \
MULT_2SPIN_IMPL(DIR,F); \
RECON;
#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
SE=st.GetEntry(ptype,DIR,ss); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
if ( local ) { \
LOAD_CHIMU_IMPL(DIR,F,PERM); \
PROJ; \
if ( perm) { \
PERMUTE_DIR(PERM); \
} \
} else if ( st.same_node[DIR] ) { \
LOAD_CHI_IMPL(DIR,F,PERM); \
} \
if (local || st.same_node[DIR] ) { \
MULT_2SPIN_IMPL(DIR,F); \
RECON; \
}
#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
SE=st.GetEntry(ptype,DIR,ss); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
if((!SE->_is_local)&&(!st.same_node[DIR]) ) { \
LOAD_CHI_IMPL(DIR,F,PERM); \
MULT_2SPIN_IMPL(DIR,F); \
RECON; \
nmu++; \
}
#define HAND_RESULT(ss,F) \
{ \
SiteSpinor & ref (out._odata[ss]); \
vstream(ref(F)(0)(0),result_00); \
vstream(ref(F)(0)(1),result_01); \
vstream(ref(F)(0)(2),result_02); \
vstream(ref(F)(1)(0),result_10); \
vstream(ref(F)(1)(1),result_11); \
vstream(ref(F)(1)(2),result_12); \
vstream(ref(F)(2)(0),result_20); \
vstream(ref(F)(2)(1),result_21); \
vstream(ref(F)(2)(2),result_22); \
vstream(ref(F)(3)(0),result_30); \
vstream(ref(F)(3)(1),result_31); \
vstream(ref(F)(3)(2),result_32); \
}
#define HAND_RESULT_EXT(ss,F) \
if (nmu){ \
SiteSpinor & ref (out._odata[ss]); \
ref(F)(0)(0)+=result_00; \
ref(F)(0)(1)+=result_01; \
ref(F)(0)(2)+=result_02; \
ref(F)(1)(0)+=result_10; \
ref(F)(1)(1)+=result_11; \
ref(F)(1)(2)+=result_12; \
ref(F)(2)(0)+=result_20; \
ref(F)(2)(1)+=result_21; \
ref(F)(2)(2)+=result_22; \
ref(F)(3)(0)+=result_30; \
ref(F)(3)(1)+=result_31; \
ref(F)(3)(2)+=result_32; \
}
#define HAND_DECLARATIONS(a) \
Simd result_00; \
Simd result_01; \
Simd result_02; \
Simd result_10; \
Simd result_11; \
Simd result_12; \
Simd result_20; \
Simd result_21; \
Simd result_22; \
Simd result_30; \
Simd result_31; \
Simd result_32; \
Simd Chi_00; \
Simd Chi_01; \
Simd Chi_02; \
Simd Chi_10; \
Simd Chi_11; \
Simd Chi_12; \
Simd UChi_00; \
Simd UChi_01; \
Simd UChi_02; \
Simd UChi_10; \
Simd UChi_11; \
Simd UChi_12; \
Simd U_00; \
Simd U_10; \
Simd U_20; \
Simd U_01; \
Simd U_11; \
Simd U_21;
#define ZERO_RESULT \
result_00=zero; \
result_01=zero; \
result_02=zero; \
result_10=zero; \
result_11=zero; \
result_12=zero; \
result_20=zero; \
result_21=zero; \
result_22=zero; \
result_30=zero; \
result_31=zero; \
result_32=zero;
#define Chimu_00 Chi_00
#define Chimu_01 Chi_01
#define Chimu_02 Chi_02
#define Chimu_10 Chi_10
#define Chimu_11 Chi_11
#define Chimu_12 Chi_12
#define Chimu_20 UChi_00
#define Chimu_21 UChi_01
#define Chimu_22 UChi_02
#define Chimu_30 UChi_10
#define Chimu_31 UChi_11
#define Chimu_32 UChi_12
namespace Grid {
namespace QCD {
template<class Impl> void
WilsonKernels<Impl>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionField &in, FermionField &out)
{
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
HAND_DECLARATIONS(ignore);
int offset,local,perm, ptype;
StencilEntry *SE;
#define HAND_DOP_SITE(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_RESULT(ss,F)
HAND_DOP_SITE(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
}
template<class Impl>
void WilsonKernels<Impl>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionField &in, FermionField &out)
{
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
HAND_DECLARATIONS(ignore);
StencilEntry *SE;
int offset,local,perm, ptype;
#define HAND_DOP_SITE_DAG(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_RESULT(ss,F)
HAND_DOP_SITE_DAG(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
}
template<class Impl> void
WilsonKernels<Impl>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionField &in, FermionField &out)
{
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
HAND_DECLARATIONS(ignore);
int offset,local,perm, ptype;
StencilEntry *SE;
#define HAND_DOP_SITE_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
ZERO_RESULT; \
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_RESULT(ss,F)
HAND_DOP_SITE_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
}
template<class Impl>
void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionField &in, FermionField &out)
{
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
HAND_DECLARATIONS(ignore);
StencilEntry *SE;
int offset,local,perm, ptype;
#define HAND_DOP_SITE_DAG_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
ZERO_RESULT; \
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_RESULT(ss,F)
HAND_DOP_SITE_DAG_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
}
template<class Impl> void
WilsonKernels<Impl>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionField &in, FermionField &out)
{
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
HAND_DECLARATIONS(ignore);
int offset,local,perm, ptype;
StencilEntry *SE;
int nmu=0;
#define HAND_DOP_SITE_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
ZERO_RESULT; \
HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_RESULT_EXT(ss,F)
HAND_DOP_SITE_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
}
template<class Impl>
void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionField &in, FermionField &out)
{
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
HAND_DECLARATIONS(ignore);
StencilEntry *SE;
int offset,local,perm, ptype;
int nmu=0;
#define HAND_DOP_SITE_DAG_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
ZERO_RESULT; \
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_RESULT_EXT(ss,F)
HAND_DOP_SITE_DAG_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
}
#define HAND_SPECIALISE_GPARITY(IMPL) \
template<> void \
WilsonKernels<IMPL>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out) \
{ \
typedef IMPL Impl; \
typedef typename Simd::scalar_type S; \
typedef typename Simd::vector_type V; \
\
HAND_DECLARATIONS(ignore); \
\
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
StencilEntry *SE; \
HAND_DOP_SITE(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \
\
template<> \
void WilsonKernels<IMPL>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out) \
{ \
typedef IMPL Impl; \
typedef typename Simd::scalar_type S; \
typedef typename Simd::vector_type V; \
\
HAND_DECLARATIONS(ignore); \
\
StencilEntry *SE; \
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
HAND_DOP_SITE_DAG(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \
\
template<> void \
WilsonKernels<IMPL>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out) \
{ \
typedef IMPL Impl; \
typedef typename Simd::scalar_type S; \
typedef typename Simd::vector_type V; \
\
HAND_DECLARATIONS(ignore); \
\
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
StencilEntry *SE; \
HAND_DOP_SITE_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \
\
template<> \
void WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out) \
{ \
typedef IMPL Impl; \
typedef typename Simd::scalar_type S; \
typedef typename Simd::vector_type V; \
\
HAND_DECLARATIONS(ignore); \
\
StencilEntry *SE; \
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
HAND_DOP_SITE_DAG_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \
\
template<> void \
WilsonKernels<IMPL>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out) \
{ \
typedef IMPL Impl; \
typedef typename Simd::scalar_type S; \
typedef typename Simd::vector_type V; \
\
HAND_DECLARATIONS(ignore); \
\
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
StencilEntry *SE; \
int nmu=0; \
HAND_DOP_SITE_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
nmu = 0; \
HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \
template<> \
void WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out) \
{ \
typedef IMPL Impl; \
typedef typename Simd::scalar_type S; \
typedef typename Simd::vector_type V; \
\
HAND_DECLARATIONS(ignore); \
\
StencilEntry *SE; \
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
int nmu=0; \
HAND_DOP_SITE_DAG_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
nmu = 0; \
HAND_DOP_SITE_DAG_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
}
HAND_SPECIALISE_GPARITY(GparityWilsonImplF);
HAND_SPECIALISE_GPARITY(GparityWilsonImplD);
HAND_SPECIALISE_GPARITY(GparityWilsonImplFH);
HAND_SPECIALISE_GPARITY(GparityWilsonImplDF);
////////////// Wilson ; uses this implementation /////////////////////
#define INSTANTIATE_THEM(A) \
template void WilsonKernels<A>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
int ss,int sU,const FermionField &in, FermionField &out); \
template void WilsonKernels<A>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out);\
template void WilsonKernels<A>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
int ss,int sU,const FermionField &in, FermionField &out); \
template void WilsonKernels<A>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out); \
template void WilsonKernels<A>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
int ss,int sU,const FermionField &in, FermionField &out); \
template void WilsonKernels<A>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out);
INSTANTIATE_THEM(GparityWilsonImplF);
INSTANTIATE_THEM(GparityWilsonImplD);
INSTANTIATE_THEM(GparityWilsonImplFH);
INSTANTIATE_THEM(GparityWilsonImplDF);
}}

View File

@ -71,18 +71,14 @@ class WilsonGaugeAction : public Action<typename Gimpl::GaugeField> {
RealD factor = 0.5 * beta / RealD(Nc);
//GaugeLinkField Umu(U._grid);
GaugeLinkField Umu(U._grid);
GaugeLinkField dSdU_mu(U._grid);
for (int mu = 0; mu < Nd; mu++) {
//Umu = PeekIndex<LorentzIndex>(U, mu);
Umu = PeekIndex<LorentzIndex>(U, mu);
// Staple in direction mu
//WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
//dSdU_mu = Ta(Umu * dSdU_mu) * factor;
WilsonLoops<Gimpl>::StapleMult(dSdU_mu, U, mu);
dSdU_mu = Ta(dSdU_mu) * factor;
WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
dSdU_mu = Ta(Umu * dSdU_mu) * factor;
PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
}

View File

@ -48,6 +48,22 @@ with this program; if not, write to the Free Software Foundation, Inc.,
} \
}
#define RegisterLoadCheckPointerMetadataFunction(NAME) \
template < class Metadata > \
void Load##NAME##Checkpointer(const CheckpointerParameters& Params_, const Metadata& M_) { \
if (!have_CheckPointer) { \
std::cout << GridLogDebug << "Loading Metadata Checkpointer " << #NAME \
<< std::endl; \
CP = std::unique_ptr<CheckpointerBaseModule>( \
new NAME##CPModule<ImplementationPolicy, Metadata >(Params_, M_)); \
have_CheckPointer = true; \
} else { \
std::cout << GridLogError << "Checkpointer already loaded " \
<< std::endl; \
exit(1); \
} \
}
namespace Grid {
namespace QCD {
@ -77,7 +93,7 @@ class HMCResourceManager {
bool have_CheckPointer;
// NOTE: operator << is not overloaded for std::vector<string>
// so thsi function is necessary
// so this function is necessary
void output_vector_string(const std::vector<std::string> &vs){
for (auto &i: vs)
std::cout << i << " ";
@ -254,6 +270,7 @@ class HMCResourceManager {
RegisterLoadCheckPointerFunction(Nersc);
#ifdef HAVE_LIME
RegisterLoadCheckPointerFunction(ILDG);
RegisterLoadCheckPointerMetadataFunction(Scidac);
#endif
////////////////////////////////////////////////////////

View File

@ -76,6 +76,14 @@ class BaseHmcCheckpointer : public HmcObservable<typename Impl::Field> {
}
}
void check_filename(const std::string &filename){
std::ifstream f(filename.c_str());
if(!f.good()){
std::cout << GridLogError << "Filename " << filename << " not found. Aborting. " << std::endl;
abort();
};
}
virtual void initialize(const CheckpointerParameters &Params) = 0;
virtual void CheckpointRestore(int traj, typename Impl::Field &U,

View File

@ -93,6 +93,9 @@ class BinaryHmcCheckpointer : public BaseHmcCheckpointer<Impl> {
void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {
std::string config, rng;
this->build_filenames(traj, Params, config, rng);
this->check_filename(rng);
this->check_filename(config);
BinarySimpleMunger<sobj_double, sobj> munge;

View File

@ -136,6 +136,20 @@ class ILDGCPModule: public CheckPointerModule< ImplementationPolicy> {
};
template<class ImplementationPolicy, class Metadata>
class ScidacCPModule: public CheckPointerModule< ImplementationPolicy> {
typedef CheckPointerModule< ImplementationPolicy> CPBase;
Metadata M;
// acquire resource
virtual void initialize(){
this->CheckPointPtr.reset(new ScidacHmcCheckpointer<ImplementationPolicy, Metadata>(this->Par_, M));
}
public:
ScidacCPModule(typename CPBase::APar Par, Metadata M_):M(M_), CPBase(Par) {}
template <class ReaderClass>
ScidacCPModule(Reader<ReaderClass>& Reader) : Parametrized<typename CPBase::APar>(Reader), M(Reader){};
};
#endif

View File

@ -34,6 +34,7 @@ directory
#include <Grid/qcd/hmc/checkpointers/NerscCheckpointer.h>
#include <Grid/qcd/hmc/checkpointers/BinaryCheckpointer.h>
#include <Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h>
#include <Grid/qcd/hmc/checkpointers/ScidacCheckpointer.h>
//#include <Grid/qcd/hmc/checkpointers/CheckPointerModules.h>

View File

@ -74,10 +74,10 @@ class ILDGHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
if ((traj % Params.saveInterval) == 0) {
std::string config, rng;
this->build_filenames(traj, Params, config, rng);
GridBase *grid = U._grid;
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
IldgWriter _IldgWriter;
IldgWriter _IldgWriter(grid->IsBoss());
_IldgWriter.open(config);
_IldgWriter.writeConfiguration(U, traj, config, config);
_IldgWriter.close();
@ -95,6 +95,10 @@ class ILDGHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
GridParallelRNG &pRNG) {
std::string config, rng;
this->build_filenames(traj, Params, config, rng);
this->check_filename(rng);
this->check_filename(config);
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::readRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);

View File

@ -69,6 +69,9 @@ class NerscHmcCheckpointer : public BaseHmcCheckpointer<Gimpl> {
GridParallelRNG &pRNG) {
std::string config, rng;
this->build_filenames(traj, Params, config, rng);
this->check_filename(rng);
this->check_filename(config);
FieldMetaData header;
NerscIO::readRNGState(sRNG, pRNG, header, rng);

View File

@ -0,0 +1,125 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/hmc/ScidacCheckpointer.h
Copyright (C) 2018
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef SCIDAC_CHECKPOINTER
#define SCIDAC_CHECKPOINTER
#ifdef HAVE_LIME
#include <iostream>
#include <sstream>
#include <string>
namespace Grid {
namespace QCD {
// For generic fields
template <class Implementation, class Metadata>
class ScidacHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
private:
CheckpointerParameters Params;
Metadata MData;
typedef typename Implementation::Field Field;
public:
//INHERIT_GIMPL_TYPES(Implementation);
ScidacHmcCheckpointer(const CheckpointerParameters &Params_) { initialize(Params_); }
ScidacHmcCheckpointer(const CheckpointerParameters &Params_, const Metadata& M_):MData(M_) { initialize(Params_); }
void initialize(const CheckpointerParameters &Params_) {
Params = Params_;
// check here that the format is valid
int ieee32big = (Params.format == std::string("IEEE32BIG"));
int ieee32 = (Params.format == std::string("IEEE32"));
int ieee64big = (Params.format == std::string("IEEE64BIG"));
int ieee64 = (Params.format == std::string("IEEE64"));
if (!(ieee64big || ieee32 || ieee32big || ieee64)) {
std::cout << GridLogError << "Unrecognized file format " << Params.format
<< std::endl;
std::cout << GridLogError
<< "Allowed: IEEE32BIG | IEEE32 | IEEE64BIG | IEEE64"
<< std::endl;
exit(1);
}
}
void TrajectoryComplete(int traj, Field &U, GridSerialRNG &sRNG,
GridParallelRNG &pRNG) {
if ((traj % Params.saveInterval) == 0) {
std::string config, rng;
this->build_filenames(traj, Params, config, rng);
GridBase *grid = U._grid;
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
ScidacWriter _ScidacWriter(grid->IsBoss());
_ScidacWriter.open(config);
_ScidacWriter.writeScidacFieldRecord(U, MData);
_ScidacWriter.close();
std::cout << GridLogMessage << "Written Scidac Configuration on " << config
<< " checksum " << std::hex << nersc_csum<<"/"
<< scidac_csuma<<"/" << scidac_csumb
<< std::dec << std::endl;
}
};
void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG,
GridParallelRNG &pRNG) {
std::string config, rng;
this->build_filenames(traj, Params, config, rng);
this->check_filename(rng);
this->check_filename(config);
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::readRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
Metadata md_content;
ScidacReader _ScidacReader;
_ScidacReader.open(config);
_ScidacReader.readScidacFieldRecord(U,md_content); // format from the header
_ScidacReader.close();
std::cout << GridLogMessage << "Read Scidac Configuration from " << config
<< " checksum " << std::hex
<< nersc_csum<<"/"
<< scidac_csuma<<"/"
<< scidac_csumb
<< std::dec << std::endl;
};
};
}
}
#endif // HAVE_LIME
#endif // ILDG_CHECKPOINTER

View File

@ -114,18 +114,26 @@ class Integrator {
// input U actually not used in the fundamental case
// Fundamental updates, include smearing
for (int a = 0; a < as[level].actions.size(); ++a) {
for (int a = 0; a < as[level].actions.size(); ++a) {
double start_full = usecond();
Field force(U._grid);
conformable(U._grid, Mom._grid);
Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared);
double start_force = usecond();
as[level].actions.at(a)->deriv(Us, force); // deriv should NOT include Ta
std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl;
if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force);
force = FieldImplementation::projectForce(force); // Ta for gauge fields
double end_force = usecond();
Real force_abs = std::sqrt(norm2(force)/U._grid->gSites());
std::cout << GridLogIntegrator << "Force average: " << force_abs << std::endl;
std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] Force average: " << force_abs << std::endl;
Mom -= force * ep;
double end_full = usecond();
double time_full = (end_full - start_full) / 1e3;
double time_force = (end_force - start_force) / 1e3;
std::cout << GridLogIntegrator << "["<<level<<"]["<<a<<"] P update elapsed time: " << time_full << " ms (force: " << time_force << " ms)" << std::endl;
}
// Force from the other representations

View File

@ -23,6 +23,7 @@ class AdjointRep {
typedef typename SU_Adjoint<ncolour>::LatticeAdjMatrix LatticeMatrix;
typedef typename SU_Adjoint<ncolour>::LatticeAdjField LatticeField;
static const int Dimension = ncolour * ncolour - 1;
static const bool isFundamental = false;
LatticeField U;

View File

@ -19,6 +19,7 @@ template <int ncolour>
class FundamentalRep {
public:
static const int Dimension = ncolour;
static const bool isFundamental = true;
// typdef to be used by the Representations class in HMC to get the
// types for the higher representation fields

View File

@ -29,6 +29,7 @@ class TwoIndexRep {
typedef typename SU_TwoIndex<ncolour, S>::LatticeTwoIndexMatrix LatticeMatrix;
typedef typename SU_TwoIndex<ncolour, S>::LatticeTwoIndexField LatticeField;
static const int Dimension = ncolour * (ncolour + S) / 2;
static const bool isFundamental = false;
LatticeField U;

View File

@ -6,30 +6,33 @@
#ifndef GAUGE_CONFIG_
#define GAUGE_CONFIG_
namespace Grid {
namespace Grid
{
namespace QCD {
namespace QCD
{
//trivial class for no smearing
template< class Impl >
class NoSmearing {
//trivial class for no smearing
template <class Impl>
class NoSmearing
{
public:
INHERIT_FIELD_TYPES(Impl);
Field* ThinField;
Field *ThinField;
NoSmearing(): ThinField(NULL) {}
NoSmearing() : ThinField(NULL) {}
void set_Field(Field& U) { ThinField = &U; }
void set_Field(Field &U) { ThinField = &U; }
void smeared_force(Field&) const {}
void smeared_force(Field &) const {}
Field& get_SmearedU() { return *ThinField; }
Field &get_SmearedU() { return *ThinField; }
Field& get_U(bool smeared = false) {
Field &get_U(bool smeared = false)
{
return *ThinField;
}
};
/*!
@ -44,32 +47,36 @@ public:
It stores a list of smeared configurations.
*/
template <class Gimpl>
class SmearedConfiguration {
public:
class SmearedConfiguration
{
public:
INHERIT_GIMPL_TYPES(Gimpl);
private:
private:
const unsigned int smearingLevels;
Smear_Stout<Gimpl> StoutSmearing;
std::vector<GaugeField> SmearedSet;
// Member functions
//====================================================================
void fill_smearedSet(GaugeField& U) {
ThinLinks = &U; // attach the smearing routine to the field U
void fill_smearedSet(GaugeField &U)
{
ThinLinks = &U; // attach the smearing routine to the field U
// check the pointer is not null
if (ThinLinks == NULL)
std::cout << GridLogError
<< "[SmearedConfiguration] Error in ThinLinks pointer\n";
if (smearingLevels > 0) {
if (smearingLevels > 0)
{
std::cout << GridLogDebug
<< "[SmearedConfiguration] Filling SmearedSet\n";
GaugeField previous_u(ThinLinks->_grid);
previous_u = *ThinLinks;
for (int smearLvl = 0; smearLvl < smearingLevels; ++smearLvl) {
for (int smearLvl = 0; smearLvl < smearingLevels; ++smearLvl)
{
StoutSmearing.smear(SmearedSet[smearLvl], previous_u);
previous_u = SmearedSet[smearLvl];
@ -81,9 +88,10 @@ class SmearedConfiguration {
}
}
//====================================================================
GaugeField AnalyticSmearedForce(const GaugeField& SigmaKPrime,
const GaugeField& GaugeK) const {
GridBase* grid = GaugeK._grid;
GaugeField AnalyticSmearedForce(const GaugeField &SigmaKPrime,
const GaugeField &GaugeK) const
{
GridBase *grid = GaugeK._grid;
GaugeField C(grid), SigmaK(grid), iLambda(grid);
GaugeLinkField iLambda_mu(grid);
GaugeLinkField iQ(grid), e_iQ(grid);
@ -94,7 +102,8 @@ class SmearedConfiguration {
SigmaK = zero;
iLambda = zero;
for (int mu = 0; mu < Nd; mu++) {
for (int mu = 0; mu < Nd; mu++)
{
Cmu = peekLorentz(C, mu);
GaugeKmu = peekLorentz(GaugeK, mu);
SigmaKPrime_mu = peekLorentz(SigmaKPrime, mu);
@ -104,20 +113,22 @@ class SmearedConfiguration {
pokeLorentz(iLambda, iLambda_mu, mu);
}
StoutSmearing.derivative(SigmaK, iLambda,
GaugeK); // derivative of SmearBase
GaugeK); // derivative of SmearBase
return SigmaK;
}
/*! @brief Returns smeared configuration at level 'Level' */
const GaugeField& get_smeared_conf(int Level) const {
const GaugeField &get_smeared_conf(int Level) const
{
return SmearedSet[Level];
}
//====================================================================
void set_iLambda(GaugeLinkField& iLambda, GaugeLinkField& e_iQ,
const GaugeLinkField& iQ, const GaugeLinkField& Sigmap,
const GaugeLinkField& GaugeK) const {
GridBase* grid = iQ._grid;
void set_iLambda(GaugeLinkField &iLambda, GaugeLinkField &e_iQ,
const GaugeLinkField &iQ, const GaugeLinkField &Sigmap,
const GaugeLinkField &GaugeK) const
{
GridBase *grid = iQ._grid;
GaugeLinkField iQ2(grid), iQ3(grid), B1(grid), B2(grid), USigmap(grid);
GaugeLinkField unity(grid);
unity = 1.0;
@ -206,15 +217,15 @@ class SmearedConfiguration {
}
//====================================================================
public:
GaugeField*
ThinLinks; /*!< @brief Pointer to the thin
links configuration */
public:
GaugeField *
ThinLinks; /* Pointer to the thin links configuration */
/*! @brief Standard constructor */
SmearedConfiguration(GridCartesian* UGrid, unsigned int Nsmear,
Smear_Stout<Gimpl>& Stout)
: smearingLevels(Nsmear), StoutSmearing(Stout), ThinLinks(NULL) {
/* Standard constructor */
SmearedConfiguration(GridCartesian *UGrid, unsigned int Nsmear,
Smear_Stout<Gimpl> &Stout)
: smearingLevels(Nsmear), StoutSmearing(Stout), ThinLinks(NULL)
{
for (unsigned int i = 0; i < smearingLevels; ++i)
SmearedSet.push_back(*(new GaugeField(UGrid)));
}
@ -223,21 +234,29 @@ class SmearedConfiguration {
SmearedConfiguration()
: smearingLevels(0), StoutSmearing(), SmearedSet(), ThinLinks(NULL) {}
// attach the smeared routines to the thin links U and fill the smeared set
void set_Field(GaugeField& U) { fill_smearedSet(U); }
void set_Field(GaugeField &U)
{
double start = usecond();
fill_smearedSet(U);
double end = usecond();
double time = (end - start)/ 1e3;
std::cout << GridLogMessage << "Smearing in " << time << " ms" << std::endl;
}
//====================================================================
void smeared_force(GaugeField& SigmaTilde) const {
if (smearingLevels > 0) {
void smeared_force(GaugeField &SigmaTilde) const
{
if (smearingLevels > 0)
{
double start = usecond();
GaugeField force = SigmaTilde; // actually = U*SigmaTilde
GaugeLinkField tmp_mu(SigmaTilde._grid);
for (int mu = 0; mu < Nd; mu++) {
for (int mu = 0; mu < Nd; mu++)
{
// to get just SigmaTilde
tmp_mu = adj(peekLorentz(SmearedSet[smearingLevels - 1], mu)) *
peekLorentz(force, mu);
tmp_mu = adj(peekLorentz(SmearedSet[smearingLevels - 1], mu)) * peekLorentz(force, mu);
pokeLorentz(force, tmp_mu, mu);
}
@ -246,33 +265,43 @@ class SmearedConfiguration {
force = AnalyticSmearedForce(force, *ThinLinks);
for (int mu = 0; mu < Nd; mu++) {
for (int mu = 0; mu < Nd; mu++)
{
tmp_mu = peekLorentz(*ThinLinks, mu) * peekLorentz(force, mu);
pokeLorentz(SigmaTilde, tmp_mu, mu);
}
} // if smearingLevels = 0 do nothing
double end = usecond();
double time = (end - start)/ 1e3;
std::cout << GridLogMessage << "Smearing force in " << time << " ms" << std::endl;
} // if smearingLevels = 0 do nothing
}
//====================================================================
GaugeField& get_SmearedU() { return SmearedSet[smearingLevels - 1]; }
GaugeField &get_SmearedU() { return SmearedSet[smearingLevels - 1]; }
GaugeField& get_U(bool smeared = false) {
GaugeField &get_U(bool smeared = false)
{
// get the config, thin links by default
if (smeared) {
if (smearingLevels) {
if (smeared)
{
if (smearingLevels)
{
RealD impl_plaq =
WilsonLoops<Gimpl>::avgPlaquette(SmearedSet[smearingLevels - 1]);
std::cout << GridLogDebug << "getting Usmr Plaq: " << impl_plaq
<< std::endl;
return get_SmearedU();
} else {
}
else
{
RealD impl_plaq = WilsonLoops<Gimpl>::avgPlaquette(*ThinLinks);
std::cout << GridLogDebug << "getting Thin Plaq: " << impl_plaq
<< std::endl;
return *ThinLinks;
}
} else {
}
else
{
RealD impl_plaq = WilsonLoops<Gimpl>::avgPlaquette(*ThinLinks);
std::cout << GridLogDebug << "getting Thin Plaq: " << impl_plaq
<< std::endl;

View File

@ -173,8 +173,8 @@ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const {
std::cout << "Time to evolve " << diff.count() << " s\n";
#endif
std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
<< step << " "
<< energyDensityPlaquette(step,out) << std::endl;
<< step << " " << tau(step) << " "
<< energyDensityPlaquette(step,out) << std::endl;
if( step % measure_interval == 0){
std::cout << GridLogMessage << "[WilsonFlow] Top. charge : "
<< step << " "
@ -193,8 +193,8 @@ void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, Re
//std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
evolve_step_adaptive(out, maxTau);
std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
<< step << " "
<< energyDensityPlaquette(out) << std::endl;
<< step << " " << taus << " "
<< energyDensityPlaquette(out) << std::endl;
if( step % measure_interval == 0){
std::cout << GridLogMessage << "[WilsonFlow] Top. charge : "
<< step << " "

View File

@ -212,6 +212,7 @@ public:
// For the force term
/*
static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
GridBase *grid = Umu._grid;
std::vector<GaugeMat> U(Nd, grid);
@ -225,7 +226,7 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
for (int nu = 0; nu < Nd; nu++) {
if (nu != mu) {
// this is ~10% faster than the Staple
// this is ~10% faster than the Staple -- PAB: so what it gives the WRONG answers for other BC's!
tmp1 = Cshift(U[nu], mu, 1);
tmp2 = Cshift(U[mu], nu, 1);
staple += tmp1* adj(U[nu]*tmp2);
@ -235,7 +236,7 @@ static void StapleMult(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
}
staple = U[mu]*staple;
}
*/
//////////////////////////////////////////////////
// the sum over all staples on each site
//////////////////////////////////////////////////

View File

@ -31,113 +31,10 @@ Author: Guido Cossu <guido.cossu@ed.ac.uk>
#define GRID_SERIALISATION_ABSTRACT_READER_H
#include <type_traits>
#include <Grid/tensors/Tensors.h>
#include <Grid/serialisation/VectorUtils.h>
namespace Grid {
// Vector IO utilities ///////////////////////////////////////////////////////
// helper function to read space-separated values
template <typename T>
std::vector<T> strToVec(const std::string s)
{
std::istringstream sstr(s);
T buf;
std::vector<T> v;
while(!sstr.eof())
{
sstr >> buf;
v.push_back(buf);
}
return v;
}
// output to streams for vectors
template < class T >
inline std::ostream & operator<<(std::ostream &os, const std::vector<T> &v)
{
os << "[";
for (auto &x: v)
{
os << x << " ";
}
if (v.size() > 0)
{
os << "\b";
}
os << "]";
return os;
}
// Vector element trait //////////////////////////////////////////////////////
template <typename T>
struct element
{
typedef T type;
static constexpr bool is_number = false;
};
template <typename T>
struct element<std::vector<T>>
{
typedef typename element<T>::type type;
static constexpr bool is_number = std::is_arithmetic<T>::value
or is_complex<T>::value
or element<T>::is_number;
};
// Vector flattening utility class ////////////////////////////////////////////
// Class to flatten a multidimensional std::vector
template <typename V>
class Flatten
{
public:
typedef typename element<V>::type Element;
public:
explicit Flatten(const V &vector);
const V & getVector(void);
const std::vector<Element> & getFlatVector(void);
const std::vector<size_t> & getDim(void);
private:
void accumulate(const Element &e);
template <typename W>
void accumulate(const W &v);
void accumulateDim(const Element &e);
template <typename W>
void accumulateDim(const W &v);
private:
const V &vector_;
std::vector<Element> flatVector_;
std::vector<size_t> dim_;
};
// Class to reconstruct a multidimensional std::vector
template <typename V>
class Reconstruct
{
public:
typedef typename element<V>::type Element;
public:
Reconstruct(const std::vector<Element> &flatVector,
const std::vector<size_t> &dim);
const V & getVector(void);
const std::vector<Element> & getFlatVector(void);
const std::vector<size_t> & getDim(void);
private:
void fill(std::vector<Element> &v);
template <typename W>
void fill(W &v);
void resize(std::vector<Element> &v, const unsigned int dim);
template <typename W>
void resize(W &v, const unsigned int dim);
private:
V vector_;
const std::vector<Element> &flatVector_;
std::vector<size_t> dim_;
size_t ind_{0};
unsigned int dimInd_{0};
};
// Pair IO utilities /////////////////////////////////////////////////////////
// helper function to parse input in the format "<obj1 obj2>"
template <typename T1, typename T2>
@ -151,15 +48,15 @@ namespace Grid {
do
{
is.get(c);
} while (c != '<' && !is.eof());
if (c == '<')
} while (c != '(' && !is.eof());
if (c == '(')
{
int start = is.tellg();
do
{
is.get(c);
} while (c != '>' && !is.eof());
if (c == '>')
} while (c != ')' && !is.eof());
if (c == ')')
{
int end = is.tellg();
int psize = end - start - 1;
@ -182,7 +79,7 @@ namespace Grid {
template <class T1, class T2>
inline std::ostream & operator<<(std::ostream &os, const std::pair<T1, T2> &p)
{
os << "<" << p.first << " " << p.second << ">";
os << "(" << p.first << " " << p.second << ")";
return os;
}
@ -205,6 +102,12 @@ namespace Grid {
template <typename U>
typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
write(const std::string& s, const U &output);
template <typename U>
void write(const std::string &s, const iScalar<U> &output);
template <typename U, int N>
void write(const std::string &s, const iVector<U, N> &output);
template <typename U, int N>
void write(const std::string &s, const iMatrix<U, N> &output);
private:
T *upcast;
};
@ -224,6 +127,12 @@ namespace Grid {
template <typename U>
typename std::enable_if<!std::is_base_of<Serializable, U>::value, void>::type
read(const std::string& s, U &output);
template <typename U>
void read(const std::string &s, iScalar<U> &output);
template <typename U, int N>
void read(const std::string &s, iVector<U, N> &output);
template <typename U, int N>
void read(const std::string &s, iMatrix<U, N> &output);
protected:
template <typename U>
void fromString(U &output, const std::string &s);
@ -237,203 +146,9 @@ namespace Grid {
};
template<typename T> struct isWriter {
static const bool value = false;
};
// Generic writer interface
// serializable base class
class Serializable
{
public:
template <typename T>
static inline void write(Writer<T> &WR,const std::string &s,
const Serializable &obj)
{}
template <typename T>
static inline void read(Reader<T> &RD,const std::string &s,
Serializable &obj)
{}
friend inline std::ostream & operator<<(std::ostream &os,
const Serializable &obj)
{
return os;
}
};
// Flatten class template implementation /////////////////////////////////////
template <typename V>
void Flatten<V>::accumulate(const Element &e)
{
flatVector_.push_back(e);
}
template <typename V>
template <typename W>
void Flatten<V>::accumulate(const W &v)
{
for (auto &e: v)
{
accumulate(e);
}
}
template <typename V>
void Flatten<V>::accumulateDim(const Element &e) {};
template <typename V>
template <typename W>
void Flatten<V>::accumulateDim(const W &v)
{
dim_.push_back(v.size());
accumulateDim(v[0]);
}
template <typename V>
Flatten<V>::Flatten(const V &vector)
: vector_(vector)
{
accumulate(vector_);
accumulateDim(vector_);
}
template <typename V>
const V & Flatten<V>::getVector(void)
{
return vector_;
}
template <typename V>
const std::vector<typename Flatten<V>::Element> &
Flatten<V>::getFlatVector(void)
{
return flatVector_;
}
template <typename V>
const std::vector<size_t> & Flatten<V>::getDim(void)
{
return dim_;
}
// Reconstruct class template implementation /////////////////////////////////
template <typename V>
void Reconstruct<V>::fill(std::vector<Element> &v)
{
for (auto &e: v)
{
e = flatVector_[ind_++];
}
}
template <typename V>
template <typename W>
void Reconstruct<V>::fill(W &v)
{
for (auto &e: v)
{
fill(e);
}
}
template <typename V>
void Reconstruct<V>::resize(std::vector<Element> &v, const unsigned int dim)
{
v.resize(dim_[dim]);
}
template <typename V>
template <typename W>
void Reconstruct<V>::resize(W &v, const unsigned int dim)
{
v.resize(dim_[dim]);
for (auto &e: v)
{
resize(e, dim + 1);
}
}
template <typename V>
Reconstruct<V>::Reconstruct(const std::vector<Element> &flatVector,
const std::vector<size_t> &dim)
: flatVector_(flatVector)
, dim_(dim)
{
resize(vector_, 0);
fill(vector_);
}
template <typename V>
const V & Reconstruct<V>::getVector(void)
{
return vector_;
}
template <typename V>
const std::vector<typename Reconstruct<V>::Element> &
Reconstruct<V>::getFlatVector(void)
{
return flatVector_;
}
template <typename V>
const std::vector<size_t> & Reconstruct<V>::getDim(void)
{
return dim_;
}
// Generic writer interface //////////////////////////////////////////////////
template <typename T>
inline void push(Writer<T> &w, const std::string &s) {
w.push(s);
}
template <typename T>
inline void push(Writer<T> &w, const char *s)
{
w.push(std::string(s));
}
template <typename T>
inline void pop(Writer<T> &w)
{
w.pop();
}
template <typename T, typename U>
inline void write(Writer<T> &w, const std::string& s, const U &output)
{
w.write(s, output);
}
// Generic reader interface
template <typename T>
inline bool push(Reader<T> &r, const std::string &s)
{
return r.push(s);
}
template <typename T>
inline bool push(Reader<T> &r, const char *s)
{
return r.push(std::string(s));
}
template <typename T>
inline void pop(Reader<T> &r)
{
r.pop();
}
template <typename T, typename U>
inline void read(Reader<T> &r, const std::string &s, U &output)
{
r.read(s, output);
}
// Writer template implementation ////////////////////////////////////////////
// Writer template implementation
template <typename T>
Writer<T>::Writer(void)
{
@ -467,6 +182,27 @@ namespace Grid {
{
upcast->writeDefault(s, output);
}
template <typename T>
template <typename U>
void Writer<T>::write(const std::string &s, const iScalar<U> &output)
{
upcast->writeDefault(s, tensorToVec(output));
}
template <typename T>
template <typename U, int N>
void Writer<T>::write(const std::string &s, const iVector<U, N> &output)
{
upcast->writeDefault(s, tensorToVec(output));
}
template <typename T>
template <typename U, int N>
void Writer<T>::write(const std::string &s, const iMatrix<U, N> &output)
{
upcast->writeDefault(s, tensorToVec(output));
}
// Reader template implementation
template <typename T>
@ -502,7 +238,37 @@ namespace Grid {
{
upcast->readDefault(s, output);
}
template <typename T>
template <typename U>
void Reader<T>::read(const std::string &s, iScalar<U> &output)
{
typename TensorToVec<iScalar<U>>::type v;
upcast->readDefault(s, v);
vecToTensor(output, v);
}
template <typename T>
template <typename U, int N>
void Reader<T>::read(const std::string &s, iVector<U, N> &output)
{
typename TensorToVec<iVector<U, N>>::type v;
upcast->readDefault(s, v);
vecToTensor(output, v);
}
template <typename T>
template <typename U, int N>
void Reader<T>::read(const std::string &s, iMatrix<U, N> &output)
{
typename TensorToVec<iMatrix<U, N>>::type v;
upcast->readDefault(s, v);
vecToTensor(output, v);
}
template <typename T>
template <typename U>
void Reader<T>::fromString(U &output, const std::string &s)
@ -521,6 +287,76 @@ namespace Grid {
abort();
}
}
// serializable base class ///////////////////////////////////////////////////
class Serializable
{
public:
template <typename T>
static inline void write(Writer<T> &WR,const std::string &s,
const Serializable &obj)
{}
template <typename T>
static inline void read(Reader<T> &RD,const std::string &s,
Serializable &obj)
{}
friend inline std::ostream & operator<<(std::ostream &os,
const Serializable &obj)
{
return os;
}
};
// Generic writer interface //////////////////////////////////////////////////
template <typename T>
inline void push(Writer<T> &w, const std::string &s) {
w.push(s);
}
template <typename T>
inline void push(Writer<T> &w, const char *s)
{
w.push(std::string(s));
}
template <typename T>
inline void pop(Writer<T> &w)
{
w.pop();
}
template <typename T, typename U>
inline void write(Writer<T> &w, const std::string& s, const U &output)
{
w.write(s, output);
}
// Generic reader interface //////////////////////////////////////////////////
template <typename T>
inline bool push(Reader<T> &r, const std::string &s)
{
return r.push(s);
}
template <typename T>
inline bool push(Reader<T> &r, const char *s)
{
return r.push(std::string(s));
}
template <typename T>
inline void pop(Reader<T> &r)
{
r.pop();
}
template <typename T, typename U>
inline void read(Reader<T> &r, const std::string &s, U &output)
{
r.read(s, output);
}
}
#endif

View File

@ -55,6 +55,11 @@ void Hdf5Writer::writeDefault(const std::string &s, const char *x)
writeDefault(s, sx);
}
Group & Hdf5Writer::getGroup(void)
{
return group_;
}
// Reader implementation ///////////////////////////////////////////////////////
Hdf5Reader::Hdf5Reader(const std::string &fileName)
: fileName_(fileName)
@ -103,3 +108,8 @@ void Hdf5Reader::readDefault(const std::string &s, std::string &x)
x.resize(strType.getSize());
attribute.read(strType, &(x[0]));
}
Group & Hdf5Reader::getGroup(void)
{
return group_;
}

View File

@ -5,6 +5,7 @@
#include <string>
#include <vector>
#include <H5Cpp.h>
#include <Grid/tensors/Tensors.h>
#include "Hdf5Type.h"
#ifndef H5_NO_NAMESPACE
@ -37,6 +38,7 @@ namespace Grid
template <typename U>
typename std::enable_if<!element<std::vector<U>>::is_number, void>::type
writeDefault(const std::string &s, const std::vector<U> &x);
H5NS::Group & getGroup(void);
private:
template <typename U>
void writeSingleAttribute(const U &x, const std::string &name,
@ -64,6 +66,7 @@ namespace Grid
template <typename U>
typename std::enable_if<!element<std::vector<U>>::is_number, void>::type
readDefault(const std::string &s, std::vector<U> &x);
H5NS::Group & getGroup(void);
private:
template <typename U>
void readSingleAttribute(U &x, const std::string &name,

Some files were not shown because too many files have changed in this diff Show More