1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-16 14:57:05 +01:00

Compare commits

..

273 Commits

Author SHA1 Message Date
b820076b91 Merge branch 'develop' into feature/mpi3 2016-10-25 06:02:33 +01:00
09f66100d3 MPI 3 compile on non-linux 2016-10-25 06:01:12 +01:00
d7d92af09d Travis fail fix attempt 2016-10-25 01:45:53 +01:00
460d0753a1 Merge branch 'develop' into feature/mpi3
Conflicts:
	lib/simd/Grid_avx512.h
2016-10-25 01:08:51 +01:00
8f8058f8a5 More random bits on parallel seeding 2016-10-25 01:05:52 +01:00
d97a27f483 Verbose 2016-10-25 01:05:31 +01:00
7c3363b91e Compiles all comms targets 2016-10-25 00:04:17 +01:00
b94478fa51 mpi, mpi3, shmem all compile.
mpi, mpi3 pass single node multi-rank
2016-10-24 23:45:31 +01:00
13bf0482e3 FFT optimisation 2016-10-24 19:25:40 +01:00
a795b5705e memory optimisation 2016-10-24 19:25:15 +01:00
392e064513 fast local peek-poke 2016-10-24 19:24:21 +01:00
b6a65059a2 Update to use shared memory to contain the stencil comms buffers
Tested on 2.1.1.1 1.2.1.1 4.1.1.1 1.4.1.1 2.2.1.1 subnode decompositions
2016-10-24 17:30:43 +01:00
ea25a4d9ac Works 2016-10-23 06:10:05 +01:00
c190221fd3 Internal SHM comms in non-simd directions working
Need to fix simd directions
2016-10-22 18:14:27 +01:00
0fcd2e7188 Simplify the comms structure prior to implementing Shared memory direct bouncs 2016-10-21 22:44:10 +01:00
910b8dd6a1 use simd type 2016-10-21 22:35:29 +01:00
75ebd3a0d1 Typo fixes and rotate for CLANG 2016-10-21 22:34:29 +01:00
09fd5c43a7 Reasonably fast version 2016-10-21 15:17:39 +01:00
f22317748f Merge branch 'feature/mpi3' of https://github.com/paboyle/Grid into feature/mpi3 2016-10-21 13:36:35 +01:00
6a9eae6b6b Reporting improvements 2016-10-21 13:36:18 +01:00
fad96cf250 StencilBufs 2016-10-21 13:36:00 +01:00
f331809c27 Use variable type for loop 2016-10-21 13:35:37 +01:00
2c54a53d0a Compile verbose reduce 2016-10-21 12:12:14 +01:00
306160ad9a bcopy threaded 2016-10-21 12:07:28 +01:00
20a091c3ed Intel vs. Clang intrinsics differences absorbed 2016-10-21 09:08:36 +01:00
202078eb1b Cray / OpenSHMEM ordering differs 2016-10-21 09:07:20 +01:00
a762b1fb71 MPI3 working with a bounce through shared memory on my laptop.
Longer term plan: make the "u_comm_buf" in Stencil point to the shared region and avoid the
send between ranks on same node.
2016-10-21 09:03:26 +01:00
5b5925b8e5 Forgot to add 2016-10-20 17:09:40 +01:00
b58adc6a4b commVector 2016-10-20 17:00:15 +01:00
f9d5e95d72 allocator template typedefs moved to AlignedAllocator 2016-10-20 16:59:39 +01:00
4f8e636a43 commVector 2016-10-20 16:59:16 +01:00
9b39f35ae6 commVector different for SHMEM compat 2016-10-20 16:58:53 +01:00
5fe2b85cbd MPI3 and shared memory support 2016-10-20 16:58:01 +01:00
c7cccaaa69 Comm vector for shmem 2016-10-20 16:57:31 +01:00
cbcfea466f MPI3 2016-10-20 16:57:14 +01:00
4955672fc3 MPI3 2016-10-20 16:57:00 +01:00
39f1c880b8 mpi3 2016-10-20 16:56:40 +01:00
8c043da5b7 SHMEM and comms allocator made different 2016-10-20 16:56:05 +01:00
3cbe974eb4 Layout 2016-10-20 16:55:21 +01:00
7af9b87318 Cache face tables to improve performance.
Extract merge now looking poor.
2016-10-18 09:51:37 +01:00
811ca45473 GNU clang hack for AVX512 since there are missing reduce intrinsics in Clang 3.9 and GCC-6 AVX512 support 2016-10-17 16:23:21 +01:00
bc1a4d40ba Faster integer handling avoid push_back 2016-10-17 16:16:44 +01:00
c8079e6621 Time the face gateher in x-dir more carefully 2016-10-13 22:28:50 +01:00
8b0d171c9a 32bit issue on the KNL code variant where byte offsets were stored 2016-10-12 17:49:32 +01:00
1f293b76b4 Merge branch 'feature/knl-stats' into develop 2016-10-12 13:47:58 +01:00
8bbd9ebc27 Reversing changes to Stencil class 2016-10-12 13:47:20 +01:00
6472b431f0 __rdpmc needed for gcc, clang++ 2016-10-12 12:29:08 +01:00
bd205a3293 Fixing for non x86 and non KNL 2016-10-12 12:09:15 +01:00
496beffa88 Fix non-KNL build 2016-10-12 12:06:08 +01:00
9b63e97108 align not absolutely required and confuses clang++ 2016-10-12 11:51:21 +01:00
81f2aeaece KNL streaming stores, and KNL performance coutners 2016-10-12 11:45:22 +01:00
2d4a45c758 Typecast pointer 2016-10-12 09:14:15 +01:00
0f182f033b Drop macos with gcc 2016-10-11 22:29:06 +01:00
7240d73184 Parallelise the x faces; fix the segv on KNL with comms 2016-10-11 22:21:07 +01:00
42cd148f5e Base pointer for comms buffer under AVX512 assembly 2016-10-11 16:06:06 +01:00
611b5d74ba Fix for AVX+FMA3 compilation 2016-10-10 15:26:17 +01:00
b56c9ffa52 Fix for AVXFMA 2016-10-10 14:43:37 +01:00
70c32fa49b Merge branch 'develop' of github.com:paboyle/Grid into develop 2016-10-09 12:55:46 +01:00
77c8a94dae AVXFMA4 flag fix for Intel Compiler 2016-10-09 12:55:12 +01:00
2e453dfbf5 Added some instrumentation to benchmark the force computation 2016-10-06 17:52:45 +01:00
4089984431 Timing hooks 2016-10-06 09:25:12 +01:00
98439847cf configure portability fix 2016-10-05 14:57:20 +01:00
c78bbd0f8c Fix ASM compilation 2016-10-04 15:37:32 +01:00
7ea4b959a4 hopefully more portable configure output 2016-09-27 11:54:37 +01:00
536e2ff073 *.inc removed: please don't commit these files either! 2016-09-27 11:54:03 +01:00
798ff34d7e configure removed: please don't commit configure! 2016-09-27 11:29:31 +01:00
04a437c92c Minor modification to the filelist script 2016-09-23 11:12:45 +01:00
5c190a1b8c Merge branch 'develop' into feature/hirep 2016-09-23 11:06:06 +01:00
15d8f5c88c Small change to the configure.ac to include the canonical names 2016-09-23 11:05:36 +01:00
c4ac6e7e8f Consolidating HMC interface
Uniformed interface for standard action in fundamental rep and Hirep
2016-09-23 10:47:42 +01:00
510e340e16 Debugged last commit for the Two index representation 2016-09-22 22:16:21 +01:00
6ffadca153 Restored number of colours to 3 2016-09-22 14:22:54 +01:00
b6597b74e7 Added support for the Two index Symmetric and Antisymmetric representations
Tested for HMC convergence: OK
Added also a test file showing an example for mixed representations
2016-09-22 14:17:37 +01:00
d2573189d8 build system: FFTW fix 2016-09-20 12:30:24 +01:00
65ca174dbb gitignore update 2016-09-20 11:25:06 +01:00
0724f7af75 QPX single precision implementation 2016-09-19 18:09:12 +01:00
2e74520821 removed libtool use (BG/Q compatibility) 2016-09-16 15:25:49 +01:00
6dd75ad9e5 Merge branch 'develop' of github.com:paboyle/Grid into feature/bgq 2016-09-16 15:07:54 +01:00
fda408ee6f Added first lines for supporting Two Index representations 2016-09-13 10:43:30 +01:00
b9c80318a2 Merge branch 'develop' into feature/hirep 2016-09-13 10:01:51 +01:00
5df5d52d41 Fix for the Intel compiler 2016-09-12 17:17:20 +01:00
f76f281e58 Cleaning files after fix 2016-09-09 11:34:25 +01:00
aa20cc8b52 Fixing compilation error with AVX512 flag 2016-09-09 02:58:52 -07:00
0fd179fb33 Merge branch 'develop' into feature/hirep 2016-09-01 12:59:53 +01:00
f45ef8d114 Minor modification in ActionBase.h 2016-09-01 11:46:46 +01:00
fd5614738d Merge branch 'develop' into feature/hirep 2016-08-30 18:21:36 +01:00
005dcc51aa Reset travis 2016-08-30 14:44:10 +01:00
655c893f86 Another test on travis 2016-08-30 14:38:42 +01:00
843f5783b4 Again travis test separating single and double 2016-08-30 14:29:09 +01:00
8986c9fedd Single and double precision travis matrix 2016-08-30 14:25:24 +01:00
c80a1d427c Retest original version of travis yaml 2016-08-30 14:05:05 +01:00
ae57032500 Separate single and double builds in travis 2016-08-30 14:00:34 +01:00
f75468728f Another error on travis 2016-08-30 13:56:23 +01:00
5acd856663 Correction of error in travis 2016-08-30 13:49:49 +01:00
b0d3e4bb2c Separating travis builds 2016-08-30 13:44:07 +01:00
b512ccbee6 HMC for Adjoint fermions works
Accepts and reproduces known results

Check initial instability of inverters
when starting from hot configurations
2016-08-30 11:31:25 +01:00
8c89391c02 FFTW unresolved fixed when no fftw3.h 2016-08-24 16:41:47 +01:00
bfac5195b8 tidy up 2016-08-24 16:38:36 +01:00
a782ca3238 Merge branch 'feature/fft-flop-count' into develop 2016-08-24 15:06:17 +01:00
744691097f Printing 2016-08-24 15:05:56 +01:00
ff6da364e8 FFT double and single precision gives good performance now in multithreaded code. 2016-08-24 15:05:00 +01:00
4d11a6f5f2 first commit for QPX intrinsics 2016-08-23 14:41:44 +01:00
88be3b39bb Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2016-08-22 18:29:36 +01:00
8a02824e08 Merge branch 'feature/FFT' into develop 2016-08-22 16:25:04 +01:00
356e7940fd fftw can be switched off 2016-08-22 16:24:49 +01:00
73ce476890 Include fftw headers 2016-08-22 16:24:21 +01:00
29c4ef41de Adding a test for libfftw3 2016-08-22 16:21:01 +01:00
e423a09974 FFT improved and test_FFT passing under MPI 8 processes, 8^4 for LatticeComplexD and LatticeSpinMatrixD 2016-08-18 02:23:21 +01:00
17097a93ec FFTW test ran over 4 mpi processes. 2016-08-17 01:33:55 +01:00
94a6373a7f Merge branch 'feature/eigen-cleanup' into develop 2016-08-15 23:58:34 +01:00
4ab7dbfd57 Instantiate 2016-08-15 23:00:40 +01:00
90e70790f3 Feature for z-Mobius prep 2016-08-15 22:31:29 +01:00
9c2e8d5e28 Nc=3 just to let all the test pass in Travis 2016-08-09 15:46:57 +01:00
147e2025b9 Added unit tests on the representation transformations
Status: Passing all tests
2016-08-08 16:54:22 +01:00
573b8c6020 build system: -O3 is not overriden by env CXXFLAGS 2016-08-06 01:26:24 +01:00
15218ec57f more Travis MPI fix 2016-08-06 00:49:14 +01:00
ec68e08dd2 Travis MPI fix 2016-08-06 00:36:05 +01:00
fc25d2295c fftw download 2016-08-06 00:28:52 +01:00
8dc2cfcedb Adding fftw header pulling 2016-08-06 00:28:28 +01:00
836f93780c first try at including MPI tests in Travis 2016-08-05 13:41:52 +01:00
5a68715be3 Richards sweep test 2016-08-05 10:51:57 +01:00
32bc7a6ab8 MPI back out of change that hangs
AVX2 for clang, gcc needs the -mfma flag.
2016-08-05 10:36:00 +01:00
b65e72e521 Merge pull request #43 from rprollins/bench/output-format
Benchmark_dwf_sweep and Benchmark_zmm output formats
2016-08-04 16:47:01 +01:00
d1aaff65e8 README update 2016-08-04 16:27:02 +01:00
93d29bb699 build system improvements after discussion with Peter 2016-08-04 16:19:59 +01:00
3b376ed54e build system: error if MPI not found 2016-08-03 15:23:38 +01:00
d5c1f614ba gitignore update 2016-08-03 15:14:33 +01:00
2edc24225d untracking ltmain.sh 2016-08-03 15:12:44 +01:00
629283726b build system: local Grid link flag moved to configure.ac 2016-08-03 15:07:42 +01:00
6adb66dd08 build system: finer management of GMP/MPFR dependence 2016-08-03 15:06:45 +01:00
5be92bb708 link fix in README 2016-08-03 12:40:56 +01:00
f4c049ea6d README update 2016-08-03 12:38:54 +01:00
bc092ad30f build system fix 2016-08-03 11:47:38 +01:00
dad642ed1b various build system fixes and improvements 2016-08-03 11:39:20 +01:00
63ae39abc7 proper propagation of OpenMP flags 2016-08-02 17:41:32 +01:00
9e5b934d21 improved LAPACK configuration 2016-08-02 17:26:54 +01:00
a7b483d67a Tests in subdirectories are not built by default 2016-08-02 12:14:28 +01:00
bb99ce0680 bootstrap script fix 2016-08-01 09:51:06 +01:00
83307df1af travis update for new build system 2016-08-01 09:38:40 +01:00
49b5c49851 Checked the hermiticity of the op in derivative, ok
Still CG fails to converge
2016-07-31 12:37:33 +01:00
e9f30cab2c first working version for the new build system 2016-07-30 17:53:18 +01:00
089f0ab582 Debugged HMC for Creutz relation 2016-07-28 16:44:41 +01:00
df6c9f55d1 Use common benchmark output format for dwf_sweep and zmm 2016-07-20 17:38:56 +01:00
b93e18ed50 Modified the Dirac Kernel class to compile with different number of colours
Added the general push_back functionality to accomodate for all defined representations

Compiles, not tested
2016-07-18 16:36:28 +01:00
9c77bb69a5 Added all elements for Hirep HMC
TODO: Test and debug
2016-07-18 12:05:23 +01:00
27f3ecc833 Merge branch 'feature/bugfix-ck-cj' into develop 2016-07-16 01:59:52 +01:00
f9e90eeb1f Sign error on the force for 4d fields fixed 2016-07-16 01:52:44 +01:00
fad5c675eb sign error on the 4d gparity force 2016-07-16 01:51:56 +01:00
4908b77d46 Fixed conflicts. PLEASE avoid making wholesale cosmetic only changes, this created
a HUGE amount of difficult to resolve and understand conflicts .

Wholesale formatting, reordering functions etc... in a central file like Tensor_class
or Grid_vector_types while others are also editing without making substantial functionality
changes creates pain.
2016-07-15 20:59:07 +01:00
f4dd5062d7 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2016-07-15 19:26:06 +01:00
da34d75841 Merge branch 'feature/Ls-vectorised-actions' into develop 2016-07-15 19:09:47 +01:00
980ff18956 Solving the instantiation no compile issue 2016-07-15 17:19:44 +01:00
7edf4c6c04 Added HMC utitities for the higher representations
TODO: Inherit types for the pseudofermions, Debugging, testing
2016-07-15 13:39:47 +01:00
1a6c7204ac Disable instantiation; Use cache version instead 2016-07-15 00:34:39 +01:00
49310fbab3 Done with red black change over 2016-07-15 00:08:43 +01:00
6049d5ac47 Update 2016-07-15 00:08:32 +01:00
35d0d35238 Updated file list 2016-07-15 00:02:53 +01:00
c0e878705e Updated file list 2016-07-15 00:02:39 +01:00
5c0c8efb9e Updated file list 2016-07-15 00:02:11 +01:00
dfd714e1ef Multiple implementations for the 5d hopping terms, depending on cache friendly
ops and/or the 5th direction being vectorised
All use 4d redblack.
2016-07-15 00:00:09 +01:00
79a8ca1a62 Rewrite for performance. Impl dependent instantiations give
4d linalg impls of the 5d hopping terms (and inverse)
Cache friendly loop orderings of the above
Dense matrix stored and apply to the above

-- Switch to Ls vectorised, and use dense matrix approach for the MooeeInv
   and rotate/shift of the Mooee M5D routines.
2016-07-14 23:58:15 +01:00
fb45eb2eb2 5d ls vec rename of impl class 2016-07-14 23:57:26 +01:00
a307274c96 Fermion impl rename for ls vectorised 5d approaches 2016-07-14 23:56:13 +01:00
3f2c44a5fe Updating the class to 5d selection based on impl type 2016-07-14 23:55:26 +01:00
48fb1cdc11 Update domain 5d vectorised impl type, move the type over to 4d redblack with
the dense OO inverse
2016-07-14 23:54:35 +01:00
8a79e93cc2 Rename the 5d domain wall fermion vectorised Ls impl class 2016-07-14 23:53:00 +01:00
3493b51879 Modest updates 2016-07-14 23:52:13 +01:00
de3e79d300 red black for Ls vectorised is 4d red black. Update accordingly now I've made this choice 2016-07-14 23:49:42 +01:00
dd62a61c5c Added broadcast and rotation of simd vectors 2016-07-14 23:49:00 +01:00
8f47d0b5ab Rotation needed for hopping term in fifth dim with Ls vectorised fields 2016-07-14 23:45:36 +01:00
42af132dab Fix for chris kellys request to peek poke on checkerboarded fields 2016-07-14 23:44:48 +01:00
9db2c6525d updating benchmarks for red black 4d for Ls vectorised code 2016-07-14 23:44:02 +01:00
adbc7c1188 Adding files for multiple implementations (cache opt) and Ls vectorisation
of the 5D cayley form chiral fermions for the 5d matrix. With Ls entirely
in the vector direction, s-hopping terms involve rotations.

The serial dependence of the LDU inversion for Mobius and 4d even odd
checkerboarding is removed by simply applying Ls^2 operations (vectorised
many ways) as a dense matrix operation.

This should give similar throughput but high flops (non-compulsory flops)
but enable use of the KNL cache friendly kernels throughout the code.

Ls is still constrained to be a multiple of Nsimd, which is as much as 8 for AVX512
with single precision.
2016-07-14 22:59:21 +01:00
9dc345e8e8 Debugged smearing and adding HMC functions for hirep 2016-07-13 17:51:18 +01:00
8b9301a74c Merge branch 'feature/bugfixes' into develop 2016-07-13 12:31:34 -04:00
6f47fbb1e2 Disabled parallel for loops in ExtractSlice and InsertSlice due to race conditions. Likely will need to do so for localConvert too. 2016-07-13 10:49:18 -04:00
a9ae30f868 Added representations definitions for the HMC 2016-07-12 13:36:10 +01:00
a3c0fb79b6 Fix to iVector and iMatrix pokeIndex and checkerboard local site indexing. 2016-07-11 17:15:22 -04:00
62601bb649 Bug fix 2016-07-08 20:46:29 +01:00
ef97e32152 Adding persistent communicators 2016-07-08 17:16:08 +01:00
daea5297ee Wrote the projector in the adjoint representation algebra 2016-07-08 16:14:16 +01:00
5028969d4b Added generators for the adjoint representation 2016-07-08 15:40:11 +01:00
c667d9fdcc Trying to make compile clean on travis; seem to have a make -j 4 problem with fftw 2016-07-07 23:26:39 +01:00
7dbb94bab2 Update 2016-07-07 22:51:37 +01:00
236dcc820b typo fix 2016-07-07 22:46:11 +01:00
a42a441a6a Rename the reconfigure script to ./autogen.sh 2016-07-07 22:35:45 +01:00
a0676beeb1 Open up dependency on Eigen and FFTW 2016-07-07 22:31:07 +01:00
c5106d0c03 Bugfix 2016-07-07 16:06:30 -04:00
fbf96b1bbb ]Merge branch 'develop' into feature/hirep 2016-07-07 14:20:10 +01:00
3c49ddfaa4 Merge branch 'temporary-smearing' into develop 2016-07-07 14:04:59 +01:00
ffb8b3116c Tested smeared RHMC Wilson1p1, accepting 2016-07-07 11:49:36 +01:00
290493e162 Merge branch 'feature/multi_prec' into develop 2016-07-06 19:29:57 -04:00
dd8cfff111 Another fix for pedantic compilers 2016-07-06 18:22:15 -04:00
184642adb0 Fix for pedantic compilers 2016-07-06 18:15:15 -04:00
4774a3bcd2 Generalized HotConfiguration and functions it calls to accept gauge fields with precision other than the default. 2016-07-06 18:01:08 -04:00
25fafa9a89 Comment 2016-07-06 16:19:41 -04:00
713520d3d2 Added tester for mixed CG 2016-07-06 16:18:19 -04:00
85ed8175cb Implemented mixed precision CG. Fixed filelist to exclude lib/Old directory and include Config.h. 2016-07-06 15:57:04 -04:00
df5c788ef2 Merge branch 'develop' into feature/multi_prec 2016-07-06 14:52:28 -04:00
15f22425c8 Added option to prevent CG from exiting when it fails to converge 2016-07-06 14:50:01 -04:00
e87182cf98 Debugged the copy constructor of the Lattice class 2016-07-06 15:31:00 +01:00
e3d5319470 Debugged the real() and imag() functions and added tests to Test_Simd 2016-07-06 14:16:03 +01:00
ffedeb1c58 Minor modifications 2016-07-06 11:41:27 +01:00
3e3b367aa9 Small changes in the Log files 2016-07-05 15:05:28 +01:00
3e80947c2b Cleaned up HMC output. Tested smeared HMCs for single precision (OK) 2016-07-05 12:03:54 +01:00
fdfbf11c6d Merge branch 'develop' into temporary-smearing 2016-07-04 18:45:10 +01:00
9cb90f714e Merge remote-tracking branch 'origin/develop' into temporary-smearing 2016-07-04 17:28:40 +01:00
6ce174cd60 Testing smearing for RHMC routines 2016-07-04 16:36:49 +01:00
17ca5240f7 Testet smeared EOWilsonRatio, accepts 2016-07-04 16:25:15 +01:00
2daffdf95d Tested smeared WilsonRatio action, accepts 2016-07-04 16:17:28 +01:00
149f826601 Tested smearing for Nf2 WilsonFermionAction, non EO: accepts 2016-07-04 16:09:19 +01:00
cd8ee27080 Simple change in iGamma for smearing 2016-07-04 16:02:57 +01:00
0fa66e8f3c Debugged smearing for EOWilson, accepts 2016-07-04 15:35:37 +01:00
8dd099267d Corrected a bug in the Expression Templates (acso and asin were wrong) 2016-07-03 12:28:25 +01:00
1a6d65c6a4 Converted set_uw and set_fj to all complex functions 2016-07-03 10:27:43 +01:00
fc4a043663 Colors and banner clean up 2016-07-02 16:15:38 +01:00
61ba50665e Merge branch 'hotfix/v0.5.1' into develop 2016-07-01 16:34:30 +01:00
bfe14000a9 Double compile fix 2016-07-01 16:33:51 +01:00
092fa0d8da Debugged set_fj,
to be fixed: BUG in imag()
2016-07-01 16:06:20 +01:00
1ceff48133 Merge branch 'release/v0.5.0' into develop 2016-06-30 15:15:59 -07:00
680645f849 Merge branch 'release/v0.5.0' 2016-06-30 15:15:03 -07:00
3fc6e03ad1 Version file 2016-06-30 14:44:09 -07:00
2d6614f3a1 Merge branch 'feature/knl-cache-opt' into develop 2016-06-30 14:36:20 -07:00
4e041b5103 Merge branch 'feature/knl-cache-opt' of https://github.com/paboyle/Grid into feature/knl-cache-opt 2016-06-30 14:36:08 -07:00
712b9a3489 Asm only for avx512 2016-06-30 14:35:02 -07:00
bdaa5b1767 Updated to have perfect prefetching for the s-vectorised kernel with any cache blocking. 2016-06-30 14:35:02 -07:00
8fcefc021a Improved the prefetching when using cache blocking codes 2016-06-30 14:35:02 -07:00
1445189361 COntrol the prefetch strategy 2016-06-30 14:35:02 -07:00
05c884a62a Prefetch change 2016-06-30 14:35:01 -07:00
a25bec87d9 Prefetch during save 2016-06-30 14:35:01 -07:00
2d8bb4c594 Tweaks 2016-06-30 14:35:01 -07:00
51cb2d4328 update file lists 2016-06-30 14:35:01 -07:00
6d58cb2a68 Enable reordering of the loops in the assembler for cache friendly.
This gets in the way of L2 prefetching however. Do next next link in stencil
prefetching.
2016-06-30 14:35:01 -07:00
c8b35d960c Merge branch 'develop' of https://github.com/paboyle/Grid into feature/knl-cache-opt 2016-06-30 14:30:49 -07:00
532f41dd61 Asm only for avx512 2016-06-30 14:00:34 -07:00
661b0ab45d Updated to have perfect prefetching for the s-vectorised kernel with any cache blocking. 2016-06-30 13:07:42 -07:00
565e9329ba Changed the colouring classes 2016-06-30 16:51:03 +01:00
4bc08ed995 Improved the prefetching when using cache blocking codes 2016-06-26 12:54:14 -07:00
b2933a0557 COntrol the prefetch strategy 2016-06-25 12:55:25 -07:00
db057cc276 Prefetch change 2016-06-25 12:54:50 -07:00
22e88eaf54 Prefetch during save 2016-06-25 12:54:14 -07:00
09fe3caebd Tweaks 2016-06-25 11:08:05 -07:00
5e02392f9c Fixed compilation error for benchmark_dwf
Some parts were assuming floating point precision
2016-06-20 12:30:51 +01:00
17a8f51a9b update file lists 2016-06-19 11:59:10 -07:00
1b7f88dd00 Enable reordering of the loops in the assembler for cache friendly.
This gets in the way of L2 prefetching however. Do next next link in stencil
prefetching.
2016-06-19 11:45:58 -07:00
d6737e4bd8 Travis fix for Linux clang builds 2016-06-14 19:15:08 +01:00
d539888e57 Merge pull request #37 from rprollins/fix/mpi_communicator
Removed write to stdout in constructor for MPI CartesianCommunicator
2016-06-14 17:25:40 +01:00
86187d7cca Removed write to stdout in constructor for MPI CartesianCommunicator 2016-06-14 15:34:20 +01:00
87418e7df1 Slightly faster prefetching perf. 2016-06-13 02:32:52 -07:00
55f65b81b5 Improvements to the assembler interface that let us move chunks of the
site and s loop into the kernels. This will save on function call overhead and
guarantee L2 prefetching strategy is right since OMP can't distribute the
sub-chunks of work.
2016-06-09 01:12:36 -07:00
d9408893b3 Prefetching in the normal kernel implementation. 2016-06-08 05:43:48 -07:00
05acc22920 placeholder for non temporal loads optimisation 2016-06-07 13:18:21 -07:00
8ac021de73 Added a test an fixed it for red black precon Ls innermost vectorised DWF 2016-06-07 13:16:56 -07:00
e503ef5590 Cleaned up 2016-06-07 00:11:36 +01:00
a7682b0060 Only instantiate the one routine to avoid duplicate symbol under g++5/MacOS 2016-06-06 23:48:21 +01:00
d4c9d71fc8 Merge branch 'master' of https://github.com/paboyle/Grid 2016-06-06 07:06:54 -07:00
786ca52c43 Problems remain in the red black preconditioning of the Ls vectorisation 2016-06-06 07:05:51 -07:00
048ac04abc Update Benchmark_dwf.cc 2016-06-03 13:44:41 +01:00
f78d89bcbe Update Lebesgue.cc
kill verbose
2016-06-03 13:33:42 +01:00
53d06046b0 Compiling updates for KNL 2016-06-03 03:47:54 -07:00
5d3a1a025d timers flag 2016-06-03 03:25:38 -07:00
139cc5f1ae Large change with KNL preparation 2016-06-03 03:24:26 -07:00
1c0e922585 Merge pull request #35 from aportelli/master
empty SIMD fix
2016-05-27 16:49:13 +01:00
9d5f693cbe empty SIMD fix 2016-05-24 10:56:27 +01:00
neo
339be37dba Debugging smeared HMC 2016-04-13 17:00:14 +09:00
neo
a87b744621 HMC runs but does not accept with smearing on 2016-04-07 16:45:11 +09:00
97d0d56bcb Debugging Smearing routines (set_fj) 2016-04-06 17:58:43 +09:00
7c7ea35ffb Putting the Traceless Antihermitian part outside the deriv in pseudofermion actions 2016-04-05 16:28:09 +09:00
4b1cf580e0 Debugging the Smearing routines 2016-04-05 16:19:30 +09:00
2d8bb356e3 Smearing routines compile (still untested) 2016-02-25 02:43:59 +09:00
a7251f28c7 Stout smearing compiles (untested) 2016-02-24 03:16:50 +09:00
neo
c1b1b89d17 More on smearing routines, writing APEsmear (dev) 2016-02-19 17:15:27 +09:00
neo
771235017d Adding smearing routines (development) 2016-02-19 15:30:41 +09:00
275 changed files with 19335 additions and 9267 deletions

30
.gitignore vendored
View File

@ -5,7 +5,6 @@
*.o *.o
*.obj *.obj
# Editor files # # Editor files #
################ ################
*~ *~
@ -48,6 +47,7 @@ Config.h.in
config.log config.log
config.status config.status
.deps .deps
*.inc
# http://www.gnu.org/software/autoconf # # http://www.gnu.org/software/autoconf #
######################################## ########################################
@ -62,19 +62,8 @@ stamp-h1
config.sub config.sub
config.guess config.guess
INSTALL INSTALL
.dirstamp
# Packages # ltmain.sh
############
# it's better to unpack these files and commit the raw source
# git has its own built in compression methods
*.7z
*.dmg
*.gz
*.iso
*.jar
*.rar
*.tar
*.zip
# Logs and databases # # Logs and databases #
###################### ######################
@ -100,3 +89,16 @@ build*/*
##################### #####################
*.xcodeproj/* *.xcodeproj/*
build.sh build.sh
# Eigen source #
################
lib/Eigen/*
# FFTW source #
################
lib/fftw/*
# libtool macros #
##################
m4/lt*
m4/libtool.m4

View File

@ -1,14 +1,14 @@
language: cpp language: cpp
cache:
directories:
- clang
matrix: matrix:
include: include:
- os: osx - os: osx
osx_image: xcode7.2 osx_image: xcode7.2
compiler: clang compiler: clang
- os: osx
osx_image: xcode7.2
compiler: gcc
env: VERSION=-5
- compiler: gcc - compiler: gcc
addons: addons:
apt: apt:
@ -19,6 +19,8 @@ matrix:
- libmpfr-dev - libmpfr-dev
- libgmp-dev - libgmp-dev
- libmpc-dev - libmpc-dev
- libopenmpi-dev
- openmpi-bin
- binutils-dev - binutils-dev
env: VERSION=-4.9 env: VERSION=-4.9
- compiler: gcc - compiler: gcc
@ -31,6 +33,8 @@ matrix:
- libmpfr-dev - libmpfr-dev
- libgmp-dev - libgmp-dev
- libmpc-dev - libmpc-dev
- libopenmpi-dev
- openmpi-bin
- binutils-dev - binutils-dev
env: VERSION=-5 env: VERSION=-5
- compiler: clang - compiler: clang
@ -38,42 +42,65 @@ matrix:
apt: apt:
sources: sources:
- ubuntu-toolchain-r-test - ubuntu-toolchain-r-test
- llvm-toolchain-precise-3.7
packages: packages:
- clang-3.7 - g++-4.8
- libmpfr-dev - libmpfr-dev
- libgmp-dev - libgmp-dev
- libmpc-dev - libmpc-dev
- libopenmpi-dev
- openmpi-bin
- binutils-dev - binutils-dev
env: VERSION=-3.7 env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
- compiler: clang - compiler: clang
addons: addons:
apt: apt:
sources: sources:
- ubuntu-toolchain-r-test - ubuntu-toolchain-r-test
- llvm-toolchain-precise-3.8
packages: packages:
- clang-3.8 - g++-4.8
- libmpfr-dev - libmpfr-dev
- libgmp-dev - libgmp-dev
- libmpc-dev - libmpc-dev
- libopenmpi-dev
- openmpi-bin
- binutils-dev - binutils-dev
env: VERSION=-3.8 env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
before_install: before_install:
- export GRIDDIR=`pwd`
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]] && [ ! -e clang/bin ]; then wget $CLANG_LINK; tar -xf `basename $CLANG_LINK`; mkdir clang; mv clang+*/* clang/; fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export PATH="${GRIDDIR}/clang/bin:${PATH}"; fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install openmpi; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]] && [[ "$CC" == "gcc" ]]; then brew install gcc5; fi - if [[ "$TRAVIS_OS_NAME" == "osx" ]] && [[ "$CC" == "gcc" ]]; then brew install gcc5; fi
install: install:
- export CC=$CC$VERSION - export CC=$CC$VERSION
- export CXX=$CXX$VERSION - export CXX=$CXX$VERSION
- echo $PATH
- which $CC
- $CC --version
- which $CXX
- $CXX --version
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
script: script:
- ./scripts/reconfigure_script - ./bootstrap.sh
- mkdir build - mkdir build
- cd build - cd build
- ../configure CXXFLAGS="-msse4.2 -O3 -std=c++11" LIBS="-lmpfr -lgmp" --enable-precision=single --enable-simd=SSE4 --enable-comms=none - ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=none
- make -j4 - make -j4
- ./benchmarks/Benchmark_dwf --threads 1 - ./benchmarks/Benchmark_dwf --threads 1
- echo make clean
- ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none
- make -j4
- ./benchmarks/Benchmark_dwf --threads 1
- echo make clean
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then export CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; fi
- ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto
- make -j4
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then mpirun -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi

View File

@ -1,5 +1,5 @@
# additional include paths necessary to compile the C++ library # additional include paths necessary to compile the C++ library
AM_CXXFLAGS = -I$(top_srcdir)/ SUBDIRS = lib benchmarks tests
SUBDIRS = lib tests benchmarks
filelist: $(SUBDIRS) AM_CXXFLAGS += -I$(top_builddir)/include
ACLOCAL_AMFLAGS = -I m4

104
README.md
View File

@ -1,8 +1,28 @@
# Grid [![Build Status](https://travis-ci.org/paboyle/Grid.svg?branch=master)](https://travis-ci.org/paboyle/Grid) # Grid
Data parallel C++ mathematical object library <table>
<tr>
<td>Last stable release</td>
<td><a href="https://travis-ci.org/paboyle/Grid">
<img src="https://travis-ci.org/paboyle/Grid.svg?branch=master"></a>
</td>
</tr>
<tr>
<td>Development branch</td>
<td><a href="https://travis-ci.org/paboyle/Grid">
<img src="https://travis-ci.org/paboyle/Grid.svg?branch=develop"></a>
</td>
</tr>
</table>
Last update 2015/7/30 **Data parallel C++ mathematical object library.**
Please send all pull requests to the `develop` branch.
License: GPL v2.
Last update 2016/08/03.
### Description
This library provides data parallel C++ container classes with internal memory layout This library provides data parallel C++ container classes with internal memory layout
that is transformed to map efficiently to SIMD architectures. CSHIFT facilities that is transformed to map efficiently to SIMD architectures. CSHIFT facilities
are provided, similar to HPF and cmfortran, and user control is given over the mapping of are provided, similar to HPF and cmfortran, and user control is given over the mapping of
@ -22,37 +42,75 @@ optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a signifi
for most programmers. for most programmers.
The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture. The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
Presently SSE4 (128 bit) AVX, AVX2 (256 bit) and IMCI and AVX512 (512 bit) targets are supported (ARM NEON on the way). Presently SSE4 (128 bit) AVX, AVX2 (256 bit) and IMCI and AVX512 (512 bit) targets are supported (ARM NEON and BG/Q QPX on the way).
These are presented as These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. These may be useful in themselves for other programmers.
The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.
vRealF, vRealD, vComplexF, vComplexD
internal vector data types. These may be useful in themselves for other programmers.
The corresponding scalar types are named
RealF, RealD, ComplexF, ComplexD
MPI, OpenMP, and SIMD parallelism are present in the library. MPI, OpenMP, and SIMD parallelism are present in the library.
Please see https://arxiv.org/abs/1512.03487 for more detail.
You can give `configure' initial values for configuration parameters ### Installation
by setting variables in the command line or in the environment. Here First, start by cloning the repository:
are examples:
./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -msse4" --enable-simd=SSE4 ``` bash
git clone https://github.com/paboyle/Grid.git
```
./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx" --enable-simd=AVX Then enter the cloned directory and set up the build system:
./configure CXX=clang++ CXXFLAGS="-std=c++11 -O3 -mavx2" --enable-simd=AVX2 ``` bash
cd Grid
./bootstrap.sh
```
./configure CXX=icpc CXXFLAGS="-std=c++11 -O3 -mmic" --enable-simd=AVX512 --host=none Now you can execute the `configure` script to generate makefiles (here from a build directory):
Note: Before running configure it could be necessary to execute the script ``` bash
mkdir build; cd build
../configure --enable-precision=double --enable-simd=AVX --enable-comms=mpi-auto --prefix=<path>
```
script/filelist where `--enable-precision=` set the default precision (`single` or `double`),
`--enable-simd=` set the SIMD type (see possible values below), `--enable-
comms=` set the protocol used for communications (`none`, `mpi`, `mpi-auto` or
`shmem`), and `<path>` should be replaced by the prefix path where you want to
install Grid. The `mpi-auto` communication option set `configure` to determine
automatically how to link to MPI. Other options are available, use `configure
--help` to display them. Like with any other program using GNU autotool, the
`CXX`, `CXXFLAGS`, `LDFLAGS`, ... environment variables can be modified to
customise the build.
Finally, you can build and install Grid:
``` bash
make; make install
```
For developers: To minimise the build time, only the tests at the root of the `tests` directory are built by default. If you want to build tests in the sub-directory `<subdir>` you can execute:
Use reconfigure_script in the scripts/ directory to create the autotools environment
``` bash
make -C tests/<subdir> tests
```
### Possible SIMD types
The following options can be use with the `--enable-simd=` option to target different SIMD instruction sets:
| String | Description |
| ----------- | -------------------------------------- |
| `GEN` | generic portable vector code |
| `SSE4` | SSE 4.2 (128 bit) |
| `AVX` | AVX (256 bit) |
| `AVXFMA4` | AVX (256 bit) + FMA |
| `AVX2` | AVX 2 (256 bit) |
| `AVX512` | AVX 512 bit |
| `AVX512MIC` | AVX 512 bit for Intel MIC architecture |
| `ICMI` | Intel ICMI instructions (512 bit) |
Alternatively, some CPU codenames can be directly used:
| String | Description |
| ----------- | -------------------------------------- |
| `KNC` | [Intel Knights Corner](http://ark.intel.com/products/codename/57721/Knights-Corner) |
| `KNL` | [Intel Knights Landing](http://ark.intel.com/products/codename/48999/Knights-Landing) |

4
VERSION Normal file
View File

@ -0,0 +1,4 @@
Version : 0.5.0
- AVX512, AVX2, AVX, SSE good
- Clang 3.5 and above, ICPC v16 and above, GCC 4.9 and above

View File

@ -25,7 +25,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#include <Grid.h> #include <Grid/Grid.h>
using namespace std; using namespace std;
using namespace Grid; using namespace Grid;
@ -194,7 +194,128 @@ int main (int argc, char ** argv)
} }
} }
#if 0
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking sequential persistent halo exchange in "<<nmu<<" dimensions"<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << " L "<<"\t\t"<<" Ls "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
for(int lat=4;lat<=32;lat+=2){
for(int Ls=1;Ls<=16;Ls*=2){
std::vector<int> latt_size ({lat,lat,lat,lat});
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
int ncomm;
int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
std::vector<CartesianCommunicator::CommsRequest_t> empty;
std::vector<std::vector<CartesianCommunicator::CommsRequest_t> > requests_fwd(Nd,empty);
std::vector<std::vector<CartesianCommunicator::CommsRequest_t> > requests_bwd(Nd,empty);
for(int mu=0;mu<4;mu++){
ncomm=0;
if (mpi_layout[mu]>1 ) {
ncomm++;
int comm_proc;
int xmit_to_rank;
int recv_from_rank;
comm_proc=1;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
Grid.SendToRecvFromInit(requests_fwd[mu],
(void *)&xbuf[mu][0],
xmit_to_rank,
(void *)&rbuf[mu][0],
recv_from_rank,
bytes);
comm_proc = mpi_layout[mu]-1;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
Grid.SendToRecvFromInit(requests_bwd[mu],
(void *)&xbuf[mu+4][0],
xmit_to_rank,
(void *)&rbuf[mu+4][0],
recv_from_rank,
bytes);
}
}
{
double start=usecond();
for(int i=0;i<Nloop;i++){
for(int mu=0;mu<4;mu++){
if (mpi_layout[mu]>1 ) {
Grid.SendToRecvFromBegin(requests_fwd[mu]);
Grid.SendToRecvFromComplete(requests_fwd[mu]);
Grid.SendToRecvFromBegin(requests_bwd[mu]);
Grid.SendToRecvFromComplete(requests_bwd[mu]);
}
}
Grid.Barrier();
}
double stop=usecond();
double dbytes = bytes;
double xbytes = Nloop*dbytes*2.0*ncomm;
double rbytes = xbytes;
double bidibytes = xbytes+rbytes;
double time = stop-start;
std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
}
{
double start=usecond();
for(int i=0;i<Nloop;i++){
for(int mu=0;mu<4;mu++){
if (mpi_layout[mu]>1 ) {
Grid.SendToRecvFromBegin(requests_fwd[mu]);
Grid.SendToRecvFromBegin(requests_bwd[mu]);
Grid.SendToRecvFromComplete(requests_fwd[mu]);
Grid.SendToRecvFromComplete(requests_bwd[mu]);
}
}
Grid.Barrier();
}
double stop=usecond();
double dbytes = bytes;
double xbytes = Nloop*dbytes*2.0*ncomm;
double rbytes = xbytes;
double bidibytes = xbytes+rbytes;
double time = stop-start;
std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
}
}
}
#endif
Grid_finalize(); Grid_finalize();
} }

View File

@ -26,7 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#include <Grid.h> #include <Grid/Grid.h>
using namespace std; using namespace std;
using namespace Grid; using namespace Grid;
@ -45,6 +45,10 @@ struct scal {
}; };
bool overlapComms = false; bool overlapComms = false;
typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF;
typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD;
int main (int argc, char ** argv) int main (int argc, char ** argv)
{ {
@ -58,12 +62,18 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl; std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
std::vector<int> latt4 = GridDefaultLatt(); std::vector<int> latt4 = GridDefaultLatt();
const int Ls=8; const int Ls=16;
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
std::vector<int> seeds4({1,2,3,4}); std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8}); std::vector<int> seeds5({5,6,7,8});
@ -76,9 +86,9 @@ int main (int argc, char ** argv)
LatticeFermion tmp(FGrid); LatticeFermion tmp(FGrid);
LatticeFermion err(FGrid); LatticeFermion err(FGrid);
ColourMatrix cm = Complex(1.0,0.0); LatticeGaugeField Umu(UGrid);
random(RNG4,Umu);
LatticeGaugeField Umu(UGrid); random(RNG4,Umu);
LatticeGaugeField Umu5d(FGrid); LatticeGaugeField Umu5d(FGrid);
// replicate across fifth dimension // replicate across fifth dimension
@ -114,19 +124,25 @@ int main (int argc, char ** argv)
RealD mass=0.1; RealD mass=0.1;
RealD M5 =1.8; RealD M5 =1.8;
typename DomainWallFermionR::ImplParams params;
params.overlapCommsCompute = overlapComms;
RealD NP = UGrid->_Nprocessors; RealD NP = UGrid->_Nprocessors;
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params); for(int doasm=1;doasm<2;doasm++){
QCD::WilsonKernelsStatic::AsmOpt=doasm;
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
std::cout<<GridLogMessage << "Naive wilson implementation "<<std::endl;
std::cout << GridLogMessage<< "Calling Dw"<<std::endl; std::cout << GridLogMessage<< "Calling Dw"<<std::endl;
int ncall =100; int ncall =100;
{ if (1) {
Dw.ZeroCounters();
double t0=usecond(); double t0=usecond();
for(int i=0;i<ncall;i++){ for(int i=0;i<ncall;i++){
__SSC_START;
Dw.Dhop(src,result,0); Dw.Dhop(src,result,0);
__SSC_STOP;
} }
double t1=usecond(); double t1=usecond();
@ -137,13 +153,140 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl; std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl; std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl; std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NP<<std::endl; std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
err = ref-result; err = ref-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl; std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
assert (norm2(err)< 1.0e-5 );
Dw.Report(); Dw.Report();
} }
exit(0); if (1)
{
typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
LatticeFermion ssrc(sFGrid);
LatticeFermion sref(sFGrid);
LatticeFermion sresult(sFGrid);
WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
for(int x=0;x<latt4[0];x++){
for(int y=0;y<latt4[1];y++){
for(int z=0;z<latt4[2];z++){
for(int t=0;t<latt4[3];t++){
for(int s=0;s<Ls;s++){
std::vector<int> site({s,x,y,z,t});
SpinColourVector tmp;
peekSite(tmp,src,site);
pokeSite(tmp,ssrc,site);
}}}}}
std::cout<<GridLogMessage<< "src norms "<< norm2(src)<<" " <<norm2(ssrc)<<std::endl;
double t0=usecond();
sDw.ZeroCounters();
for(int i=0;i<ncall;i++){
__SSC_START;
sDw.Dhop(ssrc,sresult,0);
__SSC_STOP;
}
double t1=usecond();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=1344*volume*ncall;
std::cout<<GridLogMessage << "Called Dw s_inner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
sDw.Report();
if(0){
for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
sDw.Dhop(ssrc,sresult,0);
PerformanceCounter Counter(i);
Counter.Start();
sDw.Dhop(ssrc,sresult,0);
Counter.Stop();
Counter.Report();
}
}
std::cout<<GridLogMessage<< "res norms "<< norm2(result)<<" " <<norm2(sresult)<<std::endl;
RealD sum=0;
for(int x=0;x<latt4[0];x++){
for(int y=0;y<latt4[1];y++){
for(int z=0;z<latt4[2];z++){
for(int t=0;t<latt4[3];t++){
for(int s=0;s<Ls;s++){
std::vector<int> site({s,x,y,z,t});
SpinColourVector normal, simd;
peekSite(normal,result,site);
peekSite(simd,sresult,site);
sum=sum+norm2(normal-simd);
if (norm2(normal-simd) > 1.0e-6 ) {
std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" "<<norm2(normal-simd)<<std::endl;
std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" normal "<<normal<<std::endl;
std::cout << "site "<<x<<","<<y<<","<<z<<","<<t<<","<<s<<" simd "<<simd<<std::endl;
}
}}}}}
std::cout<<GridLogMessage<<" difference between normal and simd is "<<sum<<std::endl;
assert (sum< 1.0e-5 );
if (1) {
LatticeFermion sr_eo(sFGrid);
LatticeFermion ssrc_e (sFrbGrid);
LatticeFermion ssrc_o (sFrbGrid);
LatticeFermion sr_e (sFrbGrid);
LatticeFermion sr_o (sFrbGrid);
pickCheckerboard(Even,ssrc_e,ssrc);
pickCheckerboard(Odd,ssrc_o,ssrc);
setCheckerboard(sr_eo,ssrc_o);
setCheckerboard(sr_eo,ssrc_e);
sr_e = zero;
sr_o = zero;
sDw.ZeroCounters();
sDw.stat.init("DhopEO");
double t0=usecond();
for (int i = 0; i < ncall; i++) {
sDw.DhopEO(ssrc_o, sr_e, DaggerNo);
}
double t1=usecond();
sDw.stat.print();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=(1344.0*volume*ncall)/2;
std::cout<<GridLogMessage << "sDeo mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "sDeo mflop/s per rank "<< flops/(t1-t0)/NP<<std::endl;
sDw.Report();
sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
sDw.DhopOE(ssrc_e,sr_o,DaggerNo);
sDw.Dhop (ssrc ,sresult,DaggerNo);
pickCheckerboard(Even,ssrc_e,sresult);
pickCheckerboard(Odd ,ssrc_o,sresult);
ssrc_e = ssrc_e - sr_e;
RealD error = norm2(ssrc_e);
std::cout<<GridLogMessage << "sE norm diff "<< norm2(ssrc_e)<< " vec nrm"<<norm2(sr_e) <<std::endl;
ssrc_o = ssrc_o - sr_o;
error+= norm2(ssrc_o);
std::cout<<GridLogMessage << "sO norm diff "<< norm2(ssrc_o)<< " vec nrm"<<norm2(sr_o) <<std::endl;
if(error>1.0e-5) {
setCheckerboard(ssrc,ssrc_o);
setCheckerboard(ssrc,ssrc_e);
std::cout<< ssrc << std::endl;
}
}
}
if (1) if (1)
{ // Naive wilson dag implementation { // Naive wilson dag implementation
@ -165,12 +308,13 @@ int main (int argc, char ** argv)
ref = -0.5*ref; ref = -0.5*ref;
} }
Dw.Dhop(src,result,1); Dw.Dhop(src,result,1);
std::cout << GridLogMessage << "Naive wilson implementation Dag" << std::endl;
std::cout<<GridLogMessage << "Called DwDag"<<std::endl; std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl; std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl; std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
err = ref-result; err = ref-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl; std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
assert(norm2(err)<1.0e-5);
LatticeFermion src_e (FrbGrid); LatticeFermion src_e (FrbGrid);
LatticeFermion src_o (FrbGrid); LatticeFermion src_o (FrbGrid);
LatticeFermion r_e (FrbGrid); LatticeFermion r_e (FrbGrid);
@ -186,6 +330,7 @@ int main (int argc, char ** argv)
std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl; std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
{ {
Dw.ZeroCounters();
double t0=usecond(); double t0=usecond();
for(int i=0;i<ncall;i++){ for(int i=0;i<ncall;i++){
Dw.DhopEO(src_o,r_e,DaggerNo); Dw.DhopEO(src_o,r_e,DaggerNo);
@ -196,7 +341,8 @@ int main (int argc, char ** argv)
double flops=(1344.0*volume*ncall)/2; double flops=(1344.0*volume*ncall)/2;
std::cout<<GridLogMessage << "Deo mflop/s = "<< flops/(t1-t0)<<std::endl; std::cout<<GridLogMessage << "Deo mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "Deo mflop/s per node "<< flops/(t1-t0)/NP<<std::endl; std::cout<<GridLogMessage << "Deo mflop/s per rank "<< flops/(t1-t0)/NP<<std::endl;
Dw.Report();
} }
Dw.DhopEO(src_o,r_e,DaggerNo); Dw.DhopEO(src_o,r_e,DaggerNo);
Dw.DhopOE(src_e,r_o,DaggerNo); Dw.DhopOE(src_e,r_o,DaggerNo);
@ -211,11 +357,17 @@ int main (int argc, char ** argv)
err = r_eo-result; err = r_eo-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl; std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
assert(norm2(err)<1.0e-5);
pickCheckerboard(Even,src_e,err); pickCheckerboard(Even,src_e,err);
pickCheckerboard(Odd,src_o,err); pickCheckerboard(Odd,src_o,err);
std::cout<<GridLogMessage << "norm diff even "<< norm2(src_e)<<std::endl; std::cout<<GridLogMessage << "norm diff even "<< norm2(src_e)<<std::endl;
std::cout<<GridLogMessage << "norm diff odd "<< norm2(src_o)<<std::endl; std::cout<<GridLogMessage << "norm diff odd "<< norm2(src_o)<<std::endl;
assert(norm2(src_e)<1.0e-5);
assert(norm2(src_o)<1.0e-5);
}
Grid_finalize(); Grid_finalize();
} }

View File

@ -0,0 +1,153 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./benchmarks/Benchmark_dwf.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
using namespace Grid::QCD;
template<class d>
struct scal {
d internal;
};
Gamma::GammaMatrix Gmu [] = {
Gamma::GammaX,
Gamma::GammaY,
Gamma::GammaZ,
Gamma::GammaT
};
bool overlapComms = false;
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){
overlapComms = true;
}
int threads = GridThread::GetThreads();
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
std::vector<int> latt4 = GridDefaultLatt();
const int Ls=16;
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
LatticeFermion src (FGrid); random(RNG5,src);
LatticeFermion result(FGrid); result=zero;
LatticeFermion ref(FGrid); ref=zero;
LatticeFermion tmp(FGrid);
LatticeFermion err(FGrid);
ColourMatrix cm = Complex(1.0,0.0);
LatticeGaugeField Umu(UGrid);
random(RNG4,Umu);
LatticeGaugeField Umu5d(FGrid);
// replicate across fifth dimension
for(int ss=0;ss<Umu._grid->oSites();ss++){
for(int s=0;s<Ls;s++){
Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
}
}
////////////////////////////////////
// Naive wilson implementation
////////////////////////////////////
std::vector<LatticeColourMatrix> U(4,FGrid);
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
}
if (1)
{
ref = zero;
for(int mu=0;mu<Nd;mu++){
tmp = U[mu]*Cshift(src,mu+1,1);
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
tmp =adj(U[mu])*src;
tmp =Cshift(tmp,mu+1,-1);
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
}
ref = -0.5*ref;
}
RealD mass=0.1;
RealD M5 =1.8;
typename DomainWallFermionR::ImplParams params;
params.overlapCommsCompute = overlapComms;
RealD NP = UGrid->_Nprocessors;
QCD::WilsonKernelsStatic::AsmOpt=1;
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
int ncall =50;
if (1) {
double t0=usecond();
for(int i=0;i<ncall;i++){
Dw.Dhop(src,result,0);
}
double t1=usecond();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=1344*volume*ncall;
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NP<<std::endl;
err = ref-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
// Dw.Report();
}
Grid_finalize();
}

View File

@ -0,0 +1,364 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./benchmarks/Benchmark_dwf.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
using namespace Grid::QCD;
template<class d>
struct scal {
d internal;
};
Gamma::GammaMatrix Gmu [] = {
Gamma::GammaX,
Gamma::GammaY,
Gamma::GammaZ,
Gamma::GammaT
};
void benchDw(std::vector<int> & L, int Ls, int threads, int report =0 );
void benchsDw(std::vector<int> & L, int Ls, int threads, int report=0 );
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
const int Ls=8;
int threads = GridThread::GetThreads();
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
if ( getenv("ASMOPT") ) {
QCD::WilsonKernelsStatic::AsmOpt=1;
} else {
QCD::WilsonKernelsStatic::AsmOpt=0;
}
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking DWF"<<std::endl;
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
std::cout<<GridLogMessage << "Volume \t\t\tProcs \t Dw \t eoDw \t sDw \t eosDw (Mflop/s) "<<std::endl;
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
int Lmax=32;
int dmin=0;
if ( getenv("LMAX") ) Lmax=atoi(getenv("LMAX"));
if ( getenv("DMIN") ) dmin=atoi(getenv("DMIN"));
for (int L=8;L<=Lmax;L*=2){
std::vector<int> latt4(4,L);
for(int d=4;d>dmin;d--){
if ( d<=3 ) latt4[d]*=2;
std::cout << GridLogMessage <<"\t";
for(int d=0;d<Nd;d++){
std::cout<<latt4[d]<<"x";
}
std::cout <<Ls<<"\t" ;
benchDw (latt4,Ls,threads,0);
benchsDw(latt4,Ls,threads,0);
std::cout<<std::endl;
}
}
std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
{
std::vector<int> latt4(4,16);
std::cout<<GridLogMessage << "16^4 Dw miss rate"<<std::endl;
benchDw (latt4,Ls,threads,1);
std::cout<<GridLogMessage << "16^4 sDw miss rate"<<std::endl;
benchsDw(latt4,Ls,threads,1);
}
Grid_finalize();
}
#undef CHECK
void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
{
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
#ifdef CHECK
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
LatticeFermion src (FGrid); random(RNG5,src);
LatticeGaugeField Umu(UGrid);
random(RNG4,Umu);
#else
LatticeFermion src (FGrid); src=zero;
LatticeGaugeField Umu(UGrid); Umu=zero;
#endif
LatticeFermion result(FGrid); result=zero;
LatticeFermion ref(FGrid); ref=zero;
LatticeFermion tmp(FGrid);
LatticeFermion err(FGrid);
ColourMatrix cm = Complex(1.0,0.0);
LatticeGaugeField Umu5d(FGrid);
// replicate across fifth dimension
for(int ss=0;ss<Umu._grid->oSites();ss++){
for(int s=0;s<Ls;s++){
Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
}
}
////////////////////////////////////
// Naive wilson implementation
////////////////////////////////////
std::vector<LatticeColourMatrix> U(4,FGrid);
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
}
#ifdef CHECK
if (1) {
ref = zero;
for(int mu=0;mu<Nd;mu++){
tmp = U[mu]*Cshift(src,mu+1,1);
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
tmp =adj(U[mu])*src;
tmp =Cshift(tmp,mu+1,-1);
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
}
ref = -0.5*ref;
}
#endif
RealD mass=0.1;
RealD M5 =1.8;
RealD NP = UGrid->_Nprocessors;
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
double t0=usecond();
Dw.Dhop(src,result,0);
double t1=usecond();
#ifdef TIMERS_OFF
int ncall =10;
#else
int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
#endif
if (ncall < 5 ) exit(0);
Dw.Dhop(src,result,0);
PerformanceCounter Counter(8);
Counter.Start();
t0=usecond();
for(int i=0;i<ncall;i++){
Dw.Dhop(src,result,0);
}
t1=usecond();
Counter.Stop();
if ( report ) {
Counter.Report();
}
if ( ! report ) {
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=1344*volume*ncall;
std::cout <<"\t"<<NP<< "\t"<<flops/(t1-t0)<< "\t";
}
#ifdef CHECK
err = ref-result;
RealD errd = norm2(err);
if ( errd> 1.0e-4 ) {
std::cout<<GridLogMessage << "oops !!! norm diff "<< norm2(err)<<std::endl;
exit(-1);
}
#endif
LatticeFermion src_e (FrbGrid);
LatticeFermion src_o (FrbGrid);
LatticeFermion r_e (FrbGrid);
LatticeFermion r_o (FrbGrid);
LatticeFermion r_eo (FGrid);
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd,src_o,src);
{
Dw.DhopEO(src_o,r_e,DaggerNo);
double t0=usecond();
for(int i=0;i<ncall;i++){
Dw.DhopEO(src_o,r_e,DaggerNo);
}
double t1=usecond();
if(!report){
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=(1344.0*volume*ncall)/2;
std::cout<< flops/(t1-t0);
}
}
}
#define CHECK_SDW
void benchsDw(std::vector<int> & latt4, int Ls, int threads, int report )
{
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
#ifdef CHECK_SDW
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
LatticeFermion src (FGrid); random(RNG5,src);
LatticeGaugeField Umu(UGrid);
random(RNG4,Umu);
#else
LatticeFermion src (FGrid); src=zero;
LatticeGaugeField Umu(UGrid); Umu=zero;
#endif
LatticeFermion result(FGrid); result=zero;
LatticeFermion ref(FGrid); ref=zero;
LatticeFermion tmp(FGrid);
LatticeFermion err(FGrid);
ColourMatrix cm = Complex(1.0,0.0);
LatticeGaugeField Umu5d(FGrid);
// replicate across fifth dimension
for(int ss=0;ss<Umu._grid->oSites();ss++){
for(int s=0;s<Ls;s++){
Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
}
}
RealD mass=0.1;
RealD M5 =1.8;
typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
LatticeFermion ssrc(sFGrid);
LatticeFermion sref(sFGrid);
LatticeFermion sresult(sFGrid);
WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
for(int x=0;x<latt4[0];x++){
for(int y=0;y<latt4[1];y++){
for(int z=0;z<latt4[2];z++){
for(int t=0;t<latt4[3];t++){
for(int s=0;s<Ls;s++){
std::vector<int> site({s,x,y,z,t});
SpinColourVector tmp;
peekSite(tmp,src,site);
pokeSite(tmp,ssrc,site);
}}}}}
double t0=usecond();
sDw.Dhop(ssrc,sresult,0);
double t1=usecond();
#ifdef TIMERS_OFF
int ncall =10;
#else
int ncall =1+(int) ((5.0*1000*1000)/(t1-t0));
#endif
PerformanceCounter Counter(8);
Counter.Start();
t0=usecond();
for(int i=0;i<ncall;i++){
sDw.Dhop(ssrc,sresult,0);
}
t1=usecond();
Counter.Stop();
if ( report ) {
Counter.Report();
} else {
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=1344*volume*ncall;
std::cout<<"\t"<< flops/(t1-t0);
}
LatticeFermion sr_eo(sFGrid);
LatticeFermion serr(sFGrid);
LatticeFermion ssrc_e (sFrbGrid);
LatticeFermion ssrc_o (sFrbGrid);
LatticeFermion sr_e (sFrbGrid);
LatticeFermion sr_o (sFrbGrid);
pickCheckerboard(Even,ssrc_e,ssrc);
pickCheckerboard(Odd,ssrc_o,ssrc);
setCheckerboard(sr_eo,ssrc_o);
setCheckerboard(sr_eo,ssrc_e);
sr_e = zero;
sr_o = zero;
sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
PerformanceCounter CounterSdw(8);
CounterSdw.Start();
t0=usecond();
for(int i=0;i<ncall;i++){
__SSC_START;
sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
__SSC_STOP;
}
t1=usecond();
CounterSdw.Stop();
if ( report ) {
CounterSdw.Report();
} else {
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=(1344.0*volume*ncall)/2;
std::cout<<"\t"<< flops/(t1-t0);
}
}

View File

@ -26,7 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#include <Grid.h> #include <Grid/Grid.h>
using namespace std; using namespace std;
using namespace Grid; using namespace Grid;

View File

@ -26,7 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#include <Grid.h> #include <Grid/Grid.h>
using namespace std; using namespace std;
using namespace Grid; using namespace Grid;

View File

@ -26,7 +26,7 @@ Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#include <Grid.h> #include <Grid/Grid.h>
using namespace std; using namespace std;
using namespace Grid; using namespace Grid;

View File

@ -26,7 +26,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#include <Grid.h> #include <Grid/Grid.h>
using namespace std; using namespace std;
using namespace Grid; using namespace Grid;

View File

@ -0,0 +1,117 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./benchmarks/Benchmark_wilson.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Richard Rollins <rprollins@users.noreply.github.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
using namespace Grid::QCD;
template<class d>
struct scal {
d internal;
};
Gamma::GammaMatrix Gmu [] = {
Gamma::GammaX,
Gamma::GammaY,
Gamma::GammaZ,
Gamma::GammaT
};
bool overlapComms = false;
void bench_wilson (
LatticeFermion & src,
LatticeFermion & result,
WilsonFermionR & Dw,
double const volume,
int const dag );
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){ overlapComms = true; }
typename WilsonFermionR::ImplParams params;
params.overlapCommsCompute = overlapComms;
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
std::vector<int> mpi_layout = GridDefaultMpi();
std::vector<int> seeds({1,2,3,4});
RealD mass = 0.1;
std::cout<<GridLogMessage << "============================================================================="<< std::endl;
std::cout<<GridLogMessage << "= Benchmarking Wilson" << std::endl;
std::cout<<GridLogMessage << "============================================================================="<< std::endl;
std::cout<<GridLogMessage << "Volume\t\t\tWilson/MFLOPs\tWilsonDag/MFLOPs" << std::endl;
std::cout<<GridLogMessage << "============================================================================="<< std::endl;
int Lmax = 32;
int dmin = 0;
if ( getenv("LMAX") ) Lmax=atoi(getenv("LMAX"));
if ( getenv("DMIN") ) dmin=atoi(getenv("DMIN"));
for (int L=8; L<=Lmax; L*=2)
{
std::vector<int> latt_size = std::vector<int>(4,L);
for(int d=4; d>dmin; d--)
{
if ( d<=3 ) { latt_size[d] *= 2; }
std::cout << GridLogMessage;
std::copy( latt_size.begin(), --latt_size.end(), std::ostream_iterator<int>( std::cout, std::string("x").c_str() ) );
std::cout << latt_size.back() << "\t\t";
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
GridRedBlackCartesian RBGrid(latt_size,simd_layout,mpi_layout);
GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds);
LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
LatticeFermion src(&Grid); random(pRNG,src);
LatticeFermion result(&Grid); result=zero;
double volume = std::accumulate(latt_size.begin(),latt_size.end(),1,std::multiplies<int>());
WilsonFermionR Dw(Umu,Grid,RBGrid,mass,params);
bench_wilson(src,result,Dw,volume,DaggerNo);
bench_wilson(src,result,Dw,volume,DaggerYes);
std::cout << std::endl;
}
}
std::cout<<GridLogMessage << "============================================================================="<< std::endl;
Grid_finalize();
}
void bench_wilson (
LatticeFermion & src,
LatticeFermion & result,
WilsonFermionR & Dw,
double const volume,
int const dag )
{
int ncall = 1000;
double t0 = usecond();
for(int i=0; i<ncall; i++) { Dw.Dhop(src,result,dag); }
double t1 = usecond();
double flops = 1344 * volume * ncall;
std::cout << flops/(t1-t0) << "\t\t";
}

View File

@ -25,8 +25,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#include <Grid.h> #include <Grid/Grid.h>
#include <PerfCount.h>
using namespace Grid; using namespace Grid;
@ -41,14 +40,20 @@ int main(int argc,char **argv)
std::ofstream os("zmm.dat"); std::ofstream os("zmm.dat");
os << "#V Ls Lxy Lzt C++ Asm OMP L1 " <<std::endl; os << "#V Ls Lxy Lzt C++ Asm OMP L1 " <<std::endl;
std::cout<<GridLogMessage << "====================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking ZMM"<<std::endl;
std::cout<<GridLogMessage << "====================================================================="<<std::endl;
std::cout<<GridLogMessage << "Volume \t\t\t\tC++DW/MFLOPs\tASM-DW/MFLOPs\tdiff"<<std::endl;
std::cout<<GridLogMessage << "====================================================================="<<std::endl;
for(int L=4;L<=32;L+=4){ for(int L=4;L<=32;L+=4){
for(int m=1;m<=2;m++){ for(int m=1;m<=2;m++){
for(int Ls=8;Ls<=16;Ls+=8){ for(int Ls=8;Ls<=16;Ls+=8){
std::vector<int> grid({L,L,m*L,m*L}); std::vector<int> grid({L,L,m*L,m*L});
std::cout << GridLogMessage <<"\t";
for(int i=0;i<4;i++) { for(int i=0;i<4;i++) {
std::cout << grid[i]<<"x"; std::cout << grid[i]<<"x";
} }
std::cout << Ls<<std::endl; std::cout << Ls<<"\t\t";
bench(os,grid,Ls); bench(os,grid,Ls);
} }
} }
@ -105,7 +110,6 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
RealD M5 =1.8; RealD M5 =1.8;
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
int ncall=50; int ncall=50;
double t0=usecond(); double t0=usecond();
for(int i=0;i<ncall;i++){ for(int i=0;i<ncall;i++){
@ -117,16 +121,16 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
double flops=1344*volume/2; double flops=1344*volume/2;
mfc = flops*ncall/(t1-t0); mfc = flops*ncall/(t1-t0);
std::cout<<GridLogMessage << "Called C++ Dw"<< " mflop/s = "<< mfc<<std::endl; std::cout<<mfc<<"\t\t";
QCD::WilsonFermion5DStatic::AsmOptDslash=1; QCD::WilsonKernelsStatic::AsmOpt=1;
t0=usecond(); t0=usecond();
for(int i=0;i<ncall;i++){ for(int i=0;i<ncall;i++){
Dw.DhopOE(srce,resulta,0); Dw.DhopOE(srce,resulta,0);
} }
t1=usecond(); t1=usecond();
mfa = flops*ncall/(t1-t0); mfa = flops*ncall/(t1-t0);
std::cout<<GridLogMessage << "Called ASM Dw"<< " mflop/s = "<< mfa<<std::endl; std::cout<<mfa<<"\t\t";
/* /*
int dag=DaggerNo; int dag=DaggerNo;
t0=usecond(); t0=usecond();
@ -164,8 +168,7 @@ int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
//resulta = (-0.5) * resulta; //resulta = (-0.5) * resulta;
diff = resulto-resulta; diff = resulto-resulta;
std::cout<<GridLogMessage << "diff "<< norm2(diff)<<std::endl; std::cout<<norm2(diff)<<std::endl;
std::cout<<std::endl;
return 0; return 0;
} }

View File

@ -1,31 +0,0 @@
bin_PROGRAMS = Benchmark_comms Benchmark_dwf Benchmark_memory_asynch Benchmark_memory_bandwidth Benchmark_su3 Benchmark_wilson Benchmark_zmm
Benchmark_comms_SOURCES=Benchmark_comms.cc
Benchmark_comms_LDADD=-lGrid
Benchmark_dwf_SOURCES=Benchmark_dwf.cc
Benchmark_dwf_LDADD=-lGrid
Benchmark_memory_asynch_SOURCES=Benchmark_memory_asynch.cc
Benchmark_memory_asynch_LDADD=-lGrid
Benchmark_memory_bandwidth_SOURCES=Benchmark_memory_bandwidth.cc
Benchmark_memory_bandwidth_LDADD=-lGrid
Benchmark_su3_SOURCES=Benchmark_su3.cc
Benchmark_su3_LDADD=-lGrid
Benchmark_wilson_SOURCES=Benchmark_wilson.cc
Benchmark_wilson_LDADD=-lGrid
Benchmark_zmm_SOURCES=Benchmark_zmm.cc
Benchmark_zmm_LDADD=-lGrid

View File

@ -1,8 +1 @@
# additional include paths necessary to compile the C++ library
AM_CXXFLAGS = -I$(top_srcdir)/lib
AM_LDFLAGS = -L$(top_builddir)/lib
#
# Test code
#
include Make.inc include Make.inc

19
bootstrap.sh Executable file
View File

@ -0,0 +1,19 @@
#!/usr/bin/env bash
EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.2.9.tar.bz2'
FFTW_URL=http://www.fftw.org/fftw-3.3.4.tar.gz
echo "-- deploying Eigen source..."
wget ${EIGEN_URL} --no-check-certificate
./scripts/update_eigen.sh `basename ${EIGEN_URL}`
rm `basename ${EIGEN_URL}`
echo "-- copying fftw prototypes..."
wget ${FFTW_URL}
./scripts/update_fftw.sh `basename ${FFTW_URL}`
rm `basename ${FFTW_URL}`
echo '-- generating Make.inc files...'
./scripts/filelist
echo '-- generating configure script...'
autoreconf -fvi

View File

@ -1,193 +1,269 @@
# -*- Autoconf -*-
# Process this file with autoconf to produce a configure script.
#
# Project Grid package
#
# Time-stamp: <2015-07-10 17:46:21 neo>
AC_PREREQ([2.63]) AC_PREREQ([2.63])
AC_INIT([Grid], [1.0], [paboyle@ph.ed.ac.uk]) AC_INIT([Grid], [0.5.1-dev], [https://github.com/paboyle/Grid], [Grid])
AC_CANONICAL_SYSTEM AC_CANONICAL_BUILD
AC_CANONICAL_HOST
AC_CANONICAL_TARGET
AM_INIT_AUTOMAKE(subdir-objects) AM_INIT_AUTOMAKE(subdir-objects)
AC_CONFIG_MACRO_DIR([m4]) AC_CONFIG_MACRO_DIR([m4])
AC_CONFIG_SRCDIR([lib/Grid.h]) AC_CONFIG_SRCDIR([lib/Grid.h])
AC_CONFIG_HEADERS([lib/Config.h]) AC_CONFIG_HEADERS([lib/Config.h])
m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
AC_MSG_NOTICE([
::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: ############### Checks for programs
Configuring $PACKAGE v$VERSION for $host
:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
])
# Checks for programs.
AC_LANG(C++) AC_LANG(C++)
CXXFLAGS="-O3 $CXXFLAGS"
AC_PROG_CXX AC_PROG_CXX
AC_OPENMP
AC_PROG_RANLIB AC_PROG_RANLIB
#AX_CXX_COMPILE_STDCXX_11(noext, mandatory)
AX_EXT
# Checks for libraries. ############ openmp ###############
#AX_GCC_VAR_ATTRIBUTE(aligned) AC_OPENMP
# Checks for header files. ac_openmp=no
if test "${OPENMP_CXXFLAGS}X" != "X"; then
ac_openmp=yes
AM_CXXFLAGS="$OPENMP_CXXFLAGS $AM_CXXFLAGS"
AM_LDFLAGS="$OPENMP_CXXFLAGS $AM_LDFLAGS"
fi
############### Checks for header files
AC_CHECK_HEADERS(stdint.h) AC_CHECK_HEADERS(stdint.h)
AC_CHECK_HEADERS(mm_malloc.h) AC_CHECK_HEADERS(mm_malloc.h)
AC_CHECK_HEADERS(malloc/malloc.h) AC_CHECK_HEADERS(malloc/malloc.h)
AC_CHECK_HEADERS(malloc.h) AC_CHECK_HEADERS(malloc.h)
AC_CHECK_HEADERS(endian.h) AC_CHECK_HEADERS(endian.h)
AC_CHECK_HEADERS(execinfo.h) AC_CHECK_HEADERS(execinfo.h)
AC_CHECK_HEADERS(gmp.h)
AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]]) AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]]) AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])
# Checks for typedefs, structures, and compiler characteristics. ############### Checks for typedefs, structures, and compiler characteristics
AC_TYPE_SIZE_T AC_TYPE_SIZE_T
AC_TYPE_UINT32_T AC_TYPE_UINT32_T
AC_TYPE_UINT64_T AC_TYPE_UINT64_T
# Checks for library functions. ############### GMP and MPFR #################
echo AC_ARG_WITH([gmp],
echo Checking libraries [AS_HELP_STRING([--with-gmp=prefix],
echo ::::::::::::::::::::::::::::::::::::::::::: [try this for a non-standard install prefix of the GMP library])],
[AM_CXXFLAGS="-I$with_gmp/include $AM_CXXFLAGS"]
[AM_LDFLAGS="-L$with_gmp/lib $AM_LDFLAGS"])
AC_ARG_WITH([mpfr],
[AS_HELP_STRING([--with-mpfr=prefix],
[try this for a non-standard install prefix of the MPFR library])],
[AM_CXXFLAGS="-I$with_mpfr/include $AM_CXXFLAGS"]
[AM_LDFLAGS="-L$with_mpfr/lib $AM_LDFLAGS"])
################## lapack ####################
AC_ARG_ENABLE([lapack],
[AC_HELP_STRING([--enable-lapack=yes|no|prefix], [enable LAPACK])],
[ac_LAPACK=${enable_lapack}],[ac_LAPACK=no])
case ${ac_LAPACK} in
no)
;;
yes)
AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
*)
AM_CXXFLAGS="-I$ac_LAPACK/include $AM_CXXFLAGS"
AM_LDFLAGS="-L$ac_LAPACK/lib $AM_LDFLAGS"
AC_DEFINE([USE_LAPACK],[1],[use LAPACK])
esac
################## first-touch ####################
AC_ARG_ENABLE([numa],
[AC_HELP_STRING([--enable-numa=yes|no|prefix], [enable first touch numa opt])],
[ac_NUMA=${enable_NUMA}],[ac_NUMA=no])
case ${ac_NUMA} in
no)
;;
yes)
AC_DEFINE([GRID_NUMA],[1],[First touch numa locality]);;
*)
AC_DEFINE([GRID_NUMA],[1],[First touch numa locality]);;
esac
################## FFTW3 ####################
AC_ARG_WITH([fftw],
[AS_HELP_STRING([--with-fftw=prefix],
[try this for a non-standard install prefix of the FFTW3 library])],
[AM_CXXFLAGS="-I$with_fftw/include $AM_CXXFLAGS"]
[AM_LDFLAGS="-L$with_fftw/lib $AM_LDFLAGS"])
################ Get compiler informations
AC_LANG([C++])
AX_CXX_COMPILE_STDCXX_11([noext],[mandatory])
AX_COMPILER_VENDOR
AC_DEFINE_UNQUOTED([CXX_COMP_VENDOR],["$ax_cv_cxx_compiler_vendor"],
[vendor of C++ compiler that will compile the code])
AX_GXX_VERSION
AC_DEFINE_UNQUOTED([GXX_VERSION],["$GXX_VERSION"],
[version of g++ that will compile the code])
############### Checks for library functions
CXXFLAGS_CPY=$CXXFLAGS
LDFLAGS_CPY=$LDFLAGS
CXXFLAGS="$AM_CXXFLAGS $CXXFLAGS"
LDFLAGS="$AM_LDFLAGS $LDFLAGS"
AC_CHECK_FUNCS([gettimeofday]) AC_CHECK_FUNCS([gettimeofday])
AC_CHECK_LIB([gmp],[__gmpf_init],
[AC_CHECK_LIB([mpfr],[mpfr_init],
[AC_DEFINE([HAVE_LIBMPFR], [1], [Define to 1 if you have the `MPFR' library (-lmpfr).])]
[have_mpfr=true]
[LIBS="$LIBS -lmpfr"],
[AC_MSG_ERROR([MPFR library not found])])]
[AC_DEFINE([HAVE_LIBGMP], [1], [Define to 1 if you have the `GMP' library (-lgmp).])]
[have_gmp=true]
[LIBS="$LIBS -lgmp"],
[AC_MSG_WARN([**** GMP library not found, Grid can still compile but RHMC will not work ****])])
if test "${ac_LAPACK}x" != "nox"; then
AC_CHECK_LIB([lapack],[LAPACKE_sbdsdc],[],
[AC_MSG_ERROR("LAPACK enabled but library not found")])
fi
AC_CHECK_LIB([fftw3],[fftw_execute],
[AC_DEFINE([HAVE_FFTW],[1],[Define to 1 if you have the `FFTW' library (-lfftw3).])]
[have_fftw=true]
[LIBS="$LIBS -lfftw3 -lfftw3f"],
[AC_MSG_WARN([**** FFTW library not found, Grid can still compile but FFT-based routines will not work ****])])
CXXFLAGS=$CXXFLAGS_CPY
LDFLAGS=$LDFLAGS_CPY
# ############### SIMD instruction selection
# SIMD instructions selection AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVXFMA4|AVXFMA|AVX2|AVX512|AVX512MIC|IMCI|KNL|KNC],\
#
AC_ARG_ENABLE([simd],[AC_HELP_STRING([--enable-simd=SSE4|AVX|AVXFMA4|AVX2|AVX512|IMCI],\
[Select instructions to be SSE4.0, AVX 1.0, AVX 2.0+FMA, AVX 512, IMCI])],\ [Select instructions to be SSE4.0, AVX 1.0, AVX 2.0+FMA, AVX 512, IMCI])],\
[ac_SIMD=${enable_simd}],[ac_SIMD=DEBUG]) [ac_SIMD=${enable_simd}],[ac_SIMD=GEN])
supported=no
ac_ZMM=no;
case ${ax_cv_cxx_compiler_vendor} in
clang|gnu)
case ${ac_SIMD} in case ${ac_SIMD} in
SSE4) SSE4)
echo Configuring for SSE4 AC_DEFINE([SSE4],[1],[SSE4 intrinsics])
AC_DEFINE([SSE4],[1],[SSE4 Intrinsics] ) SIMD_FLAGS='-msse4.2';;
if test x"$ax_cv_support_ssse3_ext" = x"yes"; then dnl minimal support for SSE4
supported=yes
else
AC_MSG_WARN([Your processor does not support SSE4 instructions])
fi
;;
AVX) AVX)
echo Configuring for AVX AC_DEFINE([AVX1],[1],[AVX intrinsics])
AC_DEFINE([AVX1],[1],[AVX Intrinsics] ) SIMD_FLAGS='-mavx';;
if test x"$ax_cv_support_avx_ext" = x"yes"; then dnl minimal support for AVX
supported=yes
else
AC_MSG_WARN([Your processor does not support AVX instructions])
fi
;;
AVXFMA4) AVXFMA4)
echo Configuring for AVX AC_DEFINE([AVXFMA4],[1],[AVX intrinsics with FMA4])
AC_DEFINE([AVXFMA4],[1],[AVX Intrinsics with FMA4] ) SIMD_FLAGS='-mavx -mfma4';;
if test x"$ax_cv_support_avx_ext" = x"yes"; then dnl minimal support for AVX AVXFMA)
supported=yes AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA3])
else SIMD_FLAGS='-mavx -mfma';;
AC_MSG_WARN([Your processor does not support AVX instructions])
fi
;;
AVX2) AVX2)
echo Configuring for AVX2 AC_DEFINE([AVX2],[1],[AVX2 intrinsics])
AC_DEFINE([AVX2],[1],[AVX2 Intrinsics] ) SIMD_FLAGS='-mavx2 -mfma';;
if test x"$ax_cv_support_avx2_ext" = x"yes"; then dnl minimal support for AVX2 AVX512|AVX512MIC|KNL)
supported=yes AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
else SIMD_FLAGS='-mavx512f -mavx512pf -mavx512er -mavx512cd';;
AC_MSG_WARN([Your processor does not support AVX2 instructions]) IMCI|KNC)
fi AC_DEFINE([IMCI],[1],[IMCI intrinsics for Knights Corner])
;; SIMD_FLAGS='';;
AVX512) GEN)
echo Configuring for AVX512 AC_DEFINE([GENERIC_VEC],[1],[generic vector code])
AC_DEFINE([AVX512],[1],[AVX512 Intrinsics for Knights Landing] ) SIMD_FLAGS='';;
supported="cross compilation" QPX|BGQ)
ac_ZMM=yes; AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q])
;; SIMD_FLAGS='';;
IMCI) *)
echo Configuring for IMCI AC_MSG_ERROR(["SIMD option ${ac_SIMD} not supported by the GCC/Clang compiler"]);;
AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner] ) esac;;
supported="cross compilation" intel)
ac_ZMM=no; case ${ac_SIMD} in
;; SSE4)
NEONv8) AC_DEFINE([SSE4],[1],[SSE4 intrinsics])
echo Configuring for experimental ARMv8a support SIMD_FLAGS='-msse4.2 -xsse4.2';;
AC_DEFINE([NEONv8],[1],[NEON ARMv8 Experimental support ] ) AVX)
supported="cross compilation" AC_DEFINE([AVX1],[1],[AVX intrinsics])
;; SIMD_FLAGS='-mavx -xavx';;
DEBUG) AVXFMA4)
echo Configuring without SIMD support - only for compiler DEBUGGING! AC_DEFINE([AVXFMA4],[1],[AVX intrinsics with FMA4])
AC_DEFINE([EMPTY_SIMD],[1],[EMPTY_SIMD only for DEBUGGING] ) SIMD_FLAGS='-mavx -mfma';;
;; AVXFMA)
AC_DEFINE([AVXFMA],[1],[AVX intrinsics with FMA4])
SIMD_FLAGS='-mavx -mfma';;
AVX2)
AC_DEFINE([AVX2],[1],[AVX2 intrinsics])
SIMD_FLAGS='-march=core-avx2 -xcore-avx2';;
AVX512)
AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
SIMD_FLAGS='-xcore-avx512';;
AVX512MIC|KNL)
AC_DEFINE([AVX512],[1],[AVX512 intrinsics for Knights Landing])
SIMD_FLAGS='-xmic-avx512';;
IMCI|KNC)
AC_DEFINE([IMCI],[1],[IMCI Intrinsics for Knights Corner])
SIMD_FLAGS='';;
GEN)
AC_DEFINE([GENERIC_VEC],[1],[generic vector code])
SIMD_FLAGS='';;
*)
AC_MSG_ERROR(["SIMD option ${ac_SIMD} not supported by the Intel compiler"]);;
esac;;
*)
AC_MSG_WARN([Compiler unknown, using generic vector code])
AC_DEFINE([GENERIC_VEC],[1],[generic vector code]);;
esac
AM_CXXFLAGS="$SIMD_FLAGS $AM_CXXFLAGS"
AM_CFLAGS="$SIMD_FLAGS $AM_CFLAGS"
case ${ac_SIMD} in
AVX512|AVX512MIC|KNL)
AC_DEFINE([TEST_ZMM],[1],[compile ZMM test]);;
*) *)
AC_MSG_ERROR([${ac_SIMD} flag unsupported as --enable-simd option\nRun ./configure --help for the list of options]);
;; ;;
esac esac
case ${ac_ZMM} in ############### precision selection
yes)
echo Enabling ZMM source code
;;
no)
echo Disabling ZMM source code
;;
esac
AM_CONDITIONAL(BUILD_ZMM,[ test "X${ac_ZMM}X" == "XyesX" ])
AC_ARG_ENABLE([precision],[AC_HELP_STRING([--enable-precision=single|double],[Select default word size of Real])],[ac_PRECISION=${enable_precision}],[ac_PRECISION=double]) AC_ARG_ENABLE([precision],[AC_HELP_STRING([--enable-precision=single|double],[Select default word size of Real])],[ac_PRECISION=${enable_precision}],[ac_PRECISION=double])
case ${ac_PRECISION} in case ${ac_PRECISION} in
single) single)
echo default precision is single
AC_DEFINE([GRID_DEFAULT_PRECISION_SINGLE],[1],[GRID_DEFAULT_PRECISION is SINGLE] ) AC_DEFINE([GRID_DEFAULT_PRECISION_SINGLE],[1],[GRID_DEFAULT_PRECISION is SINGLE] )
;; ;;
double) double)
echo default precision is double
AC_DEFINE([GRID_DEFAULT_PRECISION_DOUBLE],[1],[GRID_DEFAULT_PRECISION is DOUBLE] ) AC_DEFINE([GRID_DEFAULT_PRECISION_DOUBLE],[1],[GRID_DEFAULT_PRECISION is DOUBLE] )
;; ;;
esac esac
# ############### communication type selection
# Comms selection AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi|mpi-auto|shmem],[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])
#
AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi],[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])
case ${ac_COMMS} in case ${ac_COMMS} in
none) none)
echo Configuring for NO communications
AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] ) AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
;; ;;
mpi-auto)
AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
LX_FIND_MPI
if test "x$have_CXX_mpi" = 'xno'; then AC_MSG_ERROR(["MPI not found"]); fi
AM_CXXFLAGS="$MPI_CXXFLAGS $AM_CXXFLAGS"
AM_CFLAGS="$MPI_CFLAGS $AM_CFLAGS"
AM_LDFLAGS="`echo $MPI_CXXLDFLAGS | sed -E 's/-l@<:@^ @:>@+//g'` $AM_LDFLAGS"
LIBS="`echo $MPI_CXXLDFLAGS | sed -E 's/-L@<:@^ @:>@+//g'` $LIBS"
;;
mpi) mpi)
echo Configuring for MPI communications
AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] ) AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
;; ;;
mpi3)
AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] )
;;
shmem) shmem)
echo Configuring for SHMEM communications
AC_DEFINE([GRID_COMMS_SHMEM],[1],[GRID_COMMS_SHMEM] ) AC_DEFINE([GRID_COMMS_SHMEM],[1],[GRID_COMMS_SHMEM] )
;; ;;
*) *)
AC_MSG_ERROR([${ac_COMMS} unsupported --enable-comms option]); AC_MSG_ERROR([${ac_COMMS} unsupported --enable-comms option]);
;; ;;
esac esac
AM_CONDITIONAL(BUILD_COMMS_SHMEM,[ test "X${ac_COMMS}X" == "XshmemX" ]) AM_CONDITIONAL(BUILD_COMMS_SHMEM,[ test "X${ac_COMMS}X" == "XshmemX" ])
AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" ]) AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" || test "X${ac_COMMS}X" == "Xmpi-autoX" ])
AM_CONDITIONAL(BUILD_COMMS_MPI3,[ test "X${ac_COMMS}X" == "Xmpi3X"] )
AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ]) AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])
# ############### RNG selection
# RNG selection
#
AC_ARG_ENABLE([rng],[AC_HELP_STRING([--enable-rng=ranlux48|mt19937],\ AC_ARG_ENABLE([rng],[AC_HELP_STRING([--enable-rng=ranlux48|mt19937],\
[Select Random Number Generator to be used])],\ [Select Random Number Generator to be used])],\
[ac_RNG=${enable_rng}],[ac_RNG=ranlux48]) [ac_RNG=${enable_rng}],[ac_RNG=ranlux48])
case ${ac_RNG} in case ${ac_RNG} in
ranlux48) ranlux48)
AC_DEFINE([RNG_RANLUX],[1],[RNG_RANLUX] ) AC_DEFINE([RNG_RANLUX],[1],[RNG_RANLUX] )
@ -199,89 +275,92 @@ case ${ac_RNG} in
AC_MSG_ERROR([${ac_RNG} unsupported --enable-rng option]); AC_MSG_ERROR([${ac_RNG} unsupported --enable-rng option]);
;; ;;
esac esac
#
# Chroma regression tests
#
AC_ARG_ENABLE([chroma],[AC_HELP_STRING([--enable-chroma],[Expect chroma compiled under c++11 ])],ac_CHROMA=yes,ac_CHROMA=no)
case ${ac_CHROMA} in ############### timer option
AC_ARG_ENABLE([timers],[AC_HELP_STRING([--enable-timers],\
[Enable system dependent high res timers])],\
[ac_TIMERS=${enable_timers}],[ac_TIMERS=yes])
case ${ac_TIMERS} in
yes) yes)
echo Enabling tests regressing to Chroma AC_DEFINE([TIMERS_ON],[1],[TIMERS_ON] )
;; ;;
no) no)
echo Disabling tests regressing to Chroma AC_DEFINE([TIMERS_OFF],[1],[TIMERS_OFF] )
;;
*)
AC_MSG_ERROR([${ac_TIMERS} unsupported --enable-timers option]);
;;
esac
############### Chroma regression test
AC_ARG_ENABLE([chroma],[AC_HELP_STRING([--enable-chroma],[Expect chroma compiled under c++11 ])],ac_CHROMA=yes,ac_CHROMA=no)
case ${ac_CHROMA} in
yes|no)
;; ;;
*) *)
AC_MSG_ERROR([${ac_CHROMA} unsupported --enable-chroma option]); AC_MSG_ERROR([${ac_CHROMA} unsupported --enable-chroma option]);
;; ;;
esac esac
AM_CONDITIONAL(BUILD_CHROMA_REGRESSION,[ test "X${ac_CHROMA}X" == "XyesX" ]) AM_CONDITIONAL(BUILD_CHROMA_REGRESSION,[ test "X${ac_CHROMA}X" == "XyesX" ])
# ############### Doxygen
# Lapack AC_PROG_DOXYGEN
#
AC_ARG_ENABLE([lapack],[AC_HELP_STRING([--enable-lapack],[Enable lapack yes/no ])],[ac_LAPACK=${enable_lapack}],[ac_LAPACK=no])
case ${ac_LAPACK} in if test -n "$DOXYGEN"
yes) then
echo Enabling lapack AC_CONFIG_FILES([docs/doxy.cfg])
;; fi
no)
echo Disabling lapack
;;
*)
echo Enabling lapack at ${ac_LAPACK}
;;
esac
AM_CONDITIONAL(USE_LAPACK,[ test "X${ac_LAPACK}X" != "XnoX" ]) ############### Ouput
AM_CONDITIONAL(USE_LAPACK_LIB,[ test "X${ac_LAPACK}X" != "XyesX" ]) cwd=`pwd -P`; cd ${srcdir}; abs_srcdir=`pwd -P`; cd ${cwd}
AM_CXXFLAGS="-I${abs_srcdir}/include $AM_CXXFLAGS"
################################################################### AM_CFLAGS="-I${abs_srcdir}/include $AM_CFLAGS"
# Checks for doxygen support AM_LDFLAGS="-L${cwd}/lib $AM_LDFLAGS"
# if present enables the "make doxyfile" command AC_SUBST([AM_CFLAGS])
#echo AC_SUBST([AM_CXXFLAGS])
#echo Checking doxygen support AC_SUBST([AM_LDFLAGS])
#echo :::::::::::::::::::::::::::::::::::::::::::
#AC_PROG_DOXYGEN
#if test -n "$DOXYGEN"
#then
#AC_CONFIG_FILES([docs/doxy.cfg])
#fi
echo
echo Creating configuration files
echo :::::::::::::::::::::::::::::::::::::::::::
AC_CONFIG_FILES(Makefile) AC_CONFIG_FILES(Makefile)
AC_CONFIG_FILES(lib/Makefile) AC_CONFIG_FILES(lib/Makefile)
AC_CONFIG_FILES(tests/Makefile) AC_CONFIG_FILES(tests/Makefile)
AC_CONFIG_FILES(tests/IO/Makefile)
AC_CONFIG_FILES(tests/core/Makefile)
AC_CONFIG_FILES(tests/debug/Makefile)
AC_CONFIG_FILES(tests/forces/Makefile)
AC_CONFIG_FILES(tests/hmc/Makefile)
AC_CONFIG_FILES(tests/solver/Makefile)
AC_CONFIG_FILES(tests/qdpxx/Makefile) AC_CONFIG_FILES(tests/qdpxx/Makefile)
AC_CONFIG_FILES(benchmarks/Makefile) AC_CONFIG_FILES(benchmarks/Makefile)
AC_OUTPUT AC_OUTPUT
echo " echo "
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Summary of configuration for $PACKAGE v$VERSION Summary of configuration for $PACKAGE v$VERSION
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The following features are enabled: ----- PLATFORM ----------------------------------------
- architecture (build) : $build_cpu - architecture (build) : $build_cpu
- os (build) : $build_os - os (build) : $build_os
- architecture (target) : $target_cpu - architecture (target) : $target_cpu
- os (target) : $target_os - os (target) : $target_os
- compiler vendor : ${ax_cv_cxx_compiler_vendor}
- compiler version : ${ax_cv_gxx_version}
----- BUILD OPTIONS -----------------------------------
- SIMD : ${ac_SIMD}
- Threading : ${ac_openmp}
- Communications type : ${ac_COMMS}
- Default precision : ${ac_PRECISION}
- RNG choice : ${ac_RNG}
- GMP : `if test "x$have_gmp" = xtrue; then echo yes; else echo no; fi`
- LAPACK : ${ac_LAPACK}
- FFTW : `if test "x$have_fftw" = xtrue; then echo yes; else echo no; fi`
- build DOXYGEN documentation : `if test "x$enable_doc" = xyes; then echo yes; else echo no; fi` - build DOXYGEN documentation : `if test "x$enable_doc" = xyes; then echo yes; else echo no; fi`
- graphs and diagrams : `if test "x$enable_dot" = xyes; then echo yes; else echo no; fi` - graphs and diagrams : `if test "x$enable_dot" = xyes; then echo yes; else echo no; fi`
- Supported SIMD flags : $SIMD_FLAGS ----- BUILD FLAGS -------------------------------------
---------------------------------------------------------- - CXXFLAGS:
- enabled simd support : ${ac_SIMD} (config macro says supported: $supported ) `echo ${AM_CXXFLAGS} ${CXXFLAGS} | tr ' ' '\n' | sed 's/^-/ -/g'`
- communications type : ${ac_COMMS} - LDFLAGS:
- default precision : ${ac_PRECISION} `echo ${AM_LDFLAGS} ${LDFLAGS} | tr ' ' '\n' | sed 's/^-/ -/g'`
- RNG choice : ${ac_RNG} - LIBS:
- LAPACK : ${ac_LAPACK} `echo ${LIBS} | tr ' ' '\n' | sed 's/^-/ -/g'`
-------------------------------------------------------
" "

1
include/Grid Symbolic link
View File

@ -0,0 +1 @@
../lib

View File

@ -29,27 +29,28 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_ALGORITHMS_H #ifndef GRID_ALGORITHMS_H
#define GRID_ALGORITHMS_H #define GRID_ALGORITHMS_H
#include <algorithms/SparseMatrix.h> #include <Grid/algorithms/SparseMatrix.h>
#include <algorithms/LinearOperator.h> #include <Grid/algorithms/LinearOperator.h>
#include <algorithms/Preconditioner.h> #include <Grid/algorithms/Preconditioner.h>
#include <algorithms/approx/Zolotarev.h> #include <Grid/algorithms/approx/Zolotarev.h>
#include <algorithms/approx/Chebyshev.h> #include <Grid/algorithms/approx/Chebyshev.h>
#include <algorithms/approx/Remez.h> #include <Grid/algorithms/approx/Remez.h>
#include <algorithms/approx/MultiShiftFunction.h> #include <Grid/algorithms/approx/MultiShiftFunction.h>
#include <algorithms/iterative/ConjugateGradient.h> #include <Grid/algorithms/iterative/ConjugateGradient.h>
#include <algorithms/iterative/ConjugateResidual.h> #include <Grid/algorithms/iterative/ConjugateResidual.h>
#include <algorithms/iterative/NormalEquations.h> #include <Grid/algorithms/iterative/NormalEquations.h>
#include <algorithms/iterative/SchurRedBlack.h> #include <Grid/algorithms/iterative/SchurRedBlack.h>
#include <algorithms/iterative/ConjugateGradientMultiShift.h> #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
#include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
// Lanczos support // Lanczos support
#include <algorithms/iterative/MatrixUtils.h> #include <Grid/algorithms/iterative/MatrixUtils.h>
#include <algorithms/iterative/ImplicitlyRestartedLanczos.h> #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
#include <algorithms/CoarsenedMatrix.h> #include <Grid/algorithms/CoarsenedMatrix.h>
// Eigen/lanczos // Eigen/lanczos
// EigCg // EigCg

View File

@ -40,14 +40,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <mm_malloc.h> #include <mm_malloc.h>
#endif #endif
#ifdef GRID_COMMS_SHMEM
extern "C" {
#include <mpp/shmem.h>
extern void * shmem_align(size_t, size_t);
extern void shmem_free(void *);
}
#endif
namespace Grid { namespace Grid {
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
@ -65,28 +57,85 @@ public:
typedef _Tp value_type; typedef _Tp value_type;
template<typename _Tp1> struct rebind { typedef alignedAllocator<_Tp1> other; }; template<typename _Tp1> struct rebind { typedef alignedAllocator<_Tp1> other; };
alignedAllocator() throw() { } alignedAllocator() throw() { }
alignedAllocator(const alignedAllocator&) throw() { } alignedAllocator(const alignedAllocator&) throw() { }
template<typename _Tp1> alignedAllocator(const alignedAllocator<_Tp1>&) throw() { } template<typename _Tp1> alignedAllocator(const alignedAllocator<_Tp1>&) throw() { }
~alignedAllocator() throw() { } ~alignedAllocator() throw() { }
pointer address(reference __x) const { return &__x; } pointer address(reference __x) const { return &__x; }
// const_pointer address(const_reference __x) const { return &__x; }
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); } size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
pointer allocate(size_type __n, const void* _p= 0) pointer allocate(size_type __n, const void* _p= 0)
{ {
#ifdef HAVE_MM_MALLOC_H
_Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
#else
_Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
#endif
_Tp tmp;
#ifdef GRID_NUMA
#pragma omp parallel for schedule(static)
for(int i=0;i<__n;i++){
ptr[i]=tmp;
}
#endif
return ptr;
}
void deallocate(pointer __p, size_type) {
#ifdef HAVE_MM_MALLOC_H
_mm_free((void *)__p);
#else
free((void *)__p);
#endif
}
void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p) { };
void destroy(pointer __p) { };
};
template<typename _Tp> inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
//////////////////////////////////////////////////////////////////////////////////////////
// MPI3 : comms must use shm region
// SHMEM: comms must use symmetric heap
//////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_COMMS_SHMEM #ifdef GRID_COMMS_SHMEM
extern "C" {
_Tp *ptr = (_Tp *) shmem_align(__n*sizeof(_Tp),64); #include <mpp/shmem.h>
extern void * shmem_align(size_t, size_t);
extern void shmem_free(void *);
}
#define PARANOID_SYMMETRIC_HEAP #define PARANOID_SYMMETRIC_HEAP
#endif
template<typename _Tp>
class commAllocator {
public:
typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type;
typedef _Tp* pointer;
typedef const _Tp* const_pointer;
typedef _Tp& reference;
typedef const _Tp& const_reference;
typedef _Tp value_type;
template<typename _Tp1> struct rebind { typedef commAllocator<_Tp1> other; };
commAllocator() throw() { }
commAllocator(const commAllocator&) throw() { }
template<typename _Tp1> commAllocator(const commAllocator<_Tp1>&) throw() { }
~commAllocator() throw() { }
pointer address(reference __x) const { return &__x; }
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
#ifdef GRID_COMMS_SHMEM
pointer allocate(size_type __n, const void* _p= 0)
{
#ifdef CRAY
_Tp *ptr = (_Tp *) shmem_align(__n*sizeof(_Tp),64);
#else
_Tp *ptr = (_Tp *) shmem_align(64,__n*sizeof(_Tp));
#endif
#ifdef PARANOID_SYMMETRIC_HEAP #ifdef PARANOID_SYMMETRIC_HEAP
static void * bcast; static void * bcast;
static long psync[_SHMEM_REDUCE_SYNC_SIZE]; static long psync[_SHMEM_REDUCE_SYNC_SIZE];
@ -96,55 +145,47 @@ public:
if ( bcast != ptr ) { if ( bcast != ptr ) {
std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout); std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout);
BACKTRACEFILE(); // BACKTRACEFILE();
exit(0); exit(0);
} }
assert( bcast == (void *) ptr); assert( bcast == (void *) ptr);
#endif #endif
return ptr;
}
void deallocate(pointer __p, size_type) {
shmem_free((void *)__p);
}
#else #else
pointer allocate(size_type __n, const void* _p= 0)
{
#ifdef HAVE_MM_MALLOC_H #ifdef HAVE_MM_MALLOC_H
_Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128); _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
#else #else
_Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp)); _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
#endif #endif
#endif
_Tp tmp;
#undef FIRST_TOUCH_OPTIMISE
#ifdef FIRST_TOUCH_OPTIMISE
#pragma omp parallel for
for(int i=0;i<__n;i++){
ptr[i]=tmp;
}
#endif
return ptr; return ptr;
} }
void deallocate(pointer __p, size_type) { void deallocate(pointer __p, size_type) {
#ifdef GRID_COMMS_SHMEM
shmem_free((void *)__p);
#else
#ifdef HAVE_MM_MALLOC_H #ifdef HAVE_MM_MALLOC_H
_mm_free((void *)__p); _mm_free((void *)__p);
#else #else
free((void *)__p); free((void *)__p);
#endif
#endif #endif
} }
#endif
void construct(pointer __p, const _Tp& __val) { }; void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p) { }; void construct(pointer __p) { };
void destroy(pointer __p) { }; void destroy(pointer __p) { };
}; };
template<typename _Tp> inline bool operator==(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return false; }
template<typename _Tp> inline bool ////////////////////////////////////////////////////////////////////////////////
operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; } // Template typedefs
////////////////////////////////////////////////////////////////////////////////
template<typename _Tp> inline bool template<class T> using Vector = std::vector<T,alignedAllocator<T> >;
operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; } template<class T> using commVector = std::vector<T,commAllocator<T> >;
template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >;
}; // namespace Grid }; // namespace Grid
#endif #endif

View File

@ -28,8 +28,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_CARTESIAN_H #ifndef GRID_CARTESIAN_H
#define GRID_CARTESIAN_H #define GRID_CARTESIAN_H
#include <cartesian/Cartesian_base.h> #include <Grid/cartesian/Cartesian_base.h>
#include <cartesian/Cartesian_full.h> #include <Grid/cartesian/Cartesian_full.h>
#include <cartesian/Cartesian_red_black.h> #include <Grid/cartesian/Cartesian_red_black.h>
#endif #endif

View File

@ -28,6 +28,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_COMMUNICATOR_H #ifndef GRID_COMMUNICATOR_H
#define GRID_COMMUNICATOR_H #define GRID_COMMUNICATOR_H
#include <communicator/Communicator_base.h> #include <Grid/communicator/Communicator_base.h>
#endif #endif

View File

@ -28,17 +28,21 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef _GRID_CSHIFT_H_ #ifndef _GRID_CSHIFT_H_
#define _GRID_CSHIFT_H_ #define _GRID_CSHIFT_H_
#include <cshift/Cshift_common.h> #include <Grid/cshift/Cshift_common.h>
#ifdef GRID_COMMS_NONE #ifdef GRID_COMMS_NONE
#include <cshift/Cshift_none.h> #include <Grid/cshift/Cshift_none.h>
#endif #endif
#ifdef GRID_COMMS_MPI #ifdef GRID_COMMS_MPI
#include <cshift/Cshift_mpi.h> #include <Grid/cshift/Cshift_mpi.h>
#endif
#ifdef GRID_COMMS_MPI3
#include <Grid/cshift/Cshift_mpi.h>
#endif #endif
#ifdef GRID_COMMS_SHMEM #ifdef GRID_COMMS_SHMEM
#include <cshift/Cshift_mpi.h> // uses same implementation of communicator #include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator
#endif #endif
#endif #endif

271
lib/FFT.h Normal file
View File

@ -0,0 +1,271 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/Cshift.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef _GRID_FFT_H_
#define _GRID_FFT_H_
#ifdef HAVE_FFTW
#include <fftw3.h>
#endif
namespace Grid {
template<class scalar> struct FFTW { };
#ifdef HAVE_FFTW
template<> struct FFTW<ComplexD> {
public:
typedef fftw_complex FFTW_scalar;
typedef fftw_plan FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
FFTW_scalar *in, const int *inembed,
int istride, int idist,
FFTW_scalar *out, const int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
}
static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
::fftw_flops(p,add,mul,fmas);
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
::fftw_execute_dft(p,in,out);
}
inline static void fftw_destroy_plan(const FFTW_plan p) {
::fftw_destroy_plan(p);
}
};
template<> struct FFTW<ComplexF> {
public:
typedef fftwf_complex FFTW_scalar;
typedef fftwf_plan FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
FFTW_scalar *in, const int *inembed,
int istride, int idist,
FFTW_scalar *out, const int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
}
static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
::fftwf_flops(p,add,mul,fmas);
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
::fftwf_execute_dft(p,in,out);
}
inline static void fftw_destroy_plan(const FFTW_plan p) {
::fftwf_destroy_plan(p);
}
};
#endif
#ifndef FFTW_FORWARD
#define FFTW_FORWARD (-1)
#define FFTW_BACKWARD (+1)
#endif
class FFT {
private:
GridCartesian *vgrid;
GridCartesian *sgrid;
int Nd;
double flops;
double flops_call;
uint64_t usec;
std::vector<int> dimensions;
std::vector<int> processors;
std::vector<int> processor_coor;
public:
static const int forward=FFTW_FORWARD;
static const int backward=FFTW_BACKWARD;
double Flops(void) {return flops;}
double MFlops(void) {return flops/usec;}
FFT ( GridCartesian * grid ) :
vgrid(grid),
Nd(grid->_ndimension),
dimensions(grid->_fdimensions),
processors(grid->_processors),
processor_coor(grid->_processor_coor)
{
flops=0;
usec =0;
std::vector<int> layout(Nd,1);
sgrid = new GridCartesian(dimensions,layout,processors);
};
~FFT ( void) {
delete sgrid;
}
template<class vobj>
void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int inverse){
conformable(result._grid,vgrid);
conformable(source._grid,vgrid);
int L = vgrid->_ldimensions[dim];
int G = vgrid->_fdimensions[dim];
std::vector<int> layout(Nd,1);
std::vector<int> pencil_gd(vgrid->_fdimensions);
pencil_gd[dim] = G*processors[dim];
// Pencil global vol LxLxGxLxL per node
GridCartesian pencil_g(pencil_gd,layout,processors);
// Construct pencils
typedef typename vobj::scalar_object sobj;
typedef typename sobj::scalar_type scalar;
Lattice<vobj> ssource(vgrid); ssource =source;
Lattice<sobj> pgsource(&pencil_g);
Lattice<sobj> pgresult(&pencil_g); pgresult=zero;
#ifndef HAVE_FFTW
assert(0);
#else
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
{
int Ncomp = sizeof(sobj)/sizeof(scalar);
int Nlow = 1;
for(int d=0;d<dim;d++){
Nlow*=vgrid->_ldimensions[d];
}
int rank = 1; /* 1d transforms */
int n[] = {G}; /* 1d transforms of length G */
int howmany = Ncomp;
int odist,idist,istride,ostride;
idist = odist = 1; /* Distance between consecutive FT's */
istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */
int *inembed = n, *onembed = n;
int sign = FFTW_FORWARD;
if (inverse) sign = FFTW_BACKWARD;
FFTW_plan p;
{
FFTW_scalar *in = (FFTW_scalar *)&pgsource._odata[0];
FFTW_scalar *out= (FFTW_scalar *)&pgresult._odata[0];
p = FFTW<scalar>::fftw_plan_many_dft(rank,n,howmany,
in,inembed,
istride,idist,
out,onembed,
ostride, odist,
sign,FFTW_ESTIMATE);
}
std::vector<int> lcoor(Nd), gcoor(Nd);
// Barrel shift and collect global pencil
for(int p=0;p<processors[dim];p++) {
for(int idx=0;idx<sgrid->lSites();idx++) {
sgrid->LocalIndexToLocalCoor(idx,lcoor);
sobj s;
peekLocalSite(s,ssource,lcoor);
lcoor[dim]+=p*L;
pokeLocalSite(s,pgsource,lcoor);
}
ssource = Cshift(ssource,dim,L);
}
// Loop over orthog coords
int NN=pencil_g.lSites();
GridStopWatch timer;
timer.Start();
PARALLEL_FOR_LOOP
for(int idx=0;idx<NN;idx++) {
pencil_g.LocalIndexToLocalCoor(idx,lcoor);
if ( lcoor[dim] == 0 ) { // restricts loop to plane at lcoor[dim]==0
FFTW_scalar *in = (FFTW_scalar *)&pgsource._odata[idx];
FFTW_scalar *out= (FFTW_scalar *)&pgresult._odata[idx];
FFTW<scalar>::fftw_execute_dft(p,in,out);
}
}
timer.Stop();
double add,mul,fma;
FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
flops_call = add+mul+2.0*fma;
usec += timer.useconds();
flops+= flops_call*NN;
int pc = processor_coor[dim];
for(int idx=0;idx<sgrid->lSites();idx++) {
sgrid->LocalIndexToLocalCoor(idx,lcoor);
gcoor = lcoor;
// extract the result
sobj s;
gcoor[dim] = lcoor[dim]+L*pc;
peekLocalSite(s,pgresult,gcoor);
pokeLocalSite(s,result,lcoor);
}
FFTW<scalar>::fftw_destroy_plan(p);
}
#endif
}
};
}
#endif

View File

@ -59,29 +59,31 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
/////////////////// ///////////////////
// Grid headers // Grid headers
/////////////////// ///////////////////
#include <serialisation/Serialisation.h> #include <Grid/serialisation/Serialisation.h>
#include <Config.h> #include "Config.h"
#include <Timer.h> #include <Grid/Timer.h>
#include <PerfCount.h> #include <Grid/PerfCount.h>
#include <Log.h> #include <Grid/Log.h>
#include <AlignedAllocator.h> #include <Grid/AlignedAllocator.h>
#include <Simd.h> #include <Grid/Simd.h>
#include <Threads.h> #include <Grid/Threads.h>
#include <Lexicographic.h> #include <Grid/Lexicographic.h>
#include <Communicator.h> #include <Grid/Init.h>
#include <Cartesian.h> #include <Grid/Communicator.h>
#include <Tensors.h> #include <Grid/Cartesian.h>
#include <Lattice.h> #include <Grid/Tensors.h>
#include <Cshift.h> #include <Grid/Lattice.h>
#include <Stencil.h> #include <Grid/Cshift.h>
#include <Algorithms.h> #include <Grid/Stencil.h>
#include <parallelIO/BinaryIO.h> #include <Grid/Algorithms.h>
#include <qcd/QCD.h> #include <Grid/parallelIO/BinaryIO.h>
#include <parallelIO/NerscIO.h> #include <Grid/qcd/QCD.h>
#include <Init.h> #include <Grid/parallelIO/NerscIO.h>
#include <qcd/hmc/NerscCheckpointer.h> #include <Grid/FFT.h>
#include <qcd/hmc/HmcRunner.h>
#include <Grid/qcd/hmc/NerscCheckpointer.h>
#include <Grid/qcd/hmc/HmcRunner.h>

View File

@ -153,6 +153,7 @@ void GridParseLayout(char **argv,int argc,
assert(ompthreads.size()==1); assert(ompthreads.size()==1);
GridThread::SetThreads(ompthreads[0]); GridThread::SetThreads(ompthreads[0]);
} }
if( GridCmdOptionExists(argv,argv+argc,"--cores") ){ if( GridCmdOptionExists(argv,argv+argc,"--cores") ){
std::vector<int> cores(0); std::vector<int> cores(0);
arg= GridCmdOptionPayload(argv,argv+argc,"--cores"); arg= GridCmdOptionPayload(argv,argv+argc,"--cores");
@ -170,14 +171,17 @@ std::string GridCmdVectorIntToString(const std::vector<int> & vec){
///////////////////////////////////////////////////////// /////////////////////////////////////////////////////////
// //
///////////////////////////////////////////////////////// /////////////////////////////////////////////////////////
static int Grid_is_initialised = 0;
void Grid_init(int *argc,char ***argv) void Grid_init(int *argc,char ***argv)
{ {
GridLogger::StopWatch.Start();
CartesianCommunicator::Init(argc,argv); CartesianCommunicator::Init(argc,argv);
// Parse command line args. // Parse command line args.
GridLogger::StopWatch.Start();
std::string arg; std::string arg;
std::vector<std::string> logstreams; std::vector<std::string> logstreams;
std::string defaultLog("Error,Warning,Message,Performance"); std::string defaultLog("Error,Warning,Message,Performance");
@ -193,7 +197,7 @@ void Grid_init(int *argc,char ***argv)
std::cout<<GridLogMessage<<"--mpi n.n.n.n : default MPI decomposition"<<std::endl; std::cout<<GridLogMessage<<"--mpi n.n.n.n : default MPI decomposition"<<std::endl;
std::cout<<GridLogMessage<<"--threads n : default number of OMP threads"<<std::endl; std::cout<<GridLogMessage<<"--threads n : default number of OMP threads"<<std::endl;
std::cout<<GridLogMessage<<"--grid n.n.n.n : default Grid size"<<std::endl; std::cout<<GridLogMessage<<"--grid n.n.n.n : default Grid size"<<std::endl;
std::cout<<GridLogMessage<<"--log list : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug"<<std::endl; std::cout<<GridLogMessage<<"--log list : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
exit(EXIT_SUCCESS); exit(EXIT_SUCCESS);
} }
@ -203,7 +207,6 @@ void Grid_init(int *argc,char ***argv)
GridLogConfigure(logstreams); GridLogConfigure(logstreams);
} }
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){ if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
Grid_debug_handler_init(); Grid_debug_handler_init();
} }
@ -211,17 +214,19 @@ void Grid_init(int *argc,char ***argv)
Grid_quiesce_nodes(); Grid_quiesce_nodes();
} }
if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-opt") ){ if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-opt") ){
QCD::WilsonFermionStatic::HandOptDslash=1; QCD::WilsonKernelsStatic::HandOpt=1;
QCD::WilsonFermion5DStatic::HandOptDslash=1;
} }
if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){ if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
LebesgueOrder::UseLebesgueOrder=1; LebesgueOrder::UseLebesgueOrder=1;
} }
if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){ if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking"); arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
GridCmdOptionIntVector(arg,LebesgueOrder::Block); GridCmdOptionIntVector(arg,LebesgueOrder::Block);
} }
if( GridCmdOptionExists(*argv,*argv+*argc,"--timestamp") ){
GridLogTimestamp(1);
}
GridParseLayout(*argv,*argc, GridParseLayout(*argv,*argc,
Grid_default_latt, Grid_default_latt,
Grid_default_mpi); Grid_default_mpi);
@ -235,26 +240,34 @@ void Grid_init(int *argc,char ***argv)
std::cout<<GridLogMessage<<"\tvComplexD : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl; std::cout<<GridLogMessage<<"\tvComplexD : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
} }
std::string COL_RED = GridLogColours.colour["RED"];
std::string COL_PURPLE = GridLogColours.colour["PURPLE"];
std::string COL_BLACK = GridLogColours.colour["BLACK"];
std::string COL_GREEN = GridLogColours.colour["GREEN"];
std::string COL_BLUE = GridLogColours.colour["BLUE"];
std::string COL_YELLOW = GridLogColours.colour["YELLOW"];
std::string COL_BACKGROUND = GridLogColours.colour["NORMAL"];
std::cout <<std::endl; std::cout <<std::endl;
std::cout <<Logger::RED << "__|__|__|__|__"<< "|__|__|_"<<Logger::PURPLE<<"_|__|__|"<< "__|__|__|__|__"<<std::endl; std::cout <<COL_RED << "__|__|__|__|__"<< "|__|__|_"<<COL_PURPLE<<"_|__|__|"<< "__|__|__|__|__"<<std::endl;
std::cout <<Logger::RED << "__|__|__|__|__"<< "|__|__|_"<<Logger::PURPLE<<"_|__|__|"<< "__|__|__|__|__"<<std::endl; std::cout <<COL_RED << "__|__|__|__|__"<< "|__|__|_"<<COL_PURPLE<<"_|__|__|"<< "__|__|__|__|__"<<std::endl;
std::cout <<Logger::RED << "__|__| | | "<< "| | | "<<Logger::PURPLE<<" | | |"<< " | | | _|__"<<std::endl; std::cout <<COL_RED << "__|_ | | | "<< "| | | "<<COL_PURPLE<<" | | |"<< " | | | _|__"<<std::endl;
std::cout <<Logger::RED << "__|__ "<< " "<<Logger::PURPLE<<" "<< " _|__"<<std::endl; std::cout <<COL_RED << "__|_ "<< " "<<COL_PURPLE<<" "<< " _|__"<<std::endl;
std::cout <<Logger::RED << "__|_ "<<Logger::GREEN<<" GGGG "<<Logger::RED<<" RRRR "<<Logger::BLUE <<" III "<<Logger::PURPLE<<"DDDD "<<Logger::PURPLE<<" _|__"<<std::endl; std::cout <<COL_RED << "__|_ "<<COL_GREEN<<" GGGG "<<COL_RED<<" RRRR "<<COL_BLUE <<" III "<<COL_PURPLE<<"DDDD "<<COL_PURPLE<<" _|__"<<std::endl;
std::cout <<Logger::RED << "__|_ "<<Logger::GREEN<<"G "<<Logger::RED<<" R R "<<Logger::BLUE <<" I "<<Logger::PURPLE<<"D D "<<Logger::PURPLE<<" _|__"<<std::endl; std::cout <<COL_RED << "__|_ "<<COL_GREEN<<"G "<<COL_RED<<" R R "<<COL_BLUE <<" I "<<COL_PURPLE<<"D D "<<COL_PURPLE<<" _|__"<<std::endl;
std::cout <<Logger::RED << "__|_ "<<Logger::GREEN<<"G "<<Logger::RED<<" R R "<<Logger::BLUE <<" I "<<Logger::PURPLE<<"D D"<<Logger::PURPLE<<" _|__"<<std::endl; std::cout <<COL_RED << "__|_ "<<COL_GREEN<<"G "<<COL_RED<<" R R "<<COL_BLUE <<" I "<<COL_PURPLE<<"D D"<<COL_PURPLE<<" _|__"<<std::endl;
std::cout <<Logger::BLUE << "__|_ "<<Logger::GREEN<<"G GG "<<Logger::RED<<" RRRR "<<Logger::BLUE <<" I "<<Logger::PURPLE<<"D D"<<Logger::GREEN <<" _|__"<<std::endl; std::cout <<COL_BLUE << "__|_ "<<COL_GREEN<<"G GG "<<COL_RED<<" RRRR "<<COL_BLUE <<" I "<<COL_PURPLE<<"D D"<<COL_GREEN <<" _|__"<<std::endl;
std::cout <<Logger::BLUE << "__|_ "<<Logger::GREEN<<"G G "<<Logger::RED<<" R R "<<Logger::BLUE <<" I "<<Logger::PURPLE<<"D D "<<Logger::GREEN <<" _|__"<<std::endl; std::cout <<COL_BLUE << "__|_ "<<COL_GREEN<<"G G "<<COL_RED<<" R R "<<COL_BLUE <<" I "<<COL_PURPLE<<"D D "<<COL_GREEN <<" _|__"<<std::endl;
std::cout <<Logger::BLUE << "__|_ "<<Logger::GREEN<<" GGGG "<<Logger::RED<<" R R "<<Logger::BLUE <<" III "<<Logger::PURPLE<<"DDDD "<<Logger::GREEN <<" _|__"<<std::endl; std::cout <<COL_BLUE << "__|_ "<<COL_GREEN<<" GGGG "<<COL_RED<<" R R "<<COL_BLUE <<" III "<<COL_PURPLE<<"DDDD "<<COL_GREEN <<" _|__"<<std::endl;
std::cout <<Logger::BLUE << "__|__ "<< " "<<Logger::GREEN <<" "<< " _|__"<<std::endl; std::cout <<COL_BLUE << "__|_ "<< " "<<COL_GREEN <<" "<< " _|__"<<std::endl;
std::cout <<Logger::BLUE << "__|__|__|__|__"<< "|__|__|_"<<Logger::GREEN <<"_|__|__|"<< "__|__|__|__|__"<<std::endl; std::cout <<COL_BLUE << "__|__|__|__|__"<< "|__|__|_"<<COL_GREEN <<"_|__|__|"<< "__|__|__|__|__"<<std::endl;
std::cout <<Logger::BLUE << "__|__|__|__|__"<< "|__|__|_"<<Logger::GREEN <<"_|__|__|"<< "__|__|__|__|__"<<std::endl; std::cout <<COL_BLUE << "__|__|__|__|__"<< "|__|__|_"<<COL_GREEN <<"_|__|__|"<< "__|__|__|__|__"<<std::endl;
std::cout <<Logger::BLUE << " | | | | "<< "| | | "<<Logger::GREEN <<" | | |"<< " | | | | "<<std::endl; std::cout <<COL_BLUE << " | | | | "<< "| | | "<<COL_GREEN <<" | | |"<< " | | | | "<<std::endl;
std::cout << std::endl; std::cout << std::endl;
std::cout << std::endl; std::cout << std::endl;
std::cout <<Logger::YELLOW<< std::endl; std::cout <<COL_YELLOW<< std::endl;
std::cout << "Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors"<<std::endl; std::cout << "Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors"<<std::endl;
std::cout << "Colours by Tadahito Boyle "<<std::endl;
std::cout << std::endl; std::cout << std::endl;
std::cout << "This program is free software; you can redistribute it and/or modify"<<std::endl; std::cout << "This program is free software; you can redistribute it and/or modify"<<std::endl;
std::cout << "it under the terms of the GNU General Public License as published by"<<std::endl; std::cout << "it under the terms of the GNU General Public License as published by"<<std::endl;
@ -265,22 +278,20 @@ void Grid_init(int *argc,char ***argv)
std::cout << "but WITHOUT ANY WARRANTY; without even the implied warranty of"<<std::endl; std::cout << "but WITHOUT ANY WARRANTY; without even the implied warranty of"<<std::endl;
std::cout << "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the"<<std::endl; std::cout << "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the"<<std::endl;
std::cout << "GNU General Public License for more details."<<std::endl; std::cout << "GNU General Public License for more details."<<std::endl;
std::cout << Logger::BLACK <<std::endl; std::cout << COL_BACKGROUND <<std::endl;
std::cout << std::endl;
Grid_is_initialised = 1;
} }
void Grid_finalize(void) void Grid_finalize(void)
{ {
#ifdef GRID_COMMS_MPI #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3)
MPI_Finalize(); MPI_Finalize();
Grid_unquiesce_nodes(); Grid_unquiesce_nodes();
#endif #endif
} }
double usecond(void) {
struct timeval tv;
gettimeofday(&tv,NULL);
return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
}
void * Grid_backtrace_buffer[_NBACKTRACE]; void * Grid_backtrace_buffer[_NBACKTRACE];

View File

@ -33,6 +33,7 @@ namespace Grid {
void Grid_init(int *argc,char ***argv); void Grid_init(int *argc,char ***argv);
void Grid_finalize(void); void Grid_finalize(void);
// internal, controled with --handle // internal, controled with --handle
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr); void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
void Grid_debug_handler_init(void); void Grid_debug_handler_init(void);
@ -44,6 +45,7 @@ namespace Grid {
const std::vector<int> &GridDefaultMpi(void); const std::vector<int> &GridDefaultMpi(void);
const int &GridThreads(void) ; const int &GridThreads(void) ;
void GridSetThreads(int t) ; void GridSetThreads(int t) ;
void GridLogTimestamp(int);
// Common parsing chores // Common parsing chores
std::string GridCmdOptionPayload(char ** begin, char ** end, const std::string & option); std::string GridCmdOptionPayload(char ** begin, char ** end, const std::string & option);

View File

@ -28,6 +28,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_LATTICE_H #ifndef GRID_LATTICE_H
#define GRID_LATTICE_H #define GRID_LATTICE_H
#include <lattice/Lattice_base.h> #include <Grid/lattice/Lattice_base.h>
#endif #endif

View File

@ -25,7 +25,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
with this program; if not, write to the Free Software Foundation, Inc., with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#include <Grid.h> #include <Grid.h>
@ -33,77 +34,51 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
namespace Grid { namespace Grid {
GridStopWatch Logger::StopWatch; GridStopWatch Logger::StopWatch;
int Logger::timestamp;
std::ostream Logger::devnull(0); std::ostream Logger::devnull(0);
std::string Logger::BLACK("\033[30m");
std::string Logger::RED("\033[31m");
std::string Logger::GREEN("\033[32m");
std::string Logger::YELLOW("\033[33m");
std::string Logger::BLUE("\033[34m");
std::string Logger::PURPLE("\033[35m");
std::string Logger::CYAN("\033[36m");
std::string Logger::WHITE("\033[37m");
std::string Logger::NORMAL("\033[0;39m");
std::string EMPTY("");
#if 0 void GridLogTimestamp(int on){
GridLogger GridLogError (1,"Error",Logger::RED); Logger::Timestamp(on);
GridLogger GridLogWarning (1,"Warning",Logger::YELLOW); }
GridLogger GridLogMessage (1,"Message",Logger::BLACK);
GridLogger GridLogDebug (1,"Debug",Logger::PURPLE);
GridLogger GridLogPerformance(1,"Performance",Logger::GREEN);
GridLogger GridLogIterative (1,"Iterative",Logger::BLUE);
GridLogger GridLogIntegrator (1,"Integrator",Logger::BLUE);
#else
GridLogger GridLogError (1,"Error",EMPTY);
GridLogger GridLogWarning (1,"Warning",EMPTY);
GridLogger GridLogMessage (1,"Message",EMPTY);
GridLogger GridLogDebug (1,"Debug",EMPTY);
GridLogger GridLogPerformance(1,"Performance",EMPTY);
GridLogger GridLogIterative (1,"Iterative",EMPTY);
GridLogger GridLogIntegrator (1,"Integrator",EMPTY);
#endif
void GridLogConfigure(std::vector<std::string> &logstreams) Colours GridLogColours(0);
{ GridLogger GridLogError(1, "Error", GridLogColours, "RED");
GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW");
GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL");
GridLogger GridLogDebug(1, "Debug", GridLogColours, "PURPLE");
GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
GridLogger GridLogIterative(1, "Iterative", GridLogColours, "BLUE");
GridLogger GridLogIntegrator(1, "Integrator", GridLogColours, "BLUE");
void GridLogConfigure(std::vector<std::string> &logstreams) {
GridLogError.Active(0); GridLogError.Active(0);
GridLogWarning.Active(0); GridLogWarning.Active(0);
GridLogMessage.Active(0); GridLogMessage.Active(1); // at least the messages should be always on
GridLogIterative.Active(0); GridLogIterative.Active(0);
GridLogDebug.Active(0); GridLogDebug.Active(0);
GridLogPerformance.Active(0); GridLogPerformance.Active(0);
GridLogIntegrator.Active(0); GridLogIntegrator.Active(0);
GridLogColours.Active(0);
int blackAndWhite = 1;
if(blackAndWhite){
Logger::BLACK = std::string("");
Logger::RED =Logger::BLACK;
Logger::GREEN =Logger::BLACK;
Logger::YELLOW =Logger::BLACK;
Logger::BLUE =Logger::BLACK;
Logger::PURPLE =Logger::BLACK;
Logger::CYAN =Logger::BLACK;
Logger::WHITE =Logger::BLACK;
Logger::NORMAL =Logger::BLACK;
}
for (int i = 0; i < logstreams.size(); i++) { for (int i = 0; i < logstreams.size(); i++) {
if (logstreams[i] == std::string("Error")) GridLogError.Active(1); if (logstreams[i] == std::string("Error")) GridLogError.Active(1);
if (logstreams[i] == std::string("Warning")) GridLogWarning.Active(1); if (logstreams[i] == std::string("Warning")) GridLogWarning.Active(1);
if ( logstreams[i]== std::string("Message") ) GridLogMessage.Active(1); if (logstreams[i] == std::string("NoMessage")) GridLogMessage.Active(0);
if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1); if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1);
if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1); if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1);
if ( logstreams[i]== std::string("Performance") ) GridLogPerformance.Active(1); if (logstreams[i] == std::string("Performance"))
GridLogPerformance.Active(1);
if (logstreams[i] == std::string("Integrator")) GridLogIntegrator.Active(1); if (logstreams[i] == std::string("Integrator")) GridLogIntegrator.Active(1);
if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1);
} }
} }
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// Verbose limiter on MPI tasks // Verbose limiter on MPI tasks
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
void Grid_quiesce_nodes(void) void Grid_quiesce_nodes(void) {
{
int me = 0; int me = 0;
#ifdef GRID_COMMS_MPI #if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3)
MPI_Comm_rank(MPI_COMM_WORLD, &me); MPI_Comm_rank(MPI_COMM_WORLD, &me);
#endif #endif
#ifdef GRID_COMMS_SHMEM #ifdef GRID_COMMS_SHMEM
@ -114,13 +89,9 @@ void Grid_quiesce_nodes(void)
} }
} }
void Grid_unquiesce_nodes(void) void Grid_unquiesce_nodes(void) {
{
#ifdef GRID_COMMS_MPI #ifdef GRID_COMMS_MPI
std::cout.clear(); std::cout.clear();
#endif #endif
} }
} }

View File

@ -27,6 +27,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#include <map>
#ifndef GRID_LOG_H #ifndef GRID_LOG_H
#define GRID_LOG_H #define GRID_LOG_H
@ -36,43 +39,86 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
namespace Grid { namespace Grid {
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dress the output; use std::chrono for time stamping via the StopWatch class // Dress the output; use std::chrono for time stamping via the StopWatch class
int Rank(void); // used for early stage debug before library init //////////////////////////////////////////////////////////////////////////////////////////////////
class Colours{
protected:
bool is_active;
public:
std::map<std::string, std::string> colour;
Colours(bool activate=false){
Active(activate);
};
void Active(bool activate){
is_active=activate;
if (is_active){
colour["BLACK"] ="\033[30m";
colour["RED"] ="\033[31m";
colour["GREEN"] ="\033[32m";
colour["YELLOW"] ="\033[33m";
colour["BLUE"] ="\033[34m";
colour["PURPLE"] ="\033[35m";
colour["CYAN"] ="\033[36m";
colour["WHITE"] ="\033[37m";
colour["NORMAL"] ="\033[0;39m";
} else {
colour["BLACK"] ="";
colour["RED"] ="";
colour["GREEN"] ="";
colour["YELLOW"]="";
colour["BLUE"] ="";
colour["PURPLE"]="";
colour["CYAN"] ="";
colour["WHITE"] ="";
colour["NORMAL"]="";
}
};
};
class Logger { class Logger {
protected: protected:
Colours &Painter;
int active; int active;
std::string name, topName, COLOUR; static int timestamp;
std::string name, topName;
std::string COLOUR;
public: public:
static GridStopWatch StopWatch; static GridStopWatch StopWatch;
static std::ostream devnull; static std::ostream devnull;
static std::string BLACK; std::string background() {return Painter.colour["NORMAL"];}
static std::string RED ; std::string evidence() {return Painter.colour["YELLOW"];}
static std::string GREEN; std::string colour() {return Painter.colour[COLOUR];}
static std::string YELLOW;
static std::string BLUE ;
static std::string PURPLE;
static std::string CYAN ;
static std::string WHITE ;
static std::string NORMAL;
Logger(std::string topNm, int on, std::string nm,std::string col) Logger(std::string topNm, int on, std::string nm, Colours& col_class, std::string col) : active(on),
: active(on), name(nm), topName(topNm), COLOUR(col) {}; name(nm),
topName(topNm),
Painter(col_class),
COLOUR(col) {} ;
void Active(int on) {active = on;}; void Active(int on) {active = on;};
int isActive(void) {return active;}; int isActive(void) {return active;};
static void Timestamp(int on) {timestamp = on;};
friend std::ostream& operator<< (std::ostream& stream, Logger& log){
friend std::ostream& operator<< (std::ostream& stream, const Logger& log){
if ( log.active ) { if ( log.active ) {
stream << log.background()<< log.topName << log.background()<< " : ";
stream << log.colour() <<std::setw(14) << std::left << log.name << log.background() << " : ";
if ( log.timestamp ) {
StopWatch.Stop(); StopWatch.Stop();
GridTime now = StopWatch.Elapsed(); GridTime now = StopWatch.Elapsed();
StopWatch.Start(); StopWatch.Start();
stream << BLACK <<std::setw(8) << std::left << log.topName << BLACK<< " : "; stream << log.evidence()<< now << log.background() << " : " ;
stream << log.COLOUR <<std::setw(11) << log.name << BLACK << " : "; }
stream << YELLOW <<std::setw(6) << now <<BLACK << " : " ; stream << log.colour();
stream << log.COLOUR;
return stream; return stream;
} else { } else {
return devnull; return devnull;
@ -83,7 +129,8 @@ public:
class GridLogger: public Logger { class GridLogger: public Logger {
public: public:
GridLogger(int on, std::string nm, std::string col = Logger::BLACK): Logger("Grid", on, nm, col){}; GridLogger(int on, std::string nm, Colours&col_class, std::string col_key = "NORMAL"):
Logger("Grid", on, nm, col_class, col_key){};
}; };
void GridLogConfigure(std::vector<std::string> &logstreams); void GridLogConfigure(std::vector<std::string> &logstreams);
@ -95,6 +142,7 @@ extern GridLogger GridLogDebug ;
extern GridLogger GridLogPerformance; extern GridLogger GridLogPerformance;
extern GridLogger GridLogIterative ; extern GridLogger GridLogIterative ;
extern GridLogger GridLogIntegrator ; extern GridLogger GridLogIntegrator ;
extern Colours GridLogColours;
#define _NBACKTRACE (256) #define _NBACKTRACE (256)
@ -102,7 +150,7 @@ extern void * Grid_backtrace_buffer[_NBACKTRACE];
#define BACKTRACEFILE() {\ #define BACKTRACEFILE() {\
char string[20]; \ char string[20]; \
std::sprintf(string,"backtrace.%d",Rank()); \ std::sprintf(string,"backtrace.%d",CartesianCommunicator::RankWorld()); \
std::FILE * fp = std::fopen(string,"w"); \ std::FILE * fp = std::fopen(string,"w"); \
BACKTRACEFP(fp)\ BACKTRACEFP(fp)\
std::fclose(fp); \ std::fclose(fp); \
@ -128,5 +176,6 @@ extern void * Grid_backtrace_buffer[_NBACKTRACE];
#define BACKTRACE() BACKTRACEFP(stdout) #define BACKTRACE() BACKTRACEFP(stdout)
} }
#endif #endif

File diff suppressed because one or more lines are too long

View File

@ -1,32 +1,32 @@
# additional include paths necessary to compile the C++ library
AM_CXXFLAGS = -I$(top_srcdir)/
extra_sources= extra_sources=
if BUILD_COMMS_MPI if BUILD_COMMS_MPI
extra_sources+=communicator/Communicator_mpi.cc extra_sources+=communicator/Communicator_mpi.cc
extra_sources+=communicator/Communicator_base.cc
endif
if BUILD_COMMS_MPI3
extra_sources+=communicator/Communicator_mpi3.cc
extra_sources+=communicator/Communicator_base.cc
endif endif
if BUILD_COMMS_SHMEM if BUILD_COMMS_SHMEM
extra_sources+=communicator/Communicator_shmem.cc extra_sources+=communicator/Communicator_shmem.cc
extra_sources+=communicator/Communicator_base.cc
endif endif
if BUILD_COMMS_NONE if BUILD_COMMS_NONE
extra_sources+=communicator/Communicator_none.cc extra_sources+=communicator/Communicator_none.cc
extra_sources+=communicator/Communicator_base.cc
endif endif
# #
# Libraries # Libraries
# #
include Make.inc include Make.inc
include Eigen.inc
lib_LIBRARIES = libGrid.a lib_LIBRARIES = libGrid.a
libGrid_a_SOURCES = $(CCFILES) $(extra_sources) libGrid_a_SOURCES = $(CCFILES) $(extra_sources)
libGrid_adir = $(pkgincludedir)
nobase_dist_pkginclude_HEADERS = $(HFILES) $(eigen_files) Config.h
# qcd/action/fermion/PartialFractionFermion5D.cc\ \
#
# Include files
#
nobase_include_HEADERS=$(HFILES)

View File

@ -32,28 +32,44 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
namespace Grid { namespace Grid {
#define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16)) #define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16))
#define RawConfig(A,B) (A<<8|B)
const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::PerformanceCounterConfigs [] = { const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::PerformanceCounterConfigs [] = {
#ifdef __linux__ #ifdef __linux__
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES , "CPUCYCLES.........." }, { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES , "CACHE_REFERENCES..." , INSTRUCTIONS},
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS , "INSTRUCTIONS......." }, { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES , "CACHE_MISSES......." , CACHE_REFERENCES},
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES , "CACHE_REFERENCES..." }, { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES , "CPUCYCLES.........." , INSTRUCTIONS},
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES , "CACHE_MISSES......." }, { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS , "INSTRUCTIONS......." , CPUCYCLES },
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,MISS) , "L1D_READ_MISS......"}, // 4
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,ACCESS) , "L1D_READ_ACCESS...."}, #ifdef AVX512
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,MISS) , "L1D_WRITE_MISS....."}, { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES },
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,ACCESS) , "L1D_WRITE_ACCESS..."}, { PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS },
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,MISS) , "L1D_PREFETCH_MISS.."}, { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS },
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) , "L1D_PREFETCH_ACCESS"}, { PERF_TYPE_RAW, RawConfig(0x02,0x04), "L2_HIT_LOADS.......", L1D_READ_ACCESS },
{ PERF_TYPE_HW_CACHE, CacheControl(LL,READ,MISS) , "LL_READ_MISS......."}, { PERF_TYPE_RAW, RawConfig(0x04,0x04), "L2_MISS_LOADS......", L1D_READ_ACCESS },
// { PERF_TYPE_HW_CACHE, CacheControl(LL,READ,ACCESS) , "LL_READ_ACCESS....."}, { PERF_TYPE_RAW, RawConfig(0x10,0x04), "UTLB_MISS_LOADS....", L1D_READ_ACCESS },
{ PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,MISS) , "LL_WRITE_MISS......"}, { PERF_TYPE_RAW, RawConfig(0x08,0x04), "DTLB_MISS_LOADS....", L1D_READ_ACCESS },
{ PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,ACCESS) , "LL_WRITE_ACCESS...."}, // 11
{ PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,MISS) , "LL_PREFETCH_MISS..."}, #else
{ PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,ACCESS) , "LL_PREFETCH_ACCESS."}, { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,ACCESS) , "L1D_READ_ACCESS....",INSTRUCTIONS},
{ PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,MISS) , "L1I_READ_MISS......"}, { PERF_TYPE_HW_CACHE, CacheControl(L1D,READ,MISS) , "L1D_READ_MISS......",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,ACCESS) , "L1I_READ_ACCESS...."} { PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,MISS) , "L1D_WRITE_MISS.....",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,WRITE,ACCESS) , "L1D_WRITE_ACCESS...",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,MISS) , "L1D_PREFETCH_MISS..",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) , "L1D_PREFETCH_ACCESS",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(L1D,PREFETCH,ACCESS) , "L1D_PREFETCH_ACCESS",L1D_READ_ACCESS},
// 11
#endif #endif
{ PERF_TYPE_HW_CACHE, CacheControl(LL,READ,MISS) , "LL_READ_MISS.......",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(LL,READ,ACCESS) , "LL_READ_ACCESS.....",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,MISS) , "LL_WRITE_MISS......",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(LL,WRITE,ACCESS) , "LL_WRITE_ACCESS....",L1D_READ_ACCESS},
//15
{ PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,MISS) , "LL_PREFETCH_MISS...",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(LL,PREFETCH,ACCESS) , "LL_PREFETCH_ACCESS.",L1D_READ_ACCESS},
{ PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,MISS) , "L1I_READ_MISS......",INSTRUCTIONS},
{ PERF_TYPE_HW_CACHE, CacheControl(L1I,READ,ACCESS) , "L1I_READ_ACCESS....",INSTRUCTIONS}
//19
// { PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "STALL_CYCLES" }, // { PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "STALL_CYCLES" },
#endif
}; };
} }

View File

@ -58,6 +58,27 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
} }
#endif #endif
#ifdef TIMERS_OFF
inline uint64_t cyclecount(void){
return 0;
}
#define __SSC_MARK(mark) __asm__ __volatile__ ("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(mark):"%ebx")
#define __SSC_STOP __SSC_MARK(0x110)
#define __SSC_START __SSC_MARK(0x111)
#else
#define __SSC_MARK(mark)
#define __SSC_STOP
#define __SSC_START
/*
* cycle counters arch dependent
*/
#ifdef __bgq__ #ifdef __bgq__
inline uint64_t cyclecount(void){ inline uint64_t cyclecount(void){
uint64_t tmp; uint64_t tmp;
@ -65,18 +86,20 @@ inline uint64_t cyclecount(void){
return tmp; return tmp;
} }
#elif defined __x86_64__ #elif defined __x86_64__
#include <immintrin.h>
#ifndef __INTEL_COMPILER
#include <x86intrin.h> #include <x86intrin.h>
#endif
inline uint64_t cyclecount(void){ inline uint64_t cyclecount(void){
return __rdtsc(); return __rdtsc();
// unsigned int dummy;
// return __rdtscp(&dummy);
} }
#else #else
#warning No cycle counter implemented for this architecture
inline uint64_t cyclecount(void){ inline uint64_t cyclecount(void){
return 0; return 0;
} }
#endif
#endif #endif
class PerformanceCounter { class PerformanceCounter {
@ -87,6 +110,7 @@ private:
uint32_t type; uint32_t type;
uint64_t config; uint64_t config;
const char *name; const char *name;
int normalisation;
} PerformanceCounterConfig; } PerformanceCounterConfig;
static const PerformanceCounterConfig PerformanceCounterConfigs []; static const PerformanceCounterConfig PerformanceCounterConfigs [];
@ -94,26 +118,12 @@ private:
public: public:
enum PerformanceCounterType { enum PerformanceCounterType {
CPUCYCLES=0, CACHE_REFERENCES=0,
INSTRUCTIONS, CACHE_MISSES=1,
// STALL_CYCLES, CPUCYCLES=2,
CACHE_REFERENCES, INSTRUCTIONS=3,
CACHE_MISSES, L1D_READ_ACCESS=4,
L1D_READ_MISS, PERFORMANCE_COUNTER_NUM_TYPES=19
L1D_READ_ACCESS,
L1D_WRITE_MISS,
L1D_WRITE_ACCESS,
L1D_PREFETCH_MISS,
L1D_PREFETCH_ACCESS,
LL_READ_MISS,
// LL_READ_ACCESS,
LL_WRITE_MISS,
LL_WRITE_ACCESS,
LL_PREFETCH_MISS,
LL_PREFETCH_ACCESS,
L1I_READ_MISS,
L1I_READ_ACCESS,
PERFORMANCE_COUNTER_NUM_TYPES
}; };
public: public:
@ -121,7 +131,9 @@ public:
int PCT; int PCT;
long long count; long long count;
long long cycles;
int fd; int fd;
int cyclefd;
unsigned long long elapsed; unsigned long long elapsed;
uint64_t begin; uint64_t begin;
@ -134,7 +146,9 @@ public:
assert(_pct>=0); assert(_pct>=0);
assert(_pct<PERFORMANCE_COUNTER_NUM_TYPES); assert(_pct<PERFORMANCE_COUNTER_NUM_TYPES);
fd=-1; fd=-1;
cyclefd=-1;
count=0; count=0;
cycles=0;
PCT =_pct; PCT =_pct;
Open(); Open();
#endif #endif
@ -159,6 +173,15 @@ public:
fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name); fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name);
perror("Error is"); perror("Error is");
} }
int norm = PerformanceCounterConfigs[PCT].normalisation;
pe.type = PerformanceCounterConfigs[norm].type;
pe.config= PerformanceCounterConfigs[norm].config;
name = PerformanceCounterConfigs[norm].name;
cyclefd = perf_event_open(&pe, 0, -1, -1, 0); // pid 0, cpu -1 current process any cpu. group -1
if (cyclefd == -1) {
fprintf(stderr, "Error opening leader %llx for event %s\n", pe.config,name);
perror("Error is");
}
#endif #endif
} }
@ -168,6 +191,8 @@ public:
if ( fd!= -1) { if ( fd!= -1) {
::ioctl(fd, PERF_EVENT_IOC_RESET, 0); ::ioctl(fd, PERF_EVENT_IOC_RESET, 0);
::ioctl(fd, PERF_EVENT_IOC_ENABLE, 0); ::ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
::ioctl(cyclefd, PERF_EVENT_IOC_RESET, 0);
::ioctl(cyclefd, PERF_EVENT_IOC_ENABLE, 0);
} }
begin =cyclecount(); begin =cyclecount();
#else #else
@ -177,10 +202,13 @@ public:
void Stop(void) { void Stop(void) {
count=0; count=0;
cycles=0;
#ifdef __linux__ #ifdef __linux__
if ( fd!= -1) { if ( fd!= -1) {
::ioctl(fd, PERF_EVENT_IOC_DISABLE, 0); ::ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
::ioctl(cyclefd, PERF_EVENT_IOC_DISABLE, 0);
::read(fd, &count, sizeof(long long)); ::read(fd, &count, sizeof(long long));
::read(cyclefd, &cycles, sizeof(long long));
} }
elapsed = cyclecount() - begin; elapsed = cyclecount() - begin;
#else #else
@ -190,7 +218,11 @@ public:
} }
void Report(void) { void Report(void) {
#ifdef __linux__ #ifdef __linux__
std::printf("%llu cycles %s = %20llu\n", elapsed , PerformanceCounterConfigs[PCT].name, count); int N = PerformanceCounterConfigs[PCT].normalisation;
const char * sn = PerformanceCounterConfigs[N].name ;
const char * sc = PerformanceCounterConfigs[PCT].name;
std::printf("tsc = %llu %s = %llu %s = %20llu\n (%s/%s) rate = %lf\n", elapsed,sn ,cycles,
sc, count, sc,sn, (double)count/(double)cycles);
#else #else
std::printf("%llu cycles \n", elapsed ); std::printf("%llu cycles \n", elapsed );
#endif #endif
@ -199,7 +231,7 @@ public:
~PerformanceCounter() ~PerformanceCounter()
{ {
#ifdef __linux__ #ifdef __linux__
::close(fd); ::close(fd); ::close(cyclefd);
#endif #endif
} }

View File

@ -24,7 +24,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
with this program; if not, write to the Free Software Foundation, Inc., with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_SIMD_H #ifndef GRID_SIMD_H
@ -118,6 +119,14 @@ namespace Grid {
inline ComplexD timesI(const ComplexD &r) { return(r*ComplexD(0.0,1.0));} inline ComplexD timesI(const ComplexD &r) { return(r*ComplexD(0.0,1.0));}
inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));} inline ComplexF timesMinusI(const ComplexF &r){ return(r*ComplexF(0.0,-1.0));}
inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));} inline ComplexD timesMinusI(const ComplexD &r){ return(r*ComplexD(0.0,-1.0));}
// define projections to real and imaginay parts
inline ComplexF projReal(const ComplexF &r){return( ComplexF(std::real(r), 0.0));}
inline ComplexD projReal(const ComplexD &r){return( ComplexD(std::real(r), 0.0));}
inline ComplexF projImag(const ComplexF &r){return (ComplexF(std::imag(r), 0.0 ));}
inline ComplexD projImag(const ComplexD &r){return (ComplexD(std::imag(r), 0.0));}
// define auxiliary functions for complex computations
inline void timesI(ComplexF &ret,const ComplexF &r) { ret = timesI(r);} inline void timesI(ComplexF &ret,const ComplexF &r) { ret = timesI(r);}
inline void timesI(ComplexD &ret,const ComplexD &r) { ret = timesI(r);} inline void timesI(ComplexD &ret,const ComplexD &r) { ret = timesI(r);}
inline void timesMinusI(ComplexF &ret,const ComplexF &r){ ret = timesMinusI(r);} inline void timesMinusI(ComplexF &ret,const ComplexF &r){ ret = timesMinusI(r);}
@ -163,8 +172,8 @@ namespace Grid {
}; };
#include <simd/Grid_vector_types.h> #include "simd/Grid_vector_types.h"
#include <simd/Grid_vector_unops.h> #include "simd/Grid_vector_unops.h"
namespace Grid { namespace Grid {
// Default precision // Default precision

247
lib/Stat.cc Normal file
View File

@ -0,0 +1,247 @@
#include <Grid.h>
#include <PerfCount.h>
#include <Stat.h>
namespace Grid {
bool PmuStat::pmu_initialized=false;
void PmuStat::init(const char *regname)
{
#ifdef __x86_64__
name = regname;
if (!pmu_initialized)
{
std::cout<<"initialising pmu"<<std::endl;
pmu_initialized = true;
pmu_init();
}
clear();
#endif
}
void PmuStat::clear(void)
{
#ifdef __x86_64__
count = 0;
tregion = 0;
pmc0 = 0;
pmc1 = 0;
inst = 0;
cyc = 0;
ref = 0;
tcycles = 0;
reads = 0;
writes = 0;
#endif
}
void PmuStat::print(void)
{
#ifdef __x86_64__
std::cout <<"Reg "<<std::string(name)<<":\n";
std::cout <<" region "<<tregion<<std::endl;
std::cout <<" cycles "<<tcycles<<std::endl;
std::cout <<" inst "<<inst <<std::endl;
std::cout <<" cyc "<<cyc <<std::endl;
std::cout <<" ref "<<ref <<std::endl;
std::cout <<" pmc0 "<<pmc0 <<std::endl;
std::cout <<" pmc1 "<<pmc1 <<std::endl;
std::cout <<" count "<<count <<std::endl;
std::cout <<" reads "<<reads <<std::endl;
std::cout <<" writes "<<writes <<std::endl;
#endif
}
void PmuStat::start(void)
{
#ifdef __x86_64__
pmu_start();
++count;
xmemctrs(&mrstart, &mwstart);
tstart = __rdtsc();
#endif
}
void PmuStat::enter(int t)
{
#ifdef __x86_64__
counters[0][t] = __rdpmc(0);
counters[1][t] = __rdpmc(1);
counters[2][t] = __rdpmc((1<<30)|0);
counters[3][t] = __rdpmc((1<<30)|1);
counters[4][t] = __rdpmc((1<<30)|2);
counters[5][t] = __rdtsc();
#endif
}
void PmuStat::exit(int t)
{
#ifdef __x86_64__
counters[0][t] = __rdpmc(0) - counters[0][t];
counters[1][t] = __rdpmc(1) - counters[1][t];
counters[2][t] = __rdpmc((1<<30)|0) - counters[2][t];
counters[3][t] = __rdpmc((1<<30)|1) - counters[3][t];
counters[4][t] = __rdpmc((1<<30)|2) - counters[4][t];
counters[5][t] = __rdtsc() - counters[5][t];
#endif
}
void PmuStat::accum(int nthreads)
{
#ifdef __x86_64__
tend = __rdtsc();
xmemctrs(&mrend, &mwend);
pmu_stop();
for (int t = 0; t < nthreads; ++t) {
pmc0 += counters[0][t];
pmc1 += counters[1][t];
inst += counters[2][t];
cyc += counters[3][t];
ref += counters[4][t];
tcycles += counters[5][t];
}
uint64_t region = tend - tstart;
tregion += region;
uint64_t mreads = mrend - mrstart;
reads += mreads;
uint64_t mwrites = mwend - mwstart;
writes += mwrites;
#endif
}
void PmuStat::pmu_fini(void) {}
void PmuStat::pmu_start(void) {};
void PmuStat::pmu_stop(void) {};
void PmuStat::pmu_init(void)
{
#ifdef _KNIGHTS_LANDING_
KNLsetup();
#endif
}
void PmuStat::xmemctrs(uint64_t *mr, uint64_t *mw)
{
#ifdef _KNIGHTS_LANDING_
ctrs c;
KNLreadctrs(c);
uint64_t emr = 0, emw = 0;
for (int i = 0; i < NEDC; ++i)
{
emr += c.edcrd[i];
emw += c.edcwr[i];
}
*mr = emr;
*mw = emw;
#else
*mr = *mw = 0;
#endif
}
#ifdef _KNIGHTS_LANDING_
struct knl_gbl_ PmuStat::gbl;
#define PMU_MEM
void PmuStat::KNLevsetup(const char *ename, int &fd, int event, int umask)
{
char fname[1024];
snprintf(fname, sizeof(fname), "%s/type", ename);
FILE *fp = fopen(fname, "r");
if (fp == 0) {
::printf("open %s", fname);
::exit(0);
}
int type;
int ret = fscanf(fp, "%d", &type);
assert(ret == 1);
fclose(fp);
// std::cout << "Using PMU type "<<type<<" from " << std::string(ename) <<std::endl;
struct perf_event_attr hw = {};
hw.size = sizeof(hw);
hw.type = type;
// see /sys/devices/uncore_*/format/*
// All of the events we are interested in are configured the same way, but
// that isn't always true. Proper code would parse the format files
hw.config = event | (umask << 8);
//hw.read_format = PERF_FORMAT_GROUP;
// unfortunately the above only works within a single PMU; might
// as well just read them one at a time
int cpu = 0;
fd = perf_event_open(&hw, -1, cpu, -1, 0);
if (fd == -1) {
::printf("CPU %d, box %s, event 0x%lx", cpu, ename, hw.config);
::exit(0);
} else {
// std::cout << "event "<<std::string(ename)<<" set up for fd "<<fd<<" hw.config "<<hw.config <<std::endl;
}
}
void PmuStat::KNLsetup(void){
int ret;
char fname[1024];
// MC RPQ inserts and WPQ inserts (reads & writes)
for (int mc = 0; mc < NMC; ++mc)
{
::snprintf(fname, sizeof(fname), "/sys/devices/uncore_imc_%d",mc);
// RPQ Inserts
KNLevsetup(fname, gbl.mc_rd[mc], 0x1, 0x1);
// WPQ Inserts
KNLevsetup(fname, gbl.mc_wr[mc], 0x2, 0x1);
}
// EDC RPQ inserts and WPQ inserts
for (int edc=0; edc < NEDC; ++edc)
{
::snprintf(fname, sizeof(fname), "/sys/devices/uncore_edc_eclk_%d",edc);
// RPQ inserts
KNLevsetup(fname, gbl.edc_rd[edc], 0x1, 0x1);
// WPQ inserts
KNLevsetup(fname, gbl.edc_wr[edc], 0x2, 0x1);
}
// EDC HitE, HitM, MissE, MissM
for (int edc=0; edc < NEDC; ++edc)
{
::snprintf(fname, sizeof(fname), "/sys/devices/uncore_edc_uclk_%d", edc);
KNLevsetup(fname, gbl.edc_hite[edc], 0x2, 0x1);
KNLevsetup(fname, gbl.edc_hitm[edc], 0x2, 0x2);
KNLevsetup(fname, gbl.edc_misse[edc], 0x2, 0x4);
KNLevsetup(fname, gbl.edc_missm[edc], 0x2, 0x8);
}
}
uint64_t PmuStat::KNLreadctr(int fd)
{
uint64_t data;
size_t s = ::read(fd, &data, sizeof(data));
if (s != sizeof(uint64_t)){
::printf("read counter %lu", s);
::exit(0);
}
return data;
}
void PmuStat::KNLreadctrs(ctrs &c)
{
for (int i = 0; i < NMC; ++i)
{
c.mcrd[i] = KNLreadctr(gbl.mc_rd[i]);
c.mcwr[i] = KNLreadctr(gbl.mc_wr[i]);
}
for (int i = 0; i < NEDC; ++i)
{
c.edcrd[i] = KNLreadctr(gbl.edc_rd[i]);
c.edcwr[i] = KNLreadctr(gbl.edc_wr[i]);
}
for (int i = 0; i < NEDC; ++i)
{
c.edchite[i] = KNLreadctr(gbl.edc_hite[i]);
c.edchitm[i] = KNLreadctr(gbl.edc_hitm[i]);
c.edcmisse[i] = KNLreadctr(gbl.edc_misse[i]);
c.edcmissm[i] = KNLreadctr(gbl.edc_missm[i]);
}
}
#endif
}

104
lib/Stat.h Normal file
View File

@ -0,0 +1,104 @@
#ifndef _GRID_STAT_H
#define _GRID_STAT_H
#ifdef AVX512
#define _KNIGHTS_LANDING_ROOTONLY
#endif
namespace Grid {
///////////////////////////////////////////////////////////////////////////////
// Extra KNL counters from MCDRAM
///////////////////////////////////////////////////////////////////////////////
#ifdef _KNIGHTS_LANDING_
#define NMC 6
#define NEDC 8
struct ctrs
{
uint64_t mcrd[NMC];
uint64_t mcwr[NMC];
uint64_t edcrd[NEDC];
uint64_t edcwr[NEDC];
uint64_t edchite[NEDC];
uint64_t edchitm[NEDC];
uint64_t edcmisse[NEDC];
uint64_t edcmissm[NEDC];
};
// Peter/Azusa:
// Our modification of a code provided by Larry Meadows from Intel
// Verified by email exchange non-NDA, ok for github. Should be as uses /sys/devices/ FS
// so is already public and in the linux kernel for KNL.
struct knl_gbl_
{
int mc_rd[NMC];
int mc_wr[NMC];
int edc_rd[NEDC];
int edc_wr[NEDC];
int edc_hite[NEDC];
int edc_hitm[NEDC];
int edc_misse[NEDC];
int edc_missm[NEDC];
};
#endif
///////////////////////////////////////////////////////////////////////////////
class PmuStat
{
uint64_t counters[8][256];
#ifdef _KNIGHTS_LANDING_
static struct knl_gbl_ gbl;
#endif
const char *name;
uint64_t reads; // memory reads
uint64_t writes; // memory writes
uint64_t mrstart; // memory read counter at start of parallel region
uint64_t mrend; // memory read counter at end of parallel region
uint64_t mwstart; // memory write counter at start of parallel region
uint64_t mwend; // memory write counter at end of parallel region
// cumulative counters
uint64_t count; // number of invocations
uint64_t tregion; // total time in parallel region (from thread 0)
uint64_t tcycles; // total cycles inside parallel region
uint64_t inst, ref, cyc; // fixed counters
uint64_t pmc0, pmc1;// pmu
// add memory counters here
// temp variables
uint64_t tstart; // tsc at start of parallel region
uint64_t tend; // tsc at end of parallel region
// map for ctrs values
// 0 pmc0 start
// 1 pmc0 end
// 2 pmc1 start
// 3 pmc1 end
// 4 tsc start
// 5 tsc end
static bool pmu_initialized;
public:
static bool is_init(void){ return pmu_initialized;}
static void pmu_init(void);
static void pmu_fini(void);
static void pmu_start(void);
static void pmu_stop(void);
void accum(int nthreads);
static void xmemctrs(uint64_t *mr, uint64_t *mw);
void start(void);
void enter(int t);
void exit(int t);
void print(void);
void init(const char *regname);
void clear(void);
#ifdef _KNIGHTS_LANDING_
static void KNLsetup(void);
static uint64_t KNLreadctr(int fd);
static void KNLreadctrs(ctrs &c);
static void KNLevsetup(const char *ename, int &fd, int event, int umask);
#endif
};
}
#endif

View File

@ -30,7 +30,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <thread> #include <thread>
#include <stencil/Lebesgue.h> // subdir aggregate #include <Grid/stencil/Lebesgue.h> // subdir aggregate
////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////
// Must not lose sight that goal is to be able to construct really efficient // Must not lose sight that goal is to be able to construct really efficient
@ -70,17 +70,66 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
namespace Grid { namespace Grid {
inline void Gather_plane_simple_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
int off,std::vector<std::pair<int,int> > & table)
{
table.resize(0);
int rd = grid->_rdimensions[dimension];
if ( !grid->CheckerBoarded(dimension) ) {
cbmask = 0x3;
}
int so= plane*grid->_ostride[dimension]; // base offset for start of plane
int e1=grid->_slice_nblock[dimension];
int e2=grid->_slice_block[dimension];
int stride=grid->_slice_stride[dimension];
if ( cbmask == 0x3 ) {
table.resize(e1*e2);
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*stride;
int bo = n*e2;
table[bo+b]=std::pair<int,int>(bo+b,o+b);
}
}
} else {
int bo=0;
table.resize(e1*e2/2);
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*stride;
int ocb=1<<grid->CheckerBoardFromOindexTable(o+b);
if ( ocb &cbmask ) {
table[bo]=std::pair<int,int>(bo,o+b); bo++;
}
}
}
}
}
template<class vobj,class cobj,class compressor> void
Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so)
{
PARALLEL_FOR_LOOP
for(int i=0;i<table.size();i++){
buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
}
}
struct StencilEntry { struct StencilEntry {
int _offset; uint64_t _offset;
int _is_local; uint64_t _byte_offset;
int _permute; uint16_t _is_local;
int _around_the_world; uint16_t _permute;
uint32_t _around_the_world; //256 bits, 32 bytes, 1/2 cacheline
}; };
template<class vobj,class cobj> template<class vobj,class cobj>
class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in. class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
public: public:
typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
typedef uint32_t StencilInteger; typedef uint32_t StencilInteger;
typedef typename cobj::vector_type vector_type; typedef typename cobj::vector_type vector_type;
typedef typename cobj::scalar_type scalar_type; typedef typename cobj::scalar_type scalar_type;
@ -96,87 +145,60 @@ namespace Grid {
Integer to_rank; Integer to_rank;
Integer from_rank; Integer from_rank;
Integer bytes; Integer bytes;
volatile Integer done;
}; };
std::vector<Packet> Packets; std::vector<Packet> Packets;
#define SEND_IMMEDIATE int face_table_computed;
#define SERIAL_SENDS std::vector<std::vector<std::pair<int,int> > > face_table ;
void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){ void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){
comms_bytes+=2.0*bytes;
#ifdef SEND_IMMEDIATE
commtime-=usecond();
_grid->SendToRecvFrom(xmit,to,rcv,from,bytes);
commtime+=usecond();
#endif
Packet p; Packet p;
p.send_buf = xmit; p.send_buf = xmit;
p.recv_buf = rcv; p.recv_buf = rcv;
p.to_rank = to; p.to_rank = to;
p.from_rank= from; p.from_rank= from;
p.bytes = bytes; p.bytes = bytes;
p.done = 0;
comms_bytes+=2.0*bytes; comms_bytes+=2.0*bytes;
Packets.push_back(p); Packets.push_back(p);
} }
#ifdef SERIAL_SENDS void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
void Communicate(void ) { {
reqs.resize(Packets.size());
commtime-=usecond(); commtime-=usecond();
for(int i=0;i<Packets.size();i++){ for(int i=0;i<Packets.size();i++){
#ifndef SEND_IMMEDIATE _grid->StencilSendToRecvFromBegin(reqs[i],
_grid->SendToRecvFrom(
Packets[i].send_buf, Packets[i].send_buf,
Packets[i].to_rank, Packets[i].to_rank,
Packets[i].recv_buf, Packets[i].recv_buf,
Packets[i].from_rank, Packets[i].from_rank,
Packets[i].bytes); Packets[i].bytes);
#endif /*
Packets[i].done = 1; }else{
_grid->SendToRecvFromBegin(reqs[i],
Packets[i].send_buf,
Packets[i].to_rank,
Packets[i].recv_buf,
Packets[i].from_rank,
Packets[i].bytes);
}
*/
} }
commtime+=usecond(); commtime+=usecond();
} }
#else void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
void Communicate(void ) { {
typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
std::vector<std::vector<CommsRequest_t> > reqs(Packets.size());
commtime-=usecond(); commtime-=usecond();
const int concurrency=2;
for(int i=0;i<Packets.size();i+=concurrency){ for(int i=0;i<Packets.size();i++){
for(int ii=0;ii<concurrency;ii++){ // if( ShmDirectCopy )
int j = i+ii; _grid->StencilSendToRecvFromComplete(reqs[i]);
if ( j<Packets.size() ) { // else
#ifndef SEND_IMMEDIATE // _grid->SendToRecvFromComplete(reqs[i]);
_grid->SendToRecvFromBegin(reqs[j],
Packets[j].send_buf,
Packets[j].to_rank,
Packets[j].recv_buf,
Packets[j].from_rank,
Packets[j].bytes);
#endif
}
}
for(int ii=0;ii<concurrency;ii++){
int j = i+ii;
if ( j<Packets.size() ) {
#ifndef SEND_IMMEDIATE
_grid->SendToRecvFromComplete(reqs[i]);
#endif
}
}
for(int ii=0;ii<concurrency;ii++){
int j = i+ii;
if ( j<Packets.size() ) {
Packets[j].done = 1;
}
}
} }
commtime+=usecond(); commtime+=usecond();
} }
#endif
/////////////////////////////////////////// ///////////////////////////////////////////
// Simd merge queue for asynch comms // Simd merge queue for asynch comms
@ -196,36 +218,19 @@ namespace Grid {
m.rpointers= rpointers; m.rpointers= rpointers;
m.buffer_size = buffer_size; m.buffer_size = buffer_size;
m.packet_id = packet_id; m.packet_id = packet_id;
#ifdef SEND_IMMEDIATE
mergetime-=usecond();
PARALLEL_FOR_LOOP
for(int o=0;o<m.buffer_size;o++){
merge1(m.mpointer[o],m.rpointers,o);
}
mergetime+=usecond();
#else
Mergers.push_back(m); Mergers.push_back(m);
#endif
} }
void CommsMerge(void ) { void CommsMerge(void ) {
//PARALLEL_NESTED_LOOP2
for(int i=0;i<Mergers.size();i++){ for(int i=0;i<Mergers.size();i++){
spintime-=usecond();
int packet_id = Mergers[i].packet_id;
while(! Packets[packet_id].done ); // spin for completion
spintime+=usecond();
#ifndef SEND_IMMEDIATE
mergetime-=usecond(); mergetime-=usecond();
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int o=0;o<Mergers[i].buffer_size;o++){ for(int o=0;o<Mergers[i].buffer_size;o++){
merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o); merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
} }
mergetime+=usecond(); mergetime+=usecond();
#endif
} }
} }
@ -245,17 +250,59 @@ PARALLEL_FOR_LOOP
std::vector<int> _permute_type; std::vector<int> _permute_type;
// npoints x Osites() of these // npoints x Osites() of these
std::vector<std::vector<StencilEntry> > _entries; // Flat vector, change layout for cache friendly.
inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point][osite]; } Vector<StencilEntry> _entries;
void PrecomputeByteOffsets(void){
for(int i=0;i<_entries.size();i++){
if( _entries[i]._is_local ) {
_entries[i]._byte_offset = _entries[i]._offset*sizeof(vobj);
} else {
_entries[i]._byte_offset = _entries[i]._offset*sizeof(cobj);
}
}
};
inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; }
inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) {
uint64_t cbase = (uint64_t)&u_recv_buf_p[0];
local = _entries[ent]._is_local;
perm = _entries[ent]._permute;
if (perm) ptype = _permute_type[point];
if (local) {
return base + _entries[ent]._byte_offset;
} else {
return cbase + _entries[ent]._byte_offset;
}
}
inline uint64_t GetPFInfo(int ent,uint64_t base) {
uint64_t cbase = (uint64_t)&u_recv_buf_p[0];
int local = _entries[ent]._is_local;
if (local) return base + _entries[ent]._byte_offset;
else return cbase + _entries[ent]._byte_offset;
}
///////////////////////////////////////////////////////////
// Unified Comms buffers for all directions
///////////////////////////////////////////////////////////
// Vectors that live on the symmetric heap in case of SHMEM
// std::vector<commVector<scalar_object> > u_simd_send_buf_hide;
// std::vector<commVector<scalar_object> > u_simd_recv_buf_hide;
// commVector<cobj> u_send_buf_hide;
// commVector<cobj> u_recv_buf_hide;
// These are used; either SHM objects or refs to the above symmetric heap vectors
// depending on comms target
cobj* u_recv_buf_p;
cobj* u_send_buf_p;
std::vector<scalar_object *> u_simd_send_buf;
std::vector<scalar_object *> u_simd_recv_buf;
// Comms buffers
std::vector<Vector<scalar_object> > u_simd_send_buf;
std::vector<Vector<scalar_object> > u_simd_recv_buf;
Vector<cobj> u_send_buf;
Vector<cobj> comm_buf;
int u_comm_offset; int u_comm_offset;
int _unified_buffer_size; int _unified_buffer_size;
cobj *CommBuf(void) { return u_recv_buf_p; }
///////////////////////////////////////// /////////////////////////////////////////
// Timing info; ugly; possibly temporary // Timing info; ugly; possibly temporary
///////////////////////////////////////// /////////////////////////////////////////
@ -271,6 +318,48 @@ PARALLEL_FOR_LOOP
double gathermtime; double gathermtime;
double splicetime; double splicetime;
double nosplicetime; double nosplicetime;
double t_data;
double t_table;
double calls;
void ZeroCounters(void) {
gathertime = 0.;
jointime = 0.;
commtime = 0.;
halogtime = 0.;
mergetime = 0.;
spintime = 0.;
gathermtime = 0.;
splicetime = 0.;
nosplicetime = 0.;
t_data = 0.0;
t_table= 0.0;
comms_bytes = 0.;
calls = 0.;
};
void Report(void) {
#define PRINTIT(A) \
std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl;
if ( calls > 0. ) {
std::cout << GridLogMessage << " Stencil calls "<<calls<<std::endl;
PRINTIT(halogtime);
PRINTIT(gathertime);
PRINTIT(gathermtime);
PRINTIT(mergetime);
if(comms_bytes>1.0){
PRINTIT(comms_bytes);
PRINTIT(commtime);
std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s "<<std::endl;
}
PRINTIT(jointime);
PRINTIT(spintime);
PRINTIT(splicetime);
PRINTIT(nosplicetime);
PRINTIT(t_table);
PRINTIT(t_data);
}
};
#endif #endif
CartesianStencil(GridBase *grid, CartesianStencil(GridBase *grid,
@ -278,20 +367,9 @@ PARALLEL_FOR_LOOP
int checkerboard, int checkerboard,
const std::vector<int> &directions, const std::vector<int> &directions,
const std::vector<int> &distances) const std::vector<int> &distances)
: _entries(npoints), _permute_type(npoints), _comm_buf_size(npoints) : _permute_type(npoints), _comm_buf_size(npoints)
{ {
#ifdef TIMING_HACK face_table_computed=0;
gathertime=0;
jointime=0;
commtime=0;
halogtime=0;
mergetime=0;
spintime=0;
gathermtime=0;
splicetime=0;
nosplicetime=0;
comms_bytes=0;
#endif
_npoints = npoints; _npoints = npoints;
_grid = grid; _grid = grid;
_directions = directions; _directions = directions;
@ -300,13 +378,12 @@ PARALLEL_FOR_LOOP
int osites = _grid->oSites(); int osites = _grid->oSites();
_entries.resize(_npoints* osites);
for(int ii=0;ii<npoints;ii++){ for(int ii=0;ii<npoints;ii++){
int i = ii; // reverse direction to get SIMD comms done first int i = ii; // reverse direction to get SIMD comms done first
int point = i; int point = i;
_entries[i].resize( osites);
int dimension = directions[i]; int dimension = directions[i];
int displacement = distances[i]; int displacement = distances[i];
int shift = displacement; int shift = displacement;
@ -353,16 +430,25 @@ PARALLEL_FOR_LOOP
} }
} }
} }
u_send_buf.resize(_unified_buffer_size);
comm_buf.resize(_unified_buffer_size);
/////////////////////////////////////////////////////////////////////////////////
// Try to allocate for receiving in a shared memory region, fall back to buffer
/////////////////////////////////////////////////////////////////////////////////
const int Nsimd = grid->Nsimd(); const int Nsimd = grid->Nsimd();
_grid->ShmBufferFreeAll();
u_simd_send_buf.resize(Nsimd); u_simd_send_buf.resize(Nsimd);
u_simd_recv_buf.resize(Nsimd); u_simd_recv_buf.resize(Nsimd);
u_send_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
u_recv_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
for(int l=0;l<Nsimd;l++){ for(int l=0;l<Nsimd;l++){
u_simd_send_buf[l].resize(_unified_buffer_size); u_simd_recv_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object));
u_simd_recv_buf[l].resize(_unified_buffer_size); u_simd_send_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object));
} }
PrecomputeByteOffsets();
} }
void Local (int point, int dimension,int shiftpm,int cbmask) void Local (int point, int dimension,int shiftpm,int cbmask)
@ -506,10 +592,11 @@ PARALLEL_FOR_LOOP
// Simple block stride gather of SIMD objects // Simple block stride gather of SIMD objects
for(int n=0;n<_grid->_slice_nblock[dimension];n++){ for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){ for(int b=0;b<_grid->_slice_block[dimension];b++){
_entries[point][lo+o+b]._offset =ro+o+b; int idx=point+(lo+o+b)*_npoints;
_entries[point][lo+o+b]._is_local=1; _entries[idx]._offset =ro+o+b;
_entries[point][lo+o+b]._permute=permute; _entries[idx]._permute=permute;
_entries[point][lo+o+b]._around_the_world=wrap; _entries[idx]._is_local=1;
_entries[idx]._around_the_world=wrap;
} }
o +=_grid->_slice_stride[dimension]; o +=_grid->_slice_stride[dimension];
} }
@ -526,10 +613,11 @@ PARALLEL_FOR_LOOP
int ocb=1<<_grid->CheckerBoardFromOindex(o+b); int ocb=1<<_grid->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) { if ( ocb&cbmask ) {
_entries[point][lo+o+b]._offset =ro+o+b; int idx = point+(lo+o+b)*_npoints;
_entries[point][lo+o+b]._is_local=1; _entries[idx]._offset =ro+o+b;
_entries[point][lo+o+b]._permute=permute; _entries[idx]._is_local=1;
_entries[point][lo+o+b]._around_the_world=wrap; _entries[idx]._permute=permute;
_entries[idx]._around_the_world=wrap;
} }
} }
@ -552,10 +640,11 @@ PARALLEL_FOR_LOOP
// Simple block stride gather of SIMD objects // Simple block stride gather of SIMD objects
for(int n=0;n<_grid->_slice_nblock[dimension];n++){ for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){ for(int b=0;b<_grid->_slice_block[dimension];b++){
_entries[point][so+o+b]._offset =offset+(bo++); int idx=point+(so+o+b)*_npoints;
_entries[point][so+o+b]._is_local=0; _entries[idx]._offset =offset+(bo++);
_entries[point][so+o+b]._permute=0; _entries[idx]._is_local=0;
_entries[point][so+o+b]._around_the_world=wrap; _entries[idx]._permute=0;
_entries[idx]._around_the_world=wrap;
} }
o +=_grid->_slice_stride[dimension]; o +=_grid->_slice_stride[dimension];
} }
@ -571,10 +660,11 @@ PARALLEL_FOR_LOOP
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) { if ( ocb & cbmask ) {
_entries[point][so+o+b]._offset =offset+(bo++); int idx = point+(so+o+b)*_npoints;
_entries[point][so+o+b]._is_local=0; _entries[idx]._offset =offset+(bo++);
_entries[point][so+o+b]._permute =0; _entries[idx]._is_local=0;
_entries[point][so+o+b]._around_the_world=wrap; _entries[idx]._permute =0;
_entries[idx]._around_the_world=wrap;
} }
} }
o +=_grid->_slice_stride[dimension]; o +=_grid->_slice_stride[dimension];
@ -582,35 +672,22 @@ PARALLEL_FOR_LOOP
} }
} }
template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)
template<class compressor> {
std::thread HaloExchangeBegin(const Lattice<vobj> &source,compressor &compress) { std::vector<std::vector<CommsRequest_t> > reqs;
calls++;
Mergers.resize(0); Mergers.resize(0);
Packets.resize(0); Packets.resize(0);
_grid->StencilBarrier();
HaloGather(source,compress); HaloGather(source,compress);
return std::thread([&] { this->Communicate(); }); this->CommunicateBegin(reqs);
} _grid->StencilBarrier();
this->CommunicateComplete(reqs);
template<class compressor> _grid->StencilBarrier();
void HaloExchange(const Lattice<vobj> &source,compressor &compress)
{
Mergers.resize(0);
Packets.resize(0);
HaloGather(source,compress);
Communicate();
CommsMerge();
}
void HaloExchangeComplete(std::thread &thr)
{
CommsMerge(); // spins CommsMerge(); // spins
jointime-=usecond();
thr.join();
jointime+=usecond();
} }
template<class compressor> template<class compressor> void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point)
{ {
int dimension = _directions[point]; int dimension = _directions[point];
int displacement = _distances[point]; int displacement = _distances[point];
@ -638,23 +715,23 @@ PARALLEL_FOR_LOOP
if ( sshift[0] == sshift[1] ) { if ( sshift[0] == sshift[1] ) {
if (splice_dim) { if (splice_dim) {
splicetime-=usecond(); splicetime-=usecond();
GatherSimd(source,dimension,shift,0x3,compress); GatherSimd(source,dimension,shift,0x3,compress,face_idx);
splicetime+=usecond(); splicetime+=usecond();
} else { } else {
nosplicetime-=usecond(); nosplicetime-=usecond();
Gather(source,dimension,shift,0x3,compress); Gather(source,dimension,shift,0x3,compress,face_idx);
nosplicetime+=usecond(); nosplicetime+=usecond();
} }
} else { } else {
if(splice_dim){ if(splice_dim){
splicetime-=usecond(); splicetime-=usecond();
GatherSimd(source,dimension,shift,0x1,compress);// if checkerboard is unfavourable take two passes GatherSimd(source,dimension,shift,0x1,compress,face_idx);// if checkerboard is unfavourable take two passes
GatherSimd(source,dimension,shift,0x2,compress);// both with block stride loop iteration GatherSimd(source,dimension,shift,0x2,compress,face_idx);// both with block stride loop iteration
splicetime+=usecond(); splicetime+=usecond();
} else { } else {
nosplicetime-=usecond(); nosplicetime-=usecond();
Gather(source,dimension,shift,0x1,compress); Gather(source,dimension,shift,0x1,compress,face_idx);
Gather(source,dimension,shift,0x2,compress); Gather(source,dimension,shift,0x2,compress,face_idx);
nosplicetime+=usecond(); nosplicetime+=usecond();
} }
} }
@ -668,21 +745,22 @@ PARALLEL_FOR_LOOP
assert(source._grid==_grid); assert(source._grid==_grid);
halogtime-=usecond(); halogtime-=usecond();
assert (comm_buf.size() == _unified_buffer_size );
u_comm_offset=0; u_comm_offset=0;
// Gather all comms buffers // Gather all comms buffers
int face_idx=0;
for(int point = 0 ; point < _npoints; point++) { for(int point = 0 ; point < _npoints; point++) {
compress.Point(point); compress.Point(point);
HaloGatherDir(source,compress,point); HaloGatherDir(source,compress,point,face_idx);
} }
face_table_computed=1;
assert(u_comm_offset==_unified_buffer_size); assert(u_comm_offset==_unified_buffer_size);
halogtime+=usecond(); halogtime+=usecond();
} }
template<class compressor> template<class compressor>
void Gather(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor & compress) void Gather(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx)
{ {
typedef typename cobj::vector_type vector_type; typedef typename cobj::vector_type vector_type;
typedef typename cobj::scalar_type scalar_type; typedef typename cobj::scalar_type scalar_type;
@ -719,31 +797,54 @@ PARALLEL_FOR_LOOP
int bytes = words * sizeof(cobj); int bytes = words * sizeof(cobj);
gathertime-=usecond(); gathertime-=usecond();
Gather_plane_simple (rhs,u_send_buf,dimension,sx,cbmask,compress,u_comm_offset); int so = sx*rhs._grid->_ostride[dimension]; // base offset for start of plane
gathertime+=usecond(); if ( !face_table_computed ) {
t_table-=usecond();
face_table.resize(face_idx+1);
Gather_plane_simple_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,
face_table[face_idx]);
t_table+=usecond();
}
int rank = _grid->_processor; int rank = _grid->_processor;
int recv_from_rank; int recv_from_rank;
int xmit_to_rank; int xmit_to_rank;
_grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); _grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
assert (xmit_to_rank != _grid->ThisRank()); assert (xmit_to_rank != _grid->ThisRank());
assert (recv_from_rank != _grid->ThisRank()); assert (recv_from_rank != _grid->ThisRank());
// FIXME Implement asynchronous send & also avoid buffer copy /////////////////////////////////////////////////////////
AddPacket((void *)&u_send_buf[u_comm_offset], // try the direct copy if possible
(void *) &comm_buf[u_comm_offset], /////////////////////////////////////////////////////////
cobj *send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,u_recv_buf_p);
if ( send_buf==NULL ) {
send_buf = u_send_buf_p;
}
// std::cout << " send_bufs "<<std::hex<< send_buf <<" ubp "<<u_send_buf_p <<std::dec<<std::endl;
t_data-=usecond();
assert(u_send_buf_p!=NULL);
assert(send_buf!=NULL);
Gather_plane_simple_table (face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so); face_idx++;
t_data+=usecond();
AddPacket((void *)&send_buf[u_comm_offset],
(void *)&u_recv_buf_p[u_comm_offset],
xmit_to_rank, xmit_to_rank,
recv_from_rank, recv_from_rank,
bytes); bytes);
gathertime+=usecond();
u_comm_offset+=words; u_comm_offset+=words;
} }
} }
} }
template<class compressor> template<class compressor>
void GatherSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress) void GatherSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx)
{ {
const int Nsimd = _grid->Nsimd(); const int Nsimd = _grid->Nsimd();
@ -822,10 +923,6 @@ PARALLEL_FOR_LOOP
auto rp = &u_simd_recv_buf[i ][u_comm_offset]; auto rp = &u_simd_recv_buf[i ][u_comm_offset];
auto sp = &u_simd_send_buf[nbr_lane][u_comm_offset]; auto sp = &u_simd_send_buf[nbr_lane][u_comm_offset];
void *vrp = (void *)rp;
void *vsp = (void *)sp;
if(nbr_proc){ if(nbr_proc){
int recv_from_rank; int recv_from_rank;
@ -833,9 +930,17 @@ PARALLEL_FOR_LOOP
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
AddPacket( vsp,vrp,xmit_to_rank,recv_from_rank,bytes); scalar_object *shm = (scalar_object *) _grid->ShmBufferTranslate(recv_from_rank,sp);
// if ((ShmDirectCopy==0)||(shm==NULL)) {
if (shm==NULL) {
shm = rp;
}
rpointers[i] = rp; // if Direct, StencilSendToRecvFrom will suppress copy to a peer on node
// assuming above pointer flip
AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes);
rpointers[i] = shm;
} else { } else {
@ -844,7 +949,7 @@ PARALLEL_FOR_LOOP
} }
} }
AddMerge(&comm_buf[u_comm_offset],rpointers,buffer_size,Packets.size()-1); AddMerge(&u_recv_buf_p[u_comm_offset],rpointers,buffer_size,Packets.size()-1);
u_comm_offset +=buffer_size; u_comm_offset +=buffer_size;
} }

View File

@ -30,22 +30,22 @@ Author: neo <cossu@post.kek.jp>
#ifndef GRID_MATH_H #ifndef GRID_MATH_H
#define GRID_MATH_H #define GRID_MATH_H
#include <tensors/Tensor_traits.h> #include <Grid/tensors/Tensor_traits.h>
#include <tensors/Tensor_class.h> #include <Grid/tensors/Tensor_class.h>
#include <tensors/Tensor_arith.h> #include <Grid/tensors/Tensor_arith.h>
#include <tensors/Tensor_inner.h> #include <Grid/tensors/Tensor_inner.h>
#include <tensors/Tensor_outer.h> #include <Grid/tensors/Tensor_outer.h>
#include <tensors/Tensor_transpose.h> #include <Grid/tensors/Tensor_transpose.h>
#include <tensors/Tensor_trace.h> #include <Grid/tensors/Tensor_trace.h>
#include <tensors/Tensor_index.h> #include <Grid/tensors/Tensor_index.h>
#include <tensors/Tensor_Ta.h> #include <Grid/tensors/Tensor_Ta.h>
#include <tensors/Tensor_determinant.h> #include <Grid/tensors/Tensor_determinant.h>
#include <tensors/Tensor_exp.h> #include <Grid/tensors/Tensor_exp.h>
//#include <tensors/Tensor_peek.h> //#include <Grid/tensors/Tensor_peek.h>
//#include <tensors/Tensor_poke.h> //#include <Grid/tensors/Tensor_poke.h>
#include <tensors/Tensor_reality.h> #include <Grid/tensors/Tensor_reality.h>
#include <tensors/Tensor_unary.h> #include <Grid/tensors/Tensor_unary.h>
#include <tensors/Tensor_extract_merge.h> #include <Grid/tensors/Tensor_extract_merge.h>
#include <tensors/Tensor_logical.h> #include <Grid/tensors/Tensor_logical.h>
#endif #endif

View File

@ -37,7 +37,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#ifdef GRID_OMP #ifdef GRID_OMP
#include <omp.h> #include <omp.h>
#define PARALLEL_FOR_LOOP _Pragma("omp parallel for ") #ifdef GRID_NUMA
#define PARALLEL_FOR_LOOP _Pragma("omp parallel for schedule(static)")
#else
#define PARALLEL_FOR_LOOP _Pragma("omp parallel for schedule(runtime)")
#endif
#define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)") #define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)")
#else #else
#define PARALLEL_FOR_LOOP #define PARALLEL_FOR_LOOP
@ -123,6 +127,22 @@ class GridThread {
ThreadBarrier(); ThreadBarrier();
}; };
static void bcopy(const void *src, void *dst, size_t len) {
#ifdef GRID_OMP
#pragma omp parallel
{
const char *c_src =(char *) src;
char *c_dest=(char *) dst;
int me,mywork,myoff;
GridThread::GetWorkBarrier(len,me, mywork,myoff);
bcopy(&c_src[myoff],&c_dest[myoff],mywork);
}
#else
bcopy(src,dst,len);
#endif
}
}; };
} }

View File

@ -39,7 +39,13 @@ namespace Grid {
// Dress the output; use std::chrono // Dress the output; use std::chrono
// C++11 time facilities better? // C++11 time facilities better?
double usecond(void); inline double usecond(void) {
struct timeval tv;
#ifdef TIMERS_ON
gettimeofday(&tv,NULL);
#endif
return 1.0*tv.tv_usec + 1.0e6*tv.tv_sec;
}
typedef std::chrono::system_clock GridClock; typedef std::chrono::system_clock GridClock;
typedef std::chrono::time_point<GridClock> GridTimePoint; typedef std::chrono::time_point<GridClock> GridTimePoint;
@ -63,17 +69,23 @@ public:
} }
void Start(void) { void Start(void) {
assert(running == false); assert(running == false);
#ifdef TIMERS_ON
start = GridClock::now(); start = GridClock::now();
#endif
running = true; running = true;
} }
void Stop(void) { void Stop(void) {
assert(running == true); assert(running == true);
#ifdef TIMERS_ON
accumulator+= std::chrono::duration_cast<GridUsecs>(GridClock::now()-start); accumulator+= std::chrono::duration_cast<GridUsecs>(GridClock::now()-start);
#endif
running = false; running = false;
}; };
void Reset(void){ void Reset(void){
running = false; running = false;
#ifdef TIMERS_ON
start = GridClock::now(); start = GridClock::now();
#endif
accumulator = std::chrono::duration_cast<GridUsecs>(start-start); accumulator = std::chrono::duration_cast<GridUsecs>(start-start);
} }
GridTime Elapsed(void) { GridTime Elapsed(void) {

View File

@ -31,7 +31,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_ALGORITHM_COARSENED_MATRIX_H #ifndef GRID_ALGORITHM_COARSENED_MATRIX_H
#define GRID_ALGORITHM_COARSENED_MATRIX_H #define GRID_ALGORITHM_COARSENED_MATRIX_H
#include <Grid.h>
namespace Grid { namespace Grid {

View File

@ -28,7 +28,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_ALGORITHM_SPARSE_MATRIX_H #ifndef GRID_ALGORITHM_SPARSE_MATRIX_H
#define GRID_ALGORITHM_SPARSE_MATRIX_H #define GRID_ALGORITHM_SPARSE_MATRIX_H
#include <Grid.h>
namespace Grid { namespace Grid {

View File

@ -29,8 +29,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_CHEBYSHEV_H #ifndef GRID_CHEBYSHEV_H
#define GRID_CHEBYSHEV_H #define GRID_CHEBYSHEV_H
#include<Grid.h> #include <Grid/algorithms/LinearOperator.h>
#include<algorithms/LinearOperator.h>
namespace Grid { namespace Grid {

View File

@ -18,10 +18,10 @@
#include <stddef.h> #include <stddef.h>
#include <Config.h> #include <Config.h>
#ifdef HAVE_GMP_H #ifdef HAVE_LIBGMP
#include <algorithms/approx/bigfloat.h> #include "bigfloat.h"
#else #else
#include <algorithms/approx/bigfloat_double.h> #include "bigfloat_double.h"
#endif #endif
#define JMAX 10000 //Maximum number of iterations of Newton's approximation #define JMAX 10000 //Maximum number of iterations of Newton's approximation

View File

@ -24,7 +24,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
with this program; if not, write to the Free Software Foundation, Inc., with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_CONJUGATE_GRADIENT_H #ifndef GRID_CONJUGATE_GRADIENT_H
@ -40,14 +41,17 @@ namespace Grid {
template <class Field> template <class Field>
class ConjugateGradient : public OperatorFunction<Field> { class ConjugateGradient : public OperatorFunction<Field> {
public: public:
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
// Defaults true.
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
ConjugateGradient(RealD tol,Integer maxit) : Tolerance(tol), MaxIterations(maxit) { ConjugateGradient(RealD tol, Integer maxit, bool err_on_no_conv = true)
}; : Tolerance(tol),
MaxIterations(maxit),
ErrorOnNoConverge(err_on_no_conv){};
void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
void operator()(LinearOperatorBase<Field> &Linop, const Field &src,
Field &psi) {
psi.checkerboard = src.checkerboard; psi.checkerboard = src.checkerboard;
conformable(psi, src); conformable(psi, src);
@ -61,8 +65,10 @@ public:
RealD guess = norm2(psi); RealD guess = norm2(psi);
assert(std::isnan(guess) == 0); assert(std::isnan(guess) == 0);
Linop.HermOpAndNorm(psi, mmp, d, b); Linop.HermOpAndNorm(psi, mmp, d, b);
r = src - mmp; r = src - mmp;
p = r; p = r;
@ -70,12 +76,18 @@ public:
cp = a; cp = a;
ssq = norm2(src); ssq = norm2(src);
std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient: guess "<<guess<<std::endl; std::cout << GridLogIterative << std::setprecision(4)
std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient: src "<<ssq <<std::endl; << "ConjugateGradient: guess " << guess << std::endl;
std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient: mp "<<d <<std::endl; std::cout << GridLogIterative << std::setprecision(4)
std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient: mmp "<<b <<std::endl; << "ConjugateGradient: src " << ssq << std::endl;
std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient: cp,r "<<cp <<std::endl; std::cout << GridLogIterative << std::setprecision(4)
std::cout<<GridLogIterative <<std::setprecision(4)<< "ConjugateGradient: p "<<a <<std::endl; << "ConjugateGradient: mp " << d << std::endl;
std::cout << GridLogIterative << std::setprecision(4)
<< "ConjugateGradient: mmp " << b << std::endl;
std::cout << GridLogIterative << std::setprecision(4)
<< "ConjugateGradient: cp,r " << cp << std::endl;
std::cout << GridLogIterative << std::setprecision(4)
<< "ConjugateGradient: p " << a << std::endl;
RealD rsq = Tolerance * Tolerance * ssq; RealD rsq = Tolerance * Tolerance * ssq;
@ -84,7 +96,9 @@ public:
return; return;
} }
std::cout<<GridLogIterative << std::setprecision(4)<< "ConjugateGradient: k=0 residual "<<cp<<" target "<<rsq<<std::endl; std::cout << GridLogIterative << std::setprecision(4)
<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq
<< std::endl;
GridStopWatch LinalgTimer; GridStopWatch LinalgTimer;
GridStopWatch MatrixTimer; GridStopWatch MatrixTimer;
@ -93,7 +107,6 @@ public:
SolverTimer.Start(); SolverTimer.Start();
int k; int k;
for (k = 1; k <= MaxIterations; k++) { for (k = 1; k <= MaxIterations; k++) {
c = cp; c = cp;
MatrixTimer.Start(); MatrixTimer.Start();
@ -115,11 +128,11 @@ public:
p = p * b + r; p = p * b + r;
LinalgTimer.Stop(); LinalgTimer.Stop();
std::cout<<GridLogIterative<<"ConjugateGradient: Iteration " <<k<<" residual "<<cp<< " target "<< rsq<<std::endl; std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
<< " residual " << cp << " target " << rsq << std::endl;
// Stopping condition // Stopping condition
if (cp <= rsq) { if (cp <= rsq) {
SolverTimer.Stop(); SolverTimer.Stop();
Linop.HermOpAndNorm(psi, mmp, d, qq); Linop.HermOpAndNorm(psi, mmp, d, qq);
p = mmp - src; p = mmp - src;
@ -130,20 +143,25 @@ public:
RealD resnorm = sqrt(norm2(p)); RealD resnorm = sqrt(norm2(p));
RealD true_residual = resnorm / srcnorm; RealD true_residual = resnorm / srcnorm;
std::cout<<GridLogMessage<<"ConjugateGradient: Converged on iteration " <<k std::cout << GridLogMessage
<<" computed residual "<<sqrt(cp/ssq) << "ConjugateGradient: Converged on iteration " << k << std::endl;
<<" true residual " <<true_residual std::cout << GridLogMessage << "Computed residual " << sqrt(cp / ssq)
<<" target "<<Tolerance<<std::endl; << " true residual " << true_residual << " target "
std::cout<<GridLogMessage<<"Time elapsed: Total "<< SolverTimer.Elapsed() << " Matrix "<<MatrixTimer.Elapsed() << " Linalg "<<LinalgTimer.Elapsed(); << Tolerance << std::endl;
std::cout << GridLogMessage << "Time elapsed: Iterations "
<< SolverTimer.Elapsed() << " Matrix "
<< MatrixTimer.Elapsed() << " Linalg "
<< LinalgTimer.Elapsed();
std::cout << std::endl; std::cout << std::endl;
assert(true_residual/Tolerance < 1000.0); if (ErrorOnNoConverge) assert(true_residual / Tolerance < 1000.0);
return; return;
} }
} }
std::cout<<GridLogMessage<<"ConjugateGradient did NOT converge"<<std::endl; std::cout << GridLogMessage << "ConjugateGradient did NOT converge"
assert(0); << std::endl;
if (ErrorOnNoConverge) assert(0);
} }
}; };
} }

View File

@ -0,0 +1,142 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrec.h
Copyright (C) 2015
Author: Christopher Kelly <ckelly@phys.columbia.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_H
#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_H
namespace Grid {
//Mixed precision restarted defect correction CG
template<class FieldD,class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
public:
RealD Tolerance;
Integer MaxInnerIterations;
Integer MaxOuterIterations;
GridBase* SinglePrecGrid; //Grid for single-precision fields
RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
LinearOperatorBase<FieldF> &Linop_f;
LinearOperatorBase<FieldD> &Linop_d;
//Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
LinearFunction<FieldF> *guesser;
MixedPrecisionConjugateGradient(RealD tol, Integer maxinnerit, Integer maxouterit, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d) :
Linop_f(_Linop_f), Linop_d(_Linop_d),
Tolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid),
OuterLoopNormMult(100.), guesser(NULL){ };
void useGuesser(LinearFunction<FieldF> &g){
guesser = &g;
}
void operator() (const FieldD &src_d_in, FieldD &sol_d){
GridStopWatch TotalTimer;
TotalTimer.Start();
int cb = src_d_in.checkerboard;
sol_d.checkerboard = cb;
RealD src_norm = norm2(src_d_in);
RealD stop = src_norm * Tolerance*Tolerance;
GridBase* DoublePrecGrid = src_d_in._grid;
FieldD tmp_d(DoublePrecGrid);
tmp_d.checkerboard = cb;
FieldD tmp2_d(DoublePrecGrid);
tmp2_d.checkerboard = cb;
FieldD src_d(DoublePrecGrid);
src_d = src_d_in; //source for next inner iteration, computed from residual during operation
RealD inner_tol = Tolerance;
FieldF src_f(SinglePrecGrid);
src_f.checkerboard = cb;
FieldF sol_f(SinglePrecGrid);
sol_f.checkerboard = cb;
ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
CG_f.ErrorOnNoConverge = false;
GridStopWatch InnerCGtimer;
GridStopWatch PrecChangeTimer;
for(Integer outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
//Compute double precision rsd and also new RHS vector.
Linop_d.HermOp(sol_d, tmp_d);
RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
if(norm < OuterLoopNormMult * stop){
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
break;
}
while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
PrecChangeTimer.Start();
precisionChange(src_f, src_d);
PrecChangeTimer.Stop();
zeroit(sol_f);
//Optionally improve inner solver guess (eg using known eigenvectors)
if(guesser != NULL)
(*guesser)(src_f, sol_f);
//Inner CG
CG_f.Tolerance = inner_tol;
InnerCGtimer.Start();
CG_f(Linop_f, src_f, sol_f);
InnerCGtimer.Stop();
//Convert sol back to double and add to double prec solution
PrecChangeTimer.Start();
precisionChange(tmp_d, sol_f);
PrecChangeTimer.Stop();
axpy(sol_d, 1.0, tmp_d, sol_d);
}
//Final trial CG
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting final patch-up double-precision solve"<<std::endl;
ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
CG_d(Linop_d, src_d_in, sol_d);
TotalTimer.Stop();
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
}
};
}
#endif

View File

@ -243,8 +243,6 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
if ( (!converged[s]) ){ if ( (!converged[s]) ){
RealD css = c * z[s][iz]* z[s][iz]; RealD css = c * z[s][iz]* z[s][iz];
if((k%100)==0 && (s==0) )
std::cout<<GridLogMessage<<"ConjugateGradientMultiShift k="<<k<<" css " <<css <<std::endl;
if(css<rsq[s]){ if(css<rsq[s]){
if ( ! converged[s] ) if ( ! converged[s] )

View File

@ -130,8 +130,8 @@ DenseMatrix<T> GetSubMtx(DenseMatrix<T> &A,int row_st, int row_end, int col_st,
} }
#include <algorithms/iterative/Householder.h> #include "Householder.h"
#include <algorithms/iterative/Francis.h> #include "Francis.h"
#endif #endif

View File

@ -33,8 +33,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#ifdef USE_LAPACK #ifdef USE_LAPACK
#include <lapacke.h> #include <lapacke.h>
#endif #endif
#include <algorithms/iterative/DenseMatrix.h> #include "DenseMatrix.h"
#include <algorithms/iterative/EigenSort.h> #include "EigenSort.h"
namespace Grid { namespace Grid {

View File

@ -29,7 +29,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_CARTESIAN_BASE_H #ifndef GRID_CARTESIAN_BASE_H
#define GRID_CARTESIAN_BASE_H #define GRID_CARTESIAN_BASE_H
#include <Grid.h>
namespace Grid{ namespace Grid{
@ -78,15 +77,12 @@ public:
// GridCartesian / GridRedBlackCartesian // GridCartesian / GridRedBlackCartesian
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
virtual int CheckerBoarded(int dim)=0; virtual int CheckerBoarded(int dim)=0;
virtual int CheckerBoard(std::vector<int> site)=0; virtual int CheckerBoard(std::vector<int> &site)=0;
virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0; virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0; virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0; virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0;
int CheckerBoardFromOindex (int Oindex){ virtual int CheckerBoardFromOindex (int Oindex)=0;
std::vector<int> ocoor; virtual int CheckerBoardFromOindexTable (int Oindex)=0;
oCoorFromOindex(ocoor,Oindex);
return CheckerBoard(ocoor);
}
////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////
// Local layout calculations // Local layout calculations
@ -107,6 +103,12 @@ public:
for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]); for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
return idx; return idx;
} }
virtual int iIndex(std::vector<int> &lcoor)
{
int idx=0;
for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
return idx;
}
inline int oIndexReduced(std::vector<int> &ocoor) inline int oIndexReduced(std::vector<int> &ocoor)
{ {
int idx=0; int idx=0;
@ -123,12 +125,6 @@ public:
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
// SIMD lane addressing // SIMD lane addressing
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
inline int iIndex(std::vector<int> &lcoor)
{
int idx=0;
for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
return idx;
}
inline void iCoorFromIindex(std::vector<int> &coor,int lane) inline void iCoorFromIindex(std::vector<int> &coor,int lane)
{ {
Lexicographic::CoorFromIndex(coor,lane,_simd_layout); Lexicographic::CoorFromIndex(coor,lane,_simd_layout);

View File

@ -39,10 +39,17 @@ class GridCartesian: public GridBase {
public: public:
virtual int CheckerBoardFromOindexTable (int Oindex) {
return 0;
}
virtual int CheckerBoardFromOindex (int Oindex)
{
return 0;
}
virtual int CheckerBoarded(int dim){ virtual int CheckerBoarded(int dim){
return 0; return 0;
} }
virtual int CheckerBoard(std::vector<int> site){ virtual int CheckerBoard(std::vector<int> &site){
return 0; return 0;
} }
virtual int CheckerBoardDestination(int cb,int shift,int dim){ virtual int CheckerBoardDestination(int cb,int shift,int dim){

View File

@ -37,24 +37,19 @@ namespace Grid {
static const int Even =CbRed; static const int Even =CbRed;
static const int Odd =CbBlack; static const int Odd =CbBlack;
// Perhaps these are misplaced and
// should be in sparse matrix.
// Also should make these a named enum type
static const int DaggerNo=0;
static const int DaggerYes=1;
// Specialise this for red black grids storing half the data like a chess board. // Specialise this for red black grids storing half the data like a chess board.
class GridRedBlackCartesian : public GridBase class GridRedBlackCartesian : public GridBase
{ {
public: public:
std::vector<int> _checker_dim_mask; std::vector<int> _checker_dim_mask;
int _checker_dim; int _checker_dim;
std::vector<int> _checker_board;
virtual int CheckerBoarded(int dim){ virtual int CheckerBoarded(int dim){
if( dim==_checker_dim) return 1; if( dim==_checker_dim) return 1;
else return 0; else return 0;
} }
virtual int CheckerBoard(std::vector<int> site){ virtual int CheckerBoard(std::vector<int> &site){
int linear=0; int linear=0;
assert(site.size()==_ndimension); assert(site.size()==_ndimension);
for(int d=0;d<_ndimension;d++){ for(int d=0;d<_ndimension;d++){
@ -78,12 +73,20 @@ public:
// or by looping over x,y,z and multiply rather than computing checkerboard. // or by looping over x,y,z and multiply rather than computing checkerboard.
if ( (source_cb+ocb)&1 ) { if ( (source_cb+ocb)&1 ) {
return (shift)/2; return (shift)/2;
} else { } else {
return (shift+1)/2; return (shift+1)/2;
} }
} }
virtual int CheckerBoardFromOindexTable (int Oindex) {
return _checker_board[Oindex];
}
virtual int CheckerBoardFromOindex (int Oindex)
{
std::vector<int> ocoor;
oCoorFromOindex(ocoor,Oindex);
return CheckerBoard(ocoor);
}
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite){ virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite){
if(dim != _checker_dim) return shift; if(dim != _checker_dim) return shift;
@ -170,9 +173,15 @@ public:
// Use a reduced simd grid // Use a reduced simd grid
_simd_layout[d] = simd_layout[d]; _simd_layout[d] = simd_layout[d];
_rdimensions[d]= _ldimensions[d]/_simd_layout[d]; _rdimensions[d]= _ldimensions[d]/_simd_layout[d];
assert(_rdimensions[d]>0);
// all elements of a simd vector must have same checkerboard. // all elements of a simd vector must have same checkerboard.
if ( simd_layout[d]>1 ) assert((_rdimensions[d]&0x1)==0); // If Ls vectorised, this must still be the case; e.g. dwf rb5d
if ( _simd_layout[d]>1 ) {
if ( checker_dim_mask[d] ) {
assert( (_rdimensions[d]&0x1) == 0 );
}
}
_osites *= _rdimensions[d]; _osites *= _rdimensions[d];
_isites *= _simd_layout[d]; _isites *= _simd_layout[d];
@ -185,6 +194,8 @@ public:
_ostride[d] = _ostride[d-1]*_rdimensions[d-1]; _ostride[d] = _ostride[d-1]*_rdimensions[d-1];
_istride[d] = _istride[d-1]*_simd_layout[d-1]; _istride[d] = _istride[d-1]*_simd_layout[d-1];
} }
} }
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
@ -206,6 +217,18 @@ public:
block = block*_rdimensions[d]; block = block*_rdimensions[d];
} }
////////////////////////////////////////////////
// Create a checkerboard lookup table
////////////////////////////////////////////////
int rvol = 1;
for(int d=0;d<_ndimension;d++){
rvol=rvol * _rdimensions[d];
}
_checker_board.resize(rvol);
for(int osite=0;osite<_osites;osite++){
_checker_board[osite] = CheckerBoardFromOindex (osite);
}
}; };
protected: protected:
virtual int oIndex(std::vector<int> &coor) virtual int oIndex(std::vector<int> &coor)
@ -221,6 +244,18 @@ protected:
return idx; return idx;
}; };
virtual int iIndex(std::vector<int> &lcoor)
{
int idx=0;
for(int d=0;d<_ndimension;d++) {
if( d==_checker_dim ) {
idx+=_istride[d]*(lcoor[d]/(2*_rdimensions[d]));
} else {
idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
}
}
return idx;
}
}; };
} }

View File

View File

@ -0,0 +1,132 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/Communicator_none.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include "Grid.h"
namespace Grid {
///////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
///////////////////////////////////////////////////////////////
int CartesianCommunicator::ShmRank;
int CartesianCommunicator::ShmSize;
int CartesianCommunicator::GroupRank;
int CartesianCommunicator::GroupSize;
int CartesianCommunicator::WorldRank;
int CartesianCommunicator::WorldSize;
int CartesianCommunicator::Slave;
void * CartesianCommunicator::ShmCommBuf;
/////////////////////////////////
// Alloc, free shmem region
/////////////////////////////////
void *CartesianCommunicator::ShmBufferMalloc(size_t bytes){
// bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
void *ptr = (void *)heap_top;
heap_top += bytes;
heap_bytes+= bytes;
std::cout <<"Shm alloc "<<ptr<<std::endl;
assert(heap_bytes < MAX_MPI_SHM_BYTES);
return ptr;
}
void CartesianCommunicator::ShmBufferFreeAll(void) {
heap_top =(size_t)ShmBufferSelf();
heap_bytes=0;
}
/////////////////////////////////
// Grid information queries
/////////////////////////////////
int CartesianCommunicator::IsBoss(void) { return _processor==0; };
int CartesianCommunicator::BossRank(void) { return 0; };
int CartesianCommunicator::ThisRank(void) { return _processor; };
const std::vector<int> & CartesianCommunicator::ThisProcessorCoor(void) { return _processor_coor; };
const std::vector<int> & CartesianCommunicator::ProcessorGrid(void) { return _processors; };
int CartesianCommunicator::ProcessorCount(void) { return _Nprocessors; };
////////////////////////////////////////////////////////////////////////////////
// very VERY rarely (Log, serial RNG) we need world without a grid
////////////////////////////////////////////////////////////////////////////////
int CartesianCommunicator::RankWorld(void){ return WorldRank; };
int CartesianCommunicator::Ranks (void) { return WorldSize; };
int CartesianCommunicator::Nodes (void) { return GroupSize; };
int CartesianCommunicator::Cores (void) { return ShmSize; };
int CartesianCommunicator::NodeRank (void) { return GroupRank; };
int CartesianCommunicator::CoreRank (void) { return ShmRank; };
void CartesianCommunicator::GlobalSum(ComplexF &c)
{
GlobalSumVector((float *)&c,2);
}
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
{
GlobalSumVector((float *)c,2*N);
}
void CartesianCommunicator::GlobalSum(ComplexD &c)
{
GlobalSumVector((double *)&c,2);
}
void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
{
GlobalSumVector((double *)c,2*N);
}
#ifndef GRID_COMMS_MPI3
void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes)
{
SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
{
SendToRecvFromComplete(waitall);
}
void CartesianCommunicator::StencilBarrier(void){};
commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;
void *CartesianCommunicator::ShmBufferSelf(void) { return ShmCommBuf; }
void *CartesianCommunicator::ShmBuffer(int rank) {
return NULL;
}
void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) {
return NULL;
}
void CartesianCommunicator::ShmInitGeneric(void){
ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES);
ShmCommBuf=(void *)&ShmBufStorageVector[0];
}
#endif
}

View File

@ -34,77 +34,137 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifdef GRID_COMMS_MPI #ifdef GRID_COMMS_MPI
#include <mpi.h> #include <mpi.h>
#endif #endif
#ifdef GRID_COMMS_MPI3
#include <mpi.h>
#endif
#ifdef GRID_COMMS_SHMEM #ifdef GRID_COMMS_SHMEM
#include <mpp/shmem.h> #include <mpp/shmem.h>
#endif #endif
namespace Grid { namespace Grid {
class CartesianCommunicator { class CartesianCommunicator {
public: public:
// Communicator should know nothing of the physics grid, only processor grid. // 65536 ranks per node adequate for now
// 128MB shared memory for comms enought for 48^4 local vol comms
// Give external control (command line override?) of this
static const int MAXLOG2RANKSPERNODE = 16;
static const uint64_t MAX_MPI_SHM_BYTES = 128*1024*1024;
// Communicator should know nothing of the physics grid, only processor grid.
int _Nprocessors; // How many in all int _Nprocessors; // How many in all
std::vector<int> _processors; // Which dimensions get relayed out over processors lanes. std::vector<int> _processors; // Which dimensions get relayed out over processors lanes.
int _processor; // linear processor rank int _processor; // linear processor rank
std::vector<int> _processor_coor; // linear processor coordinate std::vector<int> _processor_coor; // linear processor coordinate
unsigned long _ndimension; unsigned long _ndimension;
#ifdef GRID_COMMS_MPI #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3)
MPI_Comm communicator; MPI_Comm communicator;
static MPI_Comm communicator_world;
typedef MPI_Request CommsRequest_t; typedef MPI_Request CommsRequest_t;
#else #else
typedef int CommsRequest_t; typedef int CommsRequest_t;
#endif #endif
////////////////////////////////////////////////////////////////////
// Helper functionality for SHM Windows common to all other impls
////////////////////////////////////////////////////////////////////
// Longer term; drop this in favour of a master / slave model with
// cartesian communicator on a subset of ranks, slave ranks controlled
// by group leader with data xfer via shared memory
////////////////////////////////////////////////////////////////////
#ifdef GRID_COMMS_MPI3
std::vector<int> WorldDims;
std::vector<int> GroupDims;
std::vector<int> ShmDims;
std::vector<int> GroupCoor;
std::vector<int> ShmCoor;
std::vector<int> WorldCoor;
static std::vector<int> GroupRanks;
static std::vector<int> MyGroup;
static int ShmSetup;
static MPI_Win ShmWindow;
static MPI_Comm ShmComm;
std::vector<int> LexicographicToWorldRank;
static std::vector<void *> ShmCommBufs;
#else
static void ShmInitGeneric(void);
static commVector<uint8_t> ShmBufStorageVector;
#endif
static void * ShmCommBuf;
size_t heap_top;
size_t heap_bytes;
void *ShmBufferSelf(void);
void *ShmBuffer(int rank);
void *ShmBufferTranslate(int rank,void * local_p);
void *ShmBufferMalloc(size_t bytes);
void ShmBufferFreeAll(void) ;
////////////////////////////////////////////////
// Must call in Grid startup
////////////////////////////////////////////////
static void Init(int *argc, char ***argv); static void Init(int *argc, char ***argv);
// Constructor ////////////////////////////////////////////////
// Constructor of any given grid
////////////////////////////////////////////////
CartesianCommunicator(const std::vector<int> &pdimensions_in); CartesianCommunicator(const std::vector<int> &pdimensions_in);
// Wraps MPI_Cart routines ////////////////////////////////////////////////////////////////////////////////////////
// Wraps MPI_Cart routines, or implements equivalent on other impls
////////////////////////////////////////////////////////////////////////////////////////
void ShiftedRanks(int dim,int shift,int & source, int & dest); void ShiftedRanks(int dim,int shift,int & source, int & dest);
int RankFromProcessorCoor(std::vector<int> &coor); int RankFromProcessorCoor(std::vector<int> &coor);
void ProcessorCoorFromRank(int rank,std::vector<int> &coor); void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
///////////////////////////////// /////////////////////////////////
// Grid information queries // Grid information and queries
///////////////////////////////// /////////////////////////////////
int IsBoss(void) { return _processor==0; }; static int ShmRank;
int BossRank(void) { return 0; }; static int ShmSize;
int ThisRank(void) { return _processor; }; static int GroupSize;
const std::vector<int> & ThisProcessorCoor(void) { return _processor_coor; }; static int GroupRank;
const std::vector<int> & ProcessorGrid(void) { return _processors; }; static int WorldRank;
int ProcessorCount(void) { return _Nprocessors; }; static int WorldSize;
static int Slave;
int IsBoss(void) ;
int BossRank(void) ;
int ThisRank(void) ;
const std::vector<int> & ThisProcessorCoor(void) ;
const std::vector<int> & ProcessorGrid(void) ;
int ProcessorCount(void) ;
static int Ranks (void);
static int Nodes (void);
static int Cores (void);
static int NodeRank (void);
static int CoreRank (void);
////////////////////////////////////////////////////////////////////////////////
// very VERY rarely (Log, serial RNG) we need world without a grid
////////////////////////////////////////////////////////////////////////////////
static int RankWorld(void) ;
static void BroadcastWorld(int root,void* data, int bytes);
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// Reduction // Reduction
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
void GlobalSum(RealF &); void GlobalSum(RealF &);
void GlobalSumVector(RealF *,int N); void GlobalSumVector(RealF *,int N);
void GlobalSum(RealD &); void GlobalSum(RealD &);
void GlobalSumVector(RealD *,int N); void GlobalSumVector(RealD *,int N);
void GlobalSum(uint32_t &); void GlobalSum(uint32_t &);
void GlobalSum(uint64_t &); void GlobalSum(uint64_t &);
void GlobalSum(ComplexF &c);
void GlobalSum(ComplexF &c) void GlobalSumVector(ComplexF *c,int N);
{ void GlobalSum(ComplexD &c);
GlobalSumVector((float *)&c,2); void GlobalSumVector(ComplexD *c,int N);
}
void GlobalSumVector(ComplexF *c,int N)
{
GlobalSumVector((float *)c,2*N);
}
void GlobalSum(ComplexD &c)
{
GlobalSumVector((double *)&c,2);
}
void GlobalSumVector(ComplexD *c,int N)
{
GlobalSumVector((double *)c,2*N);
}
template<class obj> void GlobalSum(obj &o){ template<class obj> void GlobalSum(obj &o){
typedef typename obj::scalar_type scalar_type; typedef typename obj::scalar_type scalar_type;
@ -112,6 +172,7 @@ class CartesianCommunicator {
scalar_type * ptr = (scalar_type *)& o; scalar_type * ptr = (scalar_type *)& o;
GlobalSumVector(ptr,words); GlobalSumVector(ptr,words);
} }
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// Face exchange, buffer swap in translational invariant way // Face exchange, buffer swap in translational invariant way
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
@ -133,8 +194,19 @@ class CartesianCommunicator {
void *recv, void *recv,
int recv_from_rank, int recv_from_rank,
int bytes); int bytes);
void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall); void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
void StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes);
void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
void StencilBarrier(void);
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// Barrier // Barrier
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
@ -144,13 +216,12 @@ class CartesianCommunicator {
// Broadcast a buffer and composite larger // Broadcast a buffer and composite larger
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
void Broadcast(int root,void* data, int bytes); void Broadcast(int root,void* data, int bytes);
template<class obj> void Broadcast(int root,obj &data) template<class obj> void Broadcast(int root,obj &data)
{ {
Broadcast(root,(void *)&data,sizeof(data)); Broadcast(root,(void *)&data,sizeof(data));
}; };
static void BroadcastWorld(int root,void* data, int bytes);
}; };
} }

View File

@ -30,6 +30,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
namespace Grid { namespace Grid {
///////////////////////////////////////////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
///////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Comm CartesianCommunicator::communicator_world;
// Should error check all MPI calls. // Should error check all MPI calls.
void CartesianCommunicator::Init(int *argc, char ***argv) { void CartesianCommunicator::Init(int *argc, char ***argv) {
int flag; int flag;
@ -37,12 +43,15 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
if ( !flag ) { if ( !flag ) {
MPI_Init(argc,argv); MPI_Init(argc,argv);
} }
} MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
MPI_Comm_rank(communicator_world,&WorldRank);
int Rank(void) { MPI_Comm_size(communicator_world,&WorldSize);
int pe; ShmRank=0;
MPI_Comm_rank(MPI_COMM_WORLD,&pe); ShmSize=1;
return pe; GroupRank=WorldRank;
GroupSize=WorldSize;
Slave =0;
ShmInitGeneric();
} }
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
@ -53,9 +62,8 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
_Nprocessors=1; _Nprocessors=1;
_processors = processors; _processors = processors;
_processor_coor.resize(_ndimension); _processor_coor.resize(_ndimension);
std::cout << processors << std::endl;
MPI_Cart_create(MPI_COMM_WORLD, _ndimension,&_processors[0],&periodic[0],1,&communicator); MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator);
MPI_Comm_rank(communicator,&_processor); MPI_Comm_rank(communicator,&_processor);
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]); MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
@ -68,7 +76,6 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
assert(Size==_Nprocessors); assert(Size==_Nprocessors);
} }
void CartesianCommunicator::GlobalSum(uint32_t &u){ void CartesianCommunicator::GlobalSum(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
assert(ierr==0); assert(ierr==0);
@ -169,7 +176,6 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
int nreq=list.size(); int nreq=list.size();
std::vector<MPI_Status> status(nreq); std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]); int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
assert(ierr==0); assert(ierr==0);
} }
@ -188,14 +194,17 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
communicator); communicator);
assert(ierr==0); assert(ierr==0);
} }
///////////////////////////////////////////////////////
// Should only be used prior to Grid Init finished.
// Check for this?
///////////////////////////////////////////////////////
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
{ {
int ierr= MPI_Bcast(data, int ierr= MPI_Bcast(data,
bytes, bytes,
MPI_BYTE, MPI_BYTE,
root, root,
MPI_COMM_WORLD); communicator_world);
assert(ierr==0); assert(ierr==0);
} }

View File

@ -0,0 +1,574 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/Communicator_mpi.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include "Grid.h"
#include <mpi.h>
namespace Grid {
///////////////////////////////////////////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
///////////////////////////////////////////////////////////////////////////////////////////////////
int CartesianCommunicator::ShmSetup = 0;
MPI_Comm CartesianCommunicator::communicator_world;
MPI_Comm CartesianCommunicator::ShmComm;
MPI_Win CartesianCommunicator::ShmWindow;
std::vector<int> CartesianCommunicator::GroupRanks;
std::vector<int> CartesianCommunicator::MyGroup;
std::vector<void *> CartesianCommunicator::ShmCommBufs;
void *CartesianCommunicator::ShmBufferSelf(void)
{
return ShmCommBufs[ShmRank];
}
void *CartesianCommunicator::ShmBuffer(int rank)
{
int gpeer = GroupRanks[rank];
if (gpeer == MPI_UNDEFINED){
return NULL;
} else {
return ShmCommBufs[gpeer];
}
}
void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p)
{
int gpeer = GroupRanks[rank];
if (gpeer == MPI_UNDEFINED){
return NULL;
} else {
uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
return (void *) remote;
}
}
void CartesianCommunicator::Init(int *argc, char ***argv) {
int flag;
MPI_Initialized(&flag); // needed to coexist with other libs apparently
if ( !flag ) {
MPI_Init(argc,argv);
}
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
MPI_Comm_rank(communicator_world,&WorldRank);
MPI_Comm_size(communicator_world,&WorldSize);
/////////////////////////////////////////////////////////////////////
// Split into groups that can share memory
/////////////////////////////////////////////////////////////////////
MPI_Comm_split_type(communicator_world, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
MPI_Comm_rank(ShmComm ,&ShmRank);
MPI_Comm_size(ShmComm ,&ShmSize);
GroupSize = WorldSize/ShmSize;
/////////////////////////////////////////////////////////////////////
// find world ranks in our SHM group (i.e. which ranks are on our node)
/////////////////////////////////////////////////////////////////////
MPI_Group WorldGroup, ShmGroup;
MPI_Comm_group (communicator_world, &WorldGroup);
MPI_Comm_group (ShmComm, &ShmGroup);
std::vector<int> world_ranks(WorldSize);
GroupRanks.resize(WorldSize);
MyGroup.resize(ShmSize);
for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &GroupRanks[0]);
///////////////////////////////////////////////////////////////////
// Identify who is in my group and noninate the leader
///////////////////////////////////////////////////////////////////
int g=0;
for(int rank=0;rank<WorldSize;rank++){
if(GroupRanks[rank]!=MPI_UNDEFINED){
assert(g<ShmSize);
MyGroup[g++] = rank;
}
}
std::sort(MyGroup.begin(),MyGroup.end(),std::less<int>());
int myleader = MyGroup[0];
std::vector<int> leaders_1hot(WorldSize,0);
std::vector<int> leaders_group(GroupSize,0);
leaders_1hot [ myleader ] = 1;
///////////////////////////////////////////////////////////////////
// global sum leaders over comm world
///////////////////////////////////////////////////////////////////
int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator_world);
assert(ierr==0);
///////////////////////////////////////////////////////////////////
// find the group leaders world rank
///////////////////////////////////////////////////////////////////
int group=0;
for(int l=0;l<WorldSize;l++){
if(leaders_1hot[l]){
leaders_group[group++] = l;
}
}
///////////////////////////////////////////////////////////////////
// Identify the rank of the group in which I (and my leader) live
///////////////////////////////////////////////////////////////////
GroupRank=-1;
for(int g=0;g<GroupSize;g++){
if (myleader == leaders_group[g]){
GroupRank=g;
}
}
assert(GroupRank!=-1);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared window for our group
//////////////////////////////////////////////////////////////////////////////////////////////////////////
ShmCommBuf = 0;
ierr = MPI_Win_allocate_shared(MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,ShmComm,&ShmCommBuf,&ShmWindow);
assert(ierr==0);
// KNL hack -- force to numa-domain 1 in flat
#if 0
//#include <numaif.h>
for(uint64_t page=0;page<MAX_MPI_SHM_BYTES;page+=4096){
void *pages = (void *) ( page + ShmCommBuf );
int status;
int flags=MPOL_MF_MOVE_ALL;
int nodes=1; // numa domain == MCDRAM
unsigned long count=1;
ierr= move_pages(0,count, &pages,&nodes,&status,flags);
if (ierr && (page==0)) perror("numa relocate command failed");
}
#endif
MPI_Win_lock_all (MPI_MODE_NOCHECK, ShmWindow);
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Plan: allocate a fixed SHM region. Scratch that is just used via some scheme during stencil comms, with no allocate free.
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
ShmCommBufs.resize(ShmSize);
for(int r=0;r<ShmSize;r++){
MPI_Aint sz;
int dsp_unit;
MPI_Win_shared_query (ShmWindow, r, &sz, &dsp_unit, &ShmCommBufs[r]);
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// Verbose for now
//////////////////////////////////////////////////////////////////////////////////////////////////////////
if (WorldRank == 0){
std::cout<<GridLogMessage<< "Grid MPI-3 configuration: detected ";
std::cout<< WorldSize << " Ranks " ;
std::cout<< GroupSize << " Nodes " ;
std::cout<< ShmSize << " with ranks-per-node "<<std::endl;
std::cout<<GridLogMessage <<"Grid MPI-3 configuration: allocated shared memory region of size ";
std::cout<<std::hex << MAX_MPI_SHM_BYTES <<" ShmCommBuf address = "<<ShmCommBuf << std::dec<<std::endl;
for(int g=0;g<GroupSize;g++){
std::cout<<GridLogMessage<<" Node "<<g<<" led by MPI rank "<<leaders_group[g]<<std::endl;
}
std::cout<<GridLogMessage<<" Boss Node Shm Pointers are {";
for(int g=0;g<ShmSize;g++){
std::cout<<std::hex<<ShmCommBufs[g]<<std::dec;
if(g!=ShmSize-1) std::cout<<",";
else std::cout<<"}"<<std::endl;
}
}
for(int g=0;g<GroupSize;g++){
if ( (ShmRank == 0) && (GroupRank==g) ) std::cout<<GridLogMessage<<"["<<g<<"] Node Group "<<g<<" is ranks {";
for(int r=0;r<ShmSize;r++){
if ( (ShmRank == 0) && (GroupRank==g) ) {
std::cout<<MyGroup[r];
if(r<ShmSize-1) std::cout<<",";
else std::cout<<"}"<<std::endl;
}
MPI_Barrier(communicator_world);
}
}
assert(ShmSetup==0); ShmSetup=1;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Want to implement some magic ... Group sub-cubes into those on same node
////////////////////////////////////////////////////////////////////////////////////////////////////////////
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{
std::vector<int> coor = _processor_coor;
assert(std::abs(shift) <_processors[dim]);
coor[dim] = (_processor_coor[dim] + shift + _processors[dim])%_processors[dim];
Lexicographic::IndexFromCoor(coor,source,_processors);
source = LexicographicToWorldRank[source];
coor[dim] = (_processor_coor[dim] - shift + _processors[dim])%_processors[dim];
Lexicographic::IndexFromCoor(coor,dest,_processors);
dest = LexicographicToWorldRank[dest];
}
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
{
int rank;
Lexicographic::IndexFromCoor(coor,rank,_processors);
rank = LexicographicToWorldRank[rank];
return rank;
}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
{
Lexicographic::CoorFromIndex(coor,rank,_processors);
rank = LexicographicToWorldRank[rank];
}
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{
int ierr;
communicator=communicator_world;
_ndimension = processors.size();
////////////////////////////////////////////////////////////////
// Assert power of two shm_size.
////////////////////////////////////////////////////////////////
int log2size = -1;
for(int i=0;i<=MAXLOG2RANKSPERNODE;i++){
if ( (0x1<<i) == ShmSize ) {
log2size = i;
break;
}
}
assert(log2size != -1);
////////////////////////////////////////////////////////////////
// Identify subblock of ranks on node spreading across dims
// in a maximally symmetrical way
////////////////////////////////////////////////////////////////
int dim = 0;
std::vector<int> WorldDims = processors;
ShmDims.resize(_ndimension,1);
GroupDims.resize(_ndimension);
ShmCoor.resize(_ndimension);
GroupCoor.resize(_ndimension);
WorldCoor.resize(_ndimension);
for(int l2=0;l2<log2size;l2++){
while ( WorldDims[dim] / ShmDims[dim] <= 1 ) dim=(dim+1)%_ndimension;
ShmDims[dim]*=2;
dim=(dim+1)%_ndimension;
}
////////////////////////////////////////////////////////////////
// Establish torus of processes and nodes with sub-blockings
////////////////////////////////////////////////////////////////
for(int d=0;d<_ndimension;d++){
GroupDims[d] = WorldDims[d]/ShmDims[d];
}
////////////////////////////////////////////////////////////////
// Check processor counts match
////////////////////////////////////////////////////////////////
_Nprocessors=1;
_processors = processors;
_processor_coor.resize(_ndimension);
for(int i=0;i<_ndimension;i++){
_Nprocessors*=_processors[i];
}
assert(WorldSize==_Nprocessors);
////////////////////////////////////////////////////////////////
// Establish mapping between lexico physics coord and WorldRank
//
////////////////////////////////////////////////////////////////
LexicographicToWorldRank.resize(WorldSize,0);
Lexicographic::CoorFromIndex(GroupCoor,GroupRank,GroupDims);
Lexicographic::CoorFromIndex(ShmCoor,ShmRank,ShmDims);
for(int d=0;d<_ndimension;d++){
WorldCoor[d] = GroupCoor[d]*ShmDims[d]+ShmCoor[d];
}
_processor_coor = WorldCoor;
int lexico;
Lexicographic::IndexFromCoor(WorldCoor,lexico,WorldDims);
LexicographicToWorldRank[lexico]=WorldRank;
_processor = lexico;
///////////////////////////////////////////////////////////////////
// global sum Lexico to World mapping
///////////////////////////////////////////////////////////////////
ierr=MPI_Allreduce(MPI_IN_PLACE,&LexicographicToWorldRank[0],WorldSize,MPI_INT,MPI_SUM,communicator);
assert(ierr==0);
};
void CartesianCommunicator::GlobalSum(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(uint64_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(float &f){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(float *f,int N)
{
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(double &d)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(double *d,int N)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFrom(void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
std::vector<CommsRequest_t> reqs(0);
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
SendToRecvFromComplete(reqs);
}
void CartesianCommunicator::SendRecvPacket(void *xmit,
void *recv,
int sender,
int receiver,
int bytes)
{
MPI_Status stat;
assert(sender != receiver);
int tag = sender;
if ( _processor == sender ) {
MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
}
if ( _processor == receiver ) {
MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
}
}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
#if 0
this->StencilBarrier();
MPI_Request xrq;
MPI_Request rrq;
static int sequence;
int ierr;
int tag;
int check;
assert(dest != _processor);
assert(from != _processor);
int gdest = GroupRanks[dest];
int gfrom = GroupRanks[from];
int gme = GroupRanks[_processor];
sequence++;
char *from_ptr = (char *)ShmCommBufs[ShmRank];
int small = (bytes<MAX_MPI_SHM_BYTES);
typedef uint64_t T;
int words = bytes/sizeof(T);
assert(((size_t)bytes &(sizeof(T)-1))==0);
assert(gme == ShmRank);
if ( small && (gdest !=MPI_UNDEFINED) ) {
char *to_ptr = (char *)ShmCommBufs[gdest];
assert(gme != gdest);
T *ip = (T *)xmit;
T *op = (T *)to_ptr;
PARALLEL_FOR_LOOP
for(int w=0;w<words;w++) {
op[w]=ip[w];
}
bcopy(&_processor,&to_ptr[bytes],sizeof(_processor));
bcopy(& sequence,&to_ptr[bytes+4],sizeof(sequence));
} else {
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
assert(ierr==0);
list.push_back(xrq);
}
this->StencilBarrier();
if (small && (gfrom !=MPI_UNDEFINED) ) {
T *ip = (T *)from_ptr;
T *op = (T *)recv;
PARALLEL_FOR_LOOP
for(int w=0;w<words;w++) {
op[w]=ip[w];
}
bcopy(&from_ptr[bytes] ,&tag ,sizeof(tag));
bcopy(&from_ptr[bytes+4],&check,sizeof(check));
assert(check==sequence);
assert(tag==from);
} else {
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
assert(ierr==0);
list.push_back(rrq);
}
this->StencilBarrier();
#else
MPI_Request xrq;
MPI_Request rrq;
int rank = _processor;
int ierr;
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
assert(ierr==0);
list.push_back(xrq);
list.push_back(rrq);
#endif
}
void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
MPI_Request xrq;
MPI_Request rrq;
int ierr;
assert(dest != _processor);
assert(from != _processor);
int gdest = GroupRanks[dest];
int gfrom = GroupRanks[from];
int gme = GroupRanks[_processor];
assert(gme == ShmRank);
if ( gdest == MPI_UNDEFINED ) {
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
assert(ierr==0);
list.push_back(xrq);
}
if ( gfrom ==MPI_UNDEFINED) {
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
assert(ierr==0);
list.push_back(rrq);
}
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{
SendToRecvFromComplete(list);
}
void CartesianCommunicator::StencilBarrier(void)
{
MPI_Win_sync (ShmWindow);
MPI_Barrier (ShmComm);
MPI_Win_sync (ShmWindow);
}
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{
int nreq=list.size();
std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
assert(ierr==0);
}
void CartesianCommunicator::Barrier(void)
{
int ierr = MPI_Barrier(communicator);
assert(ierr==0);
}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
{
int ierr=MPI_Bcast(data,
bytes,
MPI_BYTE,
root,
communicator);
assert(ierr==0);
}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
{
int ierr= MPI_Bcast(data,
bytes,
MPI_BYTE,
root,
communicator_world);
assert(ierr==0);
}
}

View File

@ -28,12 +28,22 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include "Grid.h" #include "Grid.h"
namespace Grid { namespace Grid {
///////////////////////////////////////////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
///////////////////////////////////////////////////////////////////////////////////////////////////
void CartesianCommunicator::Init(int *argc, char *** arv) void CartesianCommunicator::Init(int *argc, char *** arv)
{ {
WorldRank = 0;
WorldSize = 1;
ShmRank=0;
ShmSize=1;
GroupRank=WorldRank;
GroupSize=WorldSize;
Slave =0;
ShmInitGeneric();
} }
int Rank(void ){ return 0; };
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{ {
_processors = processors; _processors = processors;
@ -89,30 +99,16 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
assert(0); assert(0);
} }
void CartesianCommunicator::Barrier(void) void CartesianCommunicator::Barrier(void){}
{ void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
} void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) { return 0;}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes) void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){ assert(0);}
{
}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
{
}
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{ {
source =0; source =0;
dest=0; dest=0;
} }
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
{
return 0;
}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
{
}
} }

View File

@ -39,17 +39,22 @@ namespace Grid {
BACKTRACEFILE(); \ BACKTRACEFILE(); \
}\ }\
} }
int Rank(void) {
return shmem_my_pe();
} ///////////////////////////////////////////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
///////////////////////////////////////////////////////////////////////////////////////////////////
typedef struct HandShake_t { typedef struct HandShake_t {
uint64_t seq_local; uint64_t seq_local;
uint64_t seq_remote; uint64_t seq_remote;
} HandShake; } HandShake;
static Vector< HandShake > XConnections; static Vector< HandShake > XConnections;
static Vector< HandShake > RConnections; static Vector< HandShake > RConnections;
void CartesianCommunicator::Init(int *argc, char ***argv) { void CartesianCommunicator::Init(int *argc, char ***argv) {
shmem_init(); shmem_init();
XConnections.resize(shmem_n_pes()); XConnections.resize(shmem_n_pes());
@ -60,8 +65,17 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
RConnections[pe].seq_local = 0; RConnections[pe].seq_local = 0;
RConnections[pe].seq_remote= 0; RConnections[pe].seq_remote= 0;
} }
WorldSize = shmem_n_pes();
WorldRank = shmem_my_pe();
ShmRank=0;
ShmSize=1;
GroupRank=WorldRank;
GroupSize=WorldSize;
Slave =0;
shmem_barrier_all(); shmem_barrier_all();
ShmInitGeneric();
} }
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{ {
_ndimension = processors.size(); _ndimension = processors.size();
@ -230,12 +244,9 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
if ( _processor == sender ) { if ( _processor == sender ) {
printf("Sender SHMEM pt2pt %d -> %d\n",sender,receiver);
// Check he has posted a receive // Check he has posted a receive
while(SendSeq->seq_remote == SendSeq->seq_local); while(SendSeq->seq_remote == SendSeq->seq_local);
printf("Sender receive %d posted\n",sender,receiver);
// Advance our send count // Advance our send count
seq = ++(SendSeq->seq_local); seq = ++(SendSeq->seq_local);
@ -244,26 +255,19 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
shmem_putmem(recv,xmit,bytes,receiver); shmem_putmem(recv,xmit,bytes,receiver);
shmem_fence(); shmem_fence();
printf("Sender sent payload %d\n",seq);
//Notify him we're done //Notify him we're done
shmem_putmem((void *)&(RecvSeq->seq_remote),&seq,sizeof(seq),receiver); shmem_putmem((void *)&(RecvSeq->seq_remote),&seq,sizeof(seq),receiver);
shmem_fence(); shmem_fence();
printf("Sender ringing door bell %d\n",seq);
} }
if ( _processor == receiver ) { if ( _processor == receiver ) {
printf("Receiver SHMEM pt2pt %d->%d\n",sender,receiver);
// Post a receive // Post a receive
seq = ++(RecvSeq->seq_local); seq = ++(RecvSeq->seq_local);
shmem_putmem((void *)&(SendSeq->seq_remote),&seq,sizeof(seq),sender); shmem_putmem((void *)&(SendSeq->seq_remote),&seq,sizeof(seq),sender);
printf("Receiver Opening letter box %d\n",seq);
// Now wait until he has advanced our reception counter // Now wait until he has advanced our reception counter
while(RecvSeq->seq_remote != RecvSeq->seq_local); while(RecvSeq->seq_remote != RecvSeq->seq_local);
printf("Receiver Got the mail %d\n",seq);
} }
} }

View File

@ -1,3 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -44,7 +45,7 @@ public:
// Gather for when there is no need to SIMD split with compression // Gather for when there is no need to SIMD split with compression
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
template<class vobj,class cobj,class compressor> void template<class vobj,class cobj,class compressor> void
Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress, int off=0) Gather_plane_simple (const Lattice<vobj> &rhs,commVector<cobj> &buffer,int dimension,int plane,int cbmask,compressor &compress, int off=0)
{ {
int rd = rhs._grid->_rdimensions[dimension]; int rd = rhs._grid->_rdimensions[dimension];
@ -56,6 +57,7 @@ Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<
int e1=rhs._grid->_slice_nblock[dimension]; int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension]; int e2=rhs._grid->_slice_block[dimension];
int stride=rhs._grid->_slice_stride[dimension]; int stride=rhs._grid->_slice_stride[dimension];
if ( cbmask == 0x3 ) { if ( cbmask == 0x3 ) {
PARALLEL_NESTED_LOOP2 PARALLEL_NESTED_LOOP2
@ -68,15 +70,20 @@ PARALLEL_NESTED_LOOP2
} }
} else { } else {
int bo=0; int bo=0;
std::vector<std::pair<int,int> > table;
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o = n*stride; int o = n*stride;
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup int ocb=1<<rhs._grid->CheckerBoardFromOindexTable(o+b);
if ( ocb &cbmask ) { if ( ocb &cbmask ) {
buffer[off+bo++]=compress(rhs._odata[so+o+b]); table.push_back(std::pair<int,int> (bo++,o+b));
} }
} }
} }
PARALLEL_FOR_LOOP
for(int i=0;i<table.size();i++){
buffer[off+table[i].first]=compress(rhs._odata[so+table[i].second]);
}
} }
} }
@ -107,6 +114,7 @@ PARALLEL_NESTED_LOOP2
int o = n*n1; int o = n*n1;
int offset = b+n*n2; int offset = b+n*n2;
cobj temp =compress(rhs._odata[so+o+b]); cobj temp =compress(rhs._odata[so+o+b]);
extract<cobj>(temp,pointers,offset); extract<cobj>(temp,pointers,offset);
} }
@ -114,6 +122,7 @@ PARALLEL_NESTED_LOOP2
} else { } else {
assert(0); //Fixme think this is buggy assert(0); //Fixme think this is buggy
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o=n*rhs._grid->_slice_stride[dimension]; int o=n*rhs._grid->_slice_stride[dimension];
@ -132,7 +141,7 @@ PARALLEL_NESTED_LOOP2
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
// Gather for when there is no need to SIMD split // Gather for when there is no need to SIMD split
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
template<class vobj> void Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer, int dimension,int plane,int cbmask) template<class vobj> void Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
{ {
SimpleCompressor<vobj> dontcompress; SimpleCompressor<vobj> dontcompress;
Gather_plane_simple (rhs,buffer,dimension,plane,cbmask,dontcompress); Gather_plane_simple (rhs,buffer,dimension,plane,cbmask,dontcompress);
@ -150,7 +159,7 @@ template<class vobj> void Gather_plane_extract(const Lattice<vobj> &rhs,std::vec
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
// Scatter for when there is no need to SIMD split // Scatter for when there is no need to SIMD split
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer, int dimension,int plane,int cbmask) template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
{ {
int rd = rhs._grid->_rdimensions[dimension]; int rd = rhs._grid->_rdimensions[dimension];

View File

@ -119,8 +119,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
assert(shift<fd); assert(shift<fd);
int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension]; int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
std::vector<vobj,alignedAllocator<vobj> > send_buf(buffer_size); commVector<vobj> send_buf(buffer_size);
std::vector<vobj,alignedAllocator<vobj> > recv_buf(buffer_size); commVector<vobj> recv_buf(buffer_size);
int cb= (cbmask==0x2)? Odd : Even; int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); int sshift= rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
@ -191,8 +191,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension]; int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
int words = sizeof(vobj)/sizeof(vector_type); int words = sizeof(vobj)/sizeof(vector_type);
std::vector<Vector<scalar_object> > send_buf_extract(Nsimd,Vector<scalar_object>(buffer_size) ); std::vector<commVector<scalar_object> > send_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
std::vector<Vector<scalar_object> > recv_buf_extract(Nsimd,Vector<scalar_object>(buffer_size) ); std::vector<commVector<scalar_object> > recv_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
int bytes = buffer_size*sizeof(scalar_object); int bytes = buffer_size*sizeof(scalar_object);

412
lib/fftw/fftw3.h Normal file
View File

@ -0,0 +1,412 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* The following statement of license applies *only* to this header file,
* and *not* to the other files distributed with FFTW or derived therefrom:
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/***************************** NOTE TO USERS *********************************
*
* THIS IS A HEADER FILE, NOT A MANUAL
*
* If you want to know how to use FFTW, please read the manual,
* online at http://www.fftw.org/doc/ and also included with FFTW.
* For a quick start, see the manual's tutorial section.
*
* (Reading header files to learn how to use a library is a habit
* stemming from code lacking a proper manual. Arguably, it's a
* *bad* habit in most cases, because header files can contain
* interfaces that are not part of the public, stable API.)
*
****************************************************************************/
#ifndef FFTW3_H
#define FFTW3_H
#include <stdio.h>
#ifdef __cplusplus
extern "C"
{
#endif /* __cplusplus */
/* If <complex.h> is included, use the C99 complex type. Otherwise
define a type bit-compatible with C99 complex */
#if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined(complex) && defined(I)
# define FFTW_DEFINE_COMPLEX(R, C) typedef R _Complex C
#else
# define FFTW_DEFINE_COMPLEX(R, C) typedef R C[2]
#endif
#define FFTW_CONCAT(prefix, name) prefix ## name
#define FFTW_MANGLE_DOUBLE(name) FFTW_CONCAT(fftw_, name)
#define FFTW_MANGLE_FLOAT(name) FFTW_CONCAT(fftwf_, name)
#define FFTW_MANGLE_LONG_DOUBLE(name) FFTW_CONCAT(fftwl_, name)
#define FFTW_MANGLE_QUAD(name) FFTW_CONCAT(fftwq_, name)
/* IMPORTANT: for Windows compilers, you should add a line
#define FFTW_DLL
here and in kernel/ifftw.h if you are compiling/using FFTW as a
DLL, in order to do the proper importing/exporting, or
alternatively compile with -DFFTW_DLL or the equivalent
command-line flag. This is not necessary under MinGW/Cygwin, where
libtool does the imports/exports automatically. */
#if defined(FFTW_DLL) && (defined(_WIN32) || defined(__WIN32__))
/* annoying Windows syntax for shared-library declarations */
# if defined(COMPILING_FFTW) /* defined in api.h when compiling FFTW */
# define FFTW_EXTERN extern __declspec(dllexport)
# else /* user is calling FFTW; import symbol */
# define FFTW_EXTERN extern __declspec(dllimport)
# endif
#else
# define FFTW_EXTERN extern
#endif
enum fftw_r2r_kind_do_not_use_me {
FFTW_R2HC=0, FFTW_HC2R=1, FFTW_DHT=2,
FFTW_REDFT00=3, FFTW_REDFT01=4, FFTW_REDFT10=5, FFTW_REDFT11=6,
FFTW_RODFT00=7, FFTW_RODFT01=8, FFTW_RODFT10=9, FFTW_RODFT11=10
};
struct fftw_iodim_do_not_use_me {
int n; /* dimension size */
int is; /* input stride */
int os; /* output stride */
};
#include <stddef.h> /* for ptrdiff_t */
struct fftw_iodim64_do_not_use_me {
ptrdiff_t n; /* dimension size */
ptrdiff_t is; /* input stride */
ptrdiff_t os; /* output stride */
};
typedef void (*fftw_write_char_func_do_not_use_me)(char c, void *);
typedef int (*fftw_read_char_func_do_not_use_me)(void *);
/*
huge second-order macro that defines prototypes for all API
functions. We expand this macro for each supported precision
X: name-mangling macro
R: real data type
C: complex data type
*/
#define FFTW_DEFINE_API(X, R, C) \
\
FFTW_DEFINE_COMPLEX(R, C); \
\
typedef struct X(plan_s) *X(plan); \
\
typedef struct fftw_iodim_do_not_use_me X(iodim); \
typedef struct fftw_iodim64_do_not_use_me X(iodim64); \
\
typedef enum fftw_r2r_kind_do_not_use_me X(r2r_kind); \
\
typedef fftw_write_char_func_do_not_use_me X(write_char_func); \
typedef fftw_read_char_func_do_not_use_me X(read_char_func); \
\
FFTW_EXTERN void X(execute)(const X(plan) p); \
\
FFTW_EXTERN X(plan) X(plan_dft)(int rank, const int *n, \
C *in, C *out, int sign, unsigned flags); \
\
FFTW_EXTERN X(plan) X(plan_dft_1d)(int n, C *in, C *out, int sign, \
unsigned flags); \
FFTW_EXTERN X(plan) X(plan_dft_2d)(int n0, int n1, \
C *in, C *out, int sign, unsigned flags); \
FFTW_EXTERN X(plan) X(plan_dft_3d)(int n0, int n1, int n2, \
C *in, C *out, int sign, unsigned flags); \
\
FFTW_EXTERN X(plan) X(plan_many_dft)(int rank, const int *n, \
int howmany, \
C *in, const int *inembed, \
int istride, int idist, \
C *out, const int *onembed, \
int ostride, int odist, \
int sign, unsigned flags); \
\
FFTW_EXTERN X(plan) X(plan_guru_dft)(int rank, const X(iodim) *dims, \
int howmany_rank, \
const X(iodim) *howmany_dims, \
C *in, C *out, \
int sign, unsigned flags); \
FFTW_EXTERN X(plan) X(plan_guru_split_dft)(int rank, const X(iodim) *dims, \
int howmany_rank, \
const X(iodim) *howmany_dims, \
R *ri, R *ii, R *ro, R *io, \
unsigned flags); \
\
FFTW_EXTERN X(plan) X(plan_guru64_dft)(int rank, \
const X(iodim64) *dims, \
int howmany_rank, \
const X(iodim64) *howmany_dims, \
C *in, C *out, \
int sign, unsigned flags); \
FFTW_EXTERN X(plan) X(plan_guru64_split_dft)(int rank, \
const X(iodim64) *dims, \
int howmany_rank, \
const X(iodim64) *howmany_dims, \
R *ri, R *ii, R *ro, R *io, \
unsigned flags); \
\
FFTW_EXTERN void X(execute_dft)(const X(plan) p, C *in, C *out); \
FFTW_EXTERN void X(execute_split_dft)(const X(plan) p, R *ri, R *ii, \
R *ro, R *io); \
\
FFTW_EXTERN X(plan) X(plan_many_dft_r2c)(int rank, const int *n, \
int howmany, \
R *in, const int *inembed, \
int istride, int idist, \
C *out, const int *onembed, \
int ostride, int odist, \
unsigned flags); \
\
FFTW_EXTERN X(plan) X(plan_dft_r2c)(int rank, const int *n, \
R *in, C *out, unsigned flags); \
\
FFTW_EXTERN X(plan) X(plan_dft_r2c_1d)(int n,R *in,C *out,unsigned flags); \
FFTW_EXTERN X(plan) X(plan_dft_r2c_2d)(int n0, int n1, \
R *in, C *out, unsigned flags); \
FFTW_EXTERN X(plan) X(plan_dft_r2c_3d)(int n0, int n1, \
int n2, \
R *in, C *out, unsigned flags); \
\
\
FFTW_EXTERN X(plan) X(plan_many_dft_c2r)(int rank, const int *n, \
int howmany, \
C *in, const int *inembed, \
int istride, int idist, \
R *out, const int *onembed, \
int ostride, int odist, \
unsigned flags); \
\
FFTW_EXTERN X(plan) X(plan_dft_c2r)(int rank, const int *n, \
C *in, R *out, unsigned flags); \
\
FFTW_EXTERN X(plan) X(plan_dft_c2r_1d)(int n,C *in,R *out,unsigned flags); \
FFTW_EXTERN X(plan) X(plan_dft_c2r_2d)(int n0, int n1, \
C *in, R *out, unsigned flags); \
FFTW_EXTERN X(plan) X(plan_dft_c2r_3d)(int n0, int n1, \
int n2, \
C *in, R *out, unsigned flags); \
\
FFTW_EXTERN X(plan) X(plan_guru_dft_r2c)(int rank, const X(iodim) *dims, \
int howmany_rank, \
const X(iodim) *howmany_dims, \
R *in, C *out, \
unsigned flags); \
FFTW_EXTERN X(plan) X(plan_guru_dft_c2r)(int rank, const X(iodim) *dims, \
int howmany_rank, \
const X(iodim) *howmany_dims, \
C *in, R *out, \
unsigned flags); \
\
FFTW_EXTERN X(plan) X(plan_guru_split_dft_r2c)( \
int rank, const X(iodim) *dims, \
int howmany_rank, \
const X(iodim) *howmany_dims, \
R *in, R *ro, R *io, \
unsigned flags); \
FFTW_EXTERN X(plan) X(plan_guru_split_dft_c2r)( \
int rank, const X(iodim) *dims, \
int howmany_rank, \
const X(iodim) *howmany_dims, \
R *ri, R *ii, R *out, \
unsigned flags); \
\
FFTW_EXTERN X(plan) X(plan_guru64_dft_r2c)(int rank, \
const X(iodim64) *dims, \
int howmany_rank, \
const X(iodim64) *howmany_dims, \
R *in, C *out, \
unsigned flags); \
FFTW_EXTERN X(plan) X(plan_guru64_dft_c2r)(int rank, \
const X(iodim64) *dims, \
int howmany_rank, \
const X(iodim64) *howmany_dims, \
C *in, R *out, \
unsigned flags); \
\
FFTW_EXTERN X(plan) X(plan_guru64_split_dft_r2c)( \
int rank, const X(iodim64) *dims, \
int howmany_rank, \
const X(iodim64) *howmany_dims, \
R *in, R *ro, R *io, \
unsigned flags); \
FFTW_EXTERN X(plan) X(plan_guru64_split_dft_c2r)( \
int rank, const X(iodim64) *dims, \
int howmany_rank, \
const X(iodim64) *howmany_dims, \
R *ri, R *ii, R *out, \
unsigned flags); \
\
FFTW_EXTERN void X(execute_dft_r2c)(const X(plan) p, R *in, C *out); \
FFTW_EXTERN void X(execute_dft_c2r)(const X(plan) p, C *in, R *out); \
\
FFTW_EXTERN void X(execute_split_dft_r2c)(const X(plan) p, \
R *in, R *ro, R *io); \
FFTW_EXTERN void X(execute_split_dft_c2r)(const X(plan) p, \
R *ri, R *ii, R *out); \
\
FFTW_EXTERN X(plan) X(plan_many_r2r)(int rank, const int *n, \
int howmany, \
R *in, const int *inembed, \
int istride, int idist, \
R *out, const int *onembed, \
int ostride, int odist, \
const X(r2r_kind) *kind, unsigned flags); \
\
FFTW_EXTERN X(plan) X(plan_r2r)(int rank, const int *n, R *in, R *out, \
const X(r2r_kind) *kind, unsigned flags); \
\
FFTW_EXTERN X(plan) X(plan_r2r_1d)(int n, R *in, R *out, \
X(r2r_kind) kind, unsigned flags); \
FFTW_EXTERN X(plan) X(plan_r2r_2d)(int n0, int n1, R *in, R *out, \
X(r2r_kind) kind0, X(r2r_kind) kind1, \
unsigned flags); \
FFTW_EXTERN X(plan) X(plan_r2r_3d)(int n0, int n1, int n2, \
R *in, R *out, X(r2r_kind) kind0, \
X(r2r_kind) kind1, X(r2r_kind) kind2, \
unsigned flags); \
\
FFTW_EXTERN X(plan) X(plan_guru_r2r)(int rank, const X(iodim) *dims, \
int howmany_rank, \
const X(iodim) *howmany_dims, \
R *in, R *out, \
const X(r2r_kind) *kind, unsigned flags); \
\
FFTW_EXTERN X(plan) X(plan_guru64_r2r)(int rank, const X(iodim64) *dims, \
int howmany_rank, \
const X(iodim64) *howmany_dims, \
R *in, R *out, \
const X(r2r_kind) *kind, unsigned flags); \
\
FFTW_EXTERN void X(execute_r2r)(const X(plan) p, R *in, R *out); \
\
FFTW_EXTERN void X(destroy_plan)(X(plan) p); \
FFTW_EXTERN void X(forget_wisdom)(void); \
FFTW_EXTERN void X(cleanup)(void); \
\
FFTW_EXTERN void X(set_timelimit)(double t); \
\
FFTW_EXTERN void X(plan_with_nthreads)(int nthreads); \
FFTW_EXTERN int X(init_threads)(void); \
FFTW_EXTERN void X(cleanup_threads)(void); \
\
FFTW_EXTERN int X(export_wisdom_to_filename)(const char *filename); \
FFTW_EXTERN void X(export_wisdom_to_file)(FILE *output_file); \
FFTW_EXTERN char *X(export_wisdom_to_string)(void); \
FFTW_EXTERN void X(export_wisdom)(X(write_char_func) write_char, \
void *data); \
FFTW_EXTERN int X(import_system_wisdom)(void); \
FFTW_EXTERN int X(import_wisdom_from_filename)(const char *filename); \
FFTW_EXTERN int X(import_wisdom_from_file)(FILE *input_file); \
FFTW_EXTERN int X(import_wisdom_from_string)(const char *input_string); \
FFTW_EXTERN int X(import_wisdom)(X(read_char_func) read_char, void *data); \
\
FFTW_EXTERN void X(fprint_plan)(const X(plan) p, FILE *output_file); \
FFTW_EXTERN void X(print_plan)(const X(plan) p); \
FFTW_EXTERN char *X(sprint_plan)(const X(plan) p); \
\
FFTW_EXTERN void *X(malloc)(size_t n); \
FFTW_EXTERN R *X(alloc_real)(size_t n); \
FFTW_EXTERN C *X(alloc_complex)(size_t n); \
FFTW_EXTERN void X(free)(void *p); \
\
FFTW_EXTERN void X(flops)(const X(plan) p, \
double *add, double *mul, double *fmas); \
FFTW_EXTERN double X(estimate_cost)(const X(plan) p); \
FFTW_EXTERN double X(cost)(const X(plan) p); \
\
FFTW_EXTERN int X(alignment_of)(R *p); \
FFTW_EXTERN const char X(version)[]; \
FFTW_EXTERN const char X(cc)[]; \
FFTW_EXTERN const char X(codelet_optim)[];
/* end of FFTW_DEFINE_API macro */
FFTW_DEFINE_API(FFTW_MANGLE_DOUBLE, double, fftw_complex)
FFTW_DEFINE_API(FFTW_MANGLE_FLOAT, float, fftwf_complex)
FFTW_DEFINE_API(FFTW_MANGLE_LONG_DOUBLE, long double, fftwl_complex)
/* __float128 (quad precision) is a gcc extension on i386, x86_64, and ia64
for gcc >= 4.6 (compiled in FFTW with --enable-quad-precision) */
#if (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) \
&& !(defined(__ICC) || defined(__INTEL_COMPILER)) \
&& (defined(__i386__) || defined(__x86_64__) || defined(__ia64__))
# if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined(complex) && defined(I)
/* note: __float128 is a typedef, which is not supported with the _Complex
keyword in gcc, so instead we use this ugly __attribute__ version.
However, we can't simply pass the __attribute__ version to
FFTW_DEFINE_API because the __attribute__ confuses gcc in pointer
types. Hence redefining FFTW_DEFINE_COMPLEX. Ugh. */
# undef FFTW_DEFINE_COMPLEX
# define FFTW_DEFINE_COMPLEX(R, C) typedef _Complex float __attribute__((mode(TC))) C
# endif
FFTW_DEFINE_API(FFTW_MANGLE_QUAD, __float128, fftwq_complex)
#endif
#define FFTW_FORWARD (-1)
#define FFTW_BACKWARD (+1)
#define FFTW_NO_TIMELIMIT (-1.0)
/* documented flags */
#define FFTW_MEASURE (0U)
#define FFTW_DESTROY_INPUT (1U << 0)
#define FFTW_UNALIGNED (1U << 1)
#define FFTW_CONSERVE_MEMORY (1U << 2)
#define FFTW_EXHAUSTIVE (1U << 3) /* NO_EXHAUSTIVE is default */
#define FFTW_PRESERVE_INPUT (1U << 4) /* cancels FFTW_DESTROY_INPUT */
#define FFTW_PATIENT (1U << 5) /* IMPATIENT is default */
#define FFTW_ESTIMATE (1U << 6)
#define FFTW_WISDOM_ONLY (1U << 21)
/* undocumented beyond-guru flags */
#define FFTW_ESTIMATE_PATIENT (1U << 7)
#define FFTW_BELIEVE_PCOST (1U << 8)
#define FFTW_NO_DFT_R2HC (1U << 9)
#define FFTW_NO_NONTHREADED (1U << 10)
#define FFTW_NO_BUFFERING (1U << 11)
#define FFTW_NO_INDIRECT_OP (1U << 12)
#define FFTW_ALLOW_LARGE_GENERIC (1U << 13) /* NO_LARGE_GENERIC is default */
#define FFTW_NO_RANK_SPLITS (1U << 14)
#define FFTW_NO_VRANK_SPLITS (1U << 15)
#define FFTW_NO_VRECURSE (1U << 16)
#define FFTW_NO_SIMD (1U << 17)
#define FFTW_NO_SLOW (1U << 18)
#define FFTW_NO_FIXED_RADIX_LARGE_N (1U << 19)
#define FFTW_ALLOW_PRUNING (1U << 20)
#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */
#endif /* FFTW3_H */

View File

@ -24,16 +24,17 @@ Author: neo <cossu@post.kek.jp>
with this program; if not, write to the Free Software Foundation, Inc., with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_LATTICE_ET_H #ifndef GRID_LATTICE_ET_H
#define GRID_LATTICE_ET_H #define GRID_LATTICE_ET_H
#include <iostream> #include <iostream>
#include <vector>
#include <tuple> #include <tuple>
#include <typeinfo> #include <typeinfo>
#include <vector>
namespace Grid { namespace Grid {
@ -41,8 +42,8 @@ namespace Grid {
// Predicated where support // Predicated where support
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
template <class iobj, class vobj, class robj> template <class iobj, class vobj, class robj>
inline vobj predicatedWhere(const iobj &predicate,const vobj &iftrue,const robj &iffalse) { inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue,
const robj &iffalse) {
typename std::remove_const<vobj>::type ret; typename std::remove_const<vobj>::type ret;
typedef typename vobj::scalar_object scalar_object; typedef typename vobj::scalar_object scalar_object;
@ -75,50 +76,69 @@ namespace Grid {
// from tuple is hideous; C++14 introduces std::make_index_sequence for this // from tuple is hideous; C++14 introduces std::make_index_sequence for this
//////////////////////////////////////////// ////////////////////////////////////////////
// leaf eval of lattice ; should enable if protect using traits // leaf eval of lattice ; should enable if protect using traits
template <typename T> using is_lattice = std::is_base_of<LatticeBase,T >; template <typename T>
using is_lattice = std::is_base_of<LatticeBase, T>;
template <typename T>
using is_lattice_expr = std::is_base_of<LatticeExpressionBase, T>;
template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >; template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
//Specialization of getVectorType for lattices
template<typename T>
struct getVectorType<Lattice<T> >{
typedef typename Lattice<T>::vector_object type;
};
template<class sobj> template<class sobj>
inline sobj eval(const unsigned int ss, const sobj &arg) inline sobj eval(const unsigned int ss, const sobj &arg)
{ {
return arg; return arg;
} }
template <class lobj> template <class lobj>
inline const lobj &eval(const unsigned int ss, const Lattice<lobj> &arg) inline const lobj &eval(const unsigned int ss, const Lattice<lobj> &arg) {
{
return arg._odata[ss]; return arg._odata[ss];
} }
// handle nodes in syntax tree // handle nodes in syntax tree
template <typename Op, typename T1> template <typename Op, typename T1>
auto inline eval(const unsigned int ss, const LatticeUnaryExpression<Op,T1 > &expr) // eval one operand auto inline eval(
-> decltype(expr.first.func(eval(ss,std::get<0>(expr.second)))) const unsigned int ss,
{ const LatticeUnaryExpression<Op, T1> &expr) // eval one operand
-> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)))) {
return expr.first.func(eval(ss, std::get<0>(expr.second))); return expr.first.func(eval(ss, std::get<0>(expr.second)));
} }
template <typename Op, typename T1, typename T2> template <typename Op, typename T1, typename T2>
auto inline eval(const unsigned int ss, const LatticeBinaryExpression<Op,T1,T2> &expr) // eval two operands auto inline eval(
-> decltype(expr.first.func(eval(ss,std::get<0>(expr.second)),eval(ss,std::get<1>(expr.second)))) const unsigned int ss,
{ const LatticeBinaryExpression<Op, T1, T2> &expr) // eval two operands
return expr.first.func(eval(ss,std::get<0>(expr.second)),eval(ss,std::get<1>(expr.second))); -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)),
eval(ss, std::get<1>(expr.second)))) {
return expr.first.func(eval(ss, std::get<0>(expr.second)),
eval(ss, std::get<1>(expr.second)));
} }
template <typename Op, typename T1, typename T2, typename T3> template <typename Op, typename T1, typename T2, typename T3>
auto inline eval(const unsigned int ss, const LatticeTrinaryExpression<Op,T1,T2,T3 > &expr) // eval three operands auto inline eval(const unsigned int ss,
-> decltype(expr.first.func(eval(ss,std::get<0>(expr.second)),eval(ss,std::get<1>(expr.second)),eval(ss,std::get<2>(expr.second)))) const LatticeTrinaryExpression<Op, T1, T2, T3>
{ &expr) // eval three operands
return expr.first.func(eval(ss,std::get<0>(expr.second)),eval(ss,std::get<1>(expr.second)),eval(ss,std::get<2>(expr.second)) ); -> decltype(expr.first.func(eval(ss, std::get<0>(expr.second)),
eval(ss, std::get<1>(expr.second)),
eval(ss, std::get<2>(expr.second)))) {
return expr.first.func(eval(ss, std::get<0>(expr.second)),
eval(ss, std::get<1>(expr.second)),
eval(ss, std::get<2>(expr.second)));
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// Obtain the grid from an expression, ensuring conformable. This must follow a tree recursion // Obtain the grid from an expression, ensuring conformable. This must follow a
// tree recursion
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
template<class T1, typename std::enable_if<is_lattice<T1>::value, T1>::type * =nullptr > template <class T1,
typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
inline void GridFromExpression(GridBase *&grid, const T1 &lat) // Lattice leaf inline void GridFromExpression(GridBase *&grid, const T1 &lat) // Lattice leaf
{ {
if (grid) { if (grid) {
@ -126,35 +146,37 @@ inline void GridFromExpression(GridBase * &grid,const T1& lat) // Lattice leaf
} }
grid = lat._grid; grid = lat._grid;
} }
template<class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr > template <class T1,
inline void GridFromExpression(GridBase * &grid,const T1& notlat) // non-lattice leaf typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
{ inline void GridFromExpression(GridBase *&grid,
} const T1 &notlat) // non-lattice leaf
{}
template <typename Op, typename T1> template <typename Op, typename T1>
inline void GridFromExpression(GridBase * &grid,const LatticeUnaryExpression<Op,T1 > &expr) inline void GridFromExpression(GridBase *&grid,
{ const LatticeUnaryExpression<Op, T1> &expr) {
GridFromExpression(grid, std::get<0>(expr.second)); // recurse GridFromExpression(grid, std::get<0>(expr.second)); // recurse
} }
template <typename Op, typename T1, typename T2> template <typename Op, typename T1, typename T2>
inline void GridFromExpression(GridBase * &grid,const LatticeBinaryExpression<Op,T1,T2> &expr) inline void GridFromExpression(
{ GridBase *&grid, const LatticeBinaryExpression<Op, T1, T2> &expr) {
GridFromExpression(grid, std::get<0>(expr.second)); // recurse GridFromExpression(grid, std::get<0>(expr.second)); // recurse
GridFromExpression(grid, std::get<1>(expr.second)); GridFromExpression(grid, std::get<1>(expr.second));
} }
template <typename Op, typename T1, typename T2, typename T3> template <typename Op, typename T1, typename T2, typename T3>
inline void GridFromExpression( GridBase * &grid,const LatticeTrinaryExpression<Op,T1,T2,T3 > &expr) inline void GridFromExpression(
{ GridBase *&grid, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) {
GridFromExpression(grid, std::get<0>(expr.second)); // recurse GridFromExpression(grid, std::get<0>(expr.second)); // recurse
GridFromExpression(grid, std::get<1>(expr.second)); GridFromExpression(grid, std::get<1>(expr.second));
GridFromExpression(grid, std::get<2>(expr.second)); GridFromExpression(grid, std::get<2>(expr.second));
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// Obtain the CB from an expression, ensuring conformable. This must follow a tree recursion // Obtain the CB from an expression, ensuring conformable. This must follow a
// tree recursion
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
template<class T1, typename std::enable_if<is_lattice<T1>::value, T1>::type * =nullptr > template <class T1,
typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
inline void CBFromExpression(int &cb, const T1 &lat) // Lattice leaf inline void CBFromExpression(int &cb, const T1 &lat) // Lattice leaf
{ {
if ((cb == Odd) || (cb == Even)) { if ((cb == Odd) || (cb == Even)) {
@ -163,28 +185,29 @@ inline void CBFromExpression(int &cb,const T1& lat) // Lattice leaf
cb = lat.checkerboard; cb = lat.checkerboard;
// std::cout<<GridLogMessage<<"Lattice leaf cb "<<cb<<std::endl; // std::cout<<GridLogMessage<<"Lattice leaf cb "<<cb<<std::endl;
} }
template<class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr > template <class T1,
typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
inline void CBFromExpression(int &cb, const T1 &notlat) // non-lattice leaf inline void CBFromExpression(int &cb, const T1 &notlat) // non-lattice leaf
{ {
// std::cout<<GridLogMessage<<"Non lattice leaf cb"<<cb<<std::endl; // std::cout<<GridLogMessage<<"Non lattice leaf cb"<<cb<<std::endl;
} }
template <typename Op, typename T1> template <typename Op, typename T1>
inline void CBFromExpression(int &cb,const LatticeUnaryExpression<Op,T1 > &expr) inline void CBFromExpression(int &cb,
{ const LatticeUnaryExpression<Op, T1> &expr) {
CBFromExpression(cb, std::get<0>(expr.second)); // recurse CBFromExpression(cb, std::get<0>(expr.second)); // recurse
// std::cout<<GridLogMessage<<"Unary node cb "<<cb<<std::endl; // std::cout<<GridLogMessage<<"Unary node cb "<<cb<<std::endl;
} }
template <typename Op, typename T1, typename T2> template <typename Op, typename T1, typename T2>
inline void CBFromExpression(int &cb,const LatticeBinaryExpression<Op,T1,T2> &expr) inline void CBFromExpression(int &cb,
{ const LatticeBinaryExpression<Op, T1, T2> &expr) {
CBFromExpression(cb, std::get<0>(expr.second)); // recurse CBFromExpression(cb, std::get<0>(expr.second)); // recurse
CBFromExpression(cb, std::get<1>(expr.second)); CBFromExpression(cb, std::get<1>(expr.second));
// std::cout<<GridLogMessage<<"Binary node cb "<<cb<<std::endl; // std::cout<<GridLogMessage<<"Binary node cb "<<cb<<std::endl;
} }
template <typename Op, typename T1, typename T2, typename T3> template <typename Op, typename T1, typename T2, typename T3>
inline void CBFromExpression( int &cb,const LatticeTrinaryExpression<Op,T1,T2,T3 > &expr) inline void CBFromExpression(
{ int &cb, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) {
CBFromExpression(cb, std::get<0>(expr.second)); // recurse CBFromExpression(cb, std::get<0>(expr.second)); // recurse
CBFromExpression(cb, std::get<1>(expr.second)); CBFromExpression(cb, std::get<1>(expr.second));
CBFromExpression(cb, std::get<2>(expr.second)); CBFromExpression(cb, std::get<2>(expr.second));
@ -195,8 +218,8 @@ inline void CBFromExpression( int &cb,const LatticeTrinaryExpression<Op,T1,T2,T3
// Unary operators and funcs // Unary operators and funcs
//////////////////////////////////////////// ////////////////////////////////////////////
#define GridUnopClass(name, ret) \ #define GridUnopClass(name, ret) \
template <class arg> struct name\ template <class arg> \
{\ struct name { \
static auto inline func(const arg a) -> decltype(ret) { return ret; } \ static auto inline func(const arg a) -> decltype(ret) { return ret; } \
}; };
@ -212,11 +235,15 @@ GridUnopClass(UnaryReal,real(a));
GridUnopClass(UnaryImag, imag(a)); GridUnopClass(UnaryImag, imag(a));
GridUnopClass(UnaryToReal, toReal(a)); GridUnopClass(UnaryToReal, toReal(a));
GridUnopClass(UnaryToComplex, toComplex(a)); GridUnopClass(UnaryToComplex, toComplex(a));
GridUnopClass(UnaryTimesI, timesI(a));
GridUnopClass(UnaryTimesMinusI, timesMinusI(a));
GridUnopClass(UnaryAbs, abs(a)); GridUnopClass(UnaryAbs, abs(a));
GridUnopClass(UnarySqrt, sqrt(a)); GridUnopClass(UnarySqrt, sqrt(a));
GridUnopClass(UnaryRsqrt, rsqrt(a)); GridUnopClass(UnaryRsqrt, rsqrt(a));
GridUnopClass(UnarySin, sin(a)); GridUnopClass(UnarySin, sin(a));
GridUnopClass(UnaryCos, cos(a)); GridUnopClass(UnaryCos, cos(a));
GridUnopClass(UnaryAsin, asin(a));
GridUnopClass(UnaryAcos, acos(a));
GridUnopClass(UnaryLog, log(a)); GridUnopClass(UnaryLog, log(a));
GridUnopClass(UnaryExp, exp(a)); GridUnopClass(UnaryExp, exp(a));
@ -225,10 +252,9 @@ GridUnopClass(UnaryExp,exp(a));
//////////////////////////////////////////// ////////////////////////////////////////////
#define GridBinOpClass(name, combination) \ #define GridBinOpClass(name, combination) \
template <class left, class right> \ template <class left, class right> \
struct name\ struct name { \
{\ static auto inline func(const left &lhs, const right &rhs) \
static auto inline func(const left &lhs,const right &rhs)-> decltype(combination) const \ -> decltype(combination) const { \
{\
return combination; \ return combination; \
} \ } \
} }
@ -246,17 +272,18 @@ GridBinOpClass(BinaryOrOr ,lhs||rhs);
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
#define GridTrinOpClass(name, combination) \ #define GridTrinOpClass(name, combination) \
template <class predicate, class left, class right> \ template <class predicate, class left, class right> \
struct name\ struct name { \
{\ static auto inline func(const predicate &pred, const left &lhs, \
static auto inline func(const predicate &pred,const left &lhs,const right &rhs)-> decltype(combination) const \ const right &rhs) -> decltype(combination) const { \
{\
return combination; \ return combination; \
} \ } \
} }
GridTrinOpClass(TrinaryWhere,(predicatedWhere<predicate, \ GridTrinOpClass(
typename std::remove_reference<left>::type, \ TrinaryWhere,
typename std::remove_reference<right>::type> (pred,lhs,rhs))); (predicatedWhere<predicate, typename std::remove_reference<left>::type,
typename std::remove_reference<right>::type>(pred, lhs,
rhs)));
//////////////////////////////////////////// ////////////////////////////////////////////
// Operator syntactical glue // Operator syntactical glue
@ -264,49 +291,66 @@ GridTrinOpClass(TrinaryWhere,(predicatedWhere<predicate, \
#define GRID_UNOP(name) name<decltype(eval(0, arg))> #define GRID_UNOP(name) name<decltype(eval(0, arg))>
#define GRID_BINOP(name) name<decltype(eval(0, lhs)), decltype(eval(0, rhs))> #define GRID_BINOP(name) name<decltype(eval(0, lhs)), decltype(eval(0, rhs))>
#define GRID_TRINOP(name) name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))> #define GRID_TRINOP(name) \
name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>
#define GRID_DEF_UNOP(op, name) \ #define GRID_DEF_UNOP(op, name) \
template <typename T1, \ template <typename T1, \
typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value, T1>::type* = nullptr> inline auto op(const T1 &arg) \ typename std::enable_if<is_lattice<T1>::value || \
-> decltype(LatticeUnaryExpression<GRID_UNOP(name),const T1&>(std::make_pair(GRID_UNOP(name)(),std::forward_as_tuple(arg)))) \ is_lattice_expr<T1>::value, \
{ return LatticeUnaryExpression<GRID_UNOP(name), const T1 &>(std::make_pair(GRID_UNOP(name)(),std::forward_as_tuple(arg))); } T1>::type * = nullptr> \
inline auto op(const T1 &arg) \
->decltype(LatticeUnaryExpression<GRID_UNOP(name), const T1 &>( \
std::make_pair(GRID_UNOP(name)(), std::forward_as_tuple(arg)))) { \
return LatticeUnaryExpression<GRID_UNOP(name), const T1 &>( \
std::make_pair(GRID_UNOP(name)(), std::forward_as_tuple(arg))); \
}
#define GRID_BINOP_LEFT(op, name) \ #define GRID_BINOP_LEFT(op, name) \
template <typename T1, typename T2, \ template <typename T1, typename T2, \
typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value, T1>::type* = nullptr>\ typename std::enable_if<is_lattice<T1>::value || \
is_lattice_expr<T1>::value, \
T1>::type * = nullptr> \
inline auto op(const T1 &lhs, const T2 &rhs) \ inline auto op(const T1 &lhs, const T2 &rhs) \
-> decltype(LatticeBinaryExpression<GRID_BINOP(name),const T1&,const T2 &>(std::make_pair(GRID_BINOP(name)(),\ ->decltype( \
std::forward_as_tuple(lhs, rhs)))) \ LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
{\ std::make_pair(GRID_BINOP(name)(), \
return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>(std::make_pair(GRID_BINOP(name)(),\ std::forward_as_tuple(lhs, rhs)))) { \
std::forward_as_tuple(lhs, rhs))); \ return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
std::make_pair(GRID_BINOP(name)(), std::forward_as_tuple(lhs, rhs))); \
} }
#define GRID_BINOP_RIGHT(op, name) \ #define GRID_BINOP_RIGHT(op, name) \
template <typename T1, typename T2, \ template <typename T1, typename T2, \
typename std::enable_if<!is_lattice<T1>::value && !is_lattice_expr<T1>::value, T1>::type* = nullptr,\ typename std::enable_if<!is_lattice<T1>::value && \
typename std::enable_if< is_lattice<T2>::value || is_lattice_expr<T2>::value, T2>::type* = nullptr> \ !is_lattice_expr<T1>::value, \
T1>::type * = nullptr, \
typename std::enable_if<is_lattice<T2>::value || \
is_lattice_expr<T2>::value, \
T2>::type * = nullptr> \
inline auto op(const T1 &lhs, const T2 &rhs) \ inline auto op(const T1 &lhs, const T2 &rhs) \
-> decltype(LatticeBinaryExpression<GRID_BINOP(name),const T1&,const T2 &>(std::make_pair(GRID_BINOP(name)(),\ ->decltype( \
std::forward_as_tuple(lhs, rhs)))) \ LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
{\ std::make_pair(GRID_BINOP(name)(), \
return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>(std::make_pair(GRID_BINOP(name)(),\ std::forward_as_tuple(lhs, rhs)))) { \
std::forward_as_tuple(lhs, rhs))); \ return LatticeBinaryExpression<GRID_BINOP(name), const T1 &, const T2 &>( \
std::make_pair(GRID_BINOP(name)(), std::forward_as_tuple(lhs, rhs))); \
} }
#define GRID_DEF_BINOP(op, name) \ #define GRID_DEF_BINOP(op, name) \
GRID_BINOP_LEFT(op, name); \ GRID_BINOP_LEFT(op, name); \
GRID_BINOP_RIGHT(op, name); GRID_BINOP_RIGHT(op, name);
#define GRID_DEF_TRINOP(op, name) \ #define GRID_DEF_TRINOP(op, name) \
template <typename T1,typename T2,typename T3> inline auto op(const T1 &pred,const T2&lhs,const T3 &rhs) \ template <typename T1, typename T2, typename T3> \
-> decltype(LatticeTrinaryExpression<GRID_TRINOP(name),const T1&,const T2 &,const T3&>(std::make_pair(GRID_TRINOP(name)(),\ inline auto op(const T1 &pred, const T2 &lhs, const T3 &rhs) \
std::forward_as_tuple(pred,lhs,rhs)))) \ ->decltype( \
{\ LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &, \
return LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &,const T3&>(std::make_pair(GRID_TRINOP(name)(), \ const T3 &>(std::make_pair( \
std::forward_as_tuple(pred,lhs, rhs))); \ GRID_TRINOP(name)(), std::forward_as_tuple(pred, lhs, rhs)))) { \
return LatticeTrinaryExpression<GRID_TRINOP(name), const T1 &, const T2 &, \
const T3 &>(std::make_pair( \
GRID_TRINOP(name)(), std::forward_as_tuple(pred, lhs, rhs))); \
} }
//////////////////////// ////////////////////////
// Operator definitions // Operator definitions
@ -325,11 +369,16 @@ GRID_DEF_UNOP(real,UnaryReal);
GRID_DEF_UNOP(imag, UnaryImag); GRID_DEF_UNOP(imag, UnaryImag);
GRID_DEF_UNOP(toReal, UnaryToReal); GRID_DEF_UNOP(toReal, UnaryToReal);
GRID_DEF_UNOP(toComplex, UnaryToComplex); GRID_DEF_UNOP(toComplex, UnaryToComplex);
GRID_DEF_UNOP(abs ,UnaryAbs); //abs overloaded in cmath C++98; DON'T do the abs-fabs-dabs-labs thing GRID_DEF_UNOP(timesI, UnaryTimesI);
GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI);
GRID_DEF_UNOP(abs, UnaryAbs); // abs overloaded in cmath C++98; DON'T do the
// abs-fabs-dabs-labs thing
GRID_DEF_UNOP(sqrt, UnarySqrt); GRID_DEF_UNOP(sqrt, UnarySqrt);
GRID_DEF_UNOP(rsqrt, UnaryRsqrt); GRID_DEF_UNOP(rsqrt, UnaryRsqrt);
GRID_DEF_UNOP(sin, UnarySin); GRID_DEF_UNOP(sin, UnarySin);
GRID_DEF_UNOP(cos, UnaryCos); GRID_DEF_UNOP(cos, UnaryCos);
GRID_DEF_UNOP(asin, UnaryAsin);
GRID_DEF_UNOP(acos, UnaryAcos);
GRID_DEF_UNOP(log, UnaryLog); GRID_DEF_UNOP(log, UnaryLog);
GRID_DEF_UNOP(exp, UnaryExp); GRID_DEF_UNOP(exp, UnaryExp);
@ -349,29 +398,29 @@ GRID_DEF_TRINOP(where,TrinaryWhere);
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
template <class Op, class T1> template <class Op, class T1>
auto closure(const LatticeUnaryExpression<Op, T1> &expr) auto closure(const LatticeUnaryExpression<Op, T1> &expr)
-> Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second))))> -> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second))))> {
{ Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second))))> ret(
Lattice<decltype(expr.first.func(eval(0,std::get<0>(expr.second))))> ret(expr); expr);
return ret; return ret;
} }
template <class Op, class T1, class T2> template <class Op, class T1, class T2>
auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr) auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
-> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)), -> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
eval(0,std::get<1>(expr.second))))> eval(0, std::get<1>(expr.second))))> {
{
Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)), Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
eval(0,std::get<1>(expr.second))))> ret(expr); eval(0, std::get<1>(expr.second))))>
ret(expr);
return ret; return ret;
} }
template <class Op, class T1, class T2, class T3> template <class Op, class T1, class T2, class T3>
auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
-> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)), -> Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
eval(0, std::get<1>(expr.second)), eval(0, std::get<1>(expr.second)),
eval(0,std::get<2>(expr.second))))> eval(0, std::get<2>(expr.second))))> {
{
Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)), Lattice<decltype(expr.first.func(eval(0, std::get<0>(expr.second)),
eval(0, std::get<1>(expr.second)), eval(0, std::get<1>(expr.second)),
eval(0,std::get<2>(expr.second))))> ret(expr); eval(0, std::get<2>(expr.second))))>
ret(expr);
return ret; return ret;
} }
@ -382,7 +431,6 @@ template<class Op,class T1, class T2, class T3>
#undef GRID_DEF_UNOP #undef GRID_DEF_UNOP
#undef GRID_DEF_BINOP #undef GRID_DEF_BINOP
#undef GRID_DEF_TRINOP #undef GRID_DEF_TRINOP
} }
#if 0 #if 0

View File

@ -24,7 +24,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
with this program; if not, write to the Free Software Foundation, Inc., with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_LATTICE_BASE_H #ifndef GRID_LATTICE_BASE_H
@ -64,9 +65,6 @@ public:
class LatticeExpressionBase {}; class LatticeExpressionBase {};
template<class T> using Vector = std::vector<T,alignedAllocator<T> >; // Aligned allocator??
template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >; // Aligned allocator??
template <typename Op, typename T1> template <typename Op, typename T1>
class LatticeUnaryExpression : public std::pair<Op,std::tuple<T1> > , public LatticeExpressionBase { class LatticeUnaryExpression : public std::pair<Op,std::tuple<T1> > , public LatticeExpressionBase {
public: public:
@ -101,6 +99,7 @@ public:
int begin(void) { return 0;}; int begin(void) { return 0;};
int end(void) { return _odata.size(); } int end(void) { return _odata.size(); }
vobj & operator[](int i) { return _odata[i]; }; vobj & operator[](int i) { return _odata[i]; };
const vobj & operator[](int i) const { return _odata[i]; };
public: public:
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
@ -255,6 +254,18 @@ PARALLEL_FOR_LOOP
checkerboard=0; checkerboard=0;
} }
Lattice(const Lattice& r){ // copy constructor
_grid = r._grid;
checkerboard = r.checkerboard;
_odata.resize(_grid->oSites());// essential
PARALLEL_FOR_LOOP
for(int ss=0;ss<_grid->oSites();ss++){
_odata[ss]=r._odata[ss];
}
}
virtual ~Lattice(void) = default; virtual ~Lattice(void) = default;
template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){ template<class sobj> strong_inline Lattice<vobj> & operator = (const sobj & r){
@ -267,7 +278,7 @@ PARALLEL_FOR_LOOP
template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){ template<class robj> strong_inline Lattice<vobj> & operator = (const Lattice<robj> & r){
this->checkerboard = r.checkerboard; this->checkerboard = r.checkerboard;
conformable(*this,r); conformable(*this,r);
std::cout<<GridLogMessage<<"Lattice operator ="<<std::endl;
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int ss=0;ss<_grid->oSites();ss++){ for(int ss=0;ss<_grid->oSites();ss++){
this->_odata[ss]=r._odata[ss]; this->_odata[ss]=r._odata[ss];
@ -324,27 +335,27 @@ PARALLEL_FOR_LOOP
#include <lattice/Lattice_conformable.h> #include "Lattice_conformable.h"
#define GRID_LATTICE_EXPRESSION_TEMPLATES #define GRID_LATTICE_EXPRESSION_TEMPLATES
#ifdef GRID_LATTICE_EXPRESSION_TEMPLATES #ifdef GRID_LATTICE_EXPRESSION_TEMPLATES
#include <lattice/Lattice_ET.h> #include "Lattice_ET.h"
#else #else
#include <lattice/Lattice_overload.h> #include "Lattice_overload.h"
#endif #endif
#include <lattice/Lattice_arith.h> #include "Lattice_arith.h"
#include <lattice/Lattice_trace.h> #include "Lattice_trace.h"
#include <lattice/Lattice_transpose.h> #include "Lattice_transpose.h"
#include <lattice/Lattice_local.h> #include "Lattice_local.h"
#include <lattice/Lattice_reduction.h> #include "Lattice_reduction.h"
#include <lattice/Lattice_peekpoke.h> #include "Lattice_peekpoke.h"
#include <lattice/Lattice_reality.h> #include "Lattice_reality.h"
#include <lattice/Lattice_comparison_utils.h> #include "Lattice_comparison_utils.h"
#include <lattice/Lattice_comparison.h> #include "Lattice_comparison.h"
#include <lattice/Lattice_coordinate.h> #include "Lattice_coordinate.h"
#include <lattice/Lattice_where.h> #include "Lattice_where.h"
#include <lattice/Lattice_rng.h> #include "Lattice_rng.h"
#include <lattice/Lattice_unary.h> #include "Lattice_unary.h"
#include <lattice/Lattice_transfer.h> #include "Lattice_transfer.h"
#endif #endif

View File

@ -164,15 +164,17 @@ PARALLEL_FOR_LOOP
assert( l.checkerboard== l._grid->CheckerBoard(site)); assert( l.checkerboard== l._grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj)); assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type);
int odx,idx; int odx,idx;
idx= grid->iIndex(site); idx= grid->iIndex(site);
odx= grid->oIndex(site); odx= grid->oIndex(site);
std::vector<sobj> buf(Nsimd); scalar_type * vp = (scalar_type *)&l._odata[odx];
scalar_type * pt = (scalar_type *)&s;
extract(l._odata[odx],buf); for(int w=0;w<words;w++){
pt[w] = vp[idx+w*Nsimd];
s = buf[idx]; }
return; return;
}; };
@ -190,18 +192,17 @@ PARALLEL_FOR_LOOP
assert( l.checkerboard== l._grid->CheckerBoard(site)); assert( l.checkerboard== l._grid->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj)); assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type);
int odx,idx; int odx,idx;
idx= grid->iIndex(site); idx= grid->iIndex(site);
odx= grid->oIndex(site); odx= grid->oIndex(site);
std::vector<sobj> buf(Nsimd); scalar_type * vp = (scalar_type *)&l._odata[odx];
scalar_type * pt = (scalar_type *)&s;
// extract-modify-merge cycle is easiest way and this is not perf critical for(int w=0;w<words;w++){
extract(l._odata[odx],buf); vp[idx+w*Nsimd] = pt[w];
}
buf[idx] = s;
merge(l._odata[odx],buf);
return; return;
}; };

View File

@ -40,7 +40,7 @@ namespace Grid {
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
ComplexD nrm = innerProduct(arg,arg); ComplexD nrm = innerProduct(arg,arg);
return real(nrm); return std::real(nrm);
} }
template<class vobj> template<class vobj>

View File

@ -297,8 +297,9 @@ namespace Grid {
int l_idx=generator_idx(o_idx,i_idx); int l_idx=generator_idx(o_idx,i_idx);
std::vector<int> site_seeds(4); const int num_rand_seed=16;
for(int i=0;i<4;i++){ std::vector<int> site_seeds(num_rand_seed);
for(int i=0;i<site_seeds.size();i++){
site_seeds[i]= ui(pseeder); site_seeds[i]= ui(pseeder);
} }

View File

@ -349,7 +349,7 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
assert(ig->_ldimensions[d] == og->_ldimensions[d]); assert(ig->_ldimensions[d] == og->_ldimensions[d]);
} }
PARALLEL_FOR_LOOP //PARALLEL_FOR_LOOP
for(int idx=0;idx<ig->lSites();idx++){ for(int idx=0;idx<ig->lSites();idx++){
std::vector<int> lcoor(ni); std::vector<int> lcoor(ni);
ig->LocalIndexToLocalCoor(idx,lcoor); ig->LocalIndexToLocalCoor(idx,lcoor);
@ -386,7 +386,7 @@ void InsertSlice(Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int
} }
// the above should guarantee that the operations are local // the above should guarantee that the operations are local
PARALLEL_FOR_LOOP //PARALLEL_FOR_LOOP
for(int idx=0;idx<lg->lSites();idx++){ for(int idx=0;idx<lg->lSites();idx++){
std::vector<int> lcoor(nl); std::vector<int> lcoor(nl);
std::vector<int> hcoor(nh); std::vector<int> hcoor(nh);
@ -428,7 +428,7 @@ void ExtractSlice(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice, in
} }
} }
// the above should guarantee that the operations are local // the above should guarantee that the operations are local
PARALLEL_FOR_LOOP //PARALLEL_FOR_LOOP
for(int idx=0;idx<lg->lSites();idx++){ for(int idx=0;idx<lg->lSites();idx++){
std::vector<int> lcoor(nl); std::vector<int> lcoor(nl);
std::vector<int> hcoor(nh); std::vector<int> hcoor(nh);
@ -446,6 +446,79 @@ PARALLEL_FOR_LOOP
} }
template<class vobj>
void InsertSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
{
typedef typename vobj::scalar_object sobj;
sobj s;
GridBase *lg = lowDim._grid;
GridBase *hg = higherDim._grid;
int nl = lg->_ndimension;
int nh = hg->_ndimension;
assert(nl == nh);
assert(orthog<nh);
assert(orthog>=0);
for(int d=0;d<nh;d++){
assert(lg->_processors[d] == hg->_processors[d]);
assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
}
// the above should guarantee that the operations are local
//PARALLEL_FOR_LOOP
for(int idx=0;idx<lg->lSites();idx++){
std::vector<int> lcoor(nl);
std::vector<int> hcoor(nh);
lg->LocalIndexToLocalCoor(idx,lcoor);
if( lcoor[orthog] == slice_lo ) {
hcoor=lcoor;
hcoor[orthog] = slice_hi;
peekLocalSite(s,lowDim,lcoor);
pokeLocalSite(s,higherDim,hcoor);
}
}
}
template<class vobj>
void ExtractSliceLocal(Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
{
typedef typename vobj::scalar_object sobj;
sobj s;
GridBase *lg = lowDim._grid;
GridBase *hg = higherDim._grid;
int nl = lg->_ndimension;
int nh = hg->_ndimension;
assert(nl == nh);
assert(orthog<nh);
assert(orthog>=0);
for(int d=0;d<nh;d++){
assert(lg->_processors[d] == hg->_processors[d]);
assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
}
// the above should guarantee that the operations are local
//PARALLEL_FOR_LOOP
for(int idx=0;idx<lg->lSites();idx++){
std::vector<int> lcoor(nl);
std::vector<int> hcoor(nh);
lg->LocalIndexToLocalCoor(idx,lcoor);
if( lcoor[orthog] == slice_lo ) {
hcoor=lcoor;
hcoor[orthog] = slice_hi;
peekLocalSite(s,higherDim,hcoor);
pokeLocalSite(s,lowDim,lcoor);
}
}
}
template<class vobj> template<class vobj>
void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine) void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
{ {
@ -482,6 +555,96 @@ void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
} }
//Copy SIMD-vectorized lattice to array of scalar objects in lexicographic order
template<typename vobj, typename sobj>
typename std::enable_if<isSIMDvectorized<vobj>::value && !isSIMDvectorized<sobj>::value, void>::type unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in){
typedef typename vobj::vector_type vtype;
GridBase* in_grid = in._grid;
out.resize(in_grid->lSites());
int ndim = in_grid->Nd();
int in_nsimd = vtype::Nsimd();
std::vector<std::vector<int> > in_icoor(in_nsimd);
for(int lane=0; lane < in_nsimd; lane++){
in_icoor[lane].resize(ndim);
in_grid->iCoorFromIindex(in_icoor[lane], lane);
}
PARALLEL_FOR_LOOP
for(int in_oidx = 0; in_oidx < in_grid->oSites(); in_oidx++){ //loop over outer index
//Assemble vector of pointers to output elements
std::vector<sobj*> out_ptrs(in_nsimd);
std::vector<int> in_ocoor(ndim);
in_grid->oCoorFromOindex(in_ocoor, in_oidx);
std::vector<int> lcoor(in_grid->Nd());
for(int lane=0; lane < in_nsimd; lane++){
for(int mu=0;mu<ndim;mu++)
lcoor[mu] = in_ocoor[mu] + in_grid->_rdimensions[mu]*in_icoor[lane][mu];
int lex;
Lexicographic::IndexFromCoor(lcoor, lex, in_grid->_ldimensions);
out_ptrs[lane] = &out[lex];
}
//Unpack into those ptrs
const vobj & in_vobj = in._odata[in_oidx];
extract1(in_vobj, out_ptrs, 0);
}
}
//Convert a Lattice from one precision to another
template<class VobjOut, class VobjIn>
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
assert(out._grid->Nd() == in._grid->Nd());
out.checkerboard = in.checkerboard;
GridBase *in_grid=in._grid;
GridBase *out_grid = out._grid;
typedef typename VobjOut::scalar_object SobjOut;
typedef typename VobjIn::scalar_object SobjIn;
int ndim = out._grid->Nd();
int out_nsimd = out_grid->Nsimd();
std::vector<std::vector<int> > out_icoor(out_nsimd);
for(int lane=0; lane < out_nsimd; lane++){
out_icoor[lane].resize(ndim);
out_grid->iCoorFromIindex(out_icoor[lane], lane);
}
std::vector<SobjOut> in_slex_conv(in_grid->lSites());
unvectorizeToLexOrdArray(in_slex_conv, in);
PARALLEL_FOR_LOOP
for(int out_oidx=0;out_oidx<out_grid->oSites();out_oidx++){
std::vector<int> out_ocoor(ndim);
out_grid->oCoorFromOindex(out_ocoor, out_oidx);
std::vector<SobjOut*> ptrs(out_nsimd);
std::vector<int> lcoor(out_grid->Nd());
for(int lane=0; lane < out_nsimd; lane++){
for(int mu=0;mu<ndim;mu++)
lcoor[mu] = out_ocoor[mu] + out_grid->_rdimensions[mu]*out_icoor[lane][mu];
int llex; Lexicographic::IndexFromCoor(lcoor, llex, out_grid->_ldimensions);
ptrs[lane] = &in_slex_conv[llex];
}
merge(out._odata[out_oidx], ptrs, 0);
}
}
} }
#endif #endif

View File

@ -17,7 +17,7 @@
#endif #endif
// Include user configuration file (this can define various configuration macros) // Include user configuration file (this can define various configuration macros)
#include <pugixml/pugiconfig.hpp> #include "pugiconfig.hpp"
#ifndef HEADER_PUGIXML_HPP #ifndef HEADER_PUGIXML_HPP
#define HEADER_PUGIXML_HPP #define HEADER_PUGIXML_HPP

View File

@ -55,10 +55,19 @@ namespace QCD {
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// QCD iMatrix types // QCD iMatrix types
// Index conventions: Lorentz x Spin x Colour // Index conventions: Lorentz x Spin x Colour
// note: static const int or constexpr will work for type deductions
// with the intel compiler (up to version 17)
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
static const int ColourIndex = 2; #define ColourIndex 2
static const int SpinIndex = 1; #define SpinIndex 1
static const int LorentzIndex= 0; #define LorentzIndex 0
// Also should make these a named enum type
static const int DaggerNo=0;
static const int DaggerYes=1;
static const int InverseNo=0;
static const int InverseYes=1;
// Useful traits is this a spin index // Useful traits is this a spin index
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE; //typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
@ -484,16 +493,27 @@ namespace QCD {
} //namespace QCD } //namespace QCD
} // Grid } // Grid
#include <qcd/utils/SpaceTimeGrid.h>
#include <qcd/spin/Dirac.h> #include <Grid/qcd/utils/SpaceTimeGrid.h>
#include <qcd/spin/TwoSpinor.h> #include <Grid/qcd/spin/Dirac.h>
#include <qcd/utils/LinalgUtils.h> #include <Grid/qcd/spin/TwoSpinor.h>
#include <qcd/utils/CovariantCshift.h> #include <Grid/qcd/utils/LinalgUtils.h>
#include <qcd/utils/SUn.h> #include <Grid/qcd/utils/CovariantCshift.h>
#include <qcd/action/Actions.h>
#include <qcd/hmc/integrators/Integrator.h> // Include representations
#include <qcd/hmc/integrators/Integrator_algorithm.h> #include <Grid/qcd/utils/SUn.h>
#include <qcd/hmc/HMC.h> #include <Grid/qcd/utils/SUnAdjoint.h>
#include <Grid/qcd/utils/SUnTwoIndex.h>
#include <Grid/qcd/representations/hmc_types.h>
#include <Grid/qcd/action/Actions.h>
#include <Grid/qcd/smearing/Smearing.h>
#include <Grid/qcd/hmc/integrators/Integrator.h>
#include <Grid/qcd/hmc/integrators/Integrator_algorithm.h>
#include <Grid/qcd/hmc/HMC.h>
#endif #endif

View File

@ -23,7 +23,8 @@ Author: neo <cossu@post.kek.jp>
with this program; if not, write to the Free Software Foundation, Inc., with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef QCD_ACTION_BASE #ifndef QCD_ACTION_BASE
@ -33,54 +34,120 @@ namespace QCD{
template <class GaugeField> template <class GaugeField>
class Action { class Action {
public: public:
bool is_smeared = false;
// Boundary conditions? // Heatbath? // Boundary conditions? // Heatbath?
virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) = 0;// refresh pseudofermions virtual void refresh(const GaugeField& U,
GridParallelRNG& pRNG) = 0; // refresh pseudofermions
virtual RealD S(const GaugeField& U) = 0; // evaluate the action virtual RealD S(const GaugeField& U) = 0; // evaluate the action
virtual void deriv(const GaugeField &U,GaugeField & dSdU ) = 0; // evaluate the action derivative virtual void deriv(const GaugeField& U,
GaugeField& dSdU) = 0; // evaluate the action derivative
virtual ~Action(){}; virtual ~Action(){};
}; };
// Could derive PseudoFermion action with a PF field, FermionField, and a Grid; implement refresh // Indexing of tuple types
template <class T, class Tuple>
struct Index;
template <class T, class... Types>
struct Index<T, std::tuple<T, Types...>> {
static const std::size_t value = 0;
};
template <class T, class U, class... Types>
struct Index<T, std::tuple<U, Types...>> {
static const std::size_t value = 1 + Index<T, std::tuple<Types...>>::value;
};
/* /*
template<class GaugeField, class FermionField> template <class GaugeField>
class PseudoFermionAction : public Action<GaugeField> { struct ActionLevel {
public: public:
FermionField Phi; typedef Action<GaugeField>*
GridParallelRNG &pRNG; ActPtr; // now force the same colours as the rest of the code
GridBase &Grid;
PseudoFermionAction(GridBase &_Grid,GridParallelRNG &_pRNG) : Grid(_Grid), Phi(&_Grid), pRNG(_pRNG) { //Add supported representations here
};
virtual void refresh(const GaugeField &gauge) {
gaussian(Phi,pRNG);
};
}; unsigned int multiplier;
*/
template<class GaugeField> struct ActionLevel{
public:
typedef Action<GaugeField>* ActPtr; // now force the same colours as the rest of the code
int multiplier;
std::vector<ActPtr> actions; std::vector<ActPtr> actions;
ActionLevel(int mul = 1) : multiplier(mul) { ActionLevel(unsigned int mul = 1) : actions(0), multiplier(mul) {
assert (mul > 0); assert(mul >= 1);
}; };
void push_back(ActPtr ptr){ void push_back(ActPtr ptr) { actions.push_back(ptr); }
actions.push_back(ptr); };
*/
template <class GaugeField, class Repr = NoHirep >
struct ActionLevel {
public:
unsigned int multiplier;
// Fundamental repr actions separated because of the smearing
typedef Action<GaugeField>* ActPtr;
// construct a tuple of vectors of the actions for the corresponding higher
// representation fields
typedef typename AccessTypes<Action, Repr>::VectorCollection action_collection;
action_collection actions_hirep;
typedef typename AccessTypes<Action, Repr>::FieldTypeCollection action_hirep_types;
std::vector<ActPtr>& actions;
// Temporary conversion between ActionLevel and ActionLevelHirep
//ActionLevelHirep(ActionLevel<GaugeField>& AL ):actions(AL.actions), multiplier(AL.multiplier){}
ActionLevel(unsigned int mul = 1) : actions(std::get<0>(actions_hirep)), multiplier(mul) {
// initialize the hirep vectors to zero.
//apply(this->resize, actions_hirep, 0); //need a working resize
assert(mul >= 1);
};
//void push_back(ActPtr ptr) { actions.push_back(ptr); }
template < class Field >
void push_back(Action<Field>* ptr) {
// insert only in the correct vector
std::get< Index < Field, action_hirep_types>::value >(actions_hirep).push_back(ptr);
};
template < class ActPtr>
static void resize(ActPtr ap, unsigned int n){
ap->resize(n);
} }
//template <std::size_t I>
//auto getRepresentation(Repr& R)->decltype(std::get<I>(R).U) {return std::get<I>(R).U;}
// Loop on tuple for a callable function
template <std::size_t I = 1, typename Callable, typename ...Args>
inline typename std::enable_if<I == std::tuple_size<action_collection>::value, void>::type apply(
Callable, Repr& R,Args&...) const {}
template <std::size_t I = 1, typename Callable, typename ...Args>
inline typename std::enable_if<I < std::tuple_size<action_collection>::value, void>::type apply(
Callable fn, Repr& R, Args&... arguments) const {
fn(std::get<I>(actions_hirep), std::get<I>(R.rep), arguments...);
apply<I + 1>(fn, R, arguments...);
}
}; };
template<class GaugeField> using ActionSet = std::vector<ActionLevel< GaugeField > >;
//template <class GaugeField>
//using ActionSet = std::vector<ActionLevel<GaugeField> >;
}} template <class GaugeField, class R>
using ActionSet = std::vector<ActionLevel<GaugeField, R> >;
}
}
#endif #endif

View File

@ -40,25 +40,25 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
//////////////////////////////////////////// ////////////////////////////////////////////
// Abstract base interface // Abstract base interface
//////////////////////////////////////////// ////////////////////////////////////////////
#include <qcd/action/ActionBase.h> #include <Grid/qcd/action/ActionBase.h>
#include <qcd/action/ActionParams.h> #include <Grid/qcd/action/ActionParams.h>
//////////////////////////////////////////// ////////////////////////////////////////////
// Utility functions // Utility functions
//////////////////////////////////////////// ////////////////////////////////////////////
#include <qcd/action/gauge/GaugeImpl.h> #include <Grid/qcd/action/gauge/GaugeImpl.h>
#include <qcd/utils/WilsonLoops.h> #include <Grid/qcd/utils/WilsonLoops.h>
#include <qcd/action/fermion/WilsonCompressor.h> //used by all wilson type fermions #include <Grid/qcd/action/fermion/WilsonCompressor.h> //used by all wilson type fermions
#include <qcd/action/fermion/FermionOperatorImpl.h> #include <Grid/qcd/action/fermion/FermionOperatorImpl.h>
#include <qcd/action/fermion/FermionOperator.h> #include <Grid/qcd/action/fermion/FermionOperator.h>
#include <qcd/action/fermion/WilsonKernels.h> //used by all wilson type fermions #include <Grid/qcd/action/fermion/WilsonKernels.h> //used by all wilson type fermions
//////////////////////////////////////////// ////////////////////////////////////////////
// Gauge Actions // Gauge Actions
//////////////////////////////////////////// ////////////////////////////////////////////
#include <qcd/action/gauge/WilsonGaugeAction.h> #include <Grid/qcd/action/gauge/WilsonGaugeAction.h>
#include <qcd/action/gauge/PlaqPlusRectangleAction.h> #include <Grid/qcd/action/gauge/PlaqPlusRectangleAction.h>
namespace Grid { namespace Grid {
namespace QCD { namespace QCD {
@ -107,41 +107,64 @@ typedef SymanzikGaugeAction<ConjugateGimplD> ConjugateSymanzikGaugeAction
// for EVERY .cc file. This define centralises the list and restores global push of impl cases // for EVERY .cc file. This define centralises the list and restores global push of impl cases
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
#define FermOpTemplateInstantiate(A) \
#define FermOp4dVecTemplateInstantiate(A) \
template class A<WilsonImplF>; \ template class A<WilsonImplF>; \
template class A<WilsonImplD>; \ template class A<WilsonImplD>; \
template class A<ZWilsonImplF>; \
template class A<ZWilsonImplD>; \
template class A<GparityWilsonImplF>; \ template class A<GparityWilsonImplF>; \
template class A<GparityWilsonImplD>; template class A<GparityWilsonImplD>;
#define AdjointFermOpTemplateInstantiate(A) \
template class A<WilsonAdjImplF>; \
template class A<WilsonAdjImplD>;
#define TwoIndexFermOpTemplateInstantiate(A) \
template class A<WilsonTwoIndexSymmetricImplF>; \
template class A<WilsonTwoIndexSymmetricImplD>;
#define FermOp5dVecTemplateInstantiate(A) \
template class A<DomainWallVec5dImplF>; \
template class A<DomainWallVec5dImplD>; \
template class A<ZDomainWallVec5dImplF>; \
template class A<ZDomainWallVec5dImplD>;
#define FermOpTemplateInstantiate(A) \
FermOp4dVecTemplateInstantiate(A) \
FermOp5dVecTemplateInstantiate(A)
#define GparityFermOpTemplateInstantiate(A) #define GparityFermOpTemplateInstantiate(A)
//////////////////////////////////////////// ////////////////////////////////////////////
// Fermion operators / actions // Fermion operators / actions
//////////////////////////////////////////// ////////////////////////////////////////////
#include <qcd/action/fermion/WilsonFermion.h> // 4d wilson like #include <Grid/qcd/action/fermion/WilsonFermion.h> // 4d wilson like
#include <qcd/action/fermion/WilsonTMFermion.h> // 4d wilson like #include <Grid/qcd/action/fermion/WilsonTMFermion.h> // 4d wilson like
#include <qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types #include <Grid/qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types
//#include <qcd/action/fermion/CloverFermion.h> //#include <Grid/qcd/action/fermion/CloverFermion.h>
#include <qcd/action/fermion/CayleyFermion5D.h> // Cayley types #include <Grid/qcd/action/fermion/CayleyFermion5D.h> // Cayley types
#include <qcd/action/fermion/DomainWallFermion.h> #include <Grid/qcd/action/fermion/DomainWallFermion.h>
#include <qcd/action/fermion/DomainWallFermion.h> #include <Grid/qcd/action/fermion/DomainWallFermion.h>
#include <qcd/action/fermion/MobiusFermion.h> #include <Grid/qcd/action/fermion/MobiusFermion.h>
#include <qcd/action/fermion/ScaledShamirFermion.h> #include <Grid/qcd/action/fermion/ZMobiusFermion.h>
#include <qcd/action/fermion/MobiusZolotarevFermion.h> #include <Grid/qcd/action/fermion/ScaledShamirFermion.h>
#include <qcd/action/fermion/ShamirZolotarevFermion.h> #include <Grid/qcd/action/fermion/MobiusZolotarevFermion.h>
#include <qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h> #include <Grid/qcd/action/fermion/ShamirZolotarevFermion.h>
#include <qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h> #include <Grid/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h>
#include <Grid/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h>
#include <qcd/action/fermion/ContinuedFractionFermion5D.h> // Continued fraction #include <Grid/qcd/action/fermion/ContinuedFractionFermion5D.h> // Continued fraction
#include <qcd/action/fermion/OverlapWilsonContfracTanhFermion.h> #include <Grid/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h>
#include <qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h> #include <Grid/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h>
#include <qcd/action/fermion/PartialFractionFermion5D.h> // Partial fraction #include <Grid/qcd/action/fermion/PartialFractionFermion5D.h> // Partial fraction
#include <qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h> #include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h>
#include <qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h> #include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h>
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// More maintainable to maintain the following typedef list centrally, as more "impl" targets // More maintainable to maintain the following typedef list centrally, as more "impl" targets
@ -157,6 +180,14 @@ typedef WilsonFermion<WilsonImplR> WilsonFermionR;
typedef WilsonFermion<WilsonImplF> WilsonFermionF; typedef WilsonFermion<WilsonImplF> WilsonFermionF;
typedef WilsonFermion<WilsonImplD> WilsonFermionD; typedef WilsonFermion<WilsonImplD> WilsonFermionD;
typedef WilsonFermion<WilsonAdjImplR> WilsonAdjFermionR;
typedef WilsonFermion<WilsonAdjImplF> WilsonAdjFermionF;
typedef WilsonFermion<WilsonAdjImplD> WilsonAdjFermionD;
typedef WilsonFermion<WilsonTwoIndexSymmetricImplR> WilsonTwoIndexSymmetricFermionR;
typedef WilsonFermion<WilsonTwoIndexSymmetricImplF> WilsonTwoIndexSymmetricFermionF;
typedef WilsonFermion<WilsonTwoIndexSymmetricImplD> WilsonTwoIndexSymmetricFermionD;
typedef WilsonTMFermion<WilsonImplR> WilsonTMFermionR; typedef WilsonTMFermion<WilsonImplR> WilsonTMFermionR;
typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF; typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF;
typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD; typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
@ -167,6 +198,11 @@ typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
typedef MobiusFermion<WilsonImplR> MobiusFermionR; typedef MobiusFermion<WilsonImplR> MobiusFermionR;
typedef MobiusFermion<WilsonImplF> MobiusFermionF; typedef MobiusFermion<WilsonImplF> MobiusFermionF;
typedef MobiusFermion<WilsonImplD> MobiusFermionD; typedef MobiusFermion<WilsonImplD> MobiusFermionD;
typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR;
typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;
typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR; typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF; typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF;
typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD; typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD;
@ -222,21 +258,21 @@ typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code // G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
#include <qcd/action/fermion/g5HermitianLinop.h> #include <Grid/qcd/action/fermion/g5HermitianLinop.h>
//////////////////////////////////////// ////////////////////////////////////////
// Pseudo fermion combinations for HMC // Pseudo fermion combinations for HMC
//////////////////////////////////////// ////////////////////////////////////////
#include <qcd/action/pseudofermion/EvenOddSchurDifferentiable.h> #include <Grid/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h>
#include <qcd/action/pseudofermion/TwoFlavour.h> #include <Grid/qcd/action/pseudofermion/TwoFlavour.h>
#include <qcd/action/pseudofermion/TwoFlavourRatio.h> #include <Grid/qcd/action/pseudofermion/TwoFlavourRatio.h>
#include <qcd/action/pseudofermion/TwoFlavourEvenOdd.h> #include <Grid/qcd/action/pseudofermion/TwoFlavourEvenOdd.h>
#include <qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h> #include <Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h>
#include <qcd/action/pseudofermion/OneFlavourRational.h> #include <Grid/qcd/action/pseudofermion/OneFlavourRational.h>
#include <qcd/action/pseudofermion/OneFlavourRationalRatio.h> #include <Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h>
#include <qcd/action/pseudofermion/OneFlavourEvenOddRational.h> #include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h>
#include <qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h> #include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h>
#endif #endif

View File

View File

@ -28,7 +28,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#include <Grid.h> #include <Grid.h>
namespace Grid { namespace Grid {
namespace QCD { namespace QCD {
@ -45,48 +48,107 @@ namespace QCD {
FourDimGrid, FourDimGrid,
FourDimRedBlackGrid,_M5,p), FourDimRedBlackGrid,_M5,p),
mass(_mass) mass(_mass)
{ { }
}
template<class Impl>
void CayleyFermion5D<Impl>::M5D (const FermionField &psi, FermionField &chi)
{
int Ls=this->Ls;
std::vector<Coeff_t> diag (Ls,1.0);
std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass;
std::vector<Coeff_t> lower(Ls,-1.0); lower[0] =mass;
M5D(psi,chi,chi,lower,diag,upper);
}
template<class Impl> template<class Impl>
void CayleyFermion5D<Impl>::Meooe5D (const FermionField &psi, FermionField &Din) void CayleyFermion5D<Impl>::Meooe5D (const FermionField &psi, FermionField &Din)
{ {
// Assemble Din
int Ls=this->Ls; int Ls=this->Ls;
std::vector<Coeff_t> diag = bs;
std::vector<Coeff_t> upper= cs;
std::vector<Coeff_t> lower= cs;
upper[Ls-1]=-mass*upper[Ls-1];
lower[0] =-mass*lower[0];
M5D(psi,psi,Din,lower,diag,upper);
}
template<class Impl> void CayleyFermion5D<Impl>::Meo5D (const FermionField &psi, FermionField &chi)
{
int Ls=this->Ls;
std::vector<Coeff_t> diag = beo;
std::vector<Coeff_t> upper(Ls);
std::vector<Coeff_t> lower(Ls);
for(int i=0;i<Ls;i++) {
upper[i]=-ceo[i];
lower[i]=-ceo[i];
}
upper[Ls-1]=-mass*upper[Ls-1];
lower[0] =-mass*lower[0];
M5D(psi,psi,chi,lower,diag,upper);
}
template<class Impl>
void CayleyFermion5D<Impl>::Mooee (const FermionField &psi, FermionField &chi)
{
int Ls=this->Ls;
std::vector<Coeff_t> diag = bee;
std::vector<Coeff_t> upper(Ls);
std::vector<Coeff_t> lower(Ls);
for(int i=0;i<Ls;i++) {
upper[i]=-cee[i];
lower[i]=-cee[i];
}
upper[Ls-1]=-mass*upper[Ls-1];
lower[0] =-mass*lower[0];
M5D(psi,psi,chi,lower,diag,upper);
}
template<class Impl>
void CayleyFermion5D<Impl>::MooeeDag (const FermionField &psi, FermionField &chi)
{
int Ls=this->Ls;
std::vector<Coeff_t> diag = bee;
std::vector<Coeff_t> upper(Ls);
std::vector<Coeff_t> lower(Ls);
for (int s=0;s<Ls;s++){ for (int s=0;s<Ls;s++){
// Assemble the 5d matrix
if ( s==0 ) { if ( s==0 ) {
// Din = bs psi[s] + cs[s] psi[s+1} upper[s] = -cee[s+1] ;
axpby_ssp_pminus(Din,bs[s],psi,cs[s],psi,s,s+1); lower[s] = mass*cee[Ls-1];
// Din+= -mass*cs[s] psi[s+1}
axpby_ssp_pplus (Din,1.0,Din,-mass*cs[s],psi,s,Ls-1);
} else if ( s==(Ls-1)) { } else if ( s==(Ls-1)) {
axpby_ssp_pminus(Din,bs[s],psi,-mass*cs[s],psi,s,0); upper[s] = mass*cee[0];
axpby_ssp_pplus (Din,1.0,Din,cs[s],psi,s,s-1); lower[s] = -cee[s-1];
} else { } else {
axpby_ssp_pminus(Din,bs[s],psi,cs[s],psi,s,s+1); upper[s]=-cee[s+1];
axpby_ssp_pplus(Din,1.0,Din,cs[s],psi,s,s-1); lower[s]=-cee[s-1];
} }
} }
M5Ddag(psi,psi,chi,lower,diag,upper);
} }
template<class Impl>
void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
{
int Ls=this->Ls;
std::vector<Coeff_t> diag(Ls,1.0);
std::vector<Coeff_t> upper(Ls,-1.0);
std::vector<Coeff_t> lower(Ls,-1.0);
upper[Ls-1]=-mass*upper[Ls-1];
lower[0] =-mass*lower[0];
M5Ddag(psi,chi,chi,lower,diag,upper);
}
template<class Impl> template<class Impl>
void CayleyFermion5D<Impl>::MeooeDag5D (const FermionField &psi, FermionField &Din) void CayleyFermion5D<Impl>::MeooeDag5D (const FermionField &psi, FermionField &Din)
{ {
int Ls=this->Ls; int Ls=this->Ls;
for(int s=0;s<Ls;s++){ std::vector<Coeff_t> diag =bs;
if ( s==0 ) { std::vector<Coeff_t> upper=cs;
axpby_ssp_pplus (Din,bs[s],psi,cs[s+1],psi,s,s+1); std::vector<Coeff_t> lower=cs;
axpby_ssp_pminus(Din,1.0,Din,-mass*cs[Ls-1],psi,s,Ls-1); upper[Ls-1]=-mass*upper[Ls-1];
} else if ( s==(Ls-1)) { lower[0] =-mass*lower[0];
axpby_ssp_pplus (Din,bs[s],psi,-mass*cs[0],psi,s,0); M5Ddag(psi,psi,Din,lower,diag,upper);
axpby_ssp_pminus(Din,1.0,Din,cs[s-1],psi,s,s-1);
} else {
axpby_ssp_pplus (Din,bs[s],psi,cs[s+1],psi,s,s+1);
axpby_ssp_pminus(Din,1.0,Din,cs[s-1],psi,s,s-1);
}
}
} }
// override multiply
template<class Impl> template<class Impl>
RealD CayleyFermion5D<Impl>::M (const FermionField &psi, FermionField &chi) RealD CayleyFermion5D<Impl>::M (const FermionField &psi, FermionField &chi)
{ {
@ -95,42 +157,14 @@ namespace QCD {
FermionField Din(psi._grid); FermionField Din(psi._grid);
// Assemble Din // Assemble Din
/*
for(int s=0;s<Ls;s++){
if ( s==0 ) {
// Din = bs psi[s] + cs[s] psi[s+1}
axpby_ssp_pminus(Din,bs[s],psi,cs[s],psi,s,s+1);
// Din+= -mass*cs[s] psi[s+1}
axpby_ssp_pplus (Din,1.0,Din,-mass*cs[s],psi,s,Ls-1);
} else if ( s==(Ls-1)) {
axpby_ssp_pminus(Din,bs[s],psi,-mass*cs[s],psi,s,0);
axpby_ssp_pplus (Din,1.0,Din,cs[s],psi,s,s-1);
} else {
axpby_ssp_pminus(Din,bs[s],psi,cs[s],psi,s,s+1);
axpby_ssp_pplus(Din,1.0,Din,cs[s],psi,s,s-1);
}
}
*/
Meooe5D(psi,Din); Meooe5D(psi,Din);
this->DW(Din,chi,DaggerNo); this->DW(Din,chi,DaggerNo);
// ((b D_W + D_w hop terms +1) on s-diag // ((b D_W + D_w hop terms +1) on s-diag
axpby(chi,1.0,1.0,chi,psi); axpby(chi,1.0,1.0,chi,psi);
// Call Mooee?? M5D(psi,chi);
for(int s=0;s<Ls;s++){ return(norm2(chi));
if ( s==0 ){
axpby_ssp_pminus(chi,1.0,chi,-1.0,psi,s,s+1);
axpby_ssp_pplus (chi,1.0,chi,mass,psi,s,Ls-1);
} else if ( s==(Ls-1)) {
axpby_ssp_pminus(chi,1.0,chi,mass,psi,s,0);
axpby_ssp_pplus (chi,1.0,chi,-1.0,psi,s,s-1);
} else {
axpby_ssp_pminus(chi,1.0,chi,-1.0,psi,s,s+1);
axpby_ssp_pplus (chi,1.0,chi,-1.0,psi,s,s-1);
}
}
return norm2(chi);
} }
template<class Impl> template<class Impl>
@ -146,39 +180,7 @@ namespace QCD {
MeooeDag5D(Din,chi); MeooeDag5D(Din,chi);
int Ls=this->Ls; M5Ddag(psi,chi);
for(int s=0;s<Ls;s++){
// Collect the terms in DW
// Chi = bs Din[s] + cs[s] Din[s+1}
// Chi+= -mass*cs[s] psi[s+1}
/*
if ( s==0 ) {
axpby_ssp_pplus (chi,bs[s],Din,cs[s+1],Din,s,s+1);
axpby_ssp_pminus(chi,1.0,chi,-mass*cs[Ls-1],Din,s,Ls-1);
} else if ( s==(Ls-1)) {
axpby_ssp_pplus (chi,bs[s],Din,-mass*cs[0],Din,s,0);
axpby_ssp_pminus(chi,1.0,chi,cs[s-1],Din,s,s-1);
} else {
axpby_ssp_pplus (chi,bs[s],Din,cs[s+1],Din,s,s+1);
axpby_ssp_pminus(chi,1.0,chi,cs[s-1],Din,s,s-1);
}
*/
// FIXME just call MooeeDag??
// Collect the terms indept of DW
if ( s==0 ){
axpby_ssp_pplus (chi,1.0,chi,-1.0,psi,s,s+1);
axpby_ssp_pminus(chi,1.0,chi,mass,psi,s,Ls-1);
} else if ( s==(Ls-1)) {
axpby_ssp_pplus (chi,1.0,chi,mass,psi,s,0);
axpby_ssp_pminus(chi,1.0,chi,-1.0,psi,s,s-1);
} else {
axpby_ssp_pplus(chi,1.0,chi,-1.0,psi,s,s+1);
axpby_ssp_pminus(chi,1.0,chi,-1.0,psi,s,s-1);
}
}
// ((b D_W + D_w hop terms +1) on s-diag // ((b D_W + D_w hop terms +1) on s-diag
axpby (chi,1.0,1.0,chi,psi); axpby (chi,1.0,1.0,chi,psi);
return norm2(chi); return norm2(chi);
@ -189,30 +191,10 @@ namespace QCD {
void CayleyFermion5D<Impl>::Meooe (const FermionField &psi, FermionField &chi) void CayleyFermion5D<Impl>::Meooe (const FermionField &psi, FermionField &chi)
{ {
int Ls=this->Ls; int Ls=this->Ls;
FermionField tmp(psi._grid); FermionField tmp(psi._grid);
// Assemble the 5d matrix
Meooe5D(psi,tmp);
#if 0
std::cout << "Meooe Test replacement norm2 tmp = " <<norm2(tmp)<<std::endl;
for(int s=0;s<Ls;s++){
if ( s==0 ) {
// tmp = bs psi[s] + cs[s] psi[s+1}
// tmp+= -mass*cs[s] psi[s+1}
axpby_ssp_pminus(tmp,beo[s],psi,-ceo[s],psi ,s, s+1);
axpby_ssp_pplus(tmp,1.0,tmp,mass*ceo[s],psi,s,Ls-1);
} else if ( s==(Ls-1)) {
axpby_ssp_pminus(tmp,beo[s],psi,mass*ceo[s],psi,s,0);
axpby_ssp_pplus(tmp,1.0,tmp,-ceo[s],psi,s,s-1);
} else {
axpby_ssp_pminus(tmp,beo[s],psi,-ceo[s],psi,s,s+1);
axpby_ssp_pplus (tmp,1.0,tmp,-ceo[s],psi,s,s-1);
}
}
std::cout << "Meooe Test replacement norm2 tmp old = " <<norm2(tmp)<<std::endl;
#endif
// Apply 4d dslash Meooe5D(psi,tmp);
if ( psi.checkerboard == Odd ) { if ( psi.checkerboard == Odd ) {
this->DhopEO(tmp,chi,DaggerNo); this->DhopEO(tmp,chi,DaggerNo);
} else { } else {
@ -230,140 +212,16 @@ namespace QCD {
} else { } else {
this->DhopOE(psi,tmp,DaggerYes); this->DhopOE(psi,tmp,DaggerYes);
} }
MeooeDag5D(tmp,chi); MeooeDag5D(tmp,chi);
#if 0
std::cout << "Meooe Test replacement norm2 chi new = " <<norm2(chi)<<std::endl;
// Assemble the 5d matrix
int Ls=this->Ls;
for(int s=0;s<Ls;s++){
if ( s==0 ) {
axpby_ssp_pplus(chi,beo[s],tmp, -ceo[s+1] ,tmp,s,s+1);
axpby_ssp_pminus(chi, 1.0,chi,mass*ceo[Ls-1],tmp,s,Ls-1);
} else if ( s==(Ls-1)) {
axpby_ssp_pplus(chi,beo[s],tmp,mass*ceo[0],tmp,s,0);
axpby_ssp_pminus(chi,1.0,chi,-ceo[s-1],tmp,s,s-1);
} else {
axpby_ssp_pplus(chi,beo[s],tmp,-ceo[s+1],tmp,s,s+1);
axpby_ssp_pminus(chi,1.0 ,chi,-ceo[s-1],tmp,s,s-1);
}
}
std::cout << "Meooe Test replacement norm2 chi old = " <<norm2(chi)<<std::endl;
#endif
}
template<class Impl>
void CayleyFermion5D<Impl>::Mooee (const FermionField &psi, FermionField &chi)
{
int Ls=this->Ls;
for (int s=0;s<Ls;s++){
if ( s==0 ) {
axpby_ssp_pminus(chi,bee[s],psi ,-cee[s],psi,s,s+1);
axpby_ssp_pplus (chi,1.0,chi,mass*cee[s],psi,s,Ls-1);
} else if ( s==(Ls-1)) {
axpby_ssp_pminus(chi,bee[s],psi,mass*cee[s],psi,s,0);
axpby_ssp_pplus (chi,1.0,chi,-cee[s],psi,s,s-1);
} else {
axpby_ssp_pminus(chi,bee[s],psi,-cee[s],psi,s,s+1);
axpby_ssp_pplus (chi,1.0,chi,-cee[s],psi,s,s-1);
}
}
} }
template<class Impl> template<class Impl>
void CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){ void CayleyFermion5D<Impl>::Mdir (const FermionField &psi, FermionField &chi,int dir,int disp){
int Ls=this->Ls;
FermionField tmp(psi._grid); FermionField tmp(psi._grid);
// Assemble the 5d matrix Meo5D(psi,tmp);
for(int s=0;s<Ls;s++){
if ( s==0 ) {
// tmp = bs psi[s] + cs[s] psi[s+1}
// tmp+= -mass*cs[s] psi[s+1}
axpby_ssp_pminus(tmp,beo[s],psi,-ceo[s],psi ,s, s+1);
axpby_ssp_pplus(tmp,1.0,tmp,mass*ceo[s],psi,s,Ls-1);
} else if ( s==(Ls-1)) {
axpby_ssp_pminus(tmp,beo[s],psi,mass*ceo[s],psi,s,0);
axpby_ssp_pplus(tmp,1.0,tmp,-ceo[s],psi,s,s-1);
} else {
axpby_ssp_pminus(tmp,beo[s],psi,-ceo[s],psi,s,s+1);
axpby_ssp_pplus (tmp,1.0,tmp,-ceo[s],psi,s,s-1);
}
}
// Apply 4d dslash fragment // Apply 4d dslash fragment
this->DhopDir(tmp,chi,dir,disp); this->DhopDir(tmp,chi,dir,disp);
} }
template<class Impl>
void CayleyFermion5D<Impl>::MooeeDag (const FermionField &psi, FermionField &chi)
{
int Ls=this->Ls;
for (int s=0;s<Ls;s++){
// Assemble the 5d matrix
if ( s==0 ) {
axpby_ssp_pplus(chi,bee[s],psi,-cee[s+1] ,psi,s,s+1);
axpby_ssp_pminus(chi,1.0,chi,mass*cee[Ls-1],psi,s,Ls-1);
} else if ( s==(Ls-1)) {
axpby_ssp_pplus(chi,bee[s],psi,mass*cee[0],psi,s,0);
axpby_ssp_pminus(chi,1.0,chi,-cee[s-1],psi,s,s-1);
} else {
axpby_ssp_pplus(chi,bee[s],psi,-cee[s+1],psi,s,s+1);
axpby_ssp_pminus(chi,1.0 ,chi,-cee[s-1],psi,s,s-1);
}
}
}
template<class Impl>
void CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi, FermionField &chi)
{
int Ls=this->Ls;
// Apply (L^{\prime})^{-1}
axpby_ssp (chi,1.0,psi, 0.0,psi,0,0); // chi[0]=psi[0]
for (int s=1;s<Ls;s++){
axpby_ssp_pplus(chi,1.0,psi,-lee[s-1],chi,s,s-1);// recursion Psi[s] -lee P_+ chi[s-1]
}
// L_m^{-1}
for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
axpby_ssp_pminus(chi,1.0,chi,-leem[s],chi,Ls-1,s);
}
// U_m^{-1} D^{-1}
for (int s=0;s<Ls-1;s++){
// Chi[s] + 1/d chi[s]
axpby_ssp_pplus(chi,1.0/dee[s],chi,-ueem[s]/dee[Ls-1],chi,s,Ls-1);
}
axpby_ssp(chi,1.0/dee[Ls-1],chi,0.0,chi,Ls-1,Ls-1); // Modest avoidable
// Apply U^{-1}
for (int s=Ls-2;s>=0;s--){
axpby_ssp_pminus (chi,1.0,chi,-uee[s],chi,s,s+1); // chi[Ls]
}
}
template<class Impl>
void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
{
int Ls=this->Ls;
// Apply (U^{\prime})^{-dagger}
axpby_ssp (chi,1.0,psi, 0.0,psi,0,0); // chi[0]=psi[0]
for (int s=1;s<Ls;s++){
axpby_ssp_pminus(chi,1.0,psi,-uee[s-1],chi,s,s-1);
}
// U_m^{-\dagger}
for (int s=0;s<Ls-1;s++){
axpby_ssp_pplus(chi,1.0,chi,-ueem[s],chi,Ls-1,s);
}
// L_m^{-\dagger} D^{-dagger}
for (int s=0;s<Ls-1;s++){
axpby_ssp_pminus(chi,1.0/dee[s],chi,-leem[s]/dee[Ls-1],chi,s,Ls-1);
}
axpby_ssp(chi,1.0/dee[Ls-1],chi,0.0,chi,Ls-1,Ls-1); // Modest avoidable
// Apply L^{-dagger}
for (int s=Ls-2;s>=0;s--){
axpby_ssp_pplus (chi,1.0,chi,-lee[s],chi,s,s+1); // chi[Ls]
}
}
// force terms; five routines; default to Dhop on diagonal // force terms; five routines; default to Dhop on diagonal
template<class Impl> template<class Impl>
void CayleyFermion5D<Impl>::MDeriv (GaugeField &mat,const FermionField &U,const FermionField &V,int dag) void CayleyFermion5D<Impl>::MDeriv (GaugeField &mat,const FermionField &U,const FermionField &V,int dag)
@ -415,12 +273,21 @@ namespace QCD {
template<class Impl> template<class Impl>
void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c) void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
{ {
SetCoefficientsZolotarev(1.0,zdata,b,c); std::vector<Coeff_t> gamma(this->Ls);
for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
SetCoefficientsInternal(1.0,gamma,b,c);
} }
//Zolo //Zolo
template<class Impl> template<class Impl>
void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c) void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
{
std::vector<Coeff_t> gamma(this->Ls);
for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
SetCoefficientsInternal(zolo_hi,gamma,b,c);
}
//Zolo
template<class Impl>
void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c)
{ {
int Ls=this->Ls; int Ls=this->Ls;
@ -458,7 +325,7 @@ namespace QCD {
double bmc = b-c; double bmc = b-c;
for(int i=0; i < Ls; i++){ for(int i=0; i < Ls; i++){
as[i] = 1.0; as[i] = 1.0;
omega[i] = ((double)zdata->gamma[i])*zolo_hi; //NB reciprocal relative to Chroma NEF code omega[i] = gamma[i]*zolo_hi; //NB reciprocal relative to Chroma NEF code
bs[i] = 0.5*(bpc/omega[i] + bmc); bs[i] = 0.5*(bpc/omega[i] + bmc);
cs[i] = 0.5*(bpc/omega[i] - bmc); cs[i] = 0.5*(bpc/omega[i] - bmc);
} }
@ -520,12 +387,14 @@ namespace QCD {
} }
{ {
double delta_d=mass*cee[Ls-1]; Coeff_t delta_d=mass*cee[Ls-1];
for(int j=0;j<Ls-1;j++) delta_d *= cee[j]/bee[j]; for(int j=0;j<Ls-1;j++) delta_d *= cee[j]/bee[j];
dee[Ls-1] += delta_d; dee[Ls-1] += delta_d;
} }
} }
FermOpTemplateInstantiate(CayleyFermion5D); FermOpTemplateInstantiate(CayleyFermion5D);
GparityFermOpTemplateInstantiate(CayleyFermion5D); GparityFermOpTemplateInstantiate(CayleyFermion5D);

View File

@ -51,6 +51,29 @@ namespace Grid {
virtual void MooeeDag (const FermionField &in, FermionField &out); virtual void MooeeDag (const FermionField &in, FermionField &out);
virtual void MooeeInv (const FermionField &in, FermionField &out); virtual void MooeeInv (const FermionField &in, FermionField &out);
virtual void MooeeInvDag (const FermionField &in, FermionField &out); virtual void MooeeInvDag (const FermionField &in, FermionField &out);
virtual void Meo5D (const FermionField &psi, FermionField &chi);
virtual void M5D (const FermionField &psi, FermionField &chi);
virtual void M5Ddag(const FermionField &psi, FermionField &chi);
/////////////////////////////////////////////////////
// Instantiate different versions depending on Impl
/////////////////////////////////////////////////////
void M5D(const FermionField &psi,
const FermionField &phi,
FermionField &chi,
std::vector<Coeff_t> &lower,
std::vector<Coeff_t> &diag,
std::vector<Coeff_t> &upper);
void M5Ddag(const FermionField &psi,
const FermionField &phi,
FermionField &chi,
std::vector<Coeff_t> &lower,
std::vector<Coeff_t> &diag,
std::vector<Coeff_t> &upper);
void MooeeInternal(const FermionField &in, FermionField &out,int dag,int inv);
virtual void Instantiatable(void)=0; virtual void Instantiatable(void)=0;
// force terms; five routines; default to Dhop on diagonal // force terms; five routines; default to Dhop on diagonal
@ -68,23 +91,23 @@ namespace Grid {
RealD mass; RealD mass;
// Cayley form Moebius (tanh and zolotarev) // Cayley form Moebius (tanh and zolotarev)
std::vector<RealD> omega; std::vector<Coeff_t> omega;
std::vector<RealD> bs; // S dependent coeffs std::vector<Coeff_t> bs; // S dependent coeffs
std::vector<RealD> cs; std::vector<Coeff_t> cs;
std::vector<RealD> as; std::vector<Coeff_t> as;
// For preconditioning Cayley form // For preconditioning Cayley form
std::vector<RealD> bee; std::vector<Coeff_t> bee;
std::vector<RealD> cee; std::vector<Coeff_t> cee;
std::vector<RealD> aee; std::vector<Coeff_t> aee;
std::vector<RealD> beo; std::vector<Coeff_t> beo;
std::vector<RealD> ceo; std::vector<Coeff_t> ceo;
std::vector<RealD> aeo; std::vector<Coeff_t> aeo;
// LDU factorisation of the eeoo matrix // LDU factorisation of the eeoo matrix
std::vector<RealD> lee; std::vector<Coeff_t> lee;
std::vector<RealD> leem; std::vector<Coeff_t> leem;
std::vector<RealD> uee; std::vector<Coeff_t> uee;
std::vector<RealD> ueem; std::vector<Coeff_t> ueem;
std::vector<RealD> dee; std::vector<Coeff_t> dee;
// Constructors // Constructors
CayleyFermion5D(GaugeField &_Umu, CayleyFermion5D(GaugeField &_Umu,
@ -97,9 +120,20 @@ namespace Grid {
protected: protected:
void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c); void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c); void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
void SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c);
}; };
} }
} }
#define INSTANTIATE_DPERP(A)\
template void CayleyFermion5D< A >::M5D(const FermionField &psi,const FermionField &phi,FermionField &chi,\
std::vector<Coeff_t> &lower,std::vector<Coeff_t> &diag,std::vector<Coeff_t> &upper); \
template void CayleyFermion5D< A >::M5Ddag(const FermionField &psi,const FermionField &phi,FermionField &chi,\
std::vector<Coeff_t> &lower,std::vector<Coeff_t> &diag,std::vector<Coeff_t> &upper); \
template void CayleyFermion5D< A >::MooeeInv (const FermionField &psi, FermionField &chi); \
template void CayleyFermion5D< A >::MooeeInvDag (const FermionField &psi, FermionField &chi);
#define CAYLEY_DPERP_CACHE
#undef CAYLEY_DPERP_LINALG
#endif #endif

View File

@ -0,0 +1,211 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
Copyright (C) 2015
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid.h>
namespace Grid {
namespace QCD {
// FIXME -- make a version of these routines with site loop outermost for cache reuse.
// Pminus fowards
// Pplus backwards..
template<class Impl>
void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
const FermionField &phi,
FermionField &chi,
std::vector<Coeff_t> &lower,
std::vector<Coeff_t> &diag,
std::vector<Coeff_t> &upper)
{
int Ls =this->Ls;
GridBase *grid=psi._grid;
assert(phi.checkerboard == psi.checkerboard);
chi.checkerboard=psi.checkerboard;
PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
for(int s=0;s<Ls;s++){
auto tmp = psi._odata[0];
if ( s==0 ) {
spProj5m(tmp,psi._odata[ss+s+1]);
chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
spProj5p(tmp,psi._odata[ss+Ls-1]);
chi[ss+s]=chi[ss+s]+lower[s]*tmp;
} else if ( s==(Ls-1)) {
spProj5m(tmp,psi._odata[ss+0]);
chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
spProj5p(tmp,psi._odata[ss+s-1]);
chi[ss+s]=chi[ss+s]+lower[s]*tmp;
} else {
spProj5m(tmp,psi._odata[ss+s+1]);
chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
spProj5p(tmp,psi._odata[ss+s-1]);
chi[ss+s]=chi[ss+s]+lower[s]*tmp;
}
}
}
}
template<class Impl>
void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
const FermionField &phi,
FermionField &chi,
std::vector<Coeff_t> &lower,
std::vector<Coeff_t> &diag,
std::vector<Coeff_t> &upper)
{
int Ls =this->Ls;
GridBase *grid=psi._grid;
assert(phi.checkerboard == psi.checkerboard);
chi.checkerboard=psi.checkerboard;
PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
auto tmp = psi._odata[0];
for(int s=0;s<Ls;s++){
if ( s==0 ) {
spProj5p(tmp,psi._odata[ss+s+1]);
chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
spProj5m(tmp,psi._odata[ss+Ls-1]);
chi[ss+s]=chi[ss+s]+lower[s]*tmp;
} else if ( s==(Ls-1)) {
spProj5p(tmp,psi._odata[ss+0]);
chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
spProj5m(tmp,psi._odata[ss+s-1]);
chi[ss+s]=chi[ss+s]+lower[s]*tmp;
} else {
spProj5p(tmp,psi._odata[ss+s+1]);
chi[ss+s]=diag[s]*phi[ss+s]+upper[s]*tmp;
spProj5m(tmp,psi._odata[ss+s-1]);
chi[ss+s]=chi[ss+s]+lower[s]*tmp;
}
}
}
}
template<class Impl>
void CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi, FermionField &chi)
{
GridBase *grid=psi._grid;
int Ls=this->Ls;
chi.checkerboard=psi.checkerboard;
PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
auto tmp = psi._odata[0];
// Apply (L^{\prime})^{-1}
chi[ss]=psi[ss]; // chi[0]=psi[0]
for(int s=1;s<Ls;s++){
spProj5p(tmp,chi[ss+s-1]);
chi[ss+s] = psi[ss+s]-lee[s-1]*tmp;
}
// L_m^{-1}
for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
spProj5m(tmp,chi[ss+s]);
chi[ss+Ls-1] = chi[ss+Ls-1] - leem[s]*tmp;
}
// U_m^{-1} D^{-1}
for (int s=0;s<Ls-1;s++){
// Chi[s] + 1/d chi[s]
spProj5p(tmp,chi[ss+Ls-1]);
chi[ss+s] = (1.0/dee[s])*chi[ss+s]-(ueem[s]/dee[Ls-1])*tmp;
}
chi[ss+Ls-1]= (1.0/dee[Ls-1])*chi[ss+Ls-1];
// Apply U^{-1}
for (int s=Ls-2;s>=0;s--){
spProj5m(tmp,chi[ss+s+1]);
chi[ss+s] = chi[ss+s] - uee[s]*tmp;
}
}
}
template<class Impl>
void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
{
GridBase *grid=psi._grid;
int Ls=this->Ls;
assert(psi.checkerboard == psi.checkerboard);
chi.checkerboard=psi.checkerboard;
PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=Ls){ // adds Ls
auto tmp = psi._odata[0];
// Apply (U^{\prime})^{-dagger}
chi[ss]=psi[ss];
for (int s=1;s<Ls;s++){
spProj5m(tmp,chi[ss+s-1]);
chi[ss+s] = psi[ss+s]-uee[s-1]*tmp;
}
// U_m^{-\dagger}
for (int s=0;s<Ls-1;s++){
spProj5p(tmp,chi[ss+s]);
chi[ss+Ls-1] = chi[ss+Ls-1] - ueem[s]*tmp;
}
// L_m^{-\dagger} D^{-dagger}
for (int s=0;s<Ls-1;s++){
spProj5m(tmp,chi[ss+Ls-1]);
chi[ss+s] = (1.0/dee[s])*chi[ss+s]-(leem[s]/dee[Ls-1])*tmp;
}
chi[ss+Ls-1]= (1.0/dee[Ls-1])*chi[ss+Ls-1];
// Apply L^{-dagger}
for (int s=Ls-2;s>=0;s--){
spProj5p(tmp,chi[ss+s+1]);
chi[ss+s] = chi[ss+s] - lee[s]*tmp;
}
}
}
#ifdef CAYLEY_DPERP_CACHE
INSTANTIATE_DPERP(WilsonImplF);
INSTANTIATE_DPERP(WilsonImplD);
INSTANTIATE_DPERP(GparityWilsonImplF);
INSTANTIATE_DPERP(GparityWilsonImplD);
INSTANTIATE_DPERP(ZWilsonImplF);
INSTANTIATE_DPERP(ZWilsonImplD);
#endif
}}

View File

@ -0,0 +1,133 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
Copyright (C) 2015
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Eigen/Dense>
#include <Grid.h>
namespace Grid {
namespace QCD {
/*
* Dense matrix versions of routines
*/
/*
template<class Impl>
void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
{
this->MooeeInternal(psi,chi,DaggerYes,InverseYes);
}
template<class Impl>
void CayleyFermion5D<Impl>::MooeeInv(const FermionField &psi, FermionField &chi)
{
this->MooeeInternal(psi,chi,DaggerNo,InverseYes);
}
*/
template<class Impl>
void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
{
int Ls=this->Ls;
int LLs = psi._grid->_rdimensions[0];
int vol = psi._grid->oSites()/LLs;
chi.checkerboard=psi.checkerboard;
assert(Ls==LLs);
Eigen::MatrixXd Pplus = Eigen::MatrixXd::Zero(Ls,Ls);
Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
for(int s=0;s<Ls;s++){
Pplus(s,s) = bee[s];
Pminus(s,s)= bee[s];
}
for(int s=0;s<Ls-1;s++){
Pminus(s,s+1) = -cee[s];
}
for(int s=0;s<Ls-1;s++){
Pplus(s+1,s) = -cee[s+1];
}
Pplus (0,Ls-1) = mass*cee[0];
Pminus(Ls-1,0) = mass*cee[Ls-1];
Eigen::MatrixXd PplusMat ;
Eigen::MatrixXd PminusMat;
if ( inv ) {
PplusMat =Pplus.inverse();
PminusMat=Pminus.inverse();
} else {
PplusMat =Pplus;
PminusMat=Pminus;
}
if(dag){
PplusMat.adjointInPlace();
PminusMat.adjointInPlace();
}
// For the non-vectorised s-direction this is simple
for(auto site=0;site<vol;site++){
SiteSpinor SiteChi;
SiteHalfSpinor SitePplus;
SiteHalfSpinor SitePminus;
for(int s1=0;s1<Ls;s1++){
SiteChi =zero;
for(int s2=0;s2<Ls;s2++){
int lex2 = s2+Ls*site;
if ( PplusMat(s1,s2) != 0.0 ) {
spProj5p(SitePplus,psi[lex2]);
accumRecon5p(SiteChi,PplusMat (s1,s2)*SitePplus);
}
if ( PminusMat(s1,s2) != 0.0 ) {
spProj5m(SitePminus,psi[lex2]);
accumRecon5m(SiteChi,PminusMat(s1,s2)*SitePminus);
}
}
chi[s1+Ls*site] = SiteChi*0.5;
}
}
}
template void CayleyFermion5D<GparityWilsonImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
template void CayleyFermion5D<GparityWilsonImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
template void CayleyFermion5D<WilsonImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
template void CayleyFermion5D<WilsonImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
}}

View File

@ -0,0 +1,149 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
Copyright (C) 2015
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid.h>
namespace Grid {
namespace QCD {
// FIXME -- make a version of these routines with site loop outermost for cache reuse.
// Pminus fowards
// Pplus backwards
template<class Impl>
void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
const FermionField &phi,
FermionField &chi,
std::vector<Coeff_t> &lower,
std::vector<Coeff_t> &diag,
std::vector<Coeff_t> &upper)
{
int Ls=this->Ls;
for(int s=0;s<Ls;s++){
if ( s==0 ) {
axpby_ssp_pminus(chi,diag[s],phi,upper[s],psi,s,s+1);
axpby_ssp_pplus (chi,1.0,chi,lower[s],psi,s,Ls-1);
} else if ( s==(Ls-1)) {
axpby_ssp_pminus(chi,diag[s],phi,upper[s],psi,s,0);
axpby_ssp_pplus (chi,1.0,chi,lower[s],psi,s,s-1);
} else {
axpby_ssp_pminus(chi,diag[s],phi,upper[s],psi,s,s+1);
axpby_ssp_pplus(chi,1.0,chi,lower[s],psi,s,s-1);
}
}
}
template<class Impl>
void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
const FermionField &phi,
FermionField &chi,
std::vector<Coeff_t> &lower,
std::vector<Coeff_t> &diag,
std::vector<Coeff_t> &upper)
{
int Ls=this->Ls;
for(int s=0;s<Ls;s++){
if ( s==0 ) {
axpby_ssp_pplus (chi,diag[s],phi,upper[s],psi,s,s+1);
axpby_ssp_pminus(chi,1.0,chi,lower[s],psi,s,Ls-1);
} else if ( s==(Ls-1)) {
axpby_ssp_pplus (chi,diag[s],phi,upper[s],psi,s,0);
axpby_ssp_pminus(chi,1.0,chi,lower[s],psi,s,s-1);
} else {
axpby_ssp_pplus (chi,diag[s],phi,upper[s],psi,s,s+1);
axpby_ssp_pminus(chi,1.0,chi,lower[s],psi,s,s-1);
}
}
}
template<class Impl>
void CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi, FermionField &chi)
{
chi.checkerboard=psi.checkerboard;
int Ls=this->Ls;
// Apply (L^{\prime})^{-1}
axpby_ssp (chi,1.0,psi, 0.0,psi,0,0); // chi[0]=psi[0]
for (int s=1;s<Ls;s++){
axpby_ssp_pplus(chi,1.0,psi,-lee[s-1],chi,s,s-1);// recursion Psi[s] -lee P_+ chi[s-1]
}
// L_m^{-1}
for (int s=0;s<Ls-1;s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
axpby_ssp_pminus(chi,1.0,chi,-leem[s],chi,Ls-1,s);
}
// U_m^{-1} D^{-1}
for (int s=0;s<Ls-1;s++){
// Chi[s] + 1/d chi[s]
axpby_ssp_pplus(chi,1.0/dee[s],chi,-ueem[s]/dee[Ls-1],chi,s,Ls-1);
}
axpby_ssp(chi,1.0/dee[Ls-1],chi,0.0,chi,Ls-1,Ls-1); // Modest avoidable
// Apply U^{-1}
for (int s=Ls-2;s>=0;s--){
axpby_ssp_pminus (chi,1.0,chi,-uee[s],chi,s,s+1); // chi[Ls]
}
}
template<class Impl>
void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
{
chi.checkerboard=psi.checkerboard;
int Ls=this->Ls;
// Apply (U^{\prime})^{-dagger}
axpby_ssp (chi,1.0,psi, 0.0,psi,0,0); // chi[0]=psi[0]
for (int s=1;s<Ls;s++){
axpby_ssp_pminus(chi,1.0,psi,-uee[s-1],chi,s,s-1);
}
// U_m^{-\dagger}
for (int s=0;s<Ls-1;s++){
axpby_ssp_pplus(chi,1.0,chi,-ueem[s],chi,Ls-1,s);
}
// L_m^{-\dagger} D^{-dagger}
for (int s=0;s<Ls-1;s++){
axpby_ssp_pminus(chi,1.0/dee[s],chi,-leem[s]/dee[Ls-1],chi,s,Ls-1);
}
axpby_ssp(chi,1.0/dee[Ls-1],chi,0.0,chi,Ls-1,Ls-1); // Modest avoidable
// Apply L^{-dagger}
for (int s=Ls-2;s>=0;s--){
axpby_ssp_pplus (chi,1.0,chi,-lee[s],chi,s,s+1); // chi[Ls]
}
}
#ifdef CAYLEY_DPERP_LINALG
INSTANTIATE(WilsonImplF);
INSTANTIATE(WilsonImplD);
INSTANTIATE(GparityWilsonImplF);
INSTANTIATE(GparityWilsonImplD);
#endif
}
}

View File

@ -0,0 +1,309 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
Copyright (C) 2015
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Eigen/Dense>
#include <Grid.h>
namespace Grid {
namespace QCD {
/*
* Dense matrix versions of routines
*/
template<class Impl>
void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
{
this->MooeeInternal(psi,chi,DaggerYes,InverseYes);
}
template<class Impl>
void CayleyFermion5D<Impl>::MooeeInv(const FermionField &psi, FermionField &chi)
{
this->MooeeInternal(psi,chi,DaggerNo,InverseYes);
}
template<class Impl>
void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
const FermionField &phi,
FermionField &chi,
std::vector<Coeff_t> &lower,
std::vector<Coeff_t> &diag,
std::vector<Coeff_t> &upper)
{
GridBase *grid=psi._grid;
int Ls = this->Ls;
int LLs = grid->_rdimensions[0];
int nsimd= Simd::Nsimd();
Vector<iSinglet<Simd> > u(LLs);
Vector<iSinglet<Simd> > l(LLs);
Vector<iSinglet<Simd> > d(LLs);
assert(Ls/LLs==nsimd);
assert(phi.checkerboard == psi.checkerboard);
chi.checkerboard=psi.checkerboard;
// just directly address via type pun
typedef typename Simd::scalar_type scalar_type;
scalar_type * u_p = (scalar_type *)&u[0];
scalar_type * l_p = (scalar_type *)&l[0];
scalar_type * d_p = (scalar_type *)&d[0];
for(int o=0;o<LLs;o++){ // outer
for(int i=0;i<nsimd;i++){ //inner
int s = o+i*LLs;
int ss = o*nsimd+i;
u_p[ss] = upper[s];
l_p[ss] = lower[s];
d_p[ss] = diag[s];
}}
PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
alignas(64) SiteHalfSpinor hp;
alignas(64) SiteHalfSpinor hm;
alignas(64) SiteSpinor fp;
alignas(64) SiteSpinor fm;
for(int v=0;v<LLs;v++){
int vp=(v+1)%LLs;
int vm=(v+LLs-1)%LLs;
spProj5m(hp,psi[ss+vp]);
spProj5p(hm,psi[ss+vm]);
if ( vp<=v ) rotate(hp,hp,1);
if ( vm>=v ) rotate(hm,hm,nsimd-1);
hp=hp*0.5;
hm=hm*0.5;
spRecon5m(fp,hp);
spRecon5p(fm,hm);
chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
chi[ss+v] = chi[ss+v] +l[v]*fm;
}
}
}
template<class Impl>
void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
const FermionField &phi,
FermionField &chi,
std::vector<Coeff_t> &lower,
std::vector<Coeff_t> &diag,
std::vector<Coeff_t> &upper)
{
GridBase *grid=psi._grid;
int Ls = this->Ls;
int LLs = grid->_rdimensions[0];
int nsimd= Simd::Nsimd();
Vector<iSinglet<Simd> > u(LLs);
Vector<iSinglet<Simd> > l(LLs);
Vector<iSinglet<Simd> > d(LLs);
assert(Ls/LLs==nsimd);
assert(phi.checkerboard == psi.checkerboard);
chi.checkerboard=psi.checkerboard;
// just directly address via type pun
typedef typename Simd::scalar_type scalar_type;
scalar_type * u_p = (scalar_type *)&u[0];
scalar_type * l_p = (scalar_type *)&l[0];
scalar_type * d_p = (scalar_type *)&d[0];
for(int o=0;o<LLs;o++){ // outer
for(int i=0;i<nsimd;i++){ //inner
int s = o+i*LLs;
int ss = o*nsimd+i;
u_p[ss] = upper[s];
l_p[ss] = lower[s];
d_p[ss] = diag[s];
}}
PARALLEL_FOR_LOOP
for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
alignas(64) SiteHalfSpinor hp;
alignas(64) SiteHalfSpinor hm;
alignas(64) SiteSpinor fp;
alignas(64) SiteSpinor fm;
for(int v=0;v<LLs;v++){
int vp=(v+1)%LLs;
int vm=(v+LLs-1)%LLs;
spProj5p(hp,psi[ss+vp]);
spProj5m(hm,psi[ss+vm]);
if ( vp<=v ) rotate(hp,hp,1);
if ( vm>=v ) rotate(hm,hm,nsimd-1);
hp=hp*0.5;
hm=hm*0.5;
spRecon5p(fp,hp);
spRecon5m(fm,hm);
chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
chi[ss+v] = chi[ss+v] +l[v]*fm;
}
}
}
template<class Impl>
void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
{
int Ls=this->Ls;
int LLs = psi._grid->_rdimensions[0];
int vol = psi._grid->oSites()/LLs;
chi.checkerboard=psi.checkerboard;
Eigen::MatrixXcd Pplus = Eigen::MatrixXcd::Zero(Ls,Ls);
Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
for(int s=0;s<Ls;s++){
Pplus(s,s) = bee[s];
Pminus(s,s)= bee[s];
}
for(int s=0;s<Ls-1;s++){
Pminus(s,s+1) = -cee[s];
}
for(int s=0;s<Ls-1;s++){
Pplus(s+1,s) = -cee[s+1];
}
Pplus (0,Ls-1) = mass*cee[0];
Pminus(Ls-1,0) = mass*cee[Ls-1];
Eigen::MatrixXcd PplusMat ;
Eigen::MatrixXcd PminusMat;
if ( inv ) {
PplusMat =Pplus.inverse();
PminusMat=Pminus.inverse();
} else {
PplusMat =Pplus;
PminusMat=Pminus;
}
if(dag){
PplusMat.adjointInPlace();
PminusMat.adjointInPlace();
}
typedef typename SiteHalfSpinor::scalar_type scalar_type;
const int Nsimd=Simd::Nsimd();
Vector<iSinglet<Simd> > Matp(Ls*LLs);
Vector<iSinglet<Simd> > Matm(Ls*LLs);
for(int s2=0;s2<Ls;s2++){
for(int s1=0;s1<LLs;s1++){
int istride = LLs;
int ostride = 1;
Simd Vp;
Simd Vm;
scalar_type *sp = (scalar_type *)&Vp;
scalar_type *sm = (scalar_type *)&Vm;
for(int l=0;l<Nsimd;l++){
sp[l] = PplusMat (l*istride+s1*ostride ,s2);
sm[l] = PminusMat(l*istride+s1*ostride,s2);
}
Matp[LLs*s2+s1] = Vp;
Matm[LLs*s2+s1] = Vm;
}
}
// Dynamic allocate on stack to get per thread without serialised heap acces
PARALLEL_FOR_LOOP
for(auto site=0;site<vol;site++){
// SiteHalfSpinor *SitePplus =(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
// SiteHalfSpinor *SitePminus=(SiteHalfSpinor *) alloca(LLs*sizeof(SiteHalfSpinor));
// SiteSpinor *SiteChi =(SiteSpinor *) alloca(LLs*sizeof(SiteSpinor));
Vector<SiteHalfSpinor> SitePplus(LLs);
Vector<SiteHalfSpinor> SitePminus(LLs);
Vector<SiteHalfSpinor> SiteChiP(LLs);
Vector<SiteHalfSpinor> SiteChiM(LLs);
Vector<SiteSpinor> SiteChi(LLs);
SiteHalfSpinor BcastP;
SiteHalfSpinor BcastM;
for(int s=0;s<LLs;s++){
int lex = s+LLs*site;
spProj5p(SitePplus[s] ,psi[lex]);
spProj5m(SitePminus[s],psi[lex]);
SiteChiP[s]=zero;
SiteChiM[s]=zero;
}
int s=0;
for(int l=0; l<Simd::Nsimd();l++){ // simd lane
for(int s2=0;s2<LLs;s2++){ // Column loop of right hand side
vbroadcast(BcastP,SitePplus [s2],l);
vbroadcast(BcastM,SitePminus[s2],l);
for(int s1=0;s1<LLs;s1++){ // Column loop of reduction variables
SiteChiP[s1]=SiteChiP[s1]+Matp[LLs*s+s1]*BcastP;
SiteChiM[s1]=SiteChiM[s1]+Matm[LLs*s+s1]*BcastM;
}
s++;
}}
for(int s=0;s<LLs;s++){
int lex = s+LLs*site;
spRecon5p(SiteChi[s],SiteChiP[s]);
accumRecon5m(SiteChi[s],SiteChiM[s]);
chi[lex] = SiteChi[s]*0.5;
}
}
}
INSTANTIATE_DPERP(DomainWallVec5dImplD);
INSTANTIATE_DPERP(DomainWallVec5dImplF);
INSTANTIATE_DPERP(ZDomainWallVec5dImplD);
INSTANTIATE_DPERP(ZDomainWallVec5dImplF);
template void CayleyFermion5D<DomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
template void CayleyFermion5D<DomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
template void CayleyFermion5D<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
template void CayleyFermion5D<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
}}

View File

@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_QCD_DOMAIN_WALL_FERMION_H #ifndef GRID_QCD_DOMAIN_WALL_FERMION_H
#define GRID_QCD_DOMAIN_WALL_FERMION_H #define GRID_QCD_DOMAIN_WALL_FERMION_H
#include <Grid.h> #include <Grid/Grid.h>
namespace Grid { namespace Grid {
@ -63,7 +63,7 @@ namespace Grid {
Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
assert(zdata->n==this->Ls); assert(zdata->n==this->Ls);
std::cout<<GridLogMessage << "DomainWallFermion with Ls="<<this->Ls<<std::endl; // std::cout<<GridLogMessage << "DomainWallFermion with Ls="<<this->Ls<<std::endl;
// Call base setter // Call base setter
this->SetCoefficientsTanh(zdata,1.0,0.0); this->SetCoefficientsTanh(zdata,1.0,0.0);

View File

@ -25,14 +25,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
with this program; if not, write to the Free Software Foundation, Inc., with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_QCD_FERMION_OPERATOR_IMPL_H #ifndef GRID_QCD_FERMION_OPERATOR_IMPL_H
#define GRID_QCD_FERMION_OPERATOR_IMPL_H #define GRID_QCD_FERMION_OPERATOR_IMPL_H
namespace Grid { namespace Grid {
namespace QCD { namespace QCD {
@ -75,7 +75,7 @@ namespace Grid {
// //
// //
// template<class Impl> // template<class Impl>
// class MyOp : pubic<Impl> { // class MyOp : public<Impl> {
// public: // public:
// //
// INHERIT_ALL_IMPL_TYPES(Impl); // INHERIT_ALL_IMPL_TYPES(Impl);
@ -100,26 +100,35 @@ namespace Grid {
typedef typename Impl::SiteHalfSpinor SiteHalfSpinor; \ typedef typename Impl::SiteHalfSpinor SiteHalfSpinor; \
typedef typename Impl::Compressor Compressor; \ typedef typename Impl::Compressor Compressor; \
typedef typename Impl::StencilImpl StencilImpl; \ typedef typename Impl::StencilImpl StencilImpl; \
typedef typename Impl::ImplParams ImplParams; typedef typename Impl::ImplParams ImplParams; \
typedef typename Impl::Coeff_t Coeff_t;
#define INHERIT_IMPL_TYPES(Base) \ #define INHERIT_IMPL_TYPES(Base) \
INHERIT_GIMPL_TYPES(Base) \ INHERIT_GIMPL_TYPES(Base) \
INHERIT_FIMPL_TYPES(Base) INHERIT_FIMPL_TYPES(Base)
/////// /////////////////////////////////////////////////////////////////////////////
// Single flavour four spinors with colour index // Single flavour four spinors with colour index
/////// /////////////////////////////////////////////////////////////////////////////
template<class S,int Nrepresentation=Nc> template <class S, class Representation = FundamentalRepresentation,class _Coeff_t = RealD >
class WilsonImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { class WilsonImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
public: public:
typedef PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > Gimpl; static const int Dimension = Representation::Dimension;
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
//Necessary?
constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}
const bool LsVectorised=false;
typedef _Coeff_t Coeff_t;
INHERIT_GIMPL_TYPES(Gimpl); INHERIT_GIMPL_TYPES(Gimpl);
template<typename vtype> using iImplSpinor = iScalar<iVector<iVector<vtype, Nrepresentation>, Ns> >; template <typename vtype> using iImplSpinor = iScalar<iVector<iVector<vtype, Dimension>, Ns> >;
template<typename vtype> using iImplHalfSpinor = iScalar<iVector<iVector<vtype, Nrepresentation>, Nhs> >; template <typename vtype> using iImplHalfSpinor = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
template<typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds >; template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
typedef iImplSpinor<Simd> SiteSpinor; typedef iImplSpinor<Simd> SiteSpinor;
typedef iImplHalfSpinor<Simd> SiteHalfSpinor; typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
@ -138,7 +147,12 @@ namespace Grid {
bool overlapCommsCompute(void) { return Params.overlapCommsCompute; }; bool overlapCommsCompute(void) { return Params.overlapCommsCompute; };
inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St){ inline void multLink(SiteHalfSpinor &phi,
const SiteDoubledGaugeField &U,
const SiteHalfSpinor &chi,
int mu,
StencilEntry *SE,
StencilImpl &St) {
mult(&phi(), &U(mu), &chi()); mult(&phi(), &U(mu), &chi());
} }
@ -146,8 +160,10 @@ namespace Grid {
inline void loadLinkElement(Simd &reg, ref &memory) { inline void loadLinkElement(Simd &reg, ref &memory) {
reg = memory; reg = memory;
} }
inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
{ inline void DoubleStore(GridBase *GaugeGrid,
DoubledGaugeField &Uds,
const GaugeField &Umu) {
conformable(Uds._grid, GaugeGrid); conformable(Uds._grid, GaugeGrid);
conformable(Umu._grid, GaugeGrid); conformable(Umu._grid, GaugeGrid);
GaugeLinkField U(GaugeGrid); GaugeLinkField U(GaugeGrid);
@ -168,9 +184,9 @@ namespace Grid {
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){ inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
int Ls=Btilde._grid->_fdimensions[0]; int Ls=Btilde._grid->_fdimensions[0];
GaugeLinkField tmp(mat._grid); GaugeLinkField tmp(mat._grid);
tmp = zero; tmp = zero;
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int sss=0;sss<tmp._grid->oSites();sss++){ for(int sss=0;sss<tmp._grid->oSites();sss++){
int sU=sss; int sU=sss;
@ -182,18 +198,19 @@ PARALLEL_FOR_LOOP
PokeIndex<LorentzIndex>(mat,tmp,mu); PokeIndex<LorentzIndex>(mat,tmp,mu);
} }
}; };
////////////////////////////////////////////////////////////////////////////////////
///////
// Single flavour four spinors with colour index, 5d redblack // Single flavour four spinors with colour index, 5d redblack
/////// ////////////////////////////////////////////////////////////////////////////////////
template<class S,int Nrepresentation=Nc>
class DomainWallRedBlack5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { template<class S,int Nrepresentation=Nc,class _Coeff_t = RealD>
class DomainWallVec5dImpl : public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > {
public: public:
static const int Dimension = Nrepresentation;
const bool LsVectorised=true;
typedef _Coeff_t Coeff_t;
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Nrepresentation> > Gimpl; typedef PeriodicGaugeImpl<GaugeImplTypes<S, Nrepresentation> > Gimpl;
INHERIT_GIMPL_TYPES(Gimpl); INHERIT_GIMPL_TYPES(Gimpl);
@ -221,7 +238,7 @@ PARALLEL_FOR_LOOP
ImplParams Params; ImplParams Params;
DomainWallRedBlack5dImpl(const ImplParams &p= ImplParams()) : Params(p) {}; DomainWallVec5dImpl(const ImplParams &p = ImplParams()) : Params(p){};
bool overlapCommsCompute(void) { return false; }; bool overlapCommsCompute(void) { return false; };
@ -229,8 +246,10 @@ PARALLEL_FOR_LOOP
inline void loadLinkElement(Simd &reg, ref &memory) { inline void loadLinkElement(Simd &reg, ref &memory) {
vsplat(reg, memory); vsplat(reg, memory);
} }
inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St)
{ inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
StencilImpl &St) {
SiteGaugeLink UU; SiteGaugeLink UU;
for (int i = 0; i < Nrepresentation; i++) { for (int i = 0; i < Nrepresentation; i++) {
for (int j = 0; j < Nrepresentation; j++) { for (int j = 0; j < Nrepresentation; j++) {
@ -265,28 +284,32 @@ PARALLEL_FOR_LOOP
pokeLocalSite(ScalarUds, Uds, lcoor); pokeLocalSite(ScalarUds, Uds, lcoor);
} }
} }
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){ inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu)
{
assert(0); assert(0);
} }
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){ inline void InsertForce5D(GaugeField &mat, FermionField &Btilde,FermionField &Atilde, int mu)
{
assert(0); assert(0);
} }
}; };
//////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////
// Flavour doubled spinors; is Gparity the only? what about C*? // Flavour doubled spinors; is Gparity the only? what about C*?
//////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////
template<class S,int Nrepresentation> template <class S, int Nrepresentation,class _Coeff_t = RealD>
class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresentation> > { class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresentation> > {
public: public:
static const int Dimension = Nrepresentation;
const bool LsVectorised=false;
typedef _Coeff_t Coeff_t;
typedef ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> > Gimpl; typedef ConjugateGaugeImpl< GaugeImplTypes<S,Nrepresentation> > Gimpl;
INHERIT_GIMPL_TYPES(Gimpl); INHERIT_GIMPL_TYPES(Gimpl);
@ -313,8 +336,11 @@ PARALLEL_FOR_LOOP
bool overlapCommsCompute(void) { return Params.overlapCommsCompute; }; bool overlapCommsCompute(void) { return Params.overlapCommsCompute; };
// provide the multiply by link that is differentiated between Gparity (with flavour index) and non-Gparity // provide the multiply by link that is differentiated between Gparity (with
inline void multLink(SiteHalfSpinor &phi,const SiteDoubledGaugeField &U,const SiteHalfSpinor &chi,int mu,StencilEntry *SE,StencilImpl &St){ // flavour index) and non-Gparity
inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
StencilImpl &St) {
typedef SiteHalfSpinor vobj; typedef SiteHalfSpinor vobj;
typedef typename SiteHalfSpinor::scalar_object sobj; typedef typename SiteHalfSpinor::scalar_object sobj;
@ -384,7 +410,6 @@ PARALLEL_FOR_LOOP
inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu) inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
{ {
conformable(Uds._grid,GaugeGrid); conformable(Uds._grid,GaugeGrid);
conformable(Umu._grid,GaugeGrid); conformable(Umu._grid,GaugeGrid);
@ -394,7 +419,6 @@ PARALLEL_FOR_LOOP
Lattice<iScalar<vInteger> > coor(GaugeGrid); Lattice<iScalar<vInteger> > coor(GaugeGrid);
for(int mu=0;mu<Nd;mu++){ for(int mu=0;mu<Nd;mu++){
LatticeCoordinate(coor,mu); LatticeCoordinate(coor,mu);
@ -408,7 +432,6 @@ PARALLEL_FOR_LOOP
Uconj = where(coor==neglink,-Uconj,Uconj); Uconj = where(coor==neglink,-Uconj,Uconj);
} }
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(auto ss=U.begin();ss<U.end();ss++){ for(auto ss=U.begin();ss<U.end();ss++){
Uds[ss](0)(mu) = U[ss](); Uds[ss](0)(mu) = U[ss]();
@ -441,6 +464,7 @@ PARALLEL_FOR_LOOP
} }
} }
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) { inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) {
// DhopDir provides U or Uconj depending on coor/flavour. // DhopDir provides U or Uconj depending on coor/flavour.
@ -454,6 +478,7 @@ PARALLEL_FOR_LOOP
PokeIndex<LorentzIndex>(mat, link, mu); PokeIndex<LorentzIndex>(mat, link, mu);
return; return;
} }
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) { inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
int Ls = Btilde._grid->_fdimensions[0]; int Ls = Btilde._grid->_fdimensions[0];
@ -471,20 +496,37 @@ PARALLEL_FOR_LOOP
PokeIndex<LorentzIndex>(mat, tmp, mu); PokeIndex<LorentzIndex>(mat, tmp, mu);
return; return;
} }
}; };
typedef WilsonImpl<vComplex ,Nc> WilsonImplR; // Real.. whichever prec typedef WilsonImpl<vComplex, FundamentalRepresentation > WilsonImplR; // Real.. whichever prec
typedef WilsonImpl<vComplexF,Nc> WilsonImplF; // Float typedef WilsonImpl<vComplexF, FundamentalRepresentation > WilsonImplF; // Float
typedef WilsonImpl<vComplexD,Nc> WilsonImplD; // Double typedef WilsonImpl<vComplexD, FundamentalRepresentation > WilsonImplD; // Double
typedef DomainWallRedBlack5dImpl<vComplex ,Nc> DomainWallRedBlack5dImplR; // Real.. whichever prec typedef WilsonImpl<vComplex, FundamentalRepresentation, ComplexD > ZWilsonImplR; // Real.. whichever prec
typedef DomainWallRedBlack5dImpl<vComplexF,Nc> DomainWallRedBlack5dImplF; // Float typedef WilsonImpl<vComplexF, FundamentalRepresentation, ComplexD > ZWilsonImplF; // Float
typedef DomainWallRedBlack5dImpl<vComplexD,Nc> DomainWallRedBlack5dImplD; // Double typedef WilsonImpl<vComplexD, FundamentalRepresentation, ComplexD > ZWilsonImplD; // Double
typedef WilsonImpl<vComplex, AdjointRepresentation > WilsonAdjImplR; // Real.. whichever prec
typedef WilsonImpl<vComplexF, AdjointRepresentation > WilsonAdjImplF; // Float
typedef WilsonImpl<vComplexD, AdjointRepresentation > WilsonAdjImplD; // Double
typedef WilsonImpl<vComplex, TwoIndexSymmetricRepresentation > WilsonTwoIndexSymmetricImplR; // Real.. whichever prec
typedef WilsonImpl<vComplexF, TwoIndexSymmetricRepresentation > WilsonTwoIndexSymmetricImplF; // Float
typedef WilsonImpl<vComplexD, TwoIndexSymmetricRepresentation > WilsonTwoIndexSymmetricImplD; // Double
typedef DomainWallVec5dImpl<vComplex ,Nc> DomainWallVec5dImplR; // Real.. whichever prec
typedef DomainWallVec5dImpl<vComplexF,Nc> DomainWallVec5dImplF; // Float
typedef DomainWallVec5dImpl<vComplexD,Nc> DomainWallVec5dImplD; // Double
typedef DomainWallVec5dImpl<vComplex ,Nc,ComplexD> ZDomainWallVec5dImplR; // Real.. whichever prec
typedef DomainWallVec5dImpl<vComplexF,Nc,ComplexD> ZDomainWallVec5dImplF; // Float
typedef DomainWallVec5dImpl<vComplexD,Nc,ComplexD> ZDomainWallVec5dImplD; // Double
typedef GparityWilsonImpl<vComplex , Nc> GparityWilsonImplR; // Real.. whichever prec typedef GparityWilsonImpl<vComplex , Nc> GparityWilsonImplR; // Real.. whichever prec
typedef GparityWilsonImpl<vComplexF, Nc> GparityWilsonImplF; // Float typedef GparityWilsonImpl<vComplexF, Nc> GparityWilsonImplF; // Float
typedef GparityWilsonImpl<vComplexD, Nc> GparityWilsonImplD; // Double typedef GparityWilsonImpl<vComplexD, Nc> GparityWilsonImplD; // Double
} }}
}
#endif #endif

View File

@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_QCD_MOBIUS_FERMION_H #ifndef GRID_QCD_MOBIUS_FERMION_H
#define GRID_QCD_MOBIUS_FERMION_H #define GRID_QCD_MOBIUS_FERMION_H
#include <Grid.h> #include <Grid/Grid.h>
namespace Grid { namespace Grid {

View File

@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_QCD_MOBIUS_ZOLOTAREV_FERMION_H #ifndef GRID_QCD_MOBIUS_ZOLOTAREV_FERMION_H
#define GRID_QCD_MOBIUS_ZOLOTAREV_FERMION_H #define GRID_QCD_MOBIUS_ZOLOTAREV_FERMION_H
#include <Grid.h> #include <Grid/Grid.h>
namespace Grid { namespace Grid {

View File

@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef OVERLAP_WILSON_CAYLEY_TANH_FERMION_H #ifndef OVERLAP_WILSON_CAYLEY_TANH_FERMION_H
#define OVERLAP_WILSON_CAYLEY_TANH_FERMION_H #define OVERLAP_WILSON_CAYLEY_TANH_FERMION_H
#include <Grid.h> #include <Grid/Grid.h>
namespace Grid { namespace Grid {

View File

@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef OVERLAP_WILSON_CAYLEY_ZOLOTAREV_FERMION_H #ifndef OVERLAP_WILSON_CAYLEY_ZOLOTAREV_FERMION_H
#define OVERLAP_WILSON_CAYLEY_ZOLOTAREV_FERMION_H #define OVERLAP_WILSON_CAYLEY_ZOLOTAREV_FERMION_H
#include <Grid.h> #include <Grid/Grid.h>
namespace Grid { namespace Grid {

View File

@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef OVERLAP_WILSON_CONTFRAC_TANH_FERMION_H #ifndef OVERLAP_WILSON_CONTFRAC_TANH_FERMION_H
#define OVERLAP_WILSON_CONTFRAC_TANH_FERMION_H #define OVERLAP_WILSON_CONTFRAC_TANH_FERMION_H
#include <Grid.h> #include <Grid/Grid.h>
namespace Grid { namespace Grid {

View File

@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef OVERLAP_WILSON_CONTFRAC_ZOLOTAREV_FERMION_H #ifndef OVERLAP_WILSON_CONTFRAC_ZOLOTAREV_FERMION_H
#define OVERLAP_WILSON_CONTFRAC_ZOLOTAREV_FERMION_H #define OVERLAP_WILSON_CONTFRAC_ZOLOTAREV_FERMION_H
#include <Grid.h> #include <Grid/Grid.h>
namespace Grid { namespace Grid {

View File

@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef OVERLAP_WILSON_PARTFRAC_TANH_FERMION_H #ifndef OVERLAP_WILSON_PARTFRAC_TANH_FERMION_H
#define OVERLAP_WILSON_PARTFRAC_TANH_FERMION_H #define OVERLAP_WILSON_PARTFRAC_TANH_FERMION_H
#include <Grid.h> #include <Grid/Grid.h>
namespace Grid { namespace Grid {

View File

@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef OVERLAP_WILSON_PARTFRAC_ZOLOTAREV_FERMION_H #ifndef OVERLAP_WILSON_PARTFRAC_ZOLOTAREV_FERMION_H
#define OVERLAP_WILSON_PARTFRAC_ZOLOTAREV_FERMION_H #define OVERLAP_WILSON_PARTFRAC_ZOLOTAREV_FERMION_H
#include <Grid.h> #include <Grid/Grid.h>
namespace Grid { namespace Grid {

View File

@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_QCD_SCALED_SHAMIR_FERMION_H #ifndef GRID_QCD_SCALED_SHAMIR_FERMION_H
#define GRID_QCD_SCALED_SHAMIR_FERMION_H #define GRID_QCD_SCALED_SHAMIR_FERMION_H
#include <Grid.h> #include <Grid/Grid.h>
namespace Grid { namespace Grid {

View File

@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_QCD_SHAMIR_ZOLOTAREV_FERMION_H #ifndef GRID_QCD_SHAMIR_ZOLOTAREV_FERMION_H
#define GRID_QCD_SHAMIR_ZOLOTAREV_FERMION_H #define GRID_QCD_SHAMIR_ZOLOTAREV_FERMION_H
#include <Grid.h> #include <Grid/Grid.h>
namespace Grid { namespace Grid {

View File

@ -25,7 +25,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
with this program; if not, write to the Free Software Foundation, Inc., with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#include <Grid.h> #include <Grid.h>
@ -33,8 +34,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
namespace Grid { namespace Grid {
namespace QCD { namespace QCD {
const std::vector<int> WilsonFermionStatic::directions ({0,1,2,3, 0, 1, 2, 3}); const std::vector<int> WilsonFermionStatic::directions({0, 1, 2, 3, 0, 1, 2,
const std::vector<int> WilsonFermionStatic::displacements({1,1,1,1,-1,-1,-1,-1}); 3});
const std::vector<int> WilsonFermionStatic::displacements({1, 1, 1, 1, -1, -1,
-1, -1});
int WilsonFermionStatic::HandOptDslash; int WilsonFermionStatic::HandOptDslash;
///////////////////////////////// /////////////////////////////////
@ -42,28 +45,29 @@ namespace QCD {
///////////////////////////////// /////////////////////////////////
template <class Impl> template <class Impl>
WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass,
GridRedBlackCartesian &Hgrid, const ImplParams &p)
RealD _mass,const ImplParams &p) : : Kernels(p),
Kernels(p),
_grid(&Fgrid), _grid(&Fgrid),
_cbgrid(&Hgrid), _cbgrid(&Hgrid),
Stencil(&Fgrid, npoint, Even, directions, displacements), Stencil(&Fgrid, npoint, Even, directions, displacements),
StencilEven(&Hgrid,npoint,Even,directions,displacements), // source is Even StencilEven(&Hgrid, npoint, Even, directions,
StencilOdd (&Hgrid,npoint,Odd ,directions,displacements), // source is Odd displacements), // source is Even
StencilOdd(&Hgrid, npoint, Odd, directions,
displacements), // source is Odd
mass(_mass), mass(_mass),
Lebesgue(_grid),
LebesgueEvenOdd(_cbgrid),
Umu(&Fgrid), Umu(&Fgrid),
UmuEven(&Hgrid), UmuEven(&Hgrid),
UmuOdd (&Hgrid) UmuOdd(&Hgrid) {
{
// Allocate the required comms buffer // Allocate the required comms buffer
ImportGauge(_Umu); ImportGauge(_Umu);
} }
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu) void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu) {
{
GaugeField HUmu(_Umu._grid); GaugeField HUmu(_Umu._grid);
HUmu = _Umu * (-0.5); HUmu = _Umu * (-0.5);
Impl::DoubleStore(GaugeGrid(), Umu, HUmu); Impl::DoubleStore(GaugeGrid(), Umu, HUmu);
@ -76,24 +80,21 @@ namespace QCD {
///////////////////////////// /////////////////////////////
template <class Impl> template <class Impl>
RealD WilsonFermion<Impl>::M(const FermionField &in, FermionField &out) RealD WilsonFermion<Impl>::M(const FermionField &in, FermionField &out) {
{
out.checkerboard = in.checkerboard; out.checkerboard = in.checkerboard;
Dhop(in, out, DaggerNo); Dhop(in, out, DaggerNo);
return axpy_norm(out, 4 + mass, in, out); return axpy_norm(out, 4 + mass, in, out);
} }
template <class Impl> template <class Impl>
RealD WilsonFermion<Impl>::Mdag(const FermionField &in, FermionField &out) RealD WilsonFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
{
out.checkerboard = in.checkerboard; out.checkerboard = in.checkerboard;
Dhop(in, out, DaggerYes); Dhop(in, out, DaggerYes);
return axpy_norm(out, 4 + mass, in, out); return axpy_norm(out, 4 + mass, in, out);
} }
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::Meooe(const FermionField &in, FermionField &out) void WilsonFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
{
if (in.checkerboard == Odd) { if (in.checkerboard == Odd) {
DhopEO(in, out, DaggerNo); DhopEO(in, out, DaggerNo);
} else { } else {
@ -101,8 +102,7 @@ namespace QCD {
} }
} }
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
{
if (in.checkerboard == Odd) { if (in.checkerboard == Odd) {
DhopEO(in, out, DaggerYes); DhopEO(in, out, DaggerYes);
} else { } else {
@ -130,7 +130,8 @@ namespace QCD {
} }
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) { void WilsonFermion<Impl>::MooeeInvDag(const FermionField &in,
FermionField &out) {
out.checkerboard = in.checkerboard; out.checkerboard = in.checkerboard;
MooeeInv(in, out); MooeeInv(in, out);
} }
@ -140,12 +141,9 @@ namespace QCD {
/////////////////////////////////// ///////////////////////////////////
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::DerivInternal(StencilImpl & st, void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
DoubledGaugeField & U, GaugeField &mat, const FermionField &A,
GaugeField &mat,
const FermionField &A,
const FermionField &B, int dag) { const FermionField &B, int dag) {
assert((dag == DaggerNo) || (dag == DaggerYes)); assert((dag == DaggerNo) || (dag == DaggerYes));
Compressor compressor(dag); Compressor compressor(dag);
@ -157,7 +155,6 @@ namespace QCD {
st.HaloExchange(B, compressor); st.HaloExchange(B, compressor);
for (int mu = 0; mu < Nd; mu++) { for (int mu = 0; mu < Nd; mu++) {
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Flip gamma (1+g)<->(1-g) if dag // Flip gamma (1+g)<->(1-g) if dag
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
@ -169,20 +166,20 @@ namespace QCD {
//////////////////////// ////////////////////////
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for (int sss = 0; sss < B._grid->oSites(); sss++) { for (int sss = 0; sss < B._grid->oSites(); sss++) {
Kernels::DiracOptDhopDir(st,U,st.comm_buf,sss,sss,B,Btilde,mu,gamma); Kernels::DiracOptDhopDir(st, U, st.CommBuf(), sss, sss, B, Btilde, mu,
gamma);
} }
////////////////////////////////////////////////// //////////////////////////////////////////////////
// spin trace outer product // spin trace outer product
////////////////////////////////////////////////// //////////////////////////////////////////////////
Impl::InsertForce4D(mat, Btilde, Atilde, mu); Impl::InsertForce4D(mat, Btilde, Atilde, mu);
} }
} }
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::DhopDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag) void WilsonFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U,
{ const FermionField &V, int dag) {
conformable(U._grid, _grid); conformable(U._grid, _grid);
conformable(U._grid, V._grid); conformable(U._grid, V._grid);
conformable(U._grid, mat._grid); conformable(U._grid, mat._grid);
@ -193,8 +190,8 @@ PARALLEL_FOR_LOOP
} }
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag) void WilsonFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U,
{ const FermionField &V, int dag) {
conformable(U._grid, _cbgrid); conformable(U._grid, _cbgrid);
conformable(U._grid, V._grid); conformable(U._grid, V._grid);
conformable(U._grid, mat._grid); conformable(U._grid, mat._grid);
@ -207,8 +204,8 @@ PARALLEL_FOR_LOOP
} }
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag) void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U,
{ const FermionField &V, int dag) {
conformable(U._grid, _cbgrid); conformable(U._grid, _cbgrid);
conformable(U._grid, V._grid); conformable(U._grid, V._grid);
conformable(U._grid, mat._grid); conformable(U._grid, mat._grid);
@ -220,118 +217,99 @@ PARALLEL_FOR_LOOP
DerivInternal(StencilOdd, UmuEven, mat, U, V, dag); DerivInternal(StencilOdd, UmuEven, mat, U, V, dag);
} }
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out,int dag) { void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out,
int dag) {
conformable(in._grid, _grid); // verifies full grid conformable(in._grid, _grid); // verifies full grid
conformable(in._grid, out._grid); conformable(in._grid, out._grid);
out.checkerboard = in.checkerboard; out.checkerboard = in.checkerboard;
DhopInternal(Stencil,Umu,in,out,dag); DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
} }
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag) { void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out,
int dag) {
conformable(in._grid, _cbgrid); // verifies half grid conformable(in._grid, _cbgrid); // verifies half grid
conformable(in._grid, out._grid); // drops the cb check conformable(in._grid, out._grid); // drops the cb check
assert(in.checkerboard == Even); assert(in.checkerboard == Even);
out.checkerboard = Odd; out.checkerboard = Odd;
DhopInternal(StencilEven,UmuOdd,in,out,dag); DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
} }
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) { void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,
int dag) {
conformable(in._grid, _cbgrid); // verifies half grid conformable(in._grid, _cbgrid); // verifies half grid
conformable(in._grid, out._grid); // drops the cb check conformable(in._grid, out._grid); // drops the cb check
assert(in.checkerboard == Odd); assert(in.checkerboard == Odd);
out.checkerboard = Even; out.checkerboard = Even;
DhopInternal(StencilOdd,UmuEven,in,out,dag); DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
} }
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::Mdir (const FermionField &in, FermionField &out,int dir,int disp) { void WilsonFermion<Impl>::Mdir(const FermionField &in, FermionField &out,
int dir, int disp) {
DhopDir(in, out, dir, disp); DhopDir(in, out, dir, disp);
} }
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out,int dir,int disp){ void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out,
int dir, int disp) {
int skip = (disp == 1) ? 0 : 1; int skip = (disp == 1) ? 0 : 1;
int dirdisp = dir + skip * 4; int dirdisp = dir + skip * 4;
int gamma = dir + (1 - skip) * 4; int gamma = dir + (1 - skip) * 4;
DhopDirDisp(in, out, dirdisp, gamma, DaggerNo); DhopDirDisp(in, out, dirdisp, gamma, DaggerNo);
}; };
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,int dirdisp,int gamma,int dag) { void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,
int dirdisp, int gamma, int dag) {
Compressor compressor(dag); Compressor compressor(dag);
Stencil.HaloExchange(in, compressor); Stencil.HaloExchange(in, compressor);
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for (int sss = 0; sss < in._grid->oSites(); sss++) { for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.comm_buf,sss,sss,in,out,dirdisp,gamma); Kernels::DiracOptDhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out,
dirdisp, gamma);
} }
}; };
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::DhopInternal(StencilImpl & st,DoubledGaugeField & U, void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
const FermionField &in, FermionField &out,int dag) DoubledGaugeField &U,
{ const FermionField &in,
DhopInternalCommsThenCompute(st,U,in,out,dag); FermionField &out, int dag) {
}
template<class Impl>
void WilsonFermion<Impl>::DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag) {
assert((dag == DaggerNo) || (dag == DaggerYes)); assert((dag == DaggerNo) || (dag == DaggerYes));
Compressor compressor(dag); Compressor compressor(dag);
st.HaloExchange(in, compressor); st.HaloExchange(in, compressor);
if (dag == DaggerYes) { if (dag == DaggerYes) {
if( HandOptDslash ) {
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for (int sss = 0; sss < in._grid->oSites(); sss++) { for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out); Kernels::DiracOptDhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in,
out);
} }
} else { } else {
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for (int sss = 0; sss < in._grid->oSites(); sss++) { for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out); Kernels::DiracOptDhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in,
} out);
}
} else {
if( HandOptDslash ) {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out);
}
} else {
PARALLEL_FOR_LOOP
for(int sss=0;sss<in._grid->oSites();sss++){
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out);
}
} }
} }
}; };
FermOpTemplateInstantiate(WilsonFermion); FermOpTemplateInstantiate(WilsonFermion);
AdjointFermOpTemplateInstantiate(WilsonFermion);
TwoIndexFermOpTemplateInstantiate(WilsonFermion);
GparityFermOpTemplateInstantiate(WilsonFermion); GparityFermOpTemplateInstantiate(WilsonFermion);
}
}
}}

View File

@ -24,7 +24,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
with this program; if not, write to the Free Software Foundation, Inc., with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#ifndef GRID_QCD_WILSON_FERMION_H #ifndef GRID_QCD_WILSON_FERMION_H
@ -44,8 +45,7 @@ namespace Grid {
}; };
template <class Impl> template <class Impl>
class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
{
public: public:
INHERIT_IMPL_TYPES(Impl); INHERIT_IMPL_TYPES(Impl);
typedef WilsonKernels<Impl> Kernels; typedef WilsonKernels<Impl> Kernels;
@ -82,10 +82,12 @@ namespace Grid {
// Derivative interface // Derivative interface
//////////////////////// ////////////////////////
// Interface calls an internal routine // Interface calls an internal routine
void DhopDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag); void DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V,
void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag); int dag);
void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag); void DhopDerivOE(GaugeField &mat, const FermionField &U,
const FermionField &V, int dag);
void DhopDerivEO(GaugeField &mat, const FermionField &U,
const FermionField &V, int dag);
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// non-hermitian hopping term; half cb or both // non-hermitian hopping term; half cb or both
@ -99,31 +101,22 @@ namespace Grid {
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
void Mdir(const FermionField &in, FermionField &out, int dir, int disp); void Mdir(const FermionField &in, FermionField &out, int dir, int disp);
void DhopDir(const FermionField &in, FermionField &out, int dir, int disp); void DhopDir(const FermionField &in, FermionField &out, int dir, int disp);
void DhopDirDisp(const FermionField &in, FermionField &out,int dirdisp,int gamma,int dag) ; void DhopDirDisp(const FermionField &in, FermionField &out, int dirdisp,
int gamma, int dag);
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Extra methods added by derived // Extra methods added by derived
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
void DerivInternal(StencilImpl & st, void DerivInternal(StencilImpl &st, DoubledGaugeField &U, GaugeField &mat,
DoubledGaugeField & U, const FermionField &A, const FermionField &B, int dag);
GaugeField &mat,
const FermionField &A,
const FermionField &B,
int dag);
void DhopInternal(StencilImpl & st,DoubledGaugeField & U, void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
const FermionField &in, FermionField &out,int dag) ;
void DhopInternalCommsThenCompute(StencilImpl & st,DoubledGaugeField & U,
const FermionField &in, FermionField &out, int dag); const FermionField &in, FermionField &out, int dag);
// Constructor // Constructor
WilsonFermion(GaugeField &_Umu, WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass,
GridRedBlackCartesian &Hgrid, const ImplParams &p = ImplParams());
RealD _mass,
const ImplParams &p= ImplParams()
) ;
// DoubleStore impl dependent // DoubleStore impl dependent
void ImportGauge(const GaugeField &_Umu); void ImportGauge(const GaugeField &_Umu);
@ -134,7 +127,6 @@ namespace Grid {
// protected: // protected:
public: public:
RealD mass; RealD mass;
GridBase *_grid; GridBase *_grid;
@ -150,11 +142,14 @@ namespace Grid {
DoubledGaugeField UmuEven; DoubledGaugeField UmuEven;
DoubledGaugeField UmuOdd; DoubledGaugeField UmuOdd;
LebesgueOrder Lebesgue;
LebesgueOrder LebesgueEvenOdd;
}; };
typedef WilsonFermion<WilsonImplF> WilsonFermionF; typedef WilsonFermion<WilsonImplF> WilsonFermionF;
typedef WilsonFermion<WilsonImplD> WilsonFermionD; typedef WilsonFermion<WilsonImplD> WilsonFermionD;
} }
} }
#endif #endif

View File

@ -1,4 +1,3 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -39,8 +38,6 @@ namespace QCD {
// S-direction is INNERMOST and takes no part in the parity. // S-direction is INNERMOST and takes no part in the parity.
const std::vector<int> WilsonFermion5DStatic::directions ({1,2,3,4, 1, 2, 3, 4}); const std::vector<int> WilsonFermion5DStatic::directions ({1,2,3,4, 1, 2, 3, 4});
const std::vector<int> WilsonFermion5DStatic::displacements({1,1,1,1,-1,-1,-1,-1}); const std::vector<int> WilsonFermion5DStatic::displacements({1,1,1,1,-1,-1,-1,-1});
int WilsonFermion5DStatic::HandOptDslash;
int WilsonFermion5DStatic::AsmOptDslash;
// 5d lattice for DWF. // 5d lattice for DWF.
template<class Impl> template<class Impl>
@ -65,6 +62,41 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
Lebesgue(_FourDimGrid), Lebesgue(_FourDimGrid),
LebesgueEvenOdd(_FourDimRedBlackGrid) LebesgueEvenOdd(_FourDimRedBlackGrid)
{ {
if (Impl::LsVectorised) {
int nsimd = Simd::Nsimd();
// some assertions
assert(FiveDimGrid._ndimension==5);
assert(FiveDimRedBlackGrid._ndimension==5);
assert(FiveDimRedBlackGrid._checker_dim==1); // Don't checker the s direction
assert(FourDimGrid._ndimension==4);
// Dimension zero of the five-d is the Ls direction
Ls=FiveDimGrid._fdimensions[0];
assert(FiveDimGrid._processors[0] ==1);
assert(FiveDimGrid._simd_layout[0] ==nsimd);
assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
assert(FiveDimRedBlackGrid._processors[0] ==1);
assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);
// Other dimensions must match the decomposition of the four-D fields
for(int d=0;d<4;d++){
assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
assert(FourDimGrid._simd_layout[d]=1);
assert(FourDimRedBlackGrid._simd_layout[d]=1);
assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
assert(FiveDimGrid._fdimensions[d+1] ==FourDimGrid._fdimensions[d]);
assert(FiveDimGrid._processors[d+1] ==FourDimGrid._processors[d]);
assert(FiveDimGrid._simd_layout[d+1] ==FourDimGrid._simd_layout[d]);
}
} else {
// some assertions // some assertions
assert(FiveDimGrid._ndimension==5); assert(FiveDimGrid._ndimension==5);
assert(FourDimGrid._ndimension==4); assert(FourDimGrid._ndimension==4);
@ -95,37 +127,18 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
assert(FiveDimGrid._processors[d+1] ==FourDimGrid._processors[d]); assert(FiveDimGrid._processors[d+1] ==FourDimGrid._processors[d]);
assert(FiveDimGrid._simd_layout[d+1] ==FourDimGrid._simd_layout[d]); assert(FiveDimGrid._simd_layout[d+1] ==FourDimGrid._simd_layout[d]);
} }
}
// Allocate the required comms buffer // Allocate the required comms buffer
ImportGauge(_Umu); ImportGauge(_Umu);
alltime=0;
commtime=0;
jointime=0;
dslashtime=0;
dslash1time=0;
} }
/*
template<class Impl> template<class Impl>
WilsonFermion5D<Impl>::WilsonFermion5D(int simd,GaugeField &_Umu, WilsonFermion5D<Impl>::WilsonFermion5D(int simd,GaugeField &_Umu,
GridCartesian &FiveDimGrid, GridCartesian &FiveDimGrid,
GridRedBlackCartesian &FiveDimRedBlackGrid, GridRedBlackCartesian &FiveDimRedBlackGrid,
GridCartesian &FourDimGrid, GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid,
RealD _M5,const ImplParams &p) : RealD _M5,const ImplParams &p) :
Kernels(p),
_FiveDimGrid (&FiveDimGrid),
_FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
_FourDimGrid (&FourDimGrid),
_FourDimRedBlackGrid(&FourDimRedBlackGrid),
Stencil (_FiveDimGrid,npoint,Even,directions,displacements),
StencilEven(_FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even
StencilOdd (_FiveDimRedBlackGrid,npoint,Odd ,directions,displacements), // source is Odd
M5(_M5),
Umu(_FourDimGrid),
UmuEven(_FourDimRedBlackGrid),
UmuOdd (_FourDimRedBlackGrid),
Lebesgue(_FourDimGrid),
LebesgueEvenOdd(_FourDimRedBlackGrid)
{ {
int nsimd = Simd::Nsimd(); int nsimd = Simd::Nsimd();
@ -134,7 +147,6 @@ WilsonFermion5D<Impl>::WilsonFermion5D(int simd, GaugeField &_Umu,
assert(FiveDimRedBlackGrid._ndimension==5); assert(FiveDimRedBlackGrid._ndimension==5);
assert(FiveDimRedBlackGrid._checker_dim==0); // Checkerboard the s-direction assert(FiveDimRedBlackGrid._checker_dim==0); // Checkerboard the s-direction
assert(FourDimGrid._ndimension==4); assert(FourDimGrid._ndimension==4);
assert(FourDimRedBlackGrid._ndimension==4);
// Dimension zero of the five-d is the Ls direction // Dimension zero of the five-d is the Ls direction
Ls=FiveDimGrid._fdimensions[0]; Ls=FiveDimGrid._fdimensions[0];
@ -147,15 +159,10 @@ WilsonFermion5D<Impl>::WilsonFermion5D(int simd, GaugeField &_Umu,
// Other dimensions must match the decomposition of the four-D fields // Other dimensions must match the decomposition of the four-D fields
for(int d=0;d<4;d++){ for(int d=0;d<4;d++){
assert(FourDimRedBlackGrid._fdimensions[d] ==FourDimGrid._fdimensions[d]);
assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]); assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
assert(FourDimRedBlackGrid._processors[d] ==FourDimGrid._processors[d]);
assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]); assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
assert(FourDimGrid._simd_layout[d]=1); assert(FourDimGrid._simd_layout[d]=1);
assert(FourDimRedBlackGrid._simd_layout[d] ==1);
assert(FourDimRedBlackGrid._simd_layout[d] ==1);
assert(FiveDimRedBlackGrid._simd_layout[d+1]==1); assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
assert(FiveDimGrid._fdimensions[d+1] ==FourDimGrid._fdimensions[d]); assert(FiveDimGrid._fdimensions[d+1] ==FourDimGrid._fdimensions[d]);
@ -163,8 +170,68 @@ WilsonFermion5D<Impl>::WilsonFermion5D(int simd, GaugeField &_Umu,
assert(FiveDimGrid._simd_layout[d+1] ==FourDimGrid._simd_layout[d]); assert(FiveDimGrid._simd_layout[d+1] ==FourDimGrid._simd_layout[d]);
} }
// Allocate the required comms buffer {
ImportGauge(_Umu); }
}
*/
template<class Impl>
void WilsonFermion5D<Impl>::Report(void)
{
std::vector<int> latt = GridDefaultLatt();
RealD volume = Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
RealD NP = _FourDimGrid->_Nprocessors;
if ( DhopCalls > 0 ) {
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D Number of Dhop Calls : " << DhopCalls << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " << DhopCommTime<< " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls : " << DhopCommTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D Total Compute time : " << DhopComputeTime << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls : " << DhopComputeTime / DhopCalls << " us" << std::endl;
RealD mflops = 1344*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
}
if ( DerivCalls > 0 ) {
std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl;
std::cout << GridLogMessage << "WilsonFermion5D Number of Deriv Calls : " <<DerivCalls <<std::endl;
std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " <<DerivCommTime <<" us"<<std::endl;
std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls : " <<DerivCommTime/DerivCalls<<" us" <<std::endl;
std::cout << GridLogMessage << "WilsonFermion5D Total Compute time : " <<DerivComputeTime <<" us"<<std::endl;
std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls : " <<DerivComputeTime/DerivCalls<<" us" <<std::endl;
std::cout << GridLogMessage << "WilsonFermion5D Total Dhop Compute time : " <<DerivDhopComputeTime <<" us"<<std::endl;
std::cout << GridLogMessage << "WilsonFermion5D Dhop ComputeTime/Calls : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NP << std::endl;
}
if (DerivCalls > 0 || DhopCalls > 0){
std::cout << GridLogMessage << "WilsonFermion5D Stencil"<<std::endl; Stencil.Report();
std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl; StencilEven.Report();
std::cout << GridLogMessage << "WilsonFermion5D StencilOdd"<<std::endl; StencilOdd.Report();
}
}
template<class Impl>
void WilsonFermion5D<Impl>::ZeroCounters(void) {
DhopCalls = 0;
DhopCommTime = 0;
DhopComputeTime = 0;
DerivCalls = 0;
DerivCommTime = 0;
DerivComputeTime = 0;
DerivDhopComputeTime = 0;
Stencil.ZeroCounters();
StencilEven.ZeroCounters();
StencilOdd.ZeroCounters();
} }
@ -201,7 +268,7 @@ PARALLEL_FOR_LOOP
for(int s=0;s<Ls;s++){ for(int s=0;s<Ls;s++){
int sU=ss; int sU=ss;
int sF = s+Ls*sU; int sF = s+Ls*sU;
Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.comm_buf,sF,sU,in,out,dirdisp,gamma); Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.CommBuf(),sF,sU,in,out,dirdisp,gamma);
} }
} }
}; };
@ -214,6 +281,7 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
const FermionField &B, const FermionField &B,
int dag) int dag)
{ {
DerivCalls++;
assert((dag==DaggerNo) ||(dag==DaggerYes)); assert((dag==DaggerNo) ||(dag==DaggerYes));
conformable(st._grid,A._grid); conformable(st._grid,A._grid);
@ -224,12 +292,14 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
FermionField Btilde(B._grid); FermionField Btilde(B._grid);
FermionField Atilde(B._grid); FermionField Atilde(B._grid);
DerivCommTime-=usecond();
st.HaloExchange(B,compressor); st.HaloExchange(B,compressor);
DerivCommTime+=usecond();
Atilde=A; Atilde=A;
DerivComputeTime-=usecond();
for (int mu = 0; mu < Nd; mu++) { for (int mu = 0; mu < Nd; mu++) {
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Flip gamma if dag // Flip gamma if dag
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
@ -240,6 +310,7 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
// Call the single hop // Call the single hop
//////////////////////// ////////////////////////
DerivDhopComputeTime -= usecond();
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for (int sss = 0; sss < U._grid->oSites(); sss++) { for (int sss = 0; sss < U._grid->oSites(); sss++) {
for (int s = 0; s < Ls; s++) { for (int s = 0; s < Ls; s++) {
@ -249,19 +320,17 @@ PARALLEL_FOR_LOOP
assert(sF < B._grid->oSites()); assert(sF < B._grid->oSites());
assert(sU < U._grid->oSites()); assert(sU < U._grid->oSites());
Kernels::DiracOptDhopDir(st,U,st.comm_buf,sF,sU,B,Btilde,mu,gamma); Kernels::DiracOptDhopDir(st, U, st.CommBuf(), sF, sU, B, Btilde, mu, gamma);
//////////////////////////// ////////////////////////////
// spin trace outer product // spin trace outer product
//////////////////////////// ////////////////////////////
} }
} }
DerivDhopComputeTime += usecond();
Impl::InsertForce5D(mat, Btilde, Atilde, mu); Impl::InsertForce5D(mat, Btilde, Atilde, mu);
} }
DerivComputeTime += usecond();
} }
template<class Impl> template<class Impl>
@ -297,30 +366,6 @@ void WilsonFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
} }
template<class Impl>
void WilsonFermion5D<Impl>::Report(void)
{
std::cout<<GridLogMessage << "******************** WilsonFermion"<<std::endl;
std::cout<<GridLogMessage << "Wilson5d time "<<alltime <<" us"<<std::endl;
std::cout<<GridLogMessage << "HaloBegin time "<<commtime <<" us"<<std::endl;
std::cout<<GridLogMessage << "Dslash time "<<dslashtime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Dslash1 time "<<dslash1time<<" us"<<std::endl;
std::cout<<GridLogMessage << "HaloComplete time "<<jointime<<" us"<<std::endl;
std::cout<<GridLogMessage << "******************** Stencil"<<std::endl;
std::cout<<GridLogMessage << "Stencil all gather time "<<Stencil.halogtime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil nosplice gather time "<<Stencil.nosplicetime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil splice gather time "<<Stencil.splicetime<<" us"<<std::endl;
std::cout<<GridLogMessage << "********************"<<std::endl;
std::cout<<GridLogMessage << "Stencil gather "<<Stencil.gathertime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil gather simd "<<Stencil.gathermtime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil merge simd "<<Stencil.mergetime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil spin simd "<<Stencil.spintime<<" us"<<std::endl;
std::cout<<GridLogMessage << "********************"<<std::endl;
std::cout<<GridLogMessage << "Stencil MB/s "<<(double)Stencil.comms_bytes/Stencil.commtime<<std::endl;
std::cout<<GridLogMessage << "Stencil comm time "<<Stencil.commtime<<" us"<<std::endl;
std::cout<<GridLogMessage << "Stencil join time "<<Stencil.jointime<<" us"<<std::endl;
std::cout<<GridLogMessage << "********************"<<std::endl;
}
template<class Impl> template<class Impl>
void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat, void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
const FermionField &A, const FermionField &A,
@ -342,96 +387,62 @@ template<class Impl>
void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo, void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U, DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag) const FermionField &in, FermionField &out,int dag)
{
DhopInternalCommsThenCompute(st,lo,U,in,out,dag);
}
template<class Impl>
void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag)
{ {
// assert((dag==DaggerNo) ||(dag==DaggerYes)); // assert((dag==DaggerNo) ||(dag==DaggerYes));
alltime-=usecond();
Compressor compressor(dag); Compressor compressor(dag);
// Assume balanced KMP_AFFINITY; this is forced in GridThread.h
int LLs = in._grid->_rdimensions[0]; int LLs = in._grid->_rdimensions[0];
commtime -=usecond(); DhopCommTime-=usecond();
// auto handle = st.HaloExchangeBegin(in,compressor);
// st.HaloExchangeComplete(handle);
st.HaloExchange(in,compressor); st.HaloExchange(in,compressor);
commtime +=usecond(); DhopCommTime+=usecond();
jointime -=usecond();
jointime +=usecond();
DhopComputeTime-=usecond();
// Dhop takes the 4d grid from U, and makes a 5d index for fermion // Dhop takes the 4d grid from U, and makes a 5d index for fermion
// Not loop ordering and data layout.
// Designed to create
// - per thread reuse in L1 cache for U
// - 8 linear access unit stride streams per thread for Fermion for hw prefetchable.
dslashtime -=usecond();
if (dag == DaggerYes) { if (dag == DaggerYes) {
if( this->HandOptDslash ) {
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for (int ss = 0; ss < U._grid->oSites(); ss++) { for (int ss = 0; ss < U._grid->oSites(); ss++) {
for(int s=0;s<LLs;s++){
int sU = ss; int sU = ss;
int sF = s+LLs*sU; int sF = LLs * sU;
Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out); Kernels::DiracOptDhopSiteDag(st, lo, U, st.CommBuf(), sF, sU, LLs, 1, in, out);
} }
} #ifdef AVX512
} else { } else if (stat.is_init() ) {
PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){
for(int s=0;s<LLs;s++){
int sU=ss;
int sF = s+LLs*sU;
Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sF,sU,in,out);
}
}
}
} else {
if( this->AsmOptDslash ) {
PARALLEL_FOR_LOOP int nthreads;
stat.start();
#pragma omp parallel
{
#pragma omp master
nthreads = omp_get_num_threads();
int mythread = omp_get_thread_num();
stat.enter(mythread);
#pragma omp for nowait
for(int ss=0;ss<U._grid->oSites();ss++) { for(int ss=0;ss<U._grid->oSites();ss++) {
for(int s=0;s<LLs;s++){
int sU=ss; int sU=ss;
int sF = s+LLs*sU; int sF=LLs*sU;
Kernels::DiracOptAsmDhopSite(st,U,st.comm_buf,sF,sU,in,out); Kernels::DiracOptDhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
}
}
} else if( this->HandOptDslash ) {
PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){
for(int s=0;s<LLs;s++){
int sU=ss;
int sF = s+LLs*sU;
Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sF,sU,in,out);
} }
stat.exit(mythread);
} }
stat.accum(nthreads);
#endif
} else { } else {
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for (int ss = 0; ss < U._grid->oSites(); ss++) { for (int ss = 0; ss < U._grid->oSites(); ss++) {
for(int s=0;s<LLs;s++){
int sU = ss; int sU = ss;
int sF = s+LLs*sU; int sF = LLs * sU;
Kernels::DiracOptDhopSite(st,U,st.comm_buf,sF,sU,in,out); Kernels::DiracOptDhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
} }
} }
} DhopComputeTime+=usecond();
}
dslashtime +=usecond();
alltime+=usecond();
} }
template<class Impl> template<class Impl>
void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag) void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
{ {
DhopCalls++;
conformable(in._grid,FermionRedBlackGrid()); // verifies half grid conformable(in._grid,FermionRedBlackGrid()); // verifies half grid
conformable(in._grid,out._grid); // drops the cb check conformable(in._grid,out._grid); // drops the cb check
@ -443,6 +454,7 @@ void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int
template<class Impl> template<class Impl>
void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
{ {
DhopCalls++;
conformable(in._grid,FermionRedBlackGrid()); // verifies half grid conformable(in._grid,FermionRedBlackGrid()); // verifies half grid
conformable(in._grid,out._grid); // drops the cb check conformable(in._grid,out._grid); // drops the cb check
@ -454,6 +466,7 @@ void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int
template<class Impl> template<class Impl>
void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag) void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
{ {
DhopCalls+=2;
conformable(in._grid,FermionGrid()); // verifies full grid conformable(in._grid,FermionGrid()); // verifies full grid
conformable(in._grid,out._grid); conformable(in._grid,out._grid);
@ -471,8 +484,6 @@ void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag
FermOpTemplateInstantiate(WilsonFermion5D); FermOpTemplateInstantiate(WilsonFermion5D);
GparityFermOpTemplateInstantiate(WilsonFermion5D); GparityFermOpTemplateInstantiate(WilsonFermion5D);
template class WilsonFermion5D<DomainWallRedBlack5dImplF>;
template class WilsonFermion5D<DomainWallRedBlack5dImplD>;
}} }}

View File

@ -31,8 +31,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_QCD_WILSON_FERMION_5D_H #ifndef GRID_QCD_WILSON_FERMION_5D_H
#define GRID_QCD_WILSON_FERMION_5D_H #define GRID_QCD_WILSON_FERMION_5D_H
namespace Grid { #include <Grid/Stat.h>
namespace Grid {
namespace QCD { namespace QCD {
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -49,8 +50,6 @@ namespace Grid {
class WilsonFermion5DStatic { class WilsonFermion5DStatic {
public: public:
// S-direction is INNERMOST and takes no part in the parity. // S-direction is INNERMOST and takes no part in the parity.
static int AsmOptDslash; // these are a temporary hack
static int HandOptDslash; // these are a temporary hack
static const std::vector<int> directions; static const std::vector<int> directions;
static const std::vector<int> displacements; static const std::vector<int> displacements;
const int npoint = 8; const int npoint = 8;
@ -62,11 +61,19 @@ namespace Grid {
public: public:
INHERIT_IMPL_TYPES(Impl); INHERIT_IMPL_TYPES(Impl);
typedef WilsonKernels<Impl> Kernels; typedef WilsonKernels<Impl> Kernels;
double alltime; PmuStat stat;
double jointime;
double commtime; void Report(void);
double dslashtime; void ZeroCounters(void);
double dslash1time; double DhopCalls;
double DhopCommTime;
double DhopComputeTime;
double DerivCalls;
double DerivCommTime;
double DerivComputeTime;
double DerivDhopComputeTime;
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Implement the abstract base // Implement the abstract base
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
@ -122,13 +129,6 @@ namespace Grid {
FermionField &out, FermionField &out,
int dag); int dag);
void DhopInternalCommsThenCompute(StencilImpl & st,
LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out,
int dag);
// Constructors // Constructors
WilsonFermion5D(GaugeField &_Umu, WilsonFermion5D(GaugeField &_Umu,
GridCartesian &FiveDimGrid, GridCartesian &FiveDimGrid,
@ -138,18 +138,18 @@ namespace Grid {
double _M5,const ImplParams &p= ImplParams()); double _M5,const ImplParams &p= ImplParams());
// Constructors // Constructors
/*
WilsonFermion5D(int simd, WilsonFermion5D(int simd,
GaugeField &_Umu, GaugeField &_Umu,
GridCartesian &FiveDimGrid, GridCartesian &FiveDimGrid,
GridRedBlackCartesian &FiveDimRedBlackGrid, GridRedBlackCartesian &FiveDimRedBlackGrid,
GridCartesian &FourDimGrid, GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid,
double _M5,const ImplParams &p= ImplParams()); double _M5,const ImplParams &p= ImplParams());
*/
// DoubleStore // DoubleStore
void ImportGauge(const GaugeField &_Umu); void ImportGauge(const GaugeField &_Umu);
void Report(void);
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Data members require to support the functionality // Data members require to support the functionality
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
@ -181,7 +181,7 @@ namespace Grid {
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf; std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
}; };
}
} }}
#endif #endif

Some files were not shown because too many files have changed in this diff Show More