1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-10-24 01:34:47 +01:00

Compare commits

..

2386 Commits

Author SHA1 Message Date
Peter Boyle
12eb2a6a34 Zero() change 2019-08-15 01:43:00 +01:00
Peter Boyle
7c8902b04f Merge branch 'develop' into feature/gpu-port 2019-08-15 01:33:07 +01:00
Peter Boyle
4278caa030 Force a couple of things to compile on NVCC 2019-08-15 01:32:03 +01:00
Peter Boyle
be37dfb6f8 Remove debug code 2019-08-15 01:31:40 +01:00
Peter Boyle
5e8437029f nvcc error suppress 2019-08-15 01:31:12 +01:00
Peter Boyle
e279b2be29 Merge develop 2019-08-14 23:01:59 +01:00
Peter Boyle
48e6efc7c9 Merge branch 'develop' into feature/gpu-port
Conflicts:
	Grid/qcd/action/fermion/WilsonKernelsAsm.cc
	Grid/qcd/action/fermion/implementation/ImprovedStaggeredFermionImplementation.h
	Grid/qcd/action/fermion/implementation/StaggeredKernelsAsm.h
	benchmarks/Benchmark_comms.cc
2019-08-14 18:56:54 +01:00
55c095f620 Merge pull request #226 from nils-asmussen/fix/Gauss
Fix compiling of MSource::Gauss for single precision
2019-08-14 17:50:38 +01:00
Peter Boyle
3e49dc8a67 Reduction finished and hopefully fixes CI regression fail on single precisoin and force 2019-08-14 15:18:34 +01:00
Peter Boyle
96ac56cace Double precision variants for summation accuracy 2019-08-14 13:08:01 +01:00
Peter Boyle
2b037e3daa Update todo list 2019-08-14 13:07:26 +01:00
Peter Boyle
2d2de7aede Freeze the seed 2019-08-14 13:07:11 +01:00
e3966aa49b Fix compiling of MSource::Gauss for single precision 2019-08-12 14:57:11 +01:00
Peter Boyle
ce97638bac Think the reduction is now sorted and cleaned up 2019-08-11 11:09:01 +01:00
Peter Boyle
53e3ab4131 Fix force term 2019-08-11 11:06:13 +01:00
c2c4252a07 Merge pull request #216 from nils-asmussen/feature/GaussianSmearing
feature/gaussian smearing
2019-08-08 12:29:55 +02:00
Peter Boyle
9cd33a7b9c Printing improvement 2019-07-31 08:01:24 +01:00
Peter Boyle
639dc1ab21 GPU reduction fix and also exit backtrace option 2019-07-31 01:23:23 +01:00
Peter Boyle
9117f61109 GPU friendly 2019-07-31 01:22:54 +01:00
Peter Boyle
bca36d9bc3 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-07-30 22:51:23 +01:00
Peter Boyle
263dcbabab Simplify the comms benchmark 2019-07-30 22:51:04 +01:00
Peter Boyle
9dad7a0094 Reproducible reduction and axpy_norm offload from Gianluca.
Hopefully get CG running entirely on GPU
2019-07-30 00:14:12 +01:00
Peter Boyle
8c6016f717 Merge pull request #219 from mmphys/feature/include
Housekeeping. #include <Grid.h> ---> #include <Grid/Grid.h>
2019-07-29 23:08:01 +01:00
Peter Boyle
1282e1067f Do the force term on the accelerator too. Needed particularly because comms buffers
are device memory.
2019-07-29 22:58:35 +01:00
Peter Boyle
275c1c920f More info dump on error from CUDA 2019-07-26 12:18:53 +01:00
Peter Boyle
fe700a183a Getting HMC to run 2019-07-26 12:18:29 +01:00
Peter Boyle
34108296cd Merge branch 'develop' into feature/gpu-port
Conflicts:
	Grid/simd/Grid_avx512.h
2019-07-20 17:05:35 +01:00
Peter Boyle
76c704b84b Intrinsics for CLANG are now fixed in v6 2019-07-20 16:52:24 +01:00
Peter Boyle
ce255ec359 Relocate to fix build failure for comms none 2019-07-20 16:37:03 +01:00
Peter Boyle
1c096626cb Hypercube defaults to on if HPE detected, but override to off possible 2019-07-20 16:06:16 +01:00
Peter Boyle
ce8b247426 Compiles 2019-07-20 15:16:02 +01:00
Peter Boyle
80481f81be Constructor typo 2019-07-20 09:58:24 +01:00
Peter Boyle
d85dcc72df Multinode fix 2019-07-20 07:13:28 +01:00
Peter Boyle
3fedcd6d52 Compiles 2019-07-20 07:12:44 +01:00
Peter Boyle
25ba4c5f80 Merge branch 'develop' into feature/gpu-port
Conflicts:
	HMC/Mobius2p1fEOFA.cc
	tests/forces/Test_rect_force.cc
2019-07-19 11:01:55 +01:00
Peter Boyle
671bcbcccb Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-07-19 10:48:22 +01:00
Peter Boyle
ff325376cb Fix single precision deriv test fail 2019-07-19 10:47:44 +01:00
Peter Boyle
9e926e3fc5 Build fix in develop 2019-07-19 10:01:52 +01:00
Peter Boyle
775eaee199 Fix for suspected Intel 2018.1 compiler bug under O3 2019-07-19 07:57:34 +01:00
Peter Boyle
0fd2827d5d Fix fail in single 2019-07-19 05:28:26 +01:00
Peter Boyle
bdd79f9ef8 TODO update 2019-07-18 22:04:28 +01:00
Peter Boyle
0695f8cec2 Single precision compile fix. Soon deprecate single precision 2019-07-18 22:02:31 +01:00
Peter Boyle
9fa705c5a0 comma fix 2019-07-18 21:38:11 +01:00
Peter Boyle
331f5a53dc New header 2019-07-18 14:51:09 +01:00
Peter Boyle
a23dc295ac Remove compiler errors and warnings 2019-07-18 14:47:02 +01:00
Peter Boyle
08904f830e Merge develop 2019-07-16 11:59:56 +01:00
Peter Boyle
fa9cd50c5b Merge branch 'develop' into feature/gpu-port 2019-07-16 11:55:17 +01:00
Peter Boyle
7c11525d1a Local stencil for complex wilson loops etc 2019-07-14 14:05:09 +01:00
Peter Boyle
42c1dbb1d1 General local stencil first cut for Patrick force term 2019-07-14 14:04:28 +01:00
Peter Boyle
6179acfda0 Put back a call that was required 2019-07-14 13:59:54 +01:00
Peter Boyle
07601ac1f5 Replace instantiation of Gparity 2019-07-12 17:18:12 +01:00
Peter Boyle
705a8098b2 Merge branch 'feature/gpu-port' of https://github.com/paboyle/Grid into feature/gpu-port
Conflicts:
	Grid/stencil/Stencil.h
2019-07-12 17:14:11 +01:00
Peter Boyle
a29b43d755 Stencil comms cleaner 2019-07-12 17:12:25 +01:00
Peter Boyle
368c8369ce Merge branch 'feature/gpu-port' of https://github.com/paboyle/Grid into feature/gpu-port 2019-07-12 17:11:29 +01:00
Peter Boyle
c0d89a2dbb TODO updates 2019-07-12 17:11:15 +01:00
Peter Boyle
78ebd93281 Cuda 9.1 happy 2019-07-12 17:11:00 +01:00
Peter Boyle
3d58daf70f Safety check 2019-07-12 17:10:35 +01:00
Peter Boyle
bd155ca5c0 Overlap comms with comput now supported 2019-07-12 09:09:40 +01:00
Peter Boyle
91e2cf9b40 All axes can be used for comms now 2019-07-12 09:08:26 +01:00
Peter Boyle
3cc9947731 Better welcome printing 2019-07-12 06:47:51 +01:00
Peter Boyle
f15eeb0283 localise scope of variables declared in macro 2019-07-12 06:47:01 +01:00
Peter Boyle
0996ba9396 Pretty messaging 2019-07-12 06:45:31 +01:00
Peter Boyle
966a203dcb Interactions with GPU compilation 2019-07-11 03:16:17 +01:00
Peter Boyle
44170cc15f Initialise CUDA device prior to entering MPI.
This may or may not interact with Summit which configures MPI - CUDA mapping with jsrun.
TBD
Cases of OpenMPI and MVAPICH are covered, and default to cudaSetDevice(0) otherwise
2019-07-11 03:14:23 +01:00
c3d0c176ab cleaning up Kl2 contraction 2019-07-05 16:29:46 +01:00
0a71f8bb10 Merge pull request #222 from guelpers/feature/kl2QEDseq
EMLepton: Multiple source-sink separations at once
2019-07-05 16:22:34 +01:00
3a31ba2ea2 Merge remote-tracking branch 'upstream/develop' into feature/kl2QEDseq 2019-07-03 14:37:56 +01:00
eac6337466 Hadrons: EMLepton: multiple source-sink separations at once 2019-07-03 14:36:34 +01:00
ab7537e002 Merge pull request #221 from fionnoh/bugfix/A2ALoop
Bugfix for A2ALoop module
2019-07-03 14:13:51 +01:00
Peter Boyle
6e3c3214a3 Offload loops 2019-07-02 17:25:40 +01:00
Peter Boyle
d6ffadb33b Coalesced write 2019-07-02 17:25:13 +01:00
Peter Boyle
4c3225412b Drop 5dVEC 2019-07-01 07:31:26 +01:00
Peter Boyle
b8f7bfbb26 Dont stream as poor perf in some cases 2019-07-01 07:30:25 +01:00
Peter Boyle
7b7c470917 Accelerator loop 2019-07-01 07:29:51 +01:00
Peter Boyle
532e226b22 cuda 9.1 fixes 2019-07-01 07:29:22 +01:00
Peter Boyle
6a13731818 Move GPU cuda call earlier 2019-07-01 07:28:41 +01:00
fionnoh
1059189abf Bugfix for A2ALoop module 2019-06-27 13:49:55 +08:00
Peter Boyle
1cd4ee0706 Thrust used on GPU builds 2019-06-18 12:50:35 +01:00
Peter Boyle
b8f71b6777 Fix NVCC warning unused variable 2019-06-17 13:58:45 +01:00
Peter Boyle
703dc20377 Compile tests fix 2019-06-16 13:59:29 +01:00
Peter Boyle
d976e5c514 Pow is being awkward in thrust for reasons I don't understand. Possible thrust bug. 2019-06-16 12:05:11 +01:00
Peter Boyle
d7b3efe893 Compile fix 2019-06-15 17:03:15 +01:00
Peter Boyle
f710d7bd45 TODO list update 2019-06-15 12:54:27 +01:00
Peter Boyle
cb336aa8f8 Thread loop constructs changing a little 2019-06-15 12:54:11 +01:00
Peter Boyle
462900b48d Modified entire test directory to suit new GPU constructs for looping 2019-06-15 12:53:27 +01:00
Peter Boyle
0561c2edeb Benchmarks modified for new GPU constructs 2019-06-15 12:52:56 +01:00
Peter Boyle
0184719216 Change to predicate type 2019-06-15 12:52:26 +01:00
Peter Boyle
24202dbc51 Thread loop construct change 2019-06-15 12:52:07 +01:00
Peter Boyle
d763c303c5 Clean acceleerator barrier 2019-06-15 12:51:45 +01:00
Peter Boyle
8e394d3bf9 New loop construct 2019-06-15 12:51:15 +01:00
Peter Boyle
b881d5489b Move SchurDiagTwoKappa to Algorithms 2019-06-15 12:50:45 +01:00
Peter Boyle
82306913a8 Move Schur operator into correct place 2019-06-15 12:49:22 +01:00
Peter Boyle
49f90cc7eb use pragma once 2019-06-15 12:45:22 +01:00
Peter Boyle
b77af0210b Thread loop. Probably deprecate this impl 2019-06-15 12:44:56 +01:00
Peter Boyle
5254ede2d8 New loops. Revisit as accelerator loop in future audit 2019-06-15 12:44:29 +01:00
Peter Boyle
16e5d7945e Hard to make 5D vec work with GPU code 2019-06-15 12:43:43 +01:00
Peter Boyle
decc99ca76 Accelerator version 2019-06-15 12:43:00 +01:00
Peter Boyle
464cd65931 Still to test this fully 2019-06-15 12:35:14 +01:00
Peter Boyle
a1ec2f4723 Still to test this routine fully 2019-06-15 12:33:55 +01:00
Peter Boyle
ea9662ec85 Thread loop changes 2019-06-15 09:09:57 +01:00
Peter Boyle
52c74f1cac Thread loop changes 2019-06-15 09:08:16 +01:00
Peter Boyle
9a13d2992c lean up 2019-06-15 09:05:16 +01:00
Peter Boyle
b0449ae270 Thread loop changes 2019-06-15 09:04:19 +01:00
Peter Boyle
1299225105 Accelerator loop changes 2019-06-15 09:03:46 +01:00
Peter Boyle
5925e7f405 Thread for changes 2019-06-15 09:01:30 +01:00
Peter Boyle
be1fd4930f Template instantiation make happy changes 2019-06-15 08:37:34 +01:00
Peter Boyle
377fa5dec1 looping construct 2019-06-15 08:36:48 +01:00
Peter Boyle
e8b78f596e Looping construct changes 2019-06-15 08:35:57 +01:00
Peter Boyle
09720c40cd Coalesced loops 2019-06-15 08:35:26 +01:00
Peter Boyle
bb024dd114 Loop construct changed 2019-06-15 08:30:05 +01:00
Peter Boyle
52456b9ec7 New loop construct 2019-06-15 08:28:45 +01:00
Peter Boyle
b285138be4 Better checking on types 2019-06-15 08:27:48 +01:00
Peter Boyle
c7dbf4c87e Scalar support for GPU threads 2019-06-15 08:25:43 +01:00
Peter Boyle
1e889c93b8 Insert a GPU synchronise 2019-06-15 08:23:26 +01:00
Peter Boyle
7379047482 Threading and acceleration primitives further changes. accelerator_barrier() needed and used 2019-06-15 08:22:48 +01:00
Peter Boyle
d836ce3b78 Clean up of acceleration and threading primitives 2019-06-15 08:14:21 +01:00
Peter Boyle
cefaacbc07 Changing accelerator loop. Still have work to do for multi-GPU code 2019-06-15 08:10:24 +01:00
Peter Boyle
0074ef7f69 thread loops 2019-06-15 08:04:29 +01:00
Peter Boyle
20359ca15f Coalesced loops. 2019-06-15 08:03:57 +01:00
Peter Boyle
736358b0cb Coalesced loops 2019-06-15 08:03:13 +01:00
Peter Boyle
6b692aa726 Thread loops 2019-06-15 08:02:26 +01:00
Peter Boyle
7f99e1cd3b Coalesced loops 2019-06-15 08:01:39 +01:00
Peter Boyle
f3c89df948 Thread loop changes 2019-06-15 08:00:37 +01:00
Peter Boyle
b7e6d111d7 Thread loop changes. Need to offload this file 2019-06-15 07:59:10 +01:00
Peter Boyle
f39cf69c33 Accelerator loop change 2019-06-15 07:58:23 +01:00
Peter Boyle
8e27338df2 Rationalise number of loop macros 2019-06-15 07:57:40 +01:00
Peter Boyle
bcbb5e9d26 Remove assembly tests 2019-06-15 07:57:05 +01:00
Peter Boyle
0ea7f5279d Accelerator loop changes 2019-06-15 07:56:14 +01:00
Peter Boyle
18e5de426d There is a stray use of predicatedWhere introduced by Andrew Lawson in the conserve currents.
The conserved currents need rewritten using data parallel operations.
2019-06-15 07:53:58 +01:00
Peter Boyle
e896d81235 Accelerator loop redefine. Coalesce most accesses, but ET engine still to go clean. 2019-06-15 07:52:44 +01:00
Peter Boyle
7b8ccff4f4 Accelerated coalesced loops in most cases 2019-06-15 07:48:00 +01:00
Peter Boyle
68541606ab Thread loop changes. Soon try these with accelerator loops and benchmark 2019-06-15 07:46:42 +01:00
Peter Boyle
339ea10cc7 First touch only on CPU code 2019-06-15 07:45:43 +01:00
Peter Boyle
d0d8dc8042 Thread loop changes 2019-06-15 07:45:09 +01:00
Peter Boyle
81eb1fd9f2 Accelerator loop changes for coalesced access 2019-06-15 07:44:47 +01:00
Peter Boyle
cb93d32cd9 Thread loop changes 2019-06-15 07:44:08 +01:00
Peter Boyle
8f223962ff Thread loop changed 2019-06-15 07:43:42 +01:00
Peter Boyle
36f06555a2 Simplify Impl 2019-06-09 22:26:27 +01:00
Peter Boyle
d6c0e0756d Remove GPU version 2019-06-09 11:23:42 +01:00
Peter Boyle
3e41b1055c Remove Gpu only kernels. 2019-06-09 11:20:01 +01:00
Peter Boyle
9fbcfe612c Update TODO list 2019-06-09 11:19:38 +01:00
Peter Boyle
e78a5e7838 ASM instantiation without link errors 2019-06-09 01:25:21 +01:00
Peter Boyle
da8d87e9da Cuda switch off 2019-06-08 17:11:38 +01:00
Peter Boyle
8e3a05d89b Moving the instantiation into a cleaner structure 2019-06-08 13:48:33 +01:00
Peter Boyle
8adc5da7dd Testig out approaches to kernel writing introducing SIMT_loop temporarily 2019-06-08 13:47:04 +01:00
Peter Boyle
29a244e423 Test of using a lane variable instead of repeated reference to threadIdx.y 2019-06-08 13:46:26 +01:00
Peter Boyle
18cbfecf02 Use symlinks in find command 2019-06-08 13:45:46 +01:00
Peter Boyle
c933ac2248 Temporarily introduce a SIMT_loop to test out approaches prior to making a global change to
accelerator_loop
2019-06-08 13:44:27 +01:00
Peter Boyle
ad2c433574 Instantiations move. Tried using Gianluca's suggestion about avoiding threadIdx but doesn't
seem to make a difference. Will revisit this and probably remove the lane parameter from the coalescedRead
2019-06-08 13:43:12 +01:00
Peter Boyle
86e7fb6e86 Instantiation relocation 2019-06-08 13:42:46 +01:00
Peter Boyle
fb91dda7be Hand instantiation moved location 2019-06-08 13:42:26 +01:00
Peter Boyle
82cf7bc5ab Move instantiation into fermion/instantiation 2019-06-08 13:41:46 +01:00
Peter Boyle
e452cc0a22 Move static variables into instantiation .cc file 2019-06-08 13:41:20 +01:00
Peter Boyle
4d2b938166 Remove explict instantiation from here 2019-06-08 13:41:01 +01:00
Peter Boyle
10d16ab76c Remove explict instantiation from here 2019-06-08 13:40:32 +01:00
Peter Boyle
1f997fa484 Instantiate via explict .cc files for parallel make. 2019-06-08 13:39:51 +01:00
Peter Boyle
dc5024e88c The GPU reduction was not working for me and causing errors. Need to revisit.
Gianluca is working on deterministic reduction/
2019-06-08 13:39:11 +01:00
Peter Boyle
6d77941990 Drop the 5D vec actions 2019-06-08 13:38:05 +01:00
Peter Boyle
0ee6e77cbc Compiles GPU and CPU, still gives good performance on CPU 2019-06-05 13:28:16 +01:00
Peter Boyle
18d3cde29a Compile on GPU workd 2019-06-05 00:14:58 +01:00
Peter Boyle
7323099966 Instatiation fix 2019-06-05 00:14:38 +01:00
Peter Boyle
6379651cdd Generic or GPU ready for benchmark test on GPU 2019-06-05 00:13:52 +01:00
Peter Boyle
ba4fd756b9 Fix signature, but deprecating this loops style 2019-06-05 00:12:36 +01:00
Peter Boyle
d185fc1ebf clean up instantiation 2019-06-05 00:11:52 +01:00
Peter Boyle
96b36d8367 Instantiation clean up 2019-06-05 00:11:27 +01:00
Peter Boyle
899f8b5065 Instantiation clean up 5d vec removal 2019-06-05 00:11:05 +01:00
Peter Boyle
c8d0483fe9 Remove 5d vectorisation 2019-06-05 00:10:37 +01:00
Peter Boyle
0f214e5f76 Clean up instantiation 2019-06-05 00:10:13 +01:00
Peter Boyle
8eea568426 GPU loop ; presently differentiated with ifdef, find a way to unify. 2019-06-05 00:09:28 +01:00
Peter Boyle
9636324069 GPU happy code 2019-06-05 00:08:54 +01:00
Peter Boyle
8a5489d9e6 Move the loop into a central kernel call. 2019-06-05 00:08:13 +01:00
Peter Boyle
8113845f9c coalesce loop. Need to rationalise this file 2019-06-04 23:49:29 +01:00
Peter Boyle
b47f73c222 GPU happy 2019-06-04 21:30:39 +01:00
Peter Boyle
5720ced0fd Simplifying 2019-06-04 21:30:08 +01:00
Peter Boyle
2c87b56b53 Making GPU happier 2019-06-04 21:29:44 +01:00
Peter Boyle
dbad48d802 Remove Ls vectorised DWF 2019-06-04 21:27:40 +01:00
Peter Boyle
4557a1365a Remove Ls vectorised DWF 2019-06-04 20:59:59 +01:00
Peter Boyle
16e9b87d98 Remove Ls vectorised DWF as unused and hard to maintain 2019-06-04 20:59:01 +01:00
Peter Boyle
685eea3d0f Small cosmetic 2019-06-04 20:58:14 +01:00
Peter Boyle
65b48831fb Simplify code 2019-06-04 20:56:30 +01:00
Peter Boyle
57396fc595 Simplify code 2019-06-04 20:56:23 +01:00
Peter Boyle
a2e199df50 Simplifying Cayley cases. 2019-06-04 20:54:52 +01:00
Peter Boyle
020346c848 WOrk list. Will have to clean up Fermion sector. 2019-06-04 20:54:00 +01:00
Peter Boyle
c2625a127e Non blocking loop. Want to change the naming here. 2019-06-04 20:52:59 +01:00
Peter Boyle
8794d35c78 GPU 2019-06-04 20:52:27 +01:00
Peter Boyle
24bff6dbe6 Minor improvements 2019-06-04 20:51:48 +01:00
Peter Boyle
45b15d10d3 GPU happy changes 2019-06-04 20:49:16 +01:00
Peter Boyle
33d6bbe32b GPU must use accelerator vectors 2019-06-04 20:48:52 +01:00
Peter Boyle
7a1569bd46 Annoying, cannot rely on equivalence of Grid ComplexD adn Eigen Complex type on GPU.
Solve with ComplexD typecasts but must be a better way
2019-06-04 20:47:49 +01:00
Peter Boyle
6e2e904a0e NVCC compiles happy. Start to develop strategy for writing generic
code for GPU kernels and CPU kernels.
2019-06-04 20:46:35 +01:00
Peter Boyle
d92a17f359 Suppress NVCC warnings in pugixml with pragma 2019-06-04 20:45:53 +01:00
Peter Boyle
47c063f984 Remove Ls Vec cases from benchmarks 2019-06-04 20:45:35 +01:00
Peter Boyle
7e27a5213a Tests builds clean. 2019-06-04 20:45:20 +01:00
Peter Boyle
ade4a126da Getting closer on the GPU port, but will start deleting 5th dim vectorised variants
for code maintainability
2019-06-04 11:53:44 +01:00
Peter Boyle
7b59ab5bd7 Compiling after reorganisation 2019-06-03 15:46:26 +01:00
Peter Boyle
fcd8cfe257 Gparity in 2019-06-03 15:45:09 +01:00
Peter Boyle
b4b53812cb Move implementation to specific implementation headers 2019-06-03 15:43:01 +01:00
Peter Boyle
085cac583f Implementation in header 2019-06-03 15:42:36 +01:00
Peter Boyle
25e3b8640c Move to header 2019-06-03 15:42:05 +01:00
Michael Marshall
c81d3d422d Housekeeping. #include <Grid.h> ---> #include <Grid/Grid.h> 2019-06-03 15:25:05 +01:00
Peter Boyle
44bbec50b0 Making GPU compile happy 2019-06-03 14:57:04 +01:00
Peter Boyle
ec68b67d5d Attempt at unified GPU and CPU kernel 2019-06-03 14:55:51 +01:00
Peter Boyle
778450e0c8 Move to implementation subdir 2019-06-03 14:53:56 +01:00
Peter Boyle
567aa5f366 Move to implementation subdir 2019-06-03 14:53:33 +01:00
Peter Boyle
2ab7e2b175 Force instantiation in .cc files.
Eventually move into multiple files
2019-06-03 14:52:59 +01:00
Peter Boyle
6f61be044d Dont instantiate in header 2019-06-03 14:52:01 +01:00
Peter Boyle
269e00509e Don't instantiate in header 2019-06-03 14:51:24 +01:00
Peter Boyle
a5e90b0ddc Making the kernels more GPU happy 2019-06-03 14:50:54 +01:00
Peter Boyle
5622faf226 pragma once ifdef guard 2019-06-03 14:50:26 +01:00
Peter Boyle
82ecd520c7 Macos happy fix under nvcc 2019-06-03 14:48:50 +01:00
620965781e MSource::Convolution remove test code 2019-06-02 13:44:19 +01:00
9c18638b24 MSource::Convolution let mom argument be Nd dimensional 2019-06-02 13:41:39 +01:00
4bfe678218 MSource::Gauss Integer is unsigned... 2019-06-02 12:36:57 +01:00
fc6e584f2c MSource::Gauss fix sign in exponent of normalization + use correct types 2019-06-02 11:52:05 +01:00
7c3f400fc5 MSource::Gauss add parameters tA and tB 2019-06-02 00:12:15 +01:00
4bca2c17ce MSource::Convolution rename parameters 2019-06-02 00:04:07 +01:00
8d540a4e85 MSource::Gauss add mom parameter + avoid Cshifts 2019-06-01 23:56:14 +01:00
b120ef1fe4 Merge pull request #217 from guelpers/feature/EMlepwall
Hadrons: EMLepton: Wall source
2019-05-30 11:13:27 +02:00
166feb6483 Hadrons: EMLepton: Wall source 2019-05-30 10:07:08 +01:00
f569813b60 remove commented code 2019-05-29 17:07:07 +01:00
0190ada714 Merge branch 'develop' into feature/GaussianSmearing 2019-05-29 17:01:17 +01:00
de1a1dccb3 MSource::Gauss and MSource::Convolution: change LatticeComplex to ComplexField 2019-05-29 16:25:45 +01:00
0b3f40ce16 MSource::Convolution fix sign in Momentum 2019-05-29 16:06:10 +01:00
e35e8da111 Revert "cleaning up Kl2 contraction"
This reverts commit f244fed6ab.
2019-05-29 11:23:17 +02:00
6fdf93d695 move momentum phase from MSource::Gauss to MSource::Convolution 2019-05-28 17:26:55 +01:00
Peter Boyle
ffde81f22a Nsimd() and coalesced support 2019-05-25 12:44:07 +01:00
Peter Boyle
d8098f1ecd coalesced support 2019-05-25 12:43:31 +01:00
Peter Boyle
aca788cf4f Move coalesced read into tensors 2019-05-25 12:43:00 +01:00
6064f96fde MSource::Gauss remove superfluous comment 2019-05-24 20:18:37 +01:00
4e52e46a2c MSource::Gauss fix missing factor 2019-05-24 20:16:09 +01:00
6b27369ade MSource::Convolution use type PropagatorField 2019-05-24 16:07:08 +01:00
ab2e5f88cd add fields as input (for scheduler) 2019-05-24 15:57:30 +01:00
f244fed6ab cleaning up Kl2 contraction 2019-05-24 13:08:35 +01:00
9b3701ae27 posibility to save/load schedules directly from the application parameters 2019-05-24 13:08:20 +01:00
4ac27340b9 moving VERSION file to the empty ChangeLog one, this create compilation problems with #include <version> in recent versions of LLVM and case-insensitive FS (typically macOS) 2019-05-24 13:05:17 +01:00
c7c0a1065f Merge pull request #214 from guelpers/feature/kl2QEDseq
Kl2 contraction with sequential propagators
2019-05-23 20:31:41 +01:00
80947130f9 Merge pull request #215 from fionnoh/develop
Added precision tuning to Hadrons parameterfile writing
2019-05-23 18:44:58 +01:00
fionnoh
0aee73ea6b Added precision tuning to Hadrons parameterfile writing 2019-05-23 18:43:25 +01:00
e43d59045e add option mom to MSource::Gauss 2019-05-23 17:33:32 +01:00
e553678599 add modules MSource::Gauss and MSource::Convolution 2019-05-23 16:38:13 +01:00
0290ee1f6d Merge pull request #213 from fionnoh/develop
Added ZFIMPL to SeqConserved module
2019-05-23 13:46:02 +01:00
9a34edcf9f Kl2 QED cleanup 2019-05-23 13:43:22 +01:00
fionnoh
246f10001e Added ZFIMPL to SeqGamma 2019-05-23 12:42:40 +01:00
e675c6a48c Merge remote-tracking branch 'upstream/develop' into feature/kl2QED 2019-05-23 12:41:54 +01:00
fionnoh
a66d110b88 Added ZFIMPL to SeqConserved module 2019-05-23 11:49:54 +01:00
Peter Boyle
918e673078 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-05-22 09:57:02 +01:00
Peter Boyle
44b53c3ba2 F1 ensemble running with 96%~ acceptance etc.. 2019-05-22 09:56:26 +01:00
Peter Boyle
2095c12eac Make detection of HPE 8600 automatic 2019-05-22 09:54:21 +01:00
Peter Boyle
a0e9f3b0a0 Plan for GPU port 2019-05-20 09:46:19 +01:00
ae5ad986e2 Merge remote-tracking branch 'upstream/develop' into feature/kl2QED 2019-05-19 14:35:46 +01:00
Peter Boyle
a9342c6ae5 Udpdate TODO afer gianluc marge 2019-05-18 22:58:25 +01:00
Peter Boyle
ee6f96d85c Merge pull request #210 from grid-test-organisation/feature/gpu-port-develop
Cayley fermion functions for GPUs
2019-05-18 19:06:20 +01:00
Peter Boyle
77ca45ff49 Merge pull request #211 from fionnoh/develop
Enum for gaugefix and bug fix for wall source
2019-05-18 18:57:52 +01:00
Peter Boyle
4e9df9e93c GPU patches 2019-05-18 17:43:11 +01:00
Peter Boyle
9fe68857a9 Runs multiGPU with coalesced access on tesseract 2019-05-18 17:42:41 +01:00
Peter Boyle
37336c9e0c Allow compress to be either vector or scalar types 2019-05-18 17:41:13 +01:00
Peter Boyle
6c4da3bbc7 Stencil now runs with coalesced accesses 2019-05-18 17:40:35 +01:00
Peter Boyle
a584b16c4a Adding a non-blocking kernel launch 2019-05-18 17:39:54 +01:00
fionnoh
dbd7f3f0fc Added variables that were missing from wall source setup 2019-05-17 19:10:09 +01:00
fionnoh
d14512ee03 Exposed a coulomb/landau enum to the gauge fixing module 2019-05-17 19:01:52 +01:00
Peter Boyle
48b1c806ed Coulomb gauge added as an option 2019-05-17 17:36:32 +01:00
0a8b6724ef Merge pull request #209 from fionnoh/develop
Added gauge transform option to eigpack IO
2019-05-15 18:09:44 +02:00
fionnoh
ce102ac550 More logging, timing, and 4d/5d logic for eigpack gauge transforms 2019-05-15 14:31:25 +01:00
fionnoh
94accec311 Added gauge transform option to eigpack IO 2019-05-15 13:35:47 +01:00
gfilaci
1a82533d22 fix inner product with thrust reduction 2019-05-14 15:35:54 +01:00
gfilaci
e3c56fd9b3 CayleyZeroCounters before benchmark loop 2019-05-13 15:52:00 +01:00
gfilaci
955cc7790f MooeeInvDag offloaded to GPU 2019-05-13 14:25:29 +01:00
gfilaci
1179123ac2 MooeeInv offloaded to GPU 2019-05-13 12:37:12 +01:00
d8512b03f8 Merge pull request #195 from nils-asmussen/fix_GaugeProp_4d
MFermion::GaugeProp fix for 4d fields
2019-05-12 21:31:18 +02:00
d90cf9d022 Merge pull request #207 from fionnoh/develop
Weak Hamiltonian and contraction bug fixes
2019-05-12 21:30:20 +02:00
79e930ba12 Hadrons: Lepton Propagator for kl2, sign swap for antiperiodic boundary 2019-05-10 12:46:18 +01:00
gfilaci
22e35c9ddd M5Ddag offloaded to GPU 2019-05-10 12:23:39 +01:00
gfilaci
698b45e163 remove unused typedef 2019-05-09 11:19:39 +01:00
gfilaci
f1744b3f01 M5D offloaded to GPU 2019-05-09 11:17:55 +01:00
gfilaci
2b3c22f03d bandwidth dependent on grid default precision 2019-05-08 12:01:11 +01:00
gfilaci
8423a05940 duplicate CayleyFermion5D for gpu 2019-05-08 11:51:37 +01:00
fionnoh
2acd8ece65 Hadron WeakEye and A2ALoop bug fixes, and WWVVContraction bug fix 2019-05-08 10:57:36 +01:00
fionnoh
b638509c61 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-05-08 10:51:04 +01:00
edeb590818 DiskVector: fix of memory bug triggering segfault when the cache is accessed following a certain pattern 2019-05-03 17:09:47 +01:00
gfilaci
d9438627d9 M5D benchmark without vector copy overhead 2019-05-02 11:10:57 +01:00
gfilaci
b23305dbe2 fix M5D flop count 2019-05-02 11:08:21 +01:00
gfilaci
d3b5c02e2d measure M5D bandwidth and fix M5D flop count 2019-05-02 11:02:39 +01:00
gfilaci
8b6541fb60 Fix gpu MultRealPart and MaddRealPart bug 2019-05-02 10:58:17 +01:00
gfilaci
6da9aa9971 replace std::vector with Vector in benchmark 2019-05-02 10:56:22 +01:00
gfilaci
44e0360b97 replace std::vector with Vector 2019-05-02 10:55:36 +01:00
gfilaci
9003c4a07c allocator copy constructor (to be fixed) 2019-05-02 10:53:37 +01:00
gfilaci
b52fa38f8c seed initialisation of RNG5 2019-05-02 10:36:09 +01:00
gfilaci
3f1c4d8789 fix comment hash 2019-05-02 10:24:36 +01:00
4f0631615f A2A Lepton-Meson Field contraction 2019-04-30 12:04:59 +01:00
c2cd0e15d7 Merge remote-tracking branch 'upstream/develop' into feature/kl2QED
Conflicts:
	Grid/qcd/action/fermion/DomainWallFermion.h
	Grid/qcd/action/fermion/FermionOperator.h
2019-04-29 12:07:20 +01:00
Peter Boyle
60330e05a3 NVCC wacky compiler options frozen. Possibly Cuda 9.2 specific 2019-04-28 07:39:33 +01:00
Peter Boyle
f9b8c0cccf Vector changes for UVM 2019-04-28 07:38:57 +01:00
Peter Boyle
3cad67e569 Compile on tesseract 2019-04-28 07:38:09 +01:00
Peter Boyle
170ba4e619 Ensure different MPI ranks use different GPUs. The mapping works on Tesseract. 2019-04-28 07:32:30 +01:00
Peter Boyle
204a090497 Inner product is not working on GPU. Why? 2019-04-28 07:31:56 +01:00
Peter Boyle
3c717c47ef GPU no compile on Wilson Multigrid fixed 2019-04-28 07:31:19 +01:00
fionnoh
df41de4cb6 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-04-24 12:02:50 +01:00
Peter Boyle
6d0b985697 Verbose 2019-04-24 06:29:52 +01:00
Peter Boyle
94ebcf551c Iteratoin range fix 2019-04-24 06:28:14 +01:00
Peter Boyle
6fd4b0be91 Evolving HMC status 2019-04-23 21:54:45 +01:00
Peter Boyle
7894ea6263 Now have mixed precision solves in the 2f sector 2019-04-23 21:54:19 +01:00
Peter Boyle
73d4676997 Action and Deriv solvers allowed to differ 2019-04-23 21:53:44 +01:00
Peter Boyle
262a73c964 COmment improvement 2019-04-23 21:52:58 +01:00
Peter Boyle
5921b1d2b9 Layout/whitespace changes 2019-04-23 21:52:33 +01:00
Peter Boyle
6505efcb57 Set iteration count if guess is already good 2019-04-23 21:51:57 +01:00
Peter Boyle
b595f58e4c Allow HMC to acces matrix 2019-04-23 21:51:23 +01:00
Peter Boyle
b0de7ab7db Extra do nothing guesser 2019-04-23 21:50:45 +01:00
Peter Boyle
e1124d9572 Integrator verbosity updates 2019-04-23 21:50:15 +01:00
Peter Boyle
d416156c16 Mobius 2+1f sign off. 2019-04-19 07:57:08 +01:00
Peter Boyle
cd8d939a1a Integrator logging on by default 2019-04-19 07:54:17 +01:00
Peter Boyle
760cfe294c RHMC for mobius 2019-04-19 07:53:54 +01:00
Peter Boyle
13eaf21b5c HMC make file 2019-04-18 11:53:26 +01:00
Peter Boyle
1403ab231b Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-04-18 11:06:02 +01:00
Peter Boyle
0368fbcde8 Update 2019-04-18 11:05:53 +01:00
Peter Boyle
2dd0ec7862 Merge pull request #186 from djm2131/feature/eofa-bug-fixes
Merge feature/eofa-bug-fixes into develop
2019-04-17 14:54:06 +01:00
Peter Boyle
f4241e59ba Merge pull request #200 from mmphys/feature/XcodeDoc
Updated documentation after Peter's review.
2019-04-17 14:51:19 +01:00
Peter Boyle
26b1d2df2d Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-04-17 12:08:06 +01:00
Peter Boyle
bc14e86812 Simple check 2019-04-17 12:07:42 +01:00
Peter Boyle
780a67844e Simple checks 2019-04-17 12:07:17 +01:00
Peter Boyle
8b7805200f Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-04-17 12:05:09 +01:00
Peter Boyle
2871dec6c0 Monius HMC 2019-04-17 12:04:57 +01:00
Peter Boyle
abde12433e Changes locally 2019-04-17 12:03:20 +01:00
Peter Boyle
1f88ba4e39 Power method 2019-04-17 12:03:05 +01:00
Peter Boyle
ea5b3ed8a2 Momentum rescaling 2019-04-17 12:01:06 +01:00
Peter Boyle
a104115c7d Bounds checking 2019-04-17 11:56:46 +01:00
Peter Boyle
b899042d81 Bounds checking 2019-04-17 11:55:43 +01:00
Peter Boyle
3e712fe643 Scale momentum convention to CPS/UKQCD MD time 2019-04-17 11:54:17 +01:00
Peter Boyle
f4723e07c5 Add bounds checking 2019-04-17 11:52:23 +01:00
fionnoh
9ed2d02bb2 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-04-12 12:11:06 +01:00
50d016340c Merge pull request #190 from mmphys/feature/distil-checkin
Eigen::Tensor serialisation. Tested on single and double precision builds
2019-04-10 12:49:06 +01:00
Michael Marshall
f7b4fd0f69 Make sure Grid::Serializable can write Eigen Tensors to output streams. NB:
1) The Eigen package defines operator<< for Eigen tensors, but this format is different, hence Grid::Serializable::WriteMember
2) For simplification, the contents are written in memory order. I.e. Different results will be obtained depending on whether the tensor is row- or column-major
2019-04-06 15:40:23 +01:00
Michael Marshall
1f1aa92f14 Updated documentation after Peter's review.
1) Removed version numbers from Grid dependencies
2) Explained in a little more detail how to use Xcode to build Grid and Hadrons libraries
2019-04-06 13:42:39 +01:00
00963a7499 twist and boundary conditions for free propagator 2019-04-05 10:08:27 +01:00
Michael Marshall
82a77f9960 ... this time without the new Distillation modules ... 2019-04-03 23:02:26 +01:00
Michael Marshall
00b4139c16 Eigen tensor serialisation fixes after Antonin's review 2019-04-03 22:48:07 +01:00
fionnoh
3e9c757b3b Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-04-03 16:56:48 +01:00
ecf736e6bf Merge pull request #193 from nils-asmussen/fix_GaugeFix_inputmod
Fixes #192 and adds gauge transformation matrix as output.
2019-04-02 18:28:50 +01:00
fionnoh
f22ab5e1bc Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-04-02 11:29:59 +01:00
72f959c0b8 MGauge::GaugeFix use standard convention for fields 2019-03-29 16:51:21 +00:00
63001d3fa6 fix bug: MGauge::GaugeFix should not modify its input 2019-03-29 16:51:11 +00:00
b1d3d1f1a9 add gauge transformation matrix as output to module MGauge/GaugeFix 2019-03-29 16:51:00 +00:00
c2250fa124 MFermion::GaugeProp fix for 4d fields 2019-03-29 16:36:56 +00:00
84940fbdf0 Hadrons: Lepton Propagator for kl2 2019-03-28 10:15:09 +00:00
Peter Boyle
0a270b3e93 Merge pull request #191 from mmphys/GridXcode
Documentation for using Grid with Xcode on Mac OS
2019-03-27 22:53:35 +00:00
Michael Marshall
6536bed8a4 Documentation for using Grid with Xcode on Mac OS 2019-03-27 20:51:20 +00:00
79160011a1 endianness fix in resilient IO 2019-03-26 16:06:13 +00:00
Michael Marshall
47f5b1e2b5 Iterator added. Will wait for review comments before finalising. 2019-03-25 18:19:55 +00:00
Michael Marshall
a381d34f37 Fix build with Intel '17 compiler, i.e. workaround incorrect auto types for c++ style definitions.
E.g. assuming T::rank is an int, then objects defined like so:
    const auto rank{T::rank};
should also be int. Unfortunately, Intel '17 instead defines them to be std::initializer_list<int>, then proceeds to complain where these variables are used that they cannot be converted to int. NB: This was fixed under Intel '18
2019-03-23 09:24:15 +00:00
Michael Marshall
f0c2108acf Pushed paboyle's changes: Updates for clang happy 2019-03-22 12:59:14 +00:00
Peter Boyle
93a5fc083f Updates for clang happy 2019-03-22 11:39:22 +00:00
Michael Marshall
6d1de8ed2e Merge paboyle's no compile in single precision Intel 2019 fix 2019-03-21 16:48:08 +00:00
Peter Boyle
116dde31eb No compile in single precisoin Intel 2019 fix 2019-03-21 14:13:33 +00:00
Michael Marshall
12d8bf1ced Eigen::Tensor serialisation. Tested on single and double precision builds 2019-03-20 22:27:41 +00:00
d921a99b1a precision fix 2019-03-19 17:07:40 +00:00
fionnoh
9790926cc5 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-03-19 16:35:48 +00:00
Peter Boyle
a6adb85a1b Merge pull request #185 from mmphys/feature/trait-recommend
Recommendations for Traits classes
2019-03-18 12:22:42 +00:00
David Murphy
98b5b61fea Remove bundled Eigen stuff 2019-03-15 19:44:28 -04:00
David Murphy
6896c57d7c Fix typo so it matches develop 2019-03-15 19:34:36 -04:00
David Murphy
b3d480a978 Remove bundled source from my local repository 2019-03-15 19:17:03 -04:00
David Murphy
bb731c97d6 Slightly generalize interface to SchurRedBlackBase and derived solver classes so we can pass forecasted initial guesses in EOFA heatbath correctly 2019-03-15 19:10:56 -04:00
David Murphy
974003ae96 Fix sign convention of ExactOneFlavourRatioPseudoFermionAction::deriv() to match force conventions for Integrator class 2019-03-15 19:04:29 -04:00
David Murphy
93348775af Resolved merge conflict 2019-03-15 19:01:37 -04:00
91cffef883 Updates after review with Peter. 2019-03-07 14:30:35 +00:00
d3935ae7fc Hadrons: some updates in WeakMesonDecayKl2 2019-03-06 15:27:59 +00:00
0b426bf9f6 Merge remote-tracking branch 'upstream/develop' into feature/kl2QED
Conflicts:
	Hadrons/Modules.hpp
	Hadrons/modules.inc
2019-03-06 11:28:59 +00:00
Azusa Yamaguchi
acd25d0d01 Wilson clover multi grid for lime lattice 2019-03-04 11:30:15 +00:00
b7db99967a Recommendations for Traits classes 2019-02-28 20:06:59 +00:00
b930eda69d Merge branch 'develop' of github.com:paboyle/Grid into develop 2019-02-27 02:27:46 +00:00
7852181c2c Hadrons: uninitialised pointer fix (might have been harmless) 2019-02-27 02:27:40 +00:00
bdf87bc994 Hadrons: beware of the nasty uninitialised twists 2019-02-27 02:27:09 +00:00
Azusa Yamaguchi
136e7b2314 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-02-26 11:31:36 +00:00
Azusa Yamaguchi
1ea64b24fe Smearing test. Test on free field. 2019-02-26 11:31:17 +00:00
Azusa Yamaguchi
8f661f6c05 Smearing for quark observables 2019-02-26 11:31:00 +00:00
Azusa Yamaguchi
ae9e248c95 Smearing 2019-02-26 11:29:12 +00:00
fionnoh
351ffe73cd Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-02-25 14:06:09 +00:00
Peter Boyle
6160795a43 dt^2 term comments 2019-02-24 15:23:20 +00:00
Peter Boyle
ded2d5c3ab HMC directory 2019-02-24 15:22:57 +00:00
Peter Boyle
04255128ef HMC directory 2019-02-24 15:22:17 +00:00
Peter Boyle
a9a3248cb5 More precision in rect force test 2019-02-24 15:21:19 +00:00
Peter Boyle
7c461dc664 Bounds checking plan setup 2019-02-24 15:19:48 +00:00
Peter Boyle
15fddde9bf ConstEE override in Clover 2019-02-24 14:44:43 +00:00
Peter Boyle
048397d880 Default tau spacing should be longer c.f. Zbigniew Srocinsky thesis 2019-02-24 14:43:22 +00:00
Peter Boyle
196c9e4a4a Better conformable check with message 2019-02-24 14:42:52 +00:00
Peter Boyle
6a0823718e Make ConstEE except override in clover 2019-02-24 14:41:59 +00:00
Peter Boyle
22476cc5a3 Power method estimator of spectral range 2019-02-24 14:37:56 +00:00
cb16c96dc7 Hadrons: XML validator utility 2019-02-22 18:41:26 +00:00
e37614bde4 display relative norm during field IO norm check 2019-02-14 16:23:50 +00:00
042bad2ced possibility to set a build number 2019-02-14 13:58:17 +00:00
5bc0857412 IO norm check on relative norm 2019-02-10 22:12:47 +00:00
b540dc1cee Output field norm check during IO 2019-02-10 21:41:17 +00:00
7672bb6434 Hadrons: random vector utility module I/O 2019-02-10 21:25:25 +00:00
f80c548365 quieter initialisation 2019-02-10 20:47:35 +00:00
Peter Boyle
c8bcee6e97 Merge pull request #183 from nils-asmussen/fix-eigen-patch
fix patch command for eigen in bootstrap.sh
2019-02-06 14:36:36 +00:00
6e0d43aef5 fix patch command for eigen in bootstrap.sh 2019-02-06 11:25:51 +00:00
Peter Boyle
c1257208e2 Mres changes and gauge xform mat changes 2019-02-05 23:43:00 +00:00
74c38822ed Hadrons: 32 bit I/O directly in Lanczos module 2019-02-05 21:56:51 +00:00
318c64adc2 Hadrons: copyright update 2019-02-05 19:13:37 +00:00
d5b053f86f Hadrons: 1 propagator loop construction now using A2A vectors 2019-02-05 19:12:38 +00:00
c60e50e3cb Hadrons: copyright update 2019-02-05 18:55:24 +00:00
08d8b1d5fb Hadrons: 4-quark eye 3-pt contractions 2019-02-05 18:53:20 +00:00
90d6d28547 Hadrons: non-eye weak 3pt fix 2019-02-05 11:35:10 +00:00
9c31305b8d Hadrons: test cleaning 2019-02-04 21:26:25 +00:00
2eb584fdf0 Hadrons: 4-quark non-eye 3-pt contractions 2019-02-04 21:24:07 +00:00
6b46834af8 Hadrons: archiving unmaintained or exotic modules 2019-02-04 21:23:30 +00:00
3692c7f1ef Hadrons: type alias cleaning and global correlator class (need to propagate) 2019-02-04 21:21:51 +00:00
0cf94587cd array with all gammas for convenience 2019-02-04 21:20:16 +00:00
Fionn O hOgain
68868c83ff Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2019-01-31 14:46:56 +00:00
Peter Boyle
9b6ddb6e54 Adding a norm of a general field check, so that for things other than gauge configs there is an analogue of plaquette norm.
Improve argument checking in the BinaryIO.h, as there looks to be some corruption issue intermittently on tesseract jobs.
Not clear where the root bug is.
2019-01-16 22:35:58 +00:00
Vera Guelpers
447b772136 Merge remote-tracking branch 'upstream/develop' into feature/kl2QED 2019-01-07 15:09:18 +00:00
Peter Boyle
c5e081d69c Re-Merge branch 'develop' into feature/gpu-port
Pull in Regensburg MultiGrid pull request
2019-01-03 01:50:16 +00:00
Peter Boyle
535a6aaf05 Update todo list 2019-01-02 22:07:51 +00:00
Peter Boyle
91a7fe247b Merge branch 'DanielRichtmann-feature/wilsonmg' into develop 2019-01-02 14:40:31 +00:00
Peter Boyle
8a1be021d3 Merge branch 'feature/wilsonmg' of https://github.com/DanielRichtmann/Grid into DanielRichtmann-feature/wilsonmg 2019-01-02 14:39:59 +00:00
Peter Boyle
e73b909a48 Make tests running past nvcc. Different NVCC versions proving tricky to keep happy. This is 9.2 2019-01-02 12:05:30 +00:00
Peter Boyle
a4d9200293 Fixing AVX 512 instantiation error. Need to move to extern templates urgently. 2019-01-02 00:27:07 +00:00
Peter Boyle
350508bdb3 pugixml problem 2019-01-01 16:38:54 +00:00
Peter Boyle
38852737e4 No compile fix on clang 2019-01-01 15:55:13 +00:00
Peter Boyle
802404c78c Remove warnings under NVCC and move parallel_for to thread-loop 2019-01-01 15:08:09 +00:00
Peter Boyle
0e9b591c1c NVCC warning suppression 2019-01-01 15:07:47 +00:00
Peter Boyle
c43a2b599a GPU support 2019-01-01 15:07:29 +00:00
Peter Boyle
8c91e82ee8 GPU clean up, remove parallel_for. Split into accelerator_loop, thread_loop
cases, and collides with parallel_for in thrust
2019-01-01 15:06:46 +00:00
Peter Boyle
9d866d062a GPU support improvements 2019-01-01 15:05:03 +00:00
Peter Boyle
3a4e397e72 Deprecating JSON, too hard to support under NVCC 2019-01-01 15:04:33 +00:00
Peter Boyle
2b6cfe555f Disable JSON on NVCC. Maybe unsupport JSON full stop. XML and JSON is too many formats in my view. 2019-01-01 15:03:50 +00:00
Peter Boyle
7df58dd883 Photon syntax gave problems with NVCC 2019-01-01 15:03:29 +00:00
Peter Boyle
4bf86ae60a NVCC clean up 2019-01-01 15:02:50 +00:00
Peter Boyle
07ee87ff5a GPU happy. Still need to prevent hand kernels being callable under NVCC 2019-01-01 15:00:33 +00:00
Peter Boyle
0c2498fe2f Explicit instantiation needed for NVCC 2019-01-01 13:55:12 +00:00
Peter Boyle
ad2e65dad5 GPU related updates 2019-01-01 13:54:40 +00:00
Peter Boyle
715babeac8 GPU reductions first cut; use thrust, non-reproducible. Inclusive scan can fix this if desired.
Local reduction to LatticeComplex and then further reduction.
2019-01-01 13:53:37 +00:00
Peter Boyle
3eae9a9e3f update NVCC flags 2019-01-01 13:49:15 +00:00
Peter Boyle
186aad065f Roll forward Eigen in attempt to make CUDA happy 2019-01-01 13:48:32 +00:00
Peter Boyle
bf5685eb11 Update todo list 2019-01-01 13:48:06 +00:00
Peter Boyle
4a96c067ae Remove warnings from NVCC 2019-01-01 13:43:09 +00:00
Peter Boyle
ab063f33c0 Offload the linear combinations in CG 2019-01-01 13:42:13 +00:00
Peter Boyle
9efcc535bc Cleaner drop from CUDA mode around Eigen includes. Remains difficult to let Eigen compile under nvcc with version issues. 2019-01-01 13:39:10 +00:00
Peter Boyle
231b61d012 std::array by default 2019-01-01 13:37:35 +00:00
Peter Boyle
e898f4f0b0 Whitespace 2019-01-01 13:36:55 +00:00
Peter Boyle
d5db5f5242 Wrong dimension used in a temporary 2018-12-20 10:49:45 +00:00
Peter Boyle
2fcedb13dd Step size modification in HMC; ICC happy thread pragmas 2018-12-20 09:32:33 +00:00
Peter Boyle
35ed1defac Passes make check now single and double compile 2018-12-19 11:09:32 +00:00
Peter Boyle
4e95accf80 Namespace fix 2018-12-15 21:46:17 +00:00
fd66325321 pure QED test and copyright update 2018-12-14 17:39:11 +00:00
c637c0c48c James H.'s code for general size Wilson loops 2018-12-14 17:37:09 +00:00
c4b472176c Photon code fix 2018-12-14 17:36:38 +00:00
Peter Boyle
422764757d Updates in tests to make all of Grid compile 2018-12-14 16:55:54 +00:00
Vera Guelpers
943fa48ce4 Hadrons: Kl2 contraction using sequential propagators 2018-12-14 13:45:30 +00:00
Vera Guelpers
fa97a56fdd Hadrons: sequential Aslash insertion on propagator 2018-12-14 12:40:26 +00:00
856476a890 big cleanup of the Photon class + QED Coulomb gauge 2018-12-13 21:52:38 +00:00
Peter Boyle
afc462bd58 Bracketing issue in macro 2018-12-13 10:53:22 +00:00
Peter Boyle
b57a4d32aa Merge branch 'develop' into feature/gpu-port 2018-12-13 05:11:34 +00:00
c509bd3fe2 Merge branch 'feature/resilient-io' into develop 2018-12-01 12:57:43 +00:00
49b934310b resilient I/O fix 2018-11-27 20:17:09 +00:00
01e8cf5017 Merge branch 'develop' into feature/resilient-io 2018-11-27 19:09:59 +00:00
12f4499502 HDF5 serialiser fix 2018-11-27 19:09:50 +00:00
05aec72887 Hadrons: application parameter for resilient I/O 2018-11-27 18:46:43 +00:00
136d3802cb binary parallel IO can do read tests and eventually re-write in case of failure 2018-11-27 18:38:24 +00:00
a4c55406ed checksummed HDF5 IO 2018-11-27 17:43:19 +00:00
c7f33ca2a8 Revert "Hadrons: A2A vector write can fail and retry"
This reverts commit 10fc263675.
2018-11-27 17:27:26 +00:00
0e3035c51d Revert "optional non-fatal checksum fail in Lime lattice read (with error codes)"
This reverts commit bccfd4cbb3.
2018-11-27 17:27:20 +00:00
10fc263675 Hadrons: A2A vector write can fail and retry 2018-11-26 19:47:03 +00:00
bccfd4cbb3 optional non-fatal checksum fail in Lime lattice read (with error codes) 2018-11-26 19:45:51 +00:00
0b50d4a328 log time fix 2018-11-23 15:51:27 +00:00
fionnoh
b74940b3d4 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-11-23 14:08:29 +00:00
e232257cb6 Hadrons: A2AAslashVector modul cleaning and renaming 2018-11-22 19:43:49 +00:00
09451b5e48 Merge branch 'develop' of github.com:paboyle/Grid into develop 2018-11-22 15:45:24 +00:00
6364aa8acf Merge branch 'feature/contractor' into develop 2018-11-22 15:44:46 +00:00
b9e84ecab7 Hadrons: minor code cleaning 2018-11-22 15:44:30 +00:00
41032fef44 Optional RW mode for Hdf5Reader 2018-11-21 18:36:50 +00:00
d77bc88170 Optional support for faster CRC32C checksum through Intel IPP 2018-11-19 17:21:53 +00:00
494b3c9e57 Hadrons: contractor more IO fix 2018-11-19 16:26:53 +00:00
2ba19a9e07 Hadrons: contractor IO fix 2018-11-19 16:17:51 +00:00
5d7cc29eaf Hadrons: contractor token @traj@ for trajectory number in input file 2018-11-19 16:04:01 +00:00
f22a27d7f9 Hadrons: contractor trajectory loop and file output 2018-11-19 15:45:04 +00:00
Peter Boyle
33a0bbb17b Const correctness 2018-11-19 11:27:57 +00:00
f592ec8baa Hadrons: contractor performance fix 2018-11-16 20:59:49 +00:00
8b007b5c24 Hadrons: remove the use of OpenMP reductions 2018-11-16 20:00:29 +00:00
fionnoh
17b3f47b1e Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-11-16 16:32:12 +00:00
9bb170576d Merge pull request #177 from guelpers/develop
Hadrons module to electrify a gauge
2018-11-14 16:04:09 +00:00
Vera Guelpers
a7e3977b75 Merge remote-tracking branch 'upstream/develop' into develop 2018-11-13 14:56:23 +00:00
Vera Guelpers
995f20e45d Hadrons: some renamings 2018-11-13 14:54:48 +00:00
Vera Guelpers
d058b4e681 Merge branch 'feature/seqA2A' into develop 2018-11-13 13:27:24 +00:00
8e0d2f3402 Hadrons: support for twisted boundary conditions 2018-11-12 17:16:18 +00:00
2ac57370f1 Hadrons: contractor translation average normalisation 2018-11-12 16:04:35 +00:00
344e832a4e Hadrons: contractor faster transpose and finer timings 2018-11-12 15:59:54 +00:00
cfe281f1a4 Hadrons: diskvectors measure hash performance in debug output 2018-11-12 15:59:11 +00:00
f5422c7334 Hadrons: more contractor instrumentation 2018-11-09 16:23:53 +00:00
68c76a410d Hadrons: more contractor improvements 2018-11-08 19:24:29 +00:00
69b6ba0a73 Hadrons: contractor fixes and improvements 2018-11-08 18:46:28 +00:00
65349b07a7 Hadrons: simpler A2A perf functions 2018-11-08 18:44:44 +00:00
7cd9914f0e Hadrons: automatically resize output in MKL A2A matrix kernels 2018-11-08 17:40:57 +00:00
Peter Boyle
f3f24b3017 Optional Twisted BC's added, in "DoubleStore" for WilsonImpl.
Untested but doesn't affect answers when twists are all zero. The zero is the default behaviour
for ImplParams.
2018-11-08 12:55:25 +00:00
Vera Guelpers
8ef4657805 Merge remote-tracking branch 'upstream/develop' into feature/seqA2A 2018-11-08 09:00:06 +00:00
Vera Guelpers
78c1086f8b Hadrons: sequential Aslash insertion and propagator on A2A vector 2018-11-08 08:58:09 +00:00
Peter Boyle
68c13045d6 Added a test for Felix and Michael to look at 2018-11-07 23:40:15 +00:00
Peter Boyle
e9b6f58fdc Allow shrinking machine in orthog direction for extract slice local 2018-11-07 23:39:18 +00:00
Peter Boyle
839605c45c Verbose reduce 2018-11-07 23:38:46 +00:00
1ff1422e07 Hadrons: contractor lighter output 2018-11-07 20:02:53 +00:00
32376f0437 Hadrons: contractor performances 2018-11-07 19:59:11 +00:00
0c6e581336 Hadrons: first stab at general contraction code, needs serious testing 2018-11-07 19:16:55 +00:00
Vera Guelpers
e0a79a5bbf Hadrons: PR#177: Electrify gauge: Single Precision fix 2018-11-07 15:01:22 +00:00
Vera Guelpers
4c016cc1a4 Merge remote-tracking branch 'upstream/develop' into develop 2018-11-07 14:03:12 +00:00
Peter Boyle
2205b1e63e Add CXX to grid-config 2018-11-07 13:32:46 +00:00
Peter Boyle
6f421c7a6f Block solver in the SchurRedBlack plus timing report cleaner 2018-11-07 12:26:56 +00:00
Peter Boyle
b62b9ac214 Patch to broken assertion 2018-11-06 22:18:17 +00:00
88d9922e4f Hadrons: fast A2A matrix contraction kernels 2018-11-06 19:49:09 +00:00
9734e3ee58 Hadrons: (somewhat) faster build 2018-11-06 19:47:41 +00:00
Peter Boyle
8c3a599148 Block solver test 2018-11-06 16:44:58 +00:00
Azusa Yamaguchi
4a47b11876 Block CG improvements to develop 2018-11-06 12:49:05 +00:00
Vera Guelpers
f1382cf81d Merge remote-tracking branch 'upstream/develop' into develop 2018-11-06 10:29:52 +00:00
Vera Guelpers
85699daef2 Hadrons: Module to electrify a gauge field 2018-11-06 10:27:18 +00:00
1651111d18 Hadrons: final, portable form of the contractor benchmark 2018-11-05 21:29:13 +00:00
1ed4ea344d Merge branch 'develop' into feature/contractor 2018-11-05 11:42:02 +00:00
8f514ae550 Hadrons: Lanczos 32bit IO 2018-11-05 11:41:10 +00:00
4a7415e83c Hadrons: contractor benchmark update 2018-10-23 21:00:54 +01:00
0ffcfea724 Hadrons: contractor benchmark 2018-10-23 17:08:16 +01:00
febe41cc1d Hadrons: improvement on PR #176 2018-10-23 12:48:15 +01:00
62173395b8 Merge pull request #176 from guelpers/develop
Hadrons: full volume noise source for A2A
2018-10-23 12:29:35 +01:00
b48611b80f Merge branch 'develop' into feature/contractor 2018-10-22 18:27:18 +01:00
6b559d68aa Hadrons: eigenpack converter can do test reads 2018-10-22 11:10:18 +01:00
1982cc58dd Hadrons: A2A vectors I/O filename fix 2018-10-21 01:20:05 +01:00
2e2e5ce596 SciDAC I/O print data checksums 2018-10-19 20:36:32 +01:00
7d84dca8e9 Merge branch 'develop' into feature/contractor 2018-10-18 23:46:58 +01:00
2d3916418e Hadrons: more precision fix 2018-10-18 23:45:13 +01:00
21304e2139 Hadrons: fix to allow single-prec build again 2018-10-18 19:58:50 +01:00
7b850eb48b Merge branch 'develop' of github.com:paboyle/Grid into develop 2018-10-18 19:46:25 +01:00
a3ace57e01 Hadrons copyright update 2018-10-18 19:46:11 +01:00
b1c3cbe35e Hadrons: A2A vectors I/O 2018-10-18 19:44:58 +01:00
f31d6bfec2 Hadrons: contractor cleaning and better error check 2018-10-18 17:50:35 +01:00
a7cfa26901 Hadrons: reverse A2A matrix load for better DiskVector cache reuse 2018-10-18 17:50:16 +01:00
f333f3e575 Hadrons: DiskVector save-on-eviction and faster CRC32 for Eigen matrices 2018-10-18 17:48:25 +01:00
2b4e253473 Merge branch 'develop' of github.com:paboyle/Grid into develop 2018-10-17 20:28:20 +01:00
0ba3d469c7 Benchmark IO in single and double precision 2018-10-17 20:27:34 +01:00
f709329d96 Hadrons: first version of a contractor utility 2018-10-17 20:26:48 +01:00
f05b25dae4 Hadrons: A2AMatrix load 2018-10-17 20:26:26 +01:00
3e1d268fa3 Hadrons: DiskVector optimisation 2018-10-17 20:25:32 +01:00
Vera Guelpers
109c74bed8 Hadrons: full volume noise source for A2A 2018-10-16 14:56:12 +01:00
3023287fd9 Hadrons: 3-index RO access to Eigen disk vector 2018-10-16 14:44:14 +01:00
b3d6805638 Merge branch 'feature/contractor' into develop 2018-10-16 11:29:37 +01:00
291bc2a1f0 IO benchmark on a list of directories 2018-10-15 17:25:08 +01:00
2f368c33fc Hadrons: copyright update 2018-10-15 15:51:45 +01:00
9592115341 Hadrons: NPR and gauge fixing linking fix 2018-10-15 15:49:42 +01:00
Peter Boyle
24c07694bc Mixed precision now supported in MADWF 2018-10-14 00:22:52 +01:00
Peter Boyle
f0229025e2 MADWF working across a range of actions 2018-10-13 19:55:03 +01:00
Peter Boyle
6de9a45a09 NPR first cut by Julia Kettle 2018-10-12 11:00:58 +01:00
Peter Boyle
03c3d495a2 First cut (non functional NPR code) developed by Julia Kettle 2018-10-12 10:59:33 +01:00
Peter Boyle
49f25e08e8 PauliVillars based 4D -> 5D reconstruction with Fourier Accelerated PV inverse
by Christoph. Differs from the one by Rudy in BFM since it vectorises the twisted
4D solves in pairs.
2018-10-11 12:35:32 +01:00
efc0c65056 Hadrons: DiskVector Eigen specialisation with binary I/O and sha256 correctness check 2018-10-08 19:02:00 +01:00
936eaac8e1 function to get the sha256 string 2018-10-08 19:00:50 +01:00
fe6a372f75 Hadrons: fixes and cleaning in the scalar SU(N) part 2018-10-08 15:14:08 +01:00
fionnoh
dac9f8622e Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-10-08 10:12:11 +01:00
148fc052bd Hadrons: Aslash field, tested 2018-10-05 21:04:10 +01:00
c073341a10 Hadrons: more cleaning 2018-10-05 19:50:41 +01:00
78299daaac Hadrons: code cleaning 2018-10-05 16:47:52 +01:00
866449c804 Hadrons: integration of Peter's A2Autils 2018-10-05 16:42:44 +01:00
d69a52079f Merge remote-tracking branch 'gh/feature/a2a-integration' into feature/aslashfield 2018-10-05 15:39:09 +01:00
9f4f8a14a3 Hadrons: code cleaning 2018-10-05 15:38:01 +01:00
f6593dc881 Hadrons: A2A block performance counter fix 2018-10-05 15:11:01 +01:00
Peter Boyle
b46d31d4b6 MKL enable on Eigen if Grid is configured to use MKL 2018-10-05 11:29:40 +01:00
58567fc650 Hadrons: big update abstracting the block meson field routine, tested & working, performance counters broken and code dirty 2018-10-04 20:01:49 +01:00
Peter Boyle
7c57cac670 Adding A2A utils class for containing kernels. 2018-10-04 18:57:41 +01:00
d0b21bf1ff Merge branch 'feature/eigenpack-convert' into develop 2018-10-04 18:26:45 +01:00
a1825d1f59 Hadrons: final fix for multiprec eigenpacks 2018-10-04 18:25:26 +01:00
5a3e83ff7b Hadrons: new layer in eigenpacks class hierarchy 2018-10-03 14:45:01 +01:00
52569d98d8 Hadrons: multiprec eigenpack I/O fix 2018-10-03 14:24:43 +01:00
b351103c29 Hadrons: eigenpack load module with 32bit I/O 2018-10-02 21:07:56 +01:00
118cca4681 Hadrons: linking fix 2018-10-02 20:08:49 +01:00
44de727cd2 Hadrons: eigenpack support for multiprecision I/O 2018-10-02 19:51:09 +01:00
888ebc3cf9 Hadrons: better name for the EP converter 2018-10-02 15:22:18 +01:00
6c031a1b81 Merge branch 'feature/eigenpack-convert' into develop 2018-10-02 14:57:30 +01:00
02aa4bd762 Hadrons: cleaner eigenpack convert log 2018-10-02 13:43:25 +01:00
9aafa8ee60 Hadrons: eigenpack converter generalised for RB/5d grids 2018-10-02 13:34:17 +01:00
430b98b354 fix previous commit 2018-10-02 13:12:46 +01:00
84189867ef Hadrons: eigenpack converter with RB grids (to be generalised) 2018-10-02 13:05:05 +01:00
4ab8cfbe2a Hadrons: more verbose eigenpack convert 2018-10-02 12:24:45 +01:00
aadd9f4468 Eigenpack converter, to be tested, HadronsXmlRun moved to Utilities directory 2018-10-02 00:02:34 +01:00
8fbb27ce13 Hadrons: less code duplication in eigenpack IO 2018-10-01 20:15:21 +01:00
21bba95909 Hadrons: eigenpack metadata is no ignored anymore when reading 2018-10-01 19:33:45 +01:00
6448fe7121 More flexible XML control in Lime files 2018-10-01 19:32:50 +01:00
2458a11d1d Hadrons: precision cast module 2018-09-29 18:00:08 +01:00
d0ca7c3fe6 Hadrons: big update for getGrid, grids are now created automatically 2018-09-29 17:55:19 +01:00
57f899d79c Merge branch 'develop' of github.com:paboyle/Grid into develop 2018-09-29 15:50:59 +01:00
e881a0c157 Merge commit 'beed527ea37c90fd5e19b82d326eb8adc8eba5ff' into develop 2018-09-29 15:50:21 +01:00
f411657118 JSON update 2018-09-29 15:48:05 +01:00
Peter Boyle
7458c6174b Use operator() for indexing internal indices 2018-09-27 06:42:02 +01:00
Peter Boyle
21b269d0f9 Move the Grid.pdf out of a deep directory 2018-09-27 06:36:25 +01:00
Peter Boyle
083af92ac2 Update from chulwoo ; high level link for Grid.pdf in documentation 2018-09-27 06:30:40 +01:00
Peter Boyle
2c162577b5 HMC documentation 2018-09-25 23:28:17 +01:00
Peter Boyle
b1c4e96382 Updates to actions etc.. 2018-09-24 22:10:30 +01:00
Peter Boyle
a55c6f34f3 Updated docs 2018-09-24 15:44:35 +01:00
Peter Boyle
beed527ea3 Carletons chapter 2018-09-24 15:09:51 +01:00
eaa633cf69 Merge branch 'develop' of github.com:paboyle/Grid into develop 2018-09-21 18:16:22 +01:00
c632455129 Hadrons: meson field IO fix 2018-09-21 18:16:01 +01:00
c012899ed5 Hadrons: big update after templating of get/createGrid 2018-09-21 18:15:33 +01:00
paboyle
8bab544c2f Updated manual pdf 2018-09-20 18:51:11 +01:00
paboyle
76fc06a5dc Updates with todo from Carleton 2018-09-20 18:50:11 +01:00
fionnoh
d9de8fd5c9 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-09-17 12:19:47 +01:00
fionnoh
7e3647246c Merge branch 'master' of https://github.com/paboyle/Grid into develop 2018-09-17 12:19:20 +01:00
4af6c7e7aa Hadrons: copyright update 2018-09-14 12:51:48 +01:00
f60fbcfc4d Hadrons: mixed precision CG, to be tested 2018-09-14 12:47:55 +01:00
464c81706e Hadrons: defaults Impls for different precisions 2018-09-14 12:46:43 +01:00
Peter Boyle
adbdc4e65b Half comms not working on GPU yet, so disable. 2018-09-11 05:15:22 +01:00
Peter Boyle
e4deea4b94 Weird bug appears with Vector<Vector<>>.
"fix" with std::vector<Vector<>>

Lies in the face table code. But think there is some latent problem.
Possibly in my allocator since it is caching, but could simplify or eliminate the caching
option and retest. One to look at later.
2018-09-11 04:36:57 +01:00
Peter Boyle
94d721a20b Comments on further topology discovery work 2018-09-11 04:20:04 +01:00
408130b808 Hadrons: header list fix 2018-09-10 17:38:54 +01:00
375edd1370 file forgotten in last commit 2018-09-10 17:37:29 +01:00
6d912f6c67 Hadrons: general guesser factory 2018-09-10 17:36:54 +01:00
6d1d28955e Guesser class is redundant, switching to LinearFunction 2018-09-10 17:35:54 +01:00
920b471761 Hadrons tests update 2018-09-10 15:32:13 +01:00
63c21767ba Hadrons: grids stored with hash of SIMD type (for mixed-precision setups) 2018-09-10 15:31:39 +01:00
7b6b712565 function to convert std::vector to string 2018-09-10 15:17:32 +01:00
35abd05ee9 mute Version.h cache creation 2018-09-10 15:16:59 +01:00
Peter Boyle
7bf82f5b37 Offload the face handling to GPU 2018-09-10 11:28:42 +01:00
Peter Boyle
f02c7ea534 Peer to peer on GPU's setup 2018-09-10 11:26:20 +01:00
Peter Boyle
bc503b60e6 Offloadable gather code 2018-09-10 11:21:25 +01:00
Peter Boyle
704ca162c1 Offloadable compression 2018-09-10 11:20:50 +01:00
Peter Boyle
b5329d8852 Protect against zero length loops giving a kernel call failure 2018-09-10 11:20:07 +01:00
Peter Boyle
f27b9347ff Better unquiesce MPI coverage 2018-09-10 11:19:39 +01:00
Peter Boyle
b4967f0231 Verbose and error trapping cleaner 2018-09-09 14:28:02 +01:00
Peter Boyle
6d0f1aabb1 Fix the multi-node path 2018-09-09 14:27:37 +01:00
Peter Boyle
f4bfeb835d Drop back to smaller Ls 2018-09-09 14:25:06 +01:00
Peter Boyle
394b7b6276 Verbose decrease 2018-09-09 14:24:46 +01:00
dd36e60f6a compilation fix for hypercube optimal communicator 2018-09-08 18:07:29 +01:00
cb6c548e21 Hadrons: code cleaning 2018-09-07 20:40:55 +01:00
02c4ccf621 Hadrons: diskvector debug message for writes 2018-09-07 20:33:49 +01:00
fd24588212 Merge branch 'develop' of github.com:paboyle/Grid into develop 2018-09-07 20:25:11 +01:00
b800bb3ecb Hadrons: disk vector cache policy to last touch 2018-09-07 20:24:48 +01:00
f8abd0978b Hadrons copyright update 2018-09-07 20:10:07 +01:00
12c7c493bf Hadrons: disk-based container 2018-09-07 20:04:54 +01:00
paboyle
c7c9072313 Documentation 2018-09-06 16:01:42 +01:00
2bf3be5fae Hadrons: copyright and code cleaning 2018-09-04 18:25:10 +01:00
3a40e4fc69 Hadrons: scalar SU(N) 2-pt guard against negative momenta components 2018-09-04 18:24:07 +01:00
2e69e03f6f Hadrons: CosmHol configs IO module 2018-09-04 18:23:28 +01:00
a09f9bb528 Hadrons: code cleaning 2018-09-04 18:22:21 +01:00
f0e341d726 Hadrons: module list generator fix 2018-09-04 18:22:04 +01:00
6f09df0daf Hadrons: A2A matrix IO fix 2018-09-02 01:46:22 +01:00
26cee605b8 Hadrons: copyright update 2018-09-01 21:30:30 +01:00
b3fa18c229 copyright script never removes authorship 2018-09-01 21:29:58 +01:00
2940c9bcfd Hadrons: dedicated IO class for A2A matrices 2018-09-01 21:09:01 +01:00
0bb532f72b more explicit clean git tree message 2018-09-01 20:02:18 +01:00
fada2aa0f7 Hadrons: precision fix 2018-09-01 20:00:12 +01:00
c193e4e675 Aslash expression in Mathematica notebook 2018-09-01 19:59:58 +01:00
3ee682f676 more Version.h fine tuning 2018-09-01 19:58:16 +01:00
d85ec3bac2 build system minor fix 2018-09-01 19:54:21 +01:00
b52d8eb1e3 better Version.h implementation 2018-09-01 19:49:13 +01:00
ee630d2e8b Hadrons: smearing plaquette output 2018-09-01 17:38:32 +01:00
2f0af79869 Hadrons: scalar SU(N) NPR update 2018-09-01 17:36:35 +01:00
1b7fb79ec0 CI fix 2018-08-28 18:26:37 +01:00
2db1a4628c build system minor fix 2018-08-28 18:26:30 +01:00
6aa047d842 Hadrons module template fix 2018-08-28 17:17:00 +01:00
8779c32ae1 Merge branch 'feature/hadrons' into develop 2018-08-28 17:10:33 +01:00
c527dc3358 CI fix 2018-08-28 17:10:08 +01:00
6b42577b6b gitignore update 2018-08-28 16:58:37 +01:00
fb3596f968 Hadrons: precision fixes 2018-08-28 16:58:23 +01:00
f3a0158213 code cleaning 2018-08-28 16:56:07 +01:00
0250aa9347 file committed in error 2018-08-28 16:55:48 +01:00
3df6743396 more build system cleaning and patch for bad include in Eigen 2018-08-28 16:54:57 +01:00
fb7d021b9d Hadrons: moving Hadrons to root directory, build system improvements 2018-08-28 15:00:40 +01:00
5f206df775 Hadrons: meson field cache friendly cache copy 2018-08-15 17:29:44 +01:00
7727e81113 Hadrons: slight improvement on previous commit 2018-08-14 20:18:47 +01:00
c4115544a5 Hadrons: application option to save graph 2018-08-14 20:03:53 +01:00
08c47328ba Hadrons: meson field kernel performance for each block 2018-08-14 17:35:42 +01:00
09001aedca Hadrons: meson fields saved in single precision 2018-08-14 17:19:38 +01:00
2c67304716 Hadrons: meson field code cleaning 2018-08-14 17:00:05 +01:00
dc6d8686de Hadrons: meson field chunked HDF5 IO 2018-08-14 16:40:29 +01:00
cc2780bea3 Hadrons: meson field parallel IO 2018-08-14 14:55:13 +01:00
6e5a2b7922 fix previous commit 2018-08-14 14:07:54 +01:00
f4878d3a13 Hadrons: meson field threaded cache copy 2018-08-14 14:02:37 +01:00
89d2fac92e Hadrons: copyright update 2018-08-14 12:19:14 +01:00
f2d3e41cf2 Hadrons: meson field: HDF5 perf, gamma input and Eigen tensors allocated by Grid 2018-08-13 20:18:33 +01:00
3c27bb36d4 Hadrons: direct timer access 2018-08-13 20:17:45 +01:00
603d59f389 Hadrons: code cleaning 2018-08-13 20:17:24 +01:00
07a0ef3f95 Hadrons: global measurement time profile 2018-08-13 16:44:57 +01:00
503259f9c9 Hadrons: meson field HDF5 IO done and tested 2018-08-12 16:52:12 +01:00
5be6a51044 Hadrons: meson fields code cleaning and momentum phases 2018-08-11 15:13:43 +01:00
ac69f042b1 Hadrons: module RNG uniquely seeded with <run id> + <module name> + <trajectory> 2018-08-10 18:27:00 +01:00
133d5c2e34 Merge branch 'develop' into feature/hadrons 2018-08-10 16:36:40 +01:00
2a94244890 configure: --with-openssl option and LIME is now mandatory 2018-08-10 16:36:11 +01:00
a15a2dfd29 Merge branch 'develop' into feature/hadrons 2018-08-10 16:08:22 +01:00
093bb02633 Hadrons: execute message for time diluted noise 2018-08-10 16:07:48 +01:00
99a85116f8 Hadrons: module and VM instrumentation 2018-08-10 16:07:30 +01:00
paboyle
27cdb79063 Sha used to seed from a unique string 2018-08-10 15:11:01 +01:00
f4cbfd63ff Hadrons: more meson field cleaning, needs IO now 2018-08-09 18:39:58 +01:00
2b794b6aa7 Hadrons: module generating random lattices for testing purposes 2018-08-09 17:16:42 +01:00
d0244a059f Hadrons: cleaning cleaning... 2018-08-09 00:38:17 +01:00
dcdd891d7d Hadrons: precision fix 2018-08-09 00:13:53 +01:00
6d2df9de79 Hadrons: even more cleaning 2018-08-08 23:15:55 +01:00
41d4e37bae Hadrons: more cleaning 2018-08-08 19:04:44 +01:00
ee5c0cc9b6 Hadrons: code cleaning 2018-08-08 18:45:06 +01:00
0a4020eb4d Hadrons: copyright fix 2018-08-07 18:42:52 +01:00
b2de26589b Hadrons: code cleaning and copyright update 2018-08-07 18:40:48 +01:00
0677adb4dd Hadrons: overhaul of A2A for production 2018-08-07 18:27:59 +01:00
231cc95be6 Hadrons: eigenvalues precision fix 2018-08-07 18:27:19 +01:00
639f9cab82 Hadrons: schedule loading fix 2018-08-07 18:26:49 +01:00
4eac4e575e Hadrons: meson fields indentation fix 2018-08-06 12:42:25 +01:00
3f0f92cda6 Hadrons: first cleaning/integration of A2A/meson fields 2018-08-06 12:11:52 +01:00
d2650e89bd Hadrons: VM exception for object type (solves infinite loop in scheduler) 2018-08-06 12:11:00 +01:00
2962123cba Hadrons: diluted noise polish 2018-08-05 01:44:37 +01:00
830168ec37 Hadrons: first try at diluted noise class (tested) 2018-08-04 12:32:58 +01:00
584c921ca0 Eigen support fix (use of Grid as a library was broken) 2018-08-03 21:07:58 +01:00
81347b4d16 gitignore update 2018-08-03 19:58:52 +01:00
2cfa0b0e6b Merge pull request #174 from fionnoh/a2a_basics
A2A basics
2018-08-03 16:32:14 +01:00
fionnoh
fa5dee76b1 Included Peter's A2AMeson field and Eigen changes 2018-08-03 15:15:54 +01:00
fionnoh
8d1679c6b8 Merge branch 'feature/hadrons-a2a' of https://github.com/paboyle/Grid into a2a_basics 2018-08-03 15:12:24 +01:00
Peter Boyle
3791a38f7c Optimised the MesonField a bit more 2018-08-01 08:27:27 +01:00
Peter Boyle
142f7b0c86 Updated the A2A Meson Field module 2018-07-31 15:58:02 +01:00
fionnoh
891ad66eab Included changes to Hadrons RBPrecCG solver needed for subtraction of guess 2018-07-31 11:26:07 +01:00
Peter Boyle
60c43151c5 Merge branch 'feature/hadrons-a2a' of https://github.com/paboyle/Grid into feature/hadrons-a2a 2018-07-31 01:09:02 +01:00
paboyle
e036800261 Eigen fix 2018-07-31 01:08:42 +01:00
Peter Boyle
62900def36 Merge branch 'feature/hadrons-a2a' of https://github.com/paboyle/Grid into feature/hadrons-a2a 2018-07-31 00:36:26 +01:00
paboyle
e3a309a73f Eigen happiness 2018-07-31 00:35:17 +01:00
fionnoh
ad6c1c0c4e The basics of what is needed in Grid and Hadrons for the A2A class and module, with none of the contraction or MF code. 2018-07-30 18:40:50 +01:00
Peter Boyle
00b92a91b5 Optimising 2018-07-28 23:46:22 +01:00
paboyle
65533741f7 7 moms 2018-07-28 16:17:47 +01:00
Peter Boyle
dc0259fbda Merge pull request #173 from fionnoh/feature/hadrons-a2a
Changes to meson field benchmark. Now includes the gammas in the fina…
2018-07-27 23:03:56 +01:00
Peter Boyle
131a6785d4 Merge branch 'feature/hadrons-a2a' into feature/hadrons-a2a 2018-07-27 23:03:42 +01:00
paboyle
44f4f5c8e2 Momentum loop 2018-07-27 23:00:16 +01:00
fionnoh
2679df034f Changes to meson field benchmark. Now includes the gammas in the final part of the naive method, both methods compute
lhs^dag*Gamma*rhs (previously Gamma*lhs^dag*rhs), and checks results.
2018-07-27 18:31:10 +01:00
bf71162b97 Hadrons: backtrace on abort 2018-07-26 19:20:12 +01:00
299e828d83 Merge branch 'develop' into feature/hadrons 2018-07-26 16:49:49 +01:00
ef5452cddf Hadrons: smarter memory profiler 2018-07-26 16:47:45 +01:00
80de748737 Hadrons: new exceptions which can save a integer 2018-07-26 16:47:25 +01:00
paboyle
71e1006ba8 Updated meson field benchmark for dirac structures 2018-07-26 09:09:29 +01:00
00f31ae83f Merge pull request #163 from goracle/unstaged
Add printing of whether there are unstaged changes in the git hash print
2018-07-25 19:00:00 +00:00
cce339deaf Merge pull request #172 from fionnoh/feature/hadrons
feature/hadrons -> feature/hadrons-a2a
2018-07-25 17:20:19 +00:00
fionnoh
24128ff109 Changes needed for MF benchmark to work with comms correctly 2018-07-23 15:51:37 +01:00
Peter Boyle
da17a015c7 Pack the stencil smaller for 128 bit access 2018-07-23 06:12:45 -04:00
Peter Boyle
1fd08c21ac make simd width configure time option for GPU 2018-07-23 06:10:55 -04:00
Peter Boyle
28db0631ff Hack to force 128bit accesses 2018-07-23 06:10:27 -04:00
Peter Boyle
b35401b86b Fix CUDA_ARCH. Need to simplify. See when new eigen release happens 2018-07-23 06:09:33 -04:00
Peter Boyle
a0714de8ec Define vector length for GPU 2018-07-23 06:09:05 -04:00
Peter Boyle
21a1710b43 Verbose vector length 2018-07-23 06:08:39 -04:00
fionnoh
34e9d3f0ca Moved the creation and resizing of the v and w high modes from the A2A class to the A2A module and made them an output of the module. This means that they have to be inputs of the contration modules and they will freed from memory if they are no longer needed. 2018-07-22 14:40:31 +01:00
fionnoh
c995788259 Added ImportUnphysicalFermion and included appropriate logic for 5d w vectors in A2A code 2018-07-21 00:08:11 +01:00
fionnoh
94c7198001 Added ZFIMPL to A2AMeson contraction 2018-07-20 23:08:22 +01:00
fionnoh
04d86fe9f3 Removed overly verbose print statement 2018-07-20 21:38:19 +01:00
fionnoh
b78074b6a0 Removed a Dminus from high mode v and removed duplication pf D_oo code 2018-07-20 16:55:24 +01:00
fionnoh
7dfd3cdae8 Inclusion of ExportPhysicalFermionSource that fixes a bug in the low mode w vectors 2018-07-20 15:45:43 +01:00
fionnoh
cecee1ef2c Merge branch 'develop' of github.com:paboyle/Grid into feature/hadrons 2018-07-20 13:37:50 +01:00
fionnoh
355d4b58be Merge branch 'feature/hadrons' of github.com:fionnoh/Grid into feature/hadrons 2018-07-19 16:07:54 +01:00
fionnoh
2c54a536f3 Moved the meson field inner product to its own header file 2018-07-19 15:56:52 +01:00
fionnoh
d868a45120 Cleaned up some stuff that was erroneously included in a previous "trash" commit. Leaving in the mySliceInnerProdct function for now as it speeds up mesonfield creation quite a lot for 24^3 tests 2018-07-16 16:19:59 +01:00
fionnoh
9deae8c962 A2A meson field contraction code 2018-07-16 14:18:45 +01:00
Peter Boyle
b2b5137d28 Finally starting to get decent performance on Volta 2018-07-13 12:06:18 -04:00
fionnoh
db86cdd7bd Possible trash commit 2018-07-10 13:30:45 +01:00
paboyle
ec9939c1ba Test for faster implementation of meson field inner loop
This should be possible to cache block at outer levels, global sum across nodes not performed
and deferred to caller to block them all into a big all reduce.
Nc=3 and Fermion is hard coded in an ugly way. We might think about benchmarking whether
a product without the conjugate should be made available by Grid.

It is not clear whether the explicit unroll, or the performing of conjugate on left once
was the real source of the speed up.

Gives 70-80 GF/s on my laptop (single) half that double, and 70GB/s to cache.

This is competitive with dslash and a reasonable stopping point for the optimisation. If necessary we can revisit.
2018-07-10 12:38:51 +01:00
Peter Boyle
2cc07450f4 Fastest option for the dslash 2018-07-05 09:57:55 -04:00
Peter Boyle
c0e8bc9da9 Current version gets 250 - 320 GF/s on Volta on the target 12^4 volume. 2018-07-05 07:10:25 -04:00
Peter Boyle
b1265ae867 Prettify code 2018-07-05 07:08:06 -04:00
Peter Boyle
32bb85ea4c Standard extractLane is fast 2018-07-05 07:07:30 -04:00
Peter Boyle
ca0607b6ef Clearer kernel call meaning 2018-07-05 07:06:15 -04:00
Peter Boyle
19b527e83f Better extract merge for GPU. Let the SIMD header files define the pointer type for
access. GPU redirects through builtin float2, double2 for complex
2018-07-05 07:05:13 -04:00
Peter Boyle
4730d4692a Fast lane extract, saturates bandwidth on Volta for SU3 benchmarks 2018-07-05 07:03:33 -04:00
Peter Boyle
1bb456c0c5 Minor GPU vector width change 2018-07-05 07:02:04 -04:00
Peter Boyle
4b04ae3611 Printing improvement 2018-07-05 06:59:38 -04:00
Peter Boyle
2f776d51c6 Gpu specific benchmark saturates memory. Can enhance Grid to do this for expressions,
but a bitof (known) work.
2018-07-05 06:58:37 -04:00
fionnoh
f74617c124 Added ZFIMPL to meson field module 2018-07-03 14:04:53 +01:00
fionnoh
8c6a3921ed Merge remote-tracking branch 'upstream/feature/hadrons' into feature/hadrons 2018-07-03 11:35:14 +01:00
a8a15dd9d0 Hadrons: code cleaning 2018-07-02 17:52:39 +01:00
3ce68a751a Hadrons: stout smearing module 2018-07-02 17:52:04 +01:00
fionnoh
daa0977d01 Included a print statement that indicates that the guess is being subtracted from the solve. 2018-06-28 16:34:56 +01:00
fionnoh
a2929f4384 Removed A2A contraction module and replaced it with the beginnings of a meson field module 2018-06-28 16:17:26 +01:00
fionnoh
7fe3974c0a Included eigenPacks and action as references, not inputs, of A2A module. They now now longer need to be parameters in the meson field modules. 2018-06-28 16:14:49 +01:00
fionnoh
f7e86f81a0 Changes A2A class to make use of the new Solver class 2018-06-28 16:14:16 +01:00
fionnoh
fecec803d9 Merge branch 'feature/hadrons' of https://github.com/paboyle/Grid into feature/hadrons 2018-06-28 16:13:43 +01:00
fionnoh
8fe9a13cdd Merge branch 'feature/hadrons' of https://github.com/paboyle/Grid into feature/hadrons 2018-06-28 16:13:07 +01:00
paboyle
3a50afe7e7 GPU dslash updates 2018-06-27 22:32:21 +01:00
paboyle
f8e880b445 Loop for s and xyzt offlow 2018-06-27 21:49:57 +01:00
paboyle
3e947527cb Move looping over "s" and "site" into kernels for GPU optimisatoin 2018-06-27 21:29:43 +01:00
paboyle
31f65beac8 Move site and Ls looping into the kernels 2018-06-27 21:28:48 +01:00
paboyle
38e2a32ac9 Single SIMD lane operations for CUDA 2018-06-27 21:28:06 +01:00
paboyle
efa84ca50a Keep Cuda 9.1 happy 2018-06-27 21:27:32 +01:00
paboyle
5e96d6d04c Keep CUDA happy 2018-06-27 21:27:11 +01:00
paboyle
df30bdc599 CUDA happy 2018-06-27 21:26:49 +01:00
paboyle
7f45222924 Diagnostics on memory alloc fail 2018-06-27 21:26:20 +01:00
paboyle
dd891f5e3b Use NVCC to suppress device Eigen 2018-06-27 21:25:17 +01:00
d2c42e6f42 Hadrons: scaled DWF action 2018-06-26 14:59:33 +01:00
Daniel Richtmann
2881b3e8e5 WilsonMG: Remove unnecessary static assertions 2018-06-26 14:42:30 +02:00
049cc518f4 Hadrons: introduction message 2 2018-06-25 19:08:39 +01:00
2e1c66897f Hadrons: introduction message 2018-06-25 19:08:22 +01:00
adcef36189 Hadrons: Möbius DWF action 2018-06-25 15:58:35 +01:00
fionnoh
2f121c41c9 Commiting reation of meson field code before a merge with the upstream branch feature/hadrons 2018-06-25 12:20:46 +01:00
e0ed7e300f Hadrons: spurious Dminus removed 2018-06-22 16:33:43 +02:00
485207901b Merge branch 'develop' into feature/hadrons 2018-06-22 16:15:32 +02:00
c760f0a4c3 Hadrons: remove make_5D/4D functions and FreeProp fix 2018-06-22 16:12:46 +02:00
c84eeedec3 Hadrons: GaugeProp module for z-Wilson actions 2018-06-22 15:53:22 +02:00
fionnoh
1ac3526f33 Small changes to the A2A header and module 2018-06-22 12:29:42 +01:00
fionnoh
0de090ee74 Temporarily added in the contraction code that produced the working 2-pt function. This is commited for reference only and will be removed in the next push. 2018-06-22 12:28:41 +01:00
91405de3f7 Hadrons: new solver exposing fermion matrix and generic source/solve import/export 2018-06-22 12:14:37 +02:00
fionnoh
8fccda301a Fixed a bug where the guess was always subtracted after the solve and included appropriate weights for the sources in the one case we're looking at now. More work needs to be done to make the 5d/4d source logic less brittle. 2018-06-21 16:36:59 +01:00
fionnoh
7a0abfac89 Restructured the class that computes and returns the A2A vectors. 2018-06-21 16:36:06 +01:00
fionnoh
ae37fda699 A more elegant way to subtract guesses from solve and a bool check before verifying residual 2018-06-20 16:07:40 +01:00
fionnoh
b5fc5e2030 All to all module update that hit a promising milestone. Commiting for a reference for future changes. 2018-06-20 10:59:07 +01:00
Daniel Richtmann
cc5d025ea4 WilsonMG: Adapt staggered GMRES/MR tests to "new" constructor 2018-06-18 16:20:20 +02:00
paboyle
6c97a6a071 Coalescing version of the kernel 2018-06-13 20:52:29 +01:00
paboyle
73bb2d5128 Ugly hack to speed up compile on GPU; we don't use the hand kernels on GPU anyway so why compile 2018-06-13 20:35:28 +01:00
paboyle
b710fec6ea Gpu code first version of specialised kernel 2018-06-13 20:34:39 +01:00
paboyle
b2a8cd60f5 Doubled gauge field is useful 2018-06-13 20:27:47 +01:00
paboyle
867ee364ab Explicit instantiation hooks 2018-06-13 20:27:12 +01:00
paboyle
25becc9324 GPU tweaks for benchmarking; really necessary? 2018-06-13 20:26:07 +01:00
paboyle
94d1ae4c82 Some prep work for GPU shared memory. Need to be careful, as will try GPU direct
RDMA and inter-GPU memory sharing on SUmmit later
2018-06-13 20:24:06 +01:00
paboyle
2075b177ef CUDA_ARCH more carefule treatment 2018-06-13 20:22:34 +01:00
paboyle
847c761ccc Move sfw IEEE fp16 into central location 2018-06-13 20:22:01 +01:00
paboyle
8287ed8383 New GPU vector targets 2018-06-13 20:21:35 +01:00
paboyle
e6be7416f4 Use managed memory 2018-06-13 20:14:00 +01:00
paboyle
26863b6d95 User Managed memory 2018-06-13 20:13:42 +01:00
paboyle
ebd730bd54 Adding 2D loops 2018-06-13 20:13:01 +01:00
paboyle
066be31a3b Optional GPU target SIMD types; work in progress and trying experiments 2018-06-13 20:07:55 +01:00
paboyle
7a4c142955 Add GPU specific simd targets 2018-06-13 19:55:30 +01:00
Daniel Richtmann
ddcb53bce2 Merge remote-tracking branch 'upstream/develop' into feature/wilsonmg 2018-06-13 09:50:37 +02:00
Daniel Richtmann
d1c80e1d46 WilsonMG: Correct years in copyright line 2018-06-13 09:44:09 +02:00
Daniel Richtmann
c73cc7d354 WilsonMG: Add tests with MG preconditioner running single precision, outer solver running in double 2018-06-12 16:10:48 +02:00
Daniel Richtmann
49fdc324a0 WilsonMG: Make MG correctness checks abort on failing tests 2018-06-12 16:10:48 +02:00
Daniel Richtmann
f32714a2d1 WilsonMG: Make running MG correctness checks optional via commandline 2018-06-12 16:10:48 +02:00
Daniel Richtmann
73a955be20 WilsonMG: Move tests for Wilson & WilsonClover into separate files 2018-06-12 16:10:48 +02:00
Daniel Richtmann
66b7a0f871 WilsonMG: Move multigrid class to separate file 2018-06-12 16:10:48 +02:00
Daniel Richtmann
2ab9d4bc56 WilsonMG: Fix random behavior in GMRES
From time to time I saw random since the basis vectors were not initialized
properly.
2018-06-12 15:01:31 +02:00
Daniel Richtmann
4f41cd114d WilsonMG: Add a mixed precision version of FGMRES
This version does everything in double prec but accepts a preconditioner working
in single precision.
2018-06-12 15:01:31 +02:00
Daniel Richtmann
11c4f5e32c WilsonMG: Provide command line switch for reading in input xml + move default params to constructor of MultiGridParams 2018-06-12 15:01:31 +02:00
Daniel Richtmann
e9b9550298 WilsonMG: Fix incompatibility with single prec MG in construction of simd layout on coarser grids 2018-06-12 15:01:31 +02:00
Daniel Richtmann
7564fedf68 WilsonMG: Set subspace to zero to avoid random behavior 2018-06-12 15:01:31 +02:00
8db0ef9736 Merge pull request #168 from jch1g10/feature/qed-fvol
Feature/qed fvol
2018-06-08 20:09:06 +02:00
Guido Cossu
95d4b46446 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-06-08 11:30:29 +01:00
paboyle
251b904a28 Merge branch 'release/ISC-freeze-2' 2018-06-04 21:09:48 +01:00
paboyle
5dfd216a34 Better thread safety 2018-06-04 21:08:44 +01:00
paboyle
5a112feac3 Merge branch 'release/ISC-freeze-1' 2018-06-04 18:49:40 +01:00
paboyle
c2e8d0aa88 Solve g++ problem on the lanczos test 2018-06-04 18:34:15 +01:00
James Harrison
0fe5aeffbb Merge branch 'feature/hadrons' into feature/qed-fvol 2018-06-04 16:59:43 +01:00
James Harrison
7fbc469046 Merge branch 'develop' into feature/hadrons 2018-06-04 16:58:30 +01:00
paboyle
bf96a4bdbf Merge branch 'master' into develop 2018-06-04 14:03:11 +01:00
paboyle
84685c9bc3 Overflow fix 2018-06-04 13:42:07 +01:00
fionnoh
a8d4156997 Added a Hadrons module that computes the all-to-all v and w vectors 2018-05-31 17:18:58 +01:00
fionnoh
c18074869b Changes to Hadrons SchurRB solver to allow for a subtract_guess boolean to be passed 2018-05-31 17:17:16 +01:00
fionnoh
f4c6d39238 CHanges made to SchurRB solvers to allow for the subtraction of a guess after solve 2018-05-31 17:16:20 +01:00
200d35b38a Merge branch 'develop' into feature/hadrons 2018-05-28 11:52:47 +02:00
eb52e84d09 Merge branch 'feature/hadrons' of github.com:paboyle/Grid into feature/hadrons 2018-05-28 11:50:27 +02:00
72abc34764 Merge pull request #166 from guelpers/feature/hadrons
Feature/hadrons
2018-05-28 11:49:46 +02:00
e3164d4c7b Hadrons: env function to get volume in double 2018-05-28 11:39:17 +02:00
James Harrison
f5db386c55 Change MODULE_REGISTER_NS -> MODULE_REGISTER in UnitEM, ScalarVP and VPCounterTerms 2018-05-22 16:16:21 +01:00
James Harrison
294ee70a7a Merge branch 'feature/hadrons' into feature/qed-fvol
# Conflicts:
#	extras/Hadrons/modules.inc
#	lib/qcd/action/gauge/Photon.h
2018-05-21 18:02:41 +01:00
Azusa Yamaguchi
013ea4e8d1 Merge branch 'feature/staggered-comms-compute' into develop 2018-05-21 13:11:56 +01:00
Azusa Yamaguchi
7fbbb31a50 Merge branch 'develop' into feature/staggered-comms-compute
Conflicts:
	lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
2018-05-21 13:07:29 +01:00
Azusa Yamaguchi
0e127b1fc7 New file single prec test 2018-05-21 12:57:13 +01:00
Azusa Yamaguchi
68c028b0a6 Comment 2018-05-21 12:54:25 +01:00
255d4992e1 Hadrons: stochastic scalar SU(N) free field fix 2018-05-18 20:49:55 +01:00
a0d399e5ce Hadrons: yet other attempts at EMT NPR 2018-05-18 20:49:26 +01:00
fd3b2e945a Hadrons: don't right result with empty stem 2018-05-18 20:48:24 +01:00
Daniel Richtmann
6c27c72585 WilsonMG: Provide more sensible default values for MG parameters 2018-05-16 17:26:09 +02:00
Daniel Richtmann
9c003d2d72 WilsonMG: Base wilson mg preconditioner entirely on existing infrastructure 2018-05-16 17:26:09 +02:00
Daniel Richtmann
4b8710970c WilsonMG: Switch to Galerkin coarsening in CoarsenedMatrix 2018-05-16 17:26:09 +02:00
Daniel Richtmann
68d686ec38 WilsonMG: Add functionality for applying G5 on coarse grids 2018-05-16 16:17:14 +02:00
Daniel Richtmann
c48b69ca81 WilsonMG: Implement Mdir & Mdiag in CoarsenedMatrix 2018-05-16 16:08:05 +02:00
Daniel Richtmann
df8c208f5c WilsonMG: Revert CoarsenedMatrix.h and Lattice_transfer.h back to state of develop branch 2018-05-16 16:02:54 +02:00
Daniel Richtmann
61812ab7f1 Merge remote-tracking branch 'upstream/develop' into feature/wilsonmg 2018-05-15 14:57:18 +02:00
b999984501 Merge branch 'develop' into feature/hadrons 2018-05-15 13:53:57 +01:00
Guido Cossu
7836cc2d74 No checksum output on log for scidac 2018-05-15 10:10:08 +01:00
Peter Boyle
eb7d34a4cc GPU version 2018-05-14 19:41:47 -04:00
Peter Boyle
aab27a655a Start of GPU kernels 2018-05-14 19:41:17 -04:00
Peter Boyle
93280bae85 Gpu option 2018-05-14 19:40:58 -04:00
Peter Boyle
c5f93abcd7 GPU clean up 2018-05-14 19:40:33 -04:00
Peter Boyle
d5deef782d Useful debug comments 2018-05-14 19:39:52 -04:00
Peter Boyle
5f50473c0d Clean up 2018-05-14 19:39:11 -04:00
a61e0df54b Travis fix for Lime 2018-05-14 19:56:12 +01:00
9d835afa35 Attempt at solving the FP exception in the QED code 2018-05-14 19:05:54 +01:00
5e3be47117 Hadrons: scalar SU(N) various fixes 2018-05-14 18:58:39 +01:00
Peter Boyle
13f50406e3 Suppress print statement 2018-05-12 18:00:00 -04:00
Peter Boyle
09cd46d337 Lane by Lane operation 2018-05-12 17:59:35 -04:00
Peter Boyle
d3f51065c2 Give command line control of blocks/threads split 2018-05-12 17:58:56 -04:00
Peter Boyle
925ac4173d Thread count control for warp scheduler thingy doodaa thing 2018-05-12 17:58:22 -04:00
Peter Boyle
eb921041d0 Perf count control 2018-05-12 17:57:32 -04:00
48de706dd5 Merge branch 'develop' into feature/hadrons 2018-05-11 18:06:40 +01:00
f871fb0c6d check file is opened correctly in the Lime reader 2018-05-11 18:06:28 +01:00
93771f3099 Hadrons: scalar SU(N) stochastic free field 2018-05-10 22:29:48 +01:00
8cb205725b Merge branch 'develop' into feature/hadrons 2018-05-09 23:56:35 +01:00
9ad580d82f Hadrons: format fix 2018-05-07 21:38:15 +01:00
899f961d0d Hadrons: eigenvalue metadata saved with 16 significant digits 2018-05-07 21:37:03 +01:00
54d789204f more general implementation of the precision interface for serialisers 2018-05-07 21:17:46 +01:00
25828746f3 XML precision scientific with 16 digits by default 2018-05-07 21:04:31 +01:00
f362c00739 Hadrons: better handling of automatic directory creation 2018-05-07 19:43:40 +01:00
Guido Cossu
25d1cadd3b Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-05-07 18:55:09 +01:00
Guido Cossu
c24d53bbd1 Further debug of RNG I/O 2018-05-07 18:55:05 +01:00
2017e4e3b4 Hadrons: more verbose directory creation error 2018-05-07 18:12:22 +01:00
27a4d4c951 Hadrons: multi-file eigenpack in separate directory 2018-05-07 17:52:54 +01:00
2f92721249 Merge branch 'develop' into feature/hadrons 2018-05-07 17:26:47 +01:00
3c7a4106ed Trap for deadly empty comm thread option 2018-05-07 17:26:39 +01:00
3252059daf Hadrons: multi-file support for eigenpacks 2018-05-07 17:25:36 +01:00
paboyle
6eed167f0c Merge branch 'release/0.8.1' 2018-05-04 17:34:11 +01:00
paboyle
4ad0df6fde Bump volume for Gerardo 2018-05-04 17:33:23 +01:00
661381e881 Merge branch 'develop' into feature/hadrons 2018-05-04 14:52:17 +01:00
Peter Boyle
68a5079f33 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-05-04 14:13:54 +01:00
Peter Boyle
8634e19f1b Update 2018-05-04 14:13:35 +01:00
Azusa Yamaguchi
9ada378e38 Add timing 2018-05-04 10:58:01 +01:00
Vera Guelpers
9d9692d439 Fix double vs float in boundary phases 2018-05-03 16:40:16 +01:00
0659ae4014 Merge branch 'develop' into feature/hadrons 2018-05-03 16:20:22 +01:00
bfbf2f1fa0 no threaded stencil benchmark if OpenMP is not supported 2018-05-03 16:20:01 +01:00
dd6b796a01 Hadrons: scalar SU(N) volume factor fix 2018-05-03 16:19:17 +01:00
Vera Guelpers
52a856b4a8 FreeProp module for Hadrons 2018-05-03 12:33:20 +01:00
Vera Guelpers
04190ee7f3 5D free propagator for DWF and boundary conditions for free propagators 2018-05-03 12:31:36 +01:00
Azusa Yamaguchi
587bfcc0f4 Add Timing 2018-05-03 12:10:31 +01:00
Vera Guelpers
2700992ef5 Merge remote-tracking branch 'upstream/feature/hadrons' into feature/hadrons 2018-05-03 10:01:52 +01:00
Peter Boyle
8c658de179 Compressor speed up (a little); streaming stores 2018-05-02 17:52:16 +01:00
Guido Cossu
ba37d51ee9 Debugging the RNG IO 2018-05-02 15:32:06 +01:00
Azusa Yamaguchi
4f4181c54a Merge branch 'feature/staggered-comms-compute' of https://github.com/paboyle/Grid into feature/staggered-comms-compute 2018-05-02 14:59:13 +01:00
Guido Cossu
4d4ac2517b Adding Scalar field theory example for Scidac format 2018-05-02 14:36:32 +01:00
Guido Cossu
e568c24d1d Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-05-02 14:29:25 +01:00
Guido Cossu
b458326744 Checkpointer module update 2018-05-02 14:29:22 +01:00
Guido Cossu
6e7d5e2243 HMC: added Scidac checkpointer and support for metadata 2018-05-02 14:28:59 +01:00
Azusa Yamaguchi
b35169f1dd MultiShift for Staggered 2018-05-02 14:22:37 +01:00
Azusa Yamaguchi
441ad7498d add Iterative counter 2018-05-02 14:21:30 +01:00
Peter Boyle
6f6c5c549a Split off gparity 2018-05-02 14:11:23 +01:00
Peter Boyle
1584e17b54 Revert to fast versoin 2018-05-02 14:10:55 +01:00
Peter Boyle
12982a4455 Hypercube optimisation 2018-05-02 14:10:21 +01:00
Peter Boyle
172f412102 shmget reintroduce 2018-05-02 14:07:41 +01:00
Peter Boyle
a64497265d TIming 2018-05-02 14:07:28 +01:00
ca639c195f Merge branch 'develop' into feature/hadrons 2018-05-01 14:07:32 +01:00
edc28dcfbf Hadrons: scalar SU(N) 2-pt fix 2018-05-01 14:02:31 +01:00
Peter Boyle
c45f24a1b5 Improvements for tesseract 2018-04-30 21:50:00 +01:00
Dr Peter Boyle
aaf37ee4d7 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-04-27 11:45:13 +01:00
Dr Peter Boyle
1dddd17e3c Benchmark improvements from tesseract 2018-04-27 11:44:46 +01:00
paboyle
661f1d3e8e Merge branch 'release/0.8.0' into develop 2018-04-27 11:22:33 +01:00
paboyle
edcf9b9293 Merge branch 'release/0.8.0' 2018-04-27 11:13:19 +01:00
paboyle
fe6860b4dd Update with LIME library guard 2018-04-27 08:57:34 +01:00
paboyle
d6406b13e1 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-04-27 07:52:56 +01:00
paboyle
e369d7306d Rename 2018-04-27 07:51:44 +01:00
paboyle
9f8d63e104 Roll over version 2018-04-27 07:51:12 +01:00
paboyle
9b0240d101 Hot start test 2018-04-27 07:50:51 +01:00
paboyle
b27f0e5a53 Control over IO 2018-04-27 07:50:15 +01:00
paboyle
75e4483407 Stronger convergence test 2018-04-27 07:49:57 +01:00
Guido Cossu
0734e9ddd4 Debugging Scatter_plane_simple 2018-04-27 14:39:01 +09:00
paboyle
809b1cdd58 Bug fix for MPI running ; introduced last night 2018-04-27 05:19:10 +01:00
paboyle
1be8089604 Clean compile 2018-04-26 23:42:45 +01:00
paboyle
3e0eff6468 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-04-26 23:00:46 +01:00
paboyle
7ecc47ac89 Quenched test compile 2018-04-26 23:00:28 +01:00
paboyle
e9f1ac09de static 2018-04-26 23:00:08 +01:00
Peter Boyle
fa0d8feff4 Performance of CovariantCshift now non-embarrassing. 2018-04-26 17:56:27 +01:00
49b8501fd4 Merge branch 'develop' into feature/hadrons 2018-04-26 17:33:50 +01:00
d47484717e Hadrons: scalar SU(N) result handling improvement 2018-04-26 17:32:37 +01:00
Peter Boyle
05b44aef6b Merge branch 'develop' of https://github.com/paboyle/Grid into develop
Conflicts:
	benchmarks/Benchmark_su3.cc
2018-04-26 15:38:49 +01:00
Peter Boyle
03e9832efa Use macros for bare openmp 2018-04-26 14:50:02 +01:00
Peter Boyle
28a375d35d Force static 2018-04-26 14:49:42 +01:00
Peter Boyle
3b06381745 Guard bare openmp statemetn with ifdef 2018-04-26 14:48:57 +01:00
Peter Boyle
91a0a3f820 Improvement 2018-04-26 14:48:35 +01:00
Peter Boyle
8f44c799a6 Saving the benchmarking tests for Cshift 2018-04-26 14:48:03 +01:00
Azusa Yamaguchi
96272f3841 Merge staggered fix linear operator and reduction 2018-04-26 10:33:19 +01:00
Azusa Yamaguchi
5c936d88a0 Merge branch 'feature/staggered-comms-compute' of https://github.com/paboyle/Grid into feature/staggered-comms-compute 2018-04-26 10:18:37 +01:00
Azusa Yamaguchi
1c64ee926e Faster staggered operator with m^2 term trivial used 2018-04-26 10:17:49 +01:00
Azusa Yamaguchi
2cbb72a81c Provide info if EE term is trivial (m^2 factor)
Better timing in staggered 4d case
2018-04-26 10:10:07 +01:00
Azusa Yamaguchi
31d83ee046 Enable special treatment of constEE cases 2018-04-26 10:08:46 +01:00
Azusa Yamaguchi
a9e8758a01 Improvements to staggered tests timings 2018-04-26 10:08:05 +01:00
Azusa Yamaguchi
3e125c5b61 Faster linalg on CG optimised against staggered
Sum overhead is bigger for staggered
2018-04-26 10:07:19 +01:00
Azusa Yamaguchi
eac6ec4b5e Faster reductions, important on single node staggered 2018-04-26 10:03:57 +01:00
Azusa Yamaguchi
213f8db6a2 Microsecond resultion 2018-04-26 10:01:39 +01:00
Guido Cossu
6358f35b7e Debug of previous commit 2018-04-26 14:18:11 +09:00
Guido Cossu
43f5a0df50 More timers in the integrator 2018-04-26 12:01:56 +09:00
Guido Cossu
c897878776 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-04-26 11:31:57 +09:00
cc6eb51e3e Hadrons: macro refactoring for library portability 2018-04-25 16:49:14 +01:00
Vera Guelpers
507009089b Merge remote-tracking branch 'upstream/feature/hadrons' into feature/hadrons 2018-04-25 09:36:39 +01:00
paboyle
2baf193031 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-04-25 00:14:03 +01:00
paboyle
362ba0443a Cshift updates 2018-04-25 00:12:11 +01:00
paboyle
276a2353df Move constructor 2018-04-25 00:11:07 +01:00
b234784c8e Hadrons: scalar SU(N) takes operator pairs now 2018-04-24 19:52:12 +01:00
6ea2a8b7ca Hadrons: scheduler shows starting value 2018-04-24 19:51:47 +01:00
c1d0359aaa Hadrons: scalar SU(N) kinetic term saves trace 2018-04-24 19:51:22 +01:00
047ee4ad0b Hadrons: scalar SU(N) cleanup 2018-04-24 19:50:58 +01:00
a13106da0c Hadrons: scalar SU(N) gradient 2018-04-24 19:50:30 +01:00
75113e6523 Hadrons: Scalar SU(N) variable name update 2018-04-24 19:49:27 +01:00
325c73d051 Hadrons: module template update 2018-04-24 19:48:54 +01:00
b25a59e95e Hadrons: mitigation of GCC/Intel compiler bug not generating defaulted destructors 2018-04-24 17:20:25 +01:00
Guido Cossu
c5b9147b53 Correction of a minor bug in the su3 benchmark 2018-04-24 08:03:57 -07:00
Guido Cossu
64ac815fd9 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-04-24 17:27:38 +09:00
Guido Cossu
a1be533329 Corrected Flop count in Benchmark su3 and expanded the Wilson flow output 2018-04-24 01:19:53 -07:00
7c4533797f Hadrons: scalar SU(N) EMT improvement term optional 2018-04-23 22:46:39 +01:00
af84fd65bb Hadrons: missing dependency message improvement 2018-04-23 22:46:17 +01:00
Dan H
1a2613086a Fix print message. 2018-04-23 15:42:12 -04:00
Dan H
4f110c09a5 Add printing of whether there are unstaged changes in the git hash print. 2018-04-23 15:38:23 -04:00
6764362237 Hadrons: automatic directory creation fix 2018-04-23 18:45:39 +01:00
2fa2b0e0b1 Hadrons: Application header does not include all the modules 2018-04-23 17:57:17 +01:00
b61292f735 Hadrons: recursive mkdir function 2018-04-23 17:36:43 +01:00
ce7720e221 Hadrons: copyright update 2018-04-23 17:36:20 +01:00
853a5528dc Hadrons: template modules compilation optimisation 2018-04-23 17:35:01 +01:00
169f405c9c Hadrons: tests repaired 2018-04-23 12:48:34 +01:00
c6125b01ce Hadrons: Error and Warning channels always on 2018-04-23 12:48:17 +01:00
b0b5b34bff Hadrons: custom abort with module trace 2018-04-23 12:48:00 +01:00
1c9722357d Merge branch 'develop' into feature/hadrons
# Conflicts:
#	lib/qcd/action/fermion/FermionOperator.h
2018-04-20 17:15:21 +01:00
141da3ae71 function to get tensor dimensions 2018-04-20 17:13:34 +01:00
94edf9cf8b HDF5: direct access to group for custom operations 2018-04-20 17:13:21 +01:00
c11a3ca0a7 vectorise/unvectorise in reverse order 2018-04-20 17:13:04 +01:00
paboyle
870b1a85ae Think I have the physical prop interface to CF and PF overlap right, but need a strong check/regression.
Only support Hw overlap, not Ht for now. Ht needs a new Dminus implemented.
2018-04-18 14:17:49 +01:00
paboyle
b5510427f9 physical fermion interface, cshift benchmark in SU3. 2018-04-18 01:43:29 +01:00
Guido Cossu
26ed65c8f8 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-04-17 12:03:32 +01:00
paboyle
f7f043d8cf Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-04-17 10:57:18 +01:00
paboyle
ddcaa6ad29 Master does header on Nersc 2018-04-17 10:48:33 +01:00
Peter Boyle
87c5c0271b Ficxing eigen 2018-04-16 19:08:07 -04:00
Peter Boyle
a3f5a13591 Better Eigen handling 2018-04-16 18:02:55 -04:00
Peter Boyle
9fe28f00eb Eigen sim link off head revision 2018-04-16 17:54:46 -04:00
334da7f452 Hadrons: can trace which module is throwing an error 2018-04-13 18:45:31 +02:00
4669ecd4ba Hadrons: build improvement 2018-04-13 18:21:18 +02:00
4573b34cac Hadrons: scalar SU(N) 2-pt functions with momentum 2018-04-13 18:21:00 +02:00
Peter Boyle
a8a0bb85cc Control scalar execution or vector under generic. Disable Eigen vectorisation on powerpc / SUmmit 2018-04-12 12:32:57 -04:00
Peter Boyle
6411caad67 work distribution 2018-04-12 11:41:41 -04:00
Peter Boyle
7533035a99 Control Eigen vectorisatoin 2018-04-12 11:40:56 -04:00
17f57e85d1 Merge branch 'develop' into feature/hadrons 2018-04-06 22:53:11 +01:00
c8d4d184ee XML push fragment fix 2018-04-06 22:53:01 +01:00
17f27b1ebd Hadrons: eigenpack writer fix 2018-04-06 22:52:11 +01:00
a16bbecb8a Hadrons: more feedback 2018-04-06 19:38:20 +01:00
7c9b0dd842 Hadrons: top level name for eigenpack metadata 2018-04-06 19:32:22 +01:00
6b7228b3e6 Hadrons: better metadata for eigenpack 2018-04-06 19:29:53 +01:00
f117552334 post-merge fix 2018-04-06 18:38:46 +01:00
a21a160029 Merge branch 'develop' into feature/hadrons
# Conflicts:
#	lib/serialisation/XmlIO.cc
2018-04-06 18:34:19 +01:00
1569a374a9 XML interface polish, XML fragments can be pushed into a writer 2018-04-06 18:32:14 +01:00
eddf023b8a pugixml 1.9 update 2018-04-06 16:17:22 +01:00
6b8ffbe735 Hadrons: genetic minimum value type fix 2018-04-06 15:41:31 +01:00
81050535a5 Hadrons: truncate eigenvalues when loading partial eigenpack 2018-04-06 13:48:58 +01:00
7dcf5c90e3 Hadrons: eigenpack must be referred by solver when used 2018-04-06 13:16:28 +01:00
9ce00f26f9 not special characters in std::vector operator<< 2018-04-04 17:44:56 +01:00
85c253ed4a Test_serialisation MPI fix 2018-04-04 17:19:34 +01:00
ccfc0a5a89 Hadrons: better string representation of module parameters 2018-04-04 17:19:22 +01:00
d3f857b1c9 Hadrons: proper metadata for eigenpacks 2018-04-04 16:36:37 +01:00
fb62035aa0 Hadrons: do not create RB coarse grids 2018-04-03 19:49:11 +01:00
0260bc7705 Hadrons: eigen pack writing only for boss node 2018-04-03 18:55:46 +01:00
68e6a58f12 Hadrons: several Lanczos fixes and improvements 2018-04-03 17:42:21 +01:00
Daniel Richtmann
73ced656eb Merge remote-tracking branch 'upstream/develop' into feature/wilsonmg 2018-04-03 17:51:11 +02:00
Daniel Richtmann
f69008edf1 WilsonMG: Add functionality to report timings to MG preconditioner 2018-04-03 17:26:49 +02:00
Daniel Richtmann
57a49ed22f WilsonMG: Read in MG parameters from xml in test 2018-04-03 16:03:11 +02:00
Daniel Richtmann
ff6413a764 WilsonMG: Make number of levels chooseable at runtime
I don't like this solution though :(
2018-04-03 15:57:33 +02:00
Daniel Richtmann
2530bfed01 WilsonMG: Move params instance from global scope to test main function 2018-04-03 14:50:48 +02:00
640515e3d8 Merge branch 'develop' into feature/hadrons 2018-03-30 17:43:49 +01:00
paboyle
f089bf5629 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-03-30 16:17:26 +01:00
paboyle
276f113f28 IO uses master boss node for metadata. 2018-03-30 16:17:05 +01:00
97c579f637 Merge branch 'develop' into feature/hadrons 2018-03-30 16:04:44 +01:00
a13c109111 deterministic initialisation of field metadata 2018-03-30 16:03:01 +01:00
paboyle
ab6afd18ac Still compile if no LIME 2018-03-30 13:39:20 +01:00
paboyle
5bde64d48b Barrier required in parallel when we use ftell 2018-03-30 12:41:30 +01:00
paboyle
2f5add4d5f Creation of file 2018-03-30 12:30:58 +01:00
c5a885dcd6 I/O benchmark 2018-03-29 19:57:41 +01:00
Daniel Richtmann
74f79c5ac7 Revert "Add function to return full type as std::string"
This reverts commit 1cb745c8dc.
2018-03-29 12:03:50 +02:00
Daniel Richtmann
58c30c0cb1 WilsonMG: Add conformability checks in MG preconditioner 2018-03-28 13:24:39 +02:00
Daniel Richtmann
917a92118a WilsonMG: Move operator test to MG testing routine 2018-03-28 12:19:25 +02:00
a4d8512fb8 Revert "Lattice serialisation, just HDF5 for the moment"
This reverts commit 8a0cf0194f.
2018-03-27 17:55:42 +01:00
5ec903044d Serial IO code cleaning for std:: convention 2018-03-27 17:11:50 +01:00
Daniel Richtmann
04f9cf088d WilsonMG: Add more parameters to MultiGridParams struct 2018-03-27 17:13:11 +02:00
Daniel Richtmann
99107038f9 WilsonMG: Rationalize the level counting strategy 2018-03-27 17:06:33 +02:00
8a0cf0194f Lattice serialisation, just HDF5 for the moment 2018-03-26 19:16:16 +01:00
Daniel Richtmann
b78456bdf4 WilsonMG: Get rid of explicit include of GCR header 2018-03-26 15:41:53 +02:00
Daniel Richtmann
08543b6b11 WilsonMG: Provide a switch between V- and K-cycle 2018-03-26 15:37:17 +02:00
Daniel Richtmann
63ba33371f WilsonMG: Some minor refactoring 2018-03-26 15:34:53 +02:00
Daniel Richtmann
683a7d2ddd WilsonMG: Move comment to make clang-format happy 2018-03-26 14:59:40 +02:00
1c680d4b7a Merge branch 'develop' into feature/hadrons 2018-03-26 13:52:44 +01:00
Peter Boyle
b15db11c60 Kernels -> pure static object to enable device execution 2018-03-24 19:35:20 -04:00
Peter Boyle
f6077f9d48 Kernels -> not instantiaed otherwise object ref on GPU 2018-03-24 19:33:44 -04:00
Peter Boyle
572954ef12 Kernels not an instantiated object, just static 2018-03-24 19:33:13 -04:00
Peter Boyle
cedeaae7db Lebesge -> StencilView if necessary 2018-03-24 19:32:41 -04:00
Peter Boyle
e6cf0b1e17 View typedefs go to OperatorImpl 2018-03-24 19:32:11 -04:00
Peter Boyle
5412628ea6 begin end lamda 2018-03-24 19:31:45 -04:00
Peter Boyle
1f70cedbab Have to make all kernel called routines static since object reference will be a host pointer on GPU 2018-03-24 19:29:26 -04:00
Peter Boyle
b50f37cfb4 Remove overlap comms flag 2018-03-24 19:28:53 -04:00
Peter Boyle
cb0d2a1b03 threaded rng init; I thought this was on 2018-03-24 19:28:17 -04:00
Peter Boyle
6fe9b28a82 Cosmetic 2018-03-24 19:27:14 -04:00
Peter Boyle
b002587d7c Simplify 2018-03-24 19:26:44 -04:00
Peter Boyle
6c08385782 Simplify 2018-03-24 19:26:19 -04:00
Daniel Richtmann
afdcbf79d1 Merge remote-tracking branch 'upstream/develop' into feature/wilsonmg 2018-03-23 21:13:50 +01:00
Daniel Richtmann
3c3ec4e267 WilsonMG: Move tests for Wilson & WilsonClover into the same file 2018-03-23 21:12:27 +01:00
Daniel Richtmann
bbe1d5b49e WilsonMG: Temporarily use GMRES in construction of basis vectors
This can go back to CG once Mdag in CoarsenedMatrix works.
2018-03-23 20:02:27 +01:00
Daniel Richtmann
0f6009a29f WilsonMG: Huge refactor into something that could be considered an algorithm 2018-03-23 19:55:43 +01:00
Daniel Richtmann
1cfed3de7c WilsonMG: Add new logger for MG 2018-03-23 19:55:16 +01:00
Guido Cossu
c9c073eee4 Changes in messages in test dwf mixedprec 2018-03-23 11:27:56 +00:00
Guido Cossu
f290b2e908 Fix to pass CI tests 2018-03-23 11:14:23 +00:00
Guido Cossu
5f8225461b Fencing mixedcg test propagator write. LIME is still optional in Grid 2018-03-23 10:37:58 +00:00
Peter Boyle
4e1272fabf Kernels need to be static to work on GPU. No reference to host resident data 2018-03-22 18:44:53 -04:00
Peter Boyle
607dc2d3c6 Remove lebesgue order 2018-03-22 18:23:09 -04:00
Peter Boyle
23c880b009 Remove lebesgue order; stick in stencil if need 2018-03-22 18:13:41 -04:00
Peter Boyle
334bb6792f Lebesgue order removed. Stick in the stencil view 2018-03-22 18:12:12 -04:00
Peter Boyle
a3690071b4 Warm up GPu 2018-03-22 18:05:20 -04:00
Peter Boyle
299d119013 GPU work allocation improved 2018-03-22 18:04:24 -04:00
Peter Boyle
55be842d23 Dont force l1p.h so early 2018-03-22 18:01:43 -04:00
Daniel Richtmann
edbc0d49d7 WilsonMG: Get rid of explicit GridTypeMappers in CoarsenedMatrix 2018-03-22 16:38:24 +01:00
e9323460c7 Merge branch 'develop' into feature/hadrons 2018-03-22 10:48:37 +00:00
20e186a1e0 Merge pull request #158 from goracle/dev-pull
Make compilation faster by moving print of git hash.
2018-03-22 10:45:17 +00:00
Peter Boyle
6ef4af989b Merge pull request #159 from goracle/dev-precsafe
Add dimension check to precisionChange.
2018-03-22 10:41:53 +00:00
Dan H
ccde8b817f Add dimension check to precisionChange. 2018-03-21 20:58:04 -04:00
Dan H
68168bf72d Revert "Add dimension match check to precisionChange."
This reverts commit 8f601d9b39.
2018-03-21 20:51:38 -04:00
Dan H
e93d0feaa7 Merge branch 'dev-pull' of github.com:goracle/Grid into dev-pull 2018-03-21 20:39:30 -04:00
Dan H
8f601d9b39 Add dimension match check to precisionChange. 2018-03-21 20:38:19 -04:00
paboyle
5436308e4a Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-03-21 14:26:29 +00:00
paboyle
07fe7d0cbe Save file in current dir; print checksums 2018-03-21 14:26:04 +00:00
Guido Cossu
60b57706c4 Small bug fix in the shm file names 2018-03-21 13:57:30 +00:00
James Harrison
58c2f60b69 Merge branch 'feature/hadrons' into feature/qed-fvol 2018-03-20 20:19:18 +00:00
James Harrison
bfa3a7b3b0 Merge branch 'feature/hadrons' into feature/qed-fvol
# Conflicts:
#	extras/Hadrons/Modules.hpp
#	extras/Hadrons/Modules/MGauge/StochEm.cc
#	extras/Hadrons/modules.inc
2018-03-20 20:17:59 +00:00
paboyle
954e38bebe Put a username in the path 2018-03-20 18:16:15 +00:00
paboyle
b1a38bde7a Extra test for Gparity with plaquette action 2018-03-20 18:01:32 +00:00
Peter Boyle
9875c446c6 Clean up pragmas 2018-03-20 07:19:17 -04:00
Peter Boyle
9c25eb35ca Eigen develop branch for now 2018-03-20 07:18:56 -04:00
Peter Boyle
5ac96dbdc6 Warm behaviour in SU3 benchmark 2018-03-20 07:18:31 -04:00
Peter Boyle
5cc9aca85d Use 64bit index for looping 2018-03-20 06:34:52 -04:00
Peter Boyle
ac29ebcb95 Clean up debug prints 2018-03-20 06:33:59 -04:00
Guido Cossu
2581875edc Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-03-19 18:00:08 +00:00
Guido Cossu
f212b0a963 Merge branch 'feature/hadrons' of https://github.com/paboyle/Grid into feature/hadrons 2018-03-19 17:57:13 +00:00
Guido Cossu
62702dbcb8 Fixing bug in the Point sink causing NaNs 2018-03-19 17:56:53 +00:00
41d6cab033 Merge branch 'develop' into feature/hadrons 2018-03-19 13:30:21 +00:00
5a31e747c9 Merge commit 'd5ce66f6ab2c44a12def7b6d26df80d6e646b1fb' into feature/hadrons 2018-03-19 13:19:09 +00:00
cbc73a3fd1 Hadrons: CG guesser fix 2018-03-19 13:11:38 +00:00
Peter Boyle
a5cfb89304 Update eigen process direct from develop on github. Dangerous, but needed from GPU 2018-03-19 07:20:48 -04:00
Peter Boyle
f04a7251cc Gpu welcome message and device info 2018-03-19 07:12:12 -04:00
Peter Boyle
d4ce7d9905 GPU friendly Stencil needs a view 2018-03-19 07:11:21 -04:00
Peter Boyle
8a1d303ab9 GPU friendly stencil improvements 2018-03-19 07:11:03 -04:00
Peter Boyle
bf0a4de919 GPU friendly params object 2018-03-19 07:10:12 -04:00
Peter Boyle
6fe5885fe4 Warning suppress 2018-03-19 07:09:49 -04:00
Peter Boyle
17ac309e84 Fix the compile 2018-03-19 07:08:59 -04:00
Peter Boyle
7467a1c027 Latest eigen needed for GPU 2018-03-19 07:08:10 -04:00
Peter Boyle
fdfb8a26a8 Disable eigen vectorisation on GPU because of Summit compile issues 2018-03-19 07:07:30 -04:00
paboyle
2df4e422ad Merge branch 'feature/gpu-port' of https://github.com/paboyle/Grid into feature/gpu-port 2018-03-18 14:45:41 +00:00
paboyle
3a3e3cac40 Pull the trigger on offload 2018-03-18 14:45:29 +00:00
paboyle
b1c02ec310 MallocManaged in GPU 2018-03-18 14:44:46 +00:00
paboyle
38eadee2c9 Prettier code 2018-03-18 14:44:22 +00:00
paboyle
42c70437be Views 2018-03-18 14:43:47 +00:00
paboyle
65274b4d7f Tidy up 2018-03-18 14:43:16 +00:00
Peter Boyle
6c6d43eb4e Drop RB on coarse space ; that was a mistake 2018-03-17 09:35:01 +00:00
Peter Boyle
e1dcfd3553 typo fix 2018-03-16 23:10:47 +00:00
Peter Boyle
888838473a 4GB clean the offsets in parallel IO for multifile records 2018-03-16 21:54:56 +00:00
Peter Boyle
01568b0e62 Add a new SHM option 2018-03-16 21:54:28 +00:00
Peter Boyle
d5ce66f6ab Extra SHM option 2018-03-16 21:37:03 +00:00
Guido Cossu
d86936a3de Eliminating deprecated lex_sites 2018-03-16 12:26:39 +00:00
Daniel Richtmann
ee5cf6c8c5 WilsonMG: Some minor changes to GMRES implementations 2018-03-16 13:10:45 +01:00
d516938707 Hadrons: eigen packs I/O and deflation interface 2018-03-14 14:55:47 +00:00
Peter Boyle
7e8be32755 Typo fix 2018-03-13 19:22:31 -04:00
72344d1418 Hadrons: change default Schur convention to DiagTwo 2018-03-13 17:10:54 +00:00
7ecf6ab38b Merge branch 'develop' into feature/hadrons 2018-03-13 16:11:59 +00:00
2d4d70d3ec Hadrons: LCL fixes 2018-03-13 16:10:36 +00:00
78f8d47528 Hadrons: environment access to derived objects 2018-03-13 16:10:16 +00:00
b85f987b0b Hadrons: error message channel verbose during profiling 2018-03-13 16:09:22 +00:00
f57afe2079 Hadrons: much cleaner eigenpack implementation, to be tested 2018-03-13 13:51:09 +00:00
Dan H
0fb84fa34b Make compilation faster by moving print of git hash. 2018-03-12 17:03:48 -04:00
Vera Guelpers
8462bbfe63 Gamma input for meson contraction with round brackets 2018-03-12 18:02:12 +00:00
229977c955 Hadrons: minor memory fix for ShiftProbe module 2018-03-09 21:56:27 +00:00
e485a07133 Hadrons: garbage collector debug output 2018-03-09 21:56:01 +00:00
paboyle
0880747edb Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-03-09 20:44:42 +00:00
paboyle
b801e1fcd6 fclose should be called through a call to close() 2018-03-09 20:44:10 +00:00
paboyle
ff761ea4e6 Bound check improvement 2018-03-09 20:00:46 +00:00
70ec2faa98 Hadrons: maximum iteration specified for tests and error if 0 2018-03-09 19:53:55 +00:00
paboyle
a31d3e60d8 Better bounds check 2018-03-09 18:10:21 +00:00
Daniel Richtmann
a66cecc509 WilsonMG: Fix invalid call to MR ctor 2018-03-09 17:34:29 +01:00
Daniel Richtmann
0f6cdf3d4b WilsonMG: Implement missing parts of CoarsenedMatrix 2018-03-09 16:56:16 +01:00
Daniel Richtmann
1e63b73a14 WilsonMG: Some cleanup/formatting 2018-03-09 16:50:19 +01:00
2f849ee252 declaration fix 2018-03-08 23:34:00 +00:00
bb6ed44339 Merge branch 'develop' into feature/hadrons 2018-03-08 23:09:28 +00:00
paboyle
4d60b92b7f Update oSites 2018-03-08 21:00:25 +00:00
360cface33 Grid tensor serialisation fully implemented and tested 2018-03-08 19:12:03 +00:00
Azusa Yamaguchi
80302e95a8 MILC Interface 2018-03-08 15:34:03 +00:00
paboyle
c159c70c84 View introduced 2018-03-08 14:58:04 +00:00
paboyle
28b5572755 Merge branch 'feature/gpu-port' of https://github.com/paboyle/Grid into feature/gpu-port 2018-03-08 13:01:42 +00:00
paboyle
5fac7080bc Adding -fno-strict-aliasing by default 2018-03-08 13:01:24 +00:00
Peter Boyle
4548523ecc This modification eliminates what looks like a compiler bug
on Intel 2017.
2018-03-08 04:41:16 -08:00
caf2f6b274 Merge branch 'develop' of github.com:paboyle/Grid into develop 2018-03-08 09:52:25 +00:00
c49be8988b Grid tensor serialisation 2018-03-08 09:51:22 +00:00
971c2379bd std::vector to tensor conversion + test units 2018-03-08 09:50:39 +00:00
Guido Cossu
94b0d66e4c Merge pull request #157 from goracle/dev-pull
Add print of the current git hash on Grid init.
2018-03-08 16:09:28 +09:00
Dan H
5e8af396fd Add print of the current git hash on Grid init. 2018-03-07 13:11:51 -05:00
paboyle
4154fc6f44 Revert a change 2018-03-07 16:54:11 +00:00
paboyle
4e3458516a Reverting after fixing issue with extract merge 2018-03-07 16:50:13 +00:00
Peter Boyle
90a2efb9b3 Hit an annoying strict alias optimisation in GCC 4.9 through 6.3
Chris K was correct. It appears that an additional memcpy (UGHHH) is enough
to suppress the compiler
2018-03-07 07:27:26 -08:00
9942723189 Merge branch 'develop' into feature/hadrons
# Conflicts:
#	lib/serialisation/BaseIO.h
2018-03-07 15:22:16 +00:00
a7d19dbb64 Merge branch 'develop' of github.com:paboyle/Grid into develop
# Conflicts:
#	lib/serialisation/BaseIO.h
2018-03-07 15:13:54 +00:00
90dbe03e17 Conversion of Grid tensors to std::vector made more elegant, also pair syntax changed to (x y) to avoid issues with JSON/XML 2018-03-07 15:12:32 +00:00
8b14096990 Conversion of Grid tensors to std::vector made more elegant, also pair syntax changed to (x y) to avoid issues with JSON/XML 2018-03-07 15:12:18 +00:00
Azusa Yamaguchi
b938202081 Overlapped Comm for Wilson DhopInternal 2018-03-07 14:08:43 +00:00
e79ef469ac Merge branch 'develop' into feature/hadrons
# Conflicts:
#	lib/serialisation/BaseIO.h
2018-03-06 19:25:51 +00:00
485c5db0fe conversion of Grid tensors to nested std::vector in preparation for tensor serialisation 2018-03-06 19:22:03 +00:00
James Harrison
c793947209 Add overloaded Photon constructors, with default parameters for IR improvements and infinite-volume G(x=0). 2018-03-06 16:27:26 +00:00
paboyle
40699221e2 Dont alias lhs and rhs in a where statement 2018-03-06 04:14:13 -08:00
paboyle
3cb1b545d0 Don't alias the variables with a where statement. 2018-03-06 04:13:26 -08:00
3e9ee053a1 Merge branch 'develop' into feature/hadrons 2018-03-05 20:01:38 +00:00
dda6c69d5b Hadrons: scalar SU(N) shift probes 2018-03-05 20:00:29 +00:00
cd51b9af99 Torture yourself with namespace lookup 101 2018-03-05 19:58:13 +00:00
paboyle
e199ba7e88 Fix the Charge conjugate BC's 2018-03-05 13:59:02 +00:00
paboyle
c399c2b44d Guido broke the charge conjugate plaquette action with premature optimisation.
This sector of the code does not matter for anything other than Guido's quenched HMC
studies, and any plaq specific optimisations should be retained in a private branch
instead of destroying the code simplicity.
2018-03-05 12:55:41 +00:00
paboyle
af7de7a294 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-03-05 12:22:41 +00:00
paboyle
1dc86efd26 Finalize protection 2018-03-05 12:22:18 +00:00
paboyle
4d53703c67 Scalar type differeing allowed, eg. precisoin change 2018-03-05 11:39:52 +00:00
paboyle
d506c59efa Warnings disabled 2018-03-05 11:39:20 +00:00
paboyle
44188a5c6f AVX512 fix 2018-03-05 00:32:24 +00:00
paboyle
2018077770 Make NVCC happy with the compile. This is warning free on 9.1 on my laptop (both make and make tests). 2018-03-05 00:28:24 +00:00
paboyle
984e06e2b5 Introduce view objects that can safely be copied to GPU for access 2018-03-04 16:40:11 +00:00
paboyle
aead94e9a7 View introduced 2018-03-04 16:39:29 +00:00
paboyle
3277bda130 View introduction to prepare for accelerator offload.
Probably same problem exists for stencil object
2018-03-04 16:38:08 +00:00
paboyle
442b0b406c View related changes 2018-03-04 16:34:14 +00:00
paboyle
8824a54269 View related changes 2018-03-04 16:33:33 +00:00
paboyle
c03423250f Indexable changes 2018-03-04 16:31:35 +00:00
paboyle
317fd0da44 Views introduced. Need to accelerator offload these routines. 2018-03-04 16:30:45 +00:00
paboyle
783795a44a Views introduced 2018-03-04 16:12:49 +00:00
paboyle
0e6197fbed Introduce accelerator friendly expression template rewrite.
Must obtain and access lattice indexing through a view object that is safe
to copy construct in copy to GPU (without copying the lattice).
2018-03-04 16:03:19 +00:00
paboyle
dad7862f91 Go through a view object that can be copied to GPU 2018-03-04 16:02:02 +00:00
paboyle
c89a883448 where was deprecated and integrated to ET engine a long time ago. Remove dead old original code 2018-03-04 15:58:02 +00:00
paboyle
c204288fbc Remove a couple of print statements 2018-03-04 15:57:15 +00:00
paboyle
ad739f042a Introduce views for passing lattice indexing to accelerators. 2018-03-04 15:56:14 +00:00
paboyle
db988301d0 Introduce view objects for indexing lattices. Used to pass the view to acccelerators 2018-03-04 15:55:16 +00:00
paboyle
9b1f29c4c2 Support a view for passing to accelerator 2018-03-04 15:54:35 +00:00
paboyle
e5ea04ee0c Need to support precision change, and real replication in multiple simd lanes 2018-03-04 15:53:04 +00:00
paboyle
c92a3c6068 Need to support any vector type template and run on accelerator 2018-03-04 15:52:14 +00:00
paboyle
03f8da8fbc enable-debug option for debug flags in compile 2018-03-04 15:51:47 +00:00
f32555dcc5 Merge branch 'develop' into feature/hadrons 2018-03-03 15:31:52 +00:00
30391cb2eb Merge pull request #155 from fionnoh/develop
Some changes needed for deflation interface
2018-03-03 13:43:59 +00:00
e93c883470 Hadrons: basic GraphViz visualisation 2018-03-03 13:42:36 +00:00
Fionn O hOgain
2e88408f5c Some changes needed for deflation interface 2018-03-02 22:27:41 +00:00
fcac5c0772 Hadrons: scalar SU(N) fixes 2018-03-02 19:20:23 +00:00
90f4000935 Hadrons: scheduler debug less verbose 2018-03-02 19:20:01 +00:00
480708b9a0 Hadrons: safer error handling for HadronsXmlRun 2018-03-02 19:19:37 +00:00
c4baf876d4 Hadrons: graph consistency check 2018-03-02 18:40:18 +00:00
2f4dac3531 Hadrons: legal update 2018-03-02 18:10:58 +00:00
3ec6890850 Merge branch 'feature/hadrons' of github.com:paboyle/Grid into feature/hadrons 2018-03-02 17:56:08 +00:00
018801d973 Hadrons: legal update 2018-03-02 17:56:00 +00:00
1d83521daa Hadrons: scalar SU(N) EMT 2018-03-02 17:55:18 +00:00
fc5670c6a4 Merge pull request #151 from guelpers/feature/hadrons
Feature/hadrons
2018-03-02 17:54:43 +00:00
d9c435e282 Hadrons: Scalar SU(N) transverse projection module 2018-03-02 17:35:12 +00:00
614a0e8277 Hadrons: Scalar SU(N) utility functions 2018-03-02 17:34:23 +00:00
Vera Guelpers
aaf39222c3 update my fork and fixed conflicts 2018-03-02 17:08:08 +00:00
550142bd6a Hadrons: more code cleaning 2018-03-02 14:30:45 +00:00
c0a929aef7 Hadrons: code cleaning 2018-03-02 14:29:54 +00:00
37fe944224 Hadrons: scalar kinetic term 2018-03-02 14:14:11 +00:00
Vera Guelpers
315a42843f changes requested for the pull request 2018-03-02 11:47:38 +00:00
83a101db83 Hadrons: more LCL fixes 2018-03-02 11:05:02 +00:00
c4274e1660 Hadrons: LCL cleaning 2018-03-02 10:18:33 +00:00
ba6db55cb0 Hadrons: reverse last commit 2018-03-01 23:30:58 +00:00
e5ea84d531 Hadrons: LCL: orthogonalise coarse evec 2018-03-01 19:33:11 +00:00
15767a1491 Hadrons: LCL fine convergence test 2018-03-01 18:04:08 +00:00
4d2a32ae7a Hadrons: z-Mobius message fix 2018-03-01 18:03:44 +00:00
5b937e3644 Hadrons: VM memory profiling fix 2018-03-01 17:28:38 +00:00
e418b044f7 Hadrons: code cleaning 2018-03-01 12:57:28 +00:00
b8b05f143f Hadrons: Lanczos more conservative type names 2018-03-01 12:53:16 +00:00
6ec42b4b82 LCL: external storage fix 2018-03-01 12:27:29 +00:00
abb7d4d2f5 Hadrons: z-Mobius action 2018-02-27 19:32:19 +00:00
16ebbfff29 Hadrons: Schur convention globally defined through a macro 2018-02-27 18:45:23 +00:00
4828226095 Hadrons: prettier log 2018-02-27 14:43:51 +00:00
8a049f27b8 Hadrons: Lanczos code improvement 2018-02-27 13:46:59 +00:00
43578a3eb4 Hadrons: copyright update 2018-02-26 19:24:19 +00:00
fdbd42e542 Hadrons: first implementation of local coherence Lanczos 2018-02-26 19:22:43 +00:00
e7e4cee4f3 Merge branch 'develop' into feature/hadrons 2018-02-26 15:05:05 +00:00
paboyle
78a9e31ff0 options more obvious 2018-02-24 22:26:32 +00:00
paboyle
c1fc947bb8 Coordinate handling GPU friendly + some GPU merge/extract improvements 2018-02-24 22:26:10 +00:00
paboyle
ff7b19a71b Coordinate handling GPU ready avoid malloc 2018-02-24 22:25:39 +00:00
paboyle
1c16ffa1c1 Coordinate GPU ready. No malloc 2018-02-24 22:25:09 +00:00
paboyle
4962f59477 Eliminate both GPU issue and threading bottle neck by avoiding malloc in coordinate handling 2018-02-24 22:24:37 +00:00
paboyle
e158b60bce GPU friendly coords 2018-02-24 22:23:47 +00:00
paboyle
34820bec27 Coordinate handling GPU ready. No malloc 2018-02-24 22:23:18 +00:00
paboyle
eed9aa9f0c Extract merge gpu ready 2018-02-24 22:23:01 +00:00
paboyle
8792ff6439 Coordinate handling gpu ready 2018-02-24 22:22:43 +00:00
paboyle
078901278c Coordinate handling gpu friendly 2018-02-24 22:22:02 +00:00
paboyle
bf5fb89aff Coordinate handling GPU friendly 2018-02-24 22:21:36 +00:00
paboyle
7574c18cef Massive clean up extract merge.
Simpler and GPU friendly
2018-02-24 22:21:08 +00:00
paboyle
36ea5f6b77 gpu friendly coordinates ; no std::vector on GPU 2018-02-24 22:20:14 +00:00
paboyle
285deab432 Coordinate handling GPU friendly. Avoid std::vector 2018-02-24 22:19:28 +00:00
paboyle
bb7d87d0a0 Coordinate handling gpu friendly 2018-02-24 22:18:33 +00:00
James Harrison
ec3954ff5f QedFVol: Add input parameter G(x=0) for infinite-volume photon 2018-02-23 14:53:05 +00:00
Azusa Yamaguchi
0f468e2179 OverlappedComm for Staggered 5D and 4D. 2018-02-22 12:50:09 +00:00
James Harrison
8e61286741 Merge branch 'develop' into feature/qed-fvol 2018-02-20 15:33:35 +00:00
paboyle
4790e99817 Extra communicator free that I had missed.
Hard to audit them all as this is complex
2018-02-20 15:12:31 +00:00
paboyle
2dd63aa7a4 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-02-20 14:29:26 +00:00
paboyle
559a501140 Deflation interface for solvers 2018-02-20 14:29:08 +00:00
paboyle
945684c470 updates for deflation in the RB solver 2018-02-20 14:28:38 +00:00
Christopher Kelly
e30a80a234 Relaxed constraints on MPI thread mode when not using multiple comms threads 2018-02-15 17:13:36 +00:00
James Harrison
69e4ecc1d2 QedFVol: Fix single precision build error 2018-02-14 17:37:18 +00:00
James Harrison
5f483df16b Merge branch 'develop' into feature/qed-fvol 2018-02-14 16:35:04 +00:00
James Harrison
4680a977c3 QedFVol: set infinite-volume photon propagator to 1 at x=0,
so that momentum-spage photon propagator is non-negative.
Need to check whether this is sufficient for all volumes.
2018-02-14 16:30:09 +00:00
Vera Guelpers
de42456171 updated my fork and conflicts fixed 2018-02-14 13:57:56 +00:00
Vera Guelpers
d55212c998 restructure SeqConservedCurrent for DWF to need less memory 2018-02-14 10:45:18 +00:00
paboyle
c96483e3bd Whitespace only change 2018-02-13 11:39:07 +00:00
Vera Guelpers
c6e1f64573 Test for QED 2018-02-13 09:30:23 +00:00
paboyle
ae31a6a760 Move deflate to right class 2018-02-13 02:11:37 +00:00
paboyle
dd8f2a64fe INterface to suit hadrons on Lanczos 2018-02-13 02:08:49 +00:00
James Harrison
724cf02d4a QedFVol: Implement infinite-volume photon 2018-02-12 17:18:10 +00:00
paboyle
7b8b2731e7 Conj error for complex coeffs 2018-02-12 16:06:31 +00:00
paboyle
237a8ec918 Communicator leak fixed (I think) 2018-02-12 13:27:20 +00:00
Vera Guelpers
49a0ae73eb Insertion of photon field in seqential conserved current 2018-02-12 09:36:08 +00:00
Daniel Richtmann
6ab60c5b70 Merge remote-tracking branch 'upstream/develop' into feature/wilsonmg 2018-02-08 23:59:07 +01:00
Daniel Richtmann
8c692b7ffd WilsonMG: Comment assertion on hermiticity of coarse operator for now
TODO: Think of a way to not break dwf_hdcr by doing that. It's only an assertion
but it still interferes with it.
2018-02-08 23:55:05 +01:00
Daniel Richtmann
2976132bdd Add first version of multigrid for wilson clover analogous to wilson one
Just like the wilson one, this algorithm

• is currently only a 2-level method since I don't have correct implementations
  for Mdir and Mdiag in CoarsenedMatrix yet (needed for further coarsening)
• needs levelization and refactoring into a proper algorithm
2018-02-08 23:52:10 +01:00
Daniel Richtmann
48177f2f2d Add tests for all MR|GMRES solvers with wilson clover action 2018-02-08 23:52:09 +01:00
Daniel Richtmann
c4ce70a821 WilsonMG: Major cleanup 2018-02-08 23:52:08 +01:00
James Harrison
315f1146cd QedFVol: Fix output of VPCounterTerms module. 2018-02-08 20:40:45 +00:00
Daniel Richtmann
a3e009ba54 Add tests for CAGMRES solvers with staggered action 2018-02-08 17:46:28 +01:00
Daniel Richtmann
eb7cf239d9 Print warning messages in CAGMRES solvers
Currently, the implementation of these algorithms doesn't differ from their non
communication-avoiding versions.
2018-02-08 17:43:47 +01:00
Daniel Richtmann
13ae371ef8 Make solver parameters match in all MR|GMRES solver tests 2018-02-08 17:33:10 +01:00
Daniel Richtmann
9f79a87102 Fix bugs in Flexible GMRES solvers
Somehow I got the left and right-preconditioned versions of GMRES mixed up. As
of now this is right-preconditioned version, which is what we want.
2018-02-08 16:00:31 +01:00
Daniel Richtmann
4ded1ceeb0 Make GMRES solvers perform no more than MaxIterations steps
I noticed that it was possible to overrun this number.
2018-02-08 15:29:44 +01:00
James Harrison
9f202782c5 QedFVol: Change format of scalar VP output files, and save diagrams without charge factors for consistency with ChargedProp module. 2018-02-07 20:31:50 +00:00
Daniel Richtmann
8bc12e0ce1 Remove superfluous comments in MR solver 2018-02-07 18:09:09 +01:00
Daniel Richtmann
cc2f00f827 Remove test for MR solver with dwf action as it doesn't converge 2018-02-07 18:09:08 +01:00
Daniel Richtmann
cd61e2e6d6 Increase max iterations in test of MR solver with staggered action 2018-02-07 18:09:07 +01:00
Daniel Richtmann
323ed1a588 Add an overrelaxation parameter to the MR solver 2018-02-07 18:09:06 +01:00
Daniel Richtmann
68c66d2e4b Remove empty line in output of *Residual* solvers 2018-02-07 18:08:56 +01:00
Daniel Richtmann
1671adfd49 WilsonMG: Add some tests for linear operators 2018-02-07 17:15:22 +01:00
James Harrison
594a262dcc QedFVol: Remove redundant file Communicator_mpi.cc 2018-02-07 11:37:01 +00:00
James Harrison
7f8ca54285 Merge branch 'develop' into feature/qed-fvol 2018-02-07 10:11:00 +00:00
James Harrison
c5b23c367e QedFVol: Fix segmentation fault when multiple propagator modules are used. 2018-02-05 11:46:33 +00:00
Vera Guelpers
b6fe03eb26 BugFix: Now the stochatic EM potential weight is generated when calling for the first time 2018-02-02 15:29:38 +00:00
James Harrison
f37ed4958b Implement IR improvement, with coefficients set in input file. 2018-02-02 11:56:51 +00:00
paboyle
b9b5bdfc3a Proper offload (accelerator access) will require a mutable copy lambda. 2018-02-02 11:38:19 +00:00
paboyle
51eb2c5dfc Make referencign the stencil and all info required to evaluate the kernel
accelerator marked up
2018-02-02 11:37:13 +00:00
paboyle
ede0dff794 Mark up as an accelerator function 2018-02-02 11:36:44 +00:00
paboyle
aa6de818e2 Copy data needed by Kernels out of the grid object to avoid host reference 2018-02-02 11:36:11 +00:00
paboyle
dcf6517a93 Accelerator offload and copy Opt into the kernel for GPU host var safety 2018-02-02 11:35:35 +00:00
paboyle
a308dff410 accelerator loop, copy Opt into the GPU 2018-02-02 11:34:37 +00:00
paboyle
14ba20898a Accelerator loop the key kernel call 2018-02-02 11:30:07 +00:00
paboyle
a53d3ee19a Add Opt to the lambda capture to get it into the GPU 2018-02-02 11:28:39 +00:00
paboyle
5df435319d Use constexpr 2018-02-02 11:27:56 +00:00
paboyle
0da2d3e222 accelerator off load some more stuff 2018-02-02 11:27:35 +00:00
paboyle
9c9dfbfa78 Force accelerator 2018-02-02 11:25:09 +00:00
paboyle
e4df025d01 Accelerator related 2018-02-01 23:20:05 +00:00
paboyle
cfeda9d536 constexpr on const ints 2018-02-01 22:59:12 +00:00
paboyle
4450b1993a Offload 2018-02-01 22:45:47 +00:00
paboyle
d03ce5c2a4 Provide a way to get around std::vector for a known type on device.
Use template specialisation to access a private member in the Clang++ STL implementation
2018-02-01 22:44:25 +00:00
paboyle
7d6522c1ef Accelerator inline 2018-02-01 22:43:56 +00:00
paboyle
b96832a922 Accelerator inline 2018-02-01 22:43:26 +00:00
paboyle
5d7af47b05 accelerator_inline 2018-02-01 22:42:54 +00:00
paboyle
053ef25c90 constexpr makes GPU happy 2018-02-01 22:42:29 +00:00
paboyle
8ae77d3706 Small simplification of FermionOperatorImpl towards GPU but not there yet 2018-02-01 22:41:54 +00:00
Peter Boyle
896f3a8002 Fix to MPI for Hokusai system 2018-02-01 18:51:51 +00:00
James Harrison
5f85473d6b QedFVol: Move Projection class into Result class 2018-02-01 16:16:13 +00:00
Daniel Richtmann
871649238c WilsonMG: Stricter naming for linear operators 2018-02-01 14:43:08 +01:00
James Harrison
ac3b0ebc58 QedFVol: New structure for ChargedProp output files 2018-02-01 12:31:32 +00:00
Daniel Richtmann
7c86d2085b WilsonMG: Some minor cleanup 2018-02-01 12:24:16 +01:00
Daniel Richtmann
9292be0b69 WilsonMG: Add check for Mdiag + Σ Mdir == M
Need to test my implementations of CoarsenedMatrix::Mdiag &
CoarsenedMatrix::Mdir.
2018-01-31 14:03:30 +01:00
Guido Cossu
f0fcdf75b5 Update README.md 2018-01-30 12:44:20 +01:00
Guido Cossu
53bffb83d4 Updating README with new SKL target 2018-01-30 12:42:36 +01:00
Daniel Richtmann
10141f90c9 WilsonMG: Rename test file 2018-01-30 10:25:09 +01:00
Guido Cossu
cd44e851f1 Fixing compilation error in FundtoHirep 2018-01-30 06:04:30 +01:00
Daniel Richtmann
a414430817 Merge remote-tracking branch 'upstream/develop' into feature/ddalphaamg 2018-01-29 18:32:31 +01:00
Daniel Richtmann
f20728baa9 WilsonMG: Some further steps towards a three level method
Currently this is very "manual" as we are still testing stuff. Will refactor
and make it an algorithm once everything works.

What currently does work:

  - All tests in MultiGridPreconditioner::runChecks for the first coarse grid
  - The tests for the intergrid operators going from the first to the second
    coarse grid
    - (1 - P R) v   == 0
    - (1 - R P) v_c == 0
  - A full solve with VPGCR and a two-level MG preconditioner

What hinders the rest of the tests from passing with a three-level method is the
absence of implementations of CoarsenedMatrix::Mdir and CoarsenedMatrix::Mdiag.
2018-01-29 18:29:49 +01:00
Daniel Richtmann
d2e68c4355 WilsonMG: Perform some minor cleanup 2018-01-29 18:07:10 +01:00
Daniel Richtmann
1cb745c8dc Add function to return full type as std::string
Also works for nested templates. I find it useful for debugging.
Possible usage:

std::cout << "getTypename<AType>() = " << getTypename<Atype>() << std::endl;
std::cout << "getTypename<decltype(AnInstance)>() = " << getTypename<decltype(AnInstance)>() << std::endl;
2018-01-29 17:39:19 +01:00
Daniel Richtmann
faf4278019 Use 2 passes of GS in coarse operator construction 2018-01-29 17:21:42 +01:00
Daniel Richtmann
194e4b94bb Make MG checking function work level-wise 2018-01-29 17:18:20 +01:00
Daniel Richtmann
bfc1411c1f Use more iterations in subspace creation 2018-01-29 17:11:29 +01:00
Daniel Richtmann
161637e573 Turn on orthogonality checking temporarily 2018-01-29 17:10:05 +01:00
paboyle
79b50feacf fixme updates 2018-01-29 16:00:40 +00:00
Guido Cossu
fb24e3a7d2 Adding utilities for perf profiling 2018-01-29 11:11:45 +01:00
Guido Cossu
655a69259a Added support for GCC compilation for Skylake AVX512 2018-01-28 17:02:46 +01:00
paboyle
c67c1544cd abs no compile on travis fix attempt 2018-01-28 10:26:04 +00:00
paboyle
e657f9a344 OMP collapse changes to make NVCC happy 2018-01-28 01:21:53 +00:00
paboyle
b6ebf35af5 Intel compiler doesn't like Nvidia error disable pragmas 2018-01-28 01:03:10 +00:00
paboyle
604c05f4b8 parallel_for elimination -> thread_loop 2018-01-28 01:01:36 +00:00
paboyle
70e276e1ab parallel_for elimination -> thread_loop 2018-01-28 01:01:14 +00:00
paboyle
9472b02771 Parallel_for elimination -> thread_loop. 2018-01-28 01:00:55 +00:00
paboyle
9597ab94eb Zero changes, swap on lattice type. 2018-01-27 23:51:40 +00:00
paboyle
ce4da83bc2 Zero changes, literally 2018-01-27 23:51:10 +00:00
paboyle
d557f3ef77 Zero changes (literally) and also a warning elimination 2018-01-27 23:50:43 +00:00
paboyle
f574c20118 Zero changes, __VA_ARGS__ and swap 2018-01-27 23:50:17 +00:00
paboyle
f102897385 VA_ARGS to make comma safe automatic 2018-01-27 23:49:47 +00:00
paboyle
d6fce3e498 Zero changes, literally 2018-01-27 23:48:01 +00:00
paboyle
2d0bcc2606 Zero changes, acceleartor on kernels and some thread loop changes 2018-01-27 23:47:38 +00:00
paboyle
45df59720e Zero changes and VA_ARGS changes 2018-01-27 23:46:58 +00:00
paboyle
44ef5bc207 Zero changes (literally speaking). 2018-01-27 23:46:28 +00:00
paboyle
98af36217a Zero changes. (I mean literally) 2018-01-27 23:46:02 +00:00
James Harrison
4e0cf0cc28 QedFVol: Fix bug in ScalarVP.cc due to double use of temporary object. Still getting mpi3 errors when configured with enable-comms=mpi[-auto]. 2018-01-27 15:15:25 +00:00
Guido Cossu
507c4e9efc Correcting an missing semicolumn in avx512 2018-01-27 10:59:55 +01:00
paboyle
be7b37b9c9 Mistake on openmp 2018-01-27 00:05:11 +00:00
paboyle
c4f82e072b _grid becomes private ; use Grid()§ 2018-01-27 00:04:12 +00:00
paboyle
3f9654e397 Hiding internals 2018-01-26 23:09:03 +00:00
paboyle
912b50f6fa Hiding lattice internals 2018-01-26 23:08:45 +00:00
paboyle
2a4a0e43c1 Hide internals 2018-01-26 23:08:27 +00:00
paboyle
32523a229c Hide internals 2018-01-26 23:08:02 +00:00
paboyle
1ebd56c3fb Hide internal data 2018-01-26 23:07:34 +00:00
paboyle
8dccffdfd5 Hide internal data 2018-01-26 23:06:51 +00:00
paboyle
5642ea270f Hide internal data 2018-01-26 23:06:28 +00:00
paboyle
43cea62855 Hide internal data 2018-01-26 23:06:03 +00:00
paboyle
2b4067bb71 Hide internal data 2018-01-26 23:05:32 +00:00
paboyle
85771e97e9 Hide internal data 2018-01-26 23:04:46 +00:00
paboyle
8b371ffa94 Hide internal data 2018-01-26 23:03:54 +00:00
paboyle
bf659dfd92 Hide the ._odata 2018-01-26 22:27:47 +00:00
James Harrison
cdf550845f QedFVol: Fix bugs in StochEm.cc and ChargedProp.cc (still only works without MPI). 2018-01-26 21:25:20 +00:00
James Harrison
3db7a5387b BROKEN: Adapted scalarVP, UnitEm and VPCounterTerms modules to new Hadrons. Currently getting an assertion error from Communicator_mpi3.cc when I try to run. 2018-01-26 16:33:48 +00:00
paboyle
76a4dd36d9 Fix no compile of test serialisation 2018-01-26 00:13:21 +00:00
paboyle
f4010023ca Warning fixes 2018-01-25 23:46:47 +00:00
paboyle
24a4589def Changes to interface a little 2018-01-25 23:37:34 +00:00
paboyle
c904822e74 Warning removal 2018-01-25 23:37:15 +00:00
paboyle
40ee1e1957 Zero() 2018-01-25 23:36:58 +00:00
paboyle
461df78a3f Better to use Zero(), and not zero static data 2018-01-25 23:36:22 +00:00
paboyle
db9c9475d4 const 2018-01-25 23:36:06 +00:00
paboyle
214f7a6f13 Drop std::vector container for the lattice data 2018-01-25 23:35:04 +00:00
paboyle
c844cfcda8 Remove commAllocator; make more simple; option to switch off the pointer caceh 2018-01-25 23:33:57 +00:00
paboyle
a3e3034e6f Host compile 2018-01-25 23:33:00 +00:00
paboyle
e7cba358c2 Temporary update to reflect the new dropping of std::vector in Lattice
Will update again to hide the internals in an interface
2018-01-25 23:31:41 +00:00
Guido Cossu
f8a5194c70 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-01-25 13:46:37 +01:00
Guido Cossu
cff3bae155 Adding support for general Nc in the benchmark outputs 2018-01-25 13:46:31 +01:00
James Harrison
90dffc73c8 Merge branch 'feature/hadrons' into feature/qed-fvol
# Conflicts:
#	extras/Hadrons/Modules.hpp
#	extras/Hadrons/Modules/MGauge/StochEm.cc
#	extras/Hadrons/Modules/MScalar/ChargedProp.cc
#	extras/Hadrons/Modules/MScalar/ChargedProp.hpp
#	extras/Hadrons/modules.inc
#	lib/communicator/Communicator_mpi.cc
2018-01-24 16:41:44 +00:00
paboyle
99329197ee Rename header to .h 2018-01-24 14:10:09 +00:00
paboyle
421401af55 Remove IMCI as really don't support 2018-01-24 13:53:21 +00:00
paboyle
0626c1e39e Accelerator flaggina dn thrust complex for NVCC 2018-01-24 13:50:41 +00:00
paboyle
725f03e2e2 Accelerator markup and thrust complex on nvcc 2018-01-24 13:50:10 +00:00
paboyle
65f77112e0 Thread loops done properly 2018-01-24 13:49:39 +00:00
paboyle
408b868475 Generic for GPU needs accelerator markup of functions 2018-01-24 13:49:12 +00:00
paboyle
1c797deb04 Accelerator tweaks 2018-01-24 13:43:43 +00:00
paboyle
b9d5a42b57 Should be able to eliminate the COMMA_SAFE with VA_ARGS trick ; revisit this file 2018-01-24 13:42:06 +00:00
paboyle
e737591918 Accelerator loops 2018-01-24 13:41:12 +00:00
paboyle
ba5ea5830b Acceleartor loops 2018-01-24 13:40:56 +00:00
paboyle
43f244badf Thread loops for now; figure out what can be GPU accelerated later here 2018-01-24 13:40:30 +00:00
paboyle
e9c8ba5ef7 Accelerator loosp 2018-01-24 13:39:54 +00:00
paboyle
d70709a8e8 Thread construct changes 2018-01-24 13:39:06 +00:00
paboyle
733f8ff0b2 Still using parallel_for -- don't know how to implement reduction on GPU yet. Look at some sample code is best. 2018-01-24 13:38:13 +00:00
paboyle
0bfa5bb213 Accelerator loosp 2018-01-24 13:37:26 +00:00
paboyle
1f26a234f9 CPU loops explicit for peek poke 2018-01-24 13:36:31 +00:00
paboyle
13f0116425 Accelerator loops 2018-01-24 13:35:55 +00:00
paboyle
25f589b064 Accelerator loops 2018-01-24 13:35:36 +00:00
paboyle
210c50a278 Accelerator prep work 2018-01-24 13:35:13 +00:00
paboyle
549a143e78 Accelerator related 2018-01-24 13:34:46 +00:00
paboyle
277301486d Simple warning elimination 2018-01-24 13:34:15 +00:00
paboyle
c851b39a49 Nicer way of including aggregate 2018-01-24 13:33:34 +00:00
paboyle
15cc12eb6c Delete the old non ET file 2018-01-24 13:33:07 +00:00
paboyle
ae4f1f8c12 New file, split out two from Lattice_reduction 2018-01-24 13:32:43 +00:00
paboyle
5609624b44 Threading constructs replaced 2018-01-24 13:32:24 +00:00
paboyle
b5a947dd79 Change to make NVCC happy 2018-01-24 13:32:02 +00:00
paboyle
ee16f62322 stray semicolon elimination. NVCC is picky, but eventually picked up these diags
with a pragma to suppress
2018-01-24 13:31:17 +00:00
paboyle
3318de27d6 Thread macro changes 2018-01-24 13:30:23 +00:00
paboyle
ac56965306 GPU changes and threading macros replaced 2018-01-24 13:28:30 +00:00
paboyle
8e99264f40 Accelerator mark up of entire tensore space for offload 2018-01-24 13:27:30 +00:00
paboyle
69327db9a9 Improviements for NVCC. Eigen is not compat with CUDA 9 and must hack to disable device
compilation
2018-01-24 13:25:07 +00:00
paboyle
7331ee2d80 Warnings control to overpower the NVCC compiler 2018-01-24 13:24:36 +00:00
paboyle
918c105c57 NVCC warning elimination 2018-01-24 13:23:59 +00:00
paboyle
be1511d469 Remove old macros for threading 2018-01-24 13:23:24 +00:00
paboyle
f1c31df9d2 updated Eigen version. Still didn't fix CUDA 9 no compile.
Worked around by switching off __NVCC__ during the include of Eigen and switching it
back on after. No Eigen code can be offloaded, note as a rsult of this. No harm done.
2018-01-24 13:19:29 +00:00
paboyle
ff7b587fad Ugly... nvcc needs -x cu to compile .cc as cuda.
Since CXXFLAGS is Also passed to linker, and -x cu breaks link phase must replace
CXX and CXXLD with nvcc -x cu and nvcc -link respectively.
2018-01-24 13:18:19 +00:00
paboyle
4e1135b214 Updated pugixml to v1.8; still didn't fix no compile under nvcc.
Turns out nvcc was right; must to an explicit template instantiation that was missing
but left gcc, icpc and clang happy for some reason.
Fix this.
2018-01-24 13:17:10 +00:00
paboyle
acd4955a18 remove rdtsc on __NVCC__ as may be device called 2018-01-24 13:16:18 +00:00
paboyle
bd08dc4f45 Pragma use for nvcc, warning elimination. 2018-01-24 13:15:43 +00:00
paboyle
22d137d4e5 Namespace, nvcc warning elimination. 2018-01-24 13:14:43 +00:00
paboyle
87ee592176 Pragma changes and layout and warning elimination for nvcc 2018-01-24 13:14:09 +00:00
paboyle
063603b1ea Warning elimination 2018-01-24 13:12:14 +00:00
paboyle
f292106db6 Split out pragms from threads.h;
More work needed; renam threads directory to "parallelism" or something like that
2018-01-24 13:11:04 +00:00
paboyle
9d08aebea9 Compile through nvcc ; warning elimination fixes 2018-01-24 13:09:53 +00:00
paboyle
4e30739093 First compile OK through nvcc on host 2018-01-24 13:08:47 +00:00
a1151fc734 Hadrons: MPI-safe serial IO 2018-01-23 17:26:50 +00:00
James Harrison
ab3baeb38f Implement contractions and data output in functions; calculate diagrams S, X and 4C separately; output 2E and 2T instead of sunset_shifted, sunset_unshifted, tadpole_shifted, tadpole_unshifted; add comments. 2018-01-23 17:07:45 +00:00
Vera Guelpers
389731d373 changed SeqConservedSummed.hpp to work with new hadrons interface 2018-01-23 10:11:33 +00:00
6e3ce7423e Hadrons: don't display module list at startup (too long) 2018-01-22 20:04:05 +00:00
15f15a7cfd Merge branch 'develop' into feature/hadrons
# Conflicts:
#	extras/Hadrons/Modules.hpp
#	extras/Hadrons/modules.inc
2018-01-22 20:03:36 +00:00
0e5f626226 Hadrons: module for scalar operator divergence 2018-01-22 19:38:19 +00:00
Daniel Richtmann
04f92ccddf WilsonMG: Provide a fix for the previous commit; compiles and runs successfully now
I don't like the solution with the temporary very much though ...
2018-01-22 14:56:48 +01:00
Daniel Richtmann
3b2d805398 WilsonMG: Some first steps towards coarse spin dofs; not compiling yet
A failing conversion from the innermost type (Grid::Simd<...>) to a coarse
scalar (triple iScalar) in blockPromote prohibits this commit from working.
2018-01-22 12:45:51 +01:00
Azusa Yamaguchi
97b9c6f03d No option for interior/exterior split of asm kernels since different directions get interleaved 2018-01-22 11:04:19 +00:00
Azusa Yamaguchi
63982819c6 No option to overlap comms and compute for asm implementation since different directions are interleaved
in the kernels, introducing if else structure would be too painful
2018-01-22 11:03:39 +00:00
Vera Guelpers
6fec507bef merged new hadrons interface 2018-01-22 10:09:20 +00:00
James Harrison
219b3bd34f Remove freeVpTensor object 2018-01-19 17:14:11 +00:00
Daniel Richtmann
9dc885d297 Fix a bug in Wilson MG
The calculation of the lattice size of a second coarse level was incorrect.
2018-01-18 17:02:04 +01:00
Daniel Richtmann
a70c1feecc Remove some unnecessary stuff in Wilson MG 2018-01-18 15:48:28 +01:00
Daniel Richtmann
38328100c9 Implement correctness checks for Wilson MG 2018-01-18 15:43:15 +01:00
Daniel Richtmann
9732519c41 Apply clang-format to Wilson MG
I can provide the configuration file I used if people want that.
2018-01-18 15:14:37 +01:00
Daniel Richtmann
fa4eeb28c4 Save current state in Wilson MG test file 2018-01-17 17:56:34 +01:00
Guido Cossu
b00d2d2c39 Correction of Representations compilation and small compilation error for Intel 17 2018-01-17 13:46:12 +00:00
Guido Cossu
f1b3e21830 Merge branch 'feature/clover' into develop 2018-01-17 10:07:42 +00:00
paboyle
90ea472411 Auto emacs format C++ no namespace indent 2018-01-15 11:44:54 +00:00
paboyle
56999474e2 Indent 2018-01-15 11:44:45 +00:00
paboyle
d74c21a386 GLobal edit for QCD namespace removal & NAMESPACE macros 2018-01-15 09:37:58 +00:00
paboyle
ca6bdd7302 Useful drive to emacs C++ mode 2018-01-15 00:24:41 +00:00
paboyle
6f20f1d224 Namespace 2018-01-15 00:24:20 +00:00
paboyle
d0e357ef89 CLeanup and no QCD namespace 2018-01-15 00:23:51 +00:00
paboyle
21251f2e1b Namespace and formatting changes 2018-01-15 00:21:27 +00:00
paboyle
fcf1ccf669 Namespace, indent, badly formatted 2018-01-15 00:17:58 +00:00
paboyle
49cce514f1 Namespace 2018-01-15 00:17:11 +00:00
paboyle
695af98a1d Namespace, indent, tidy 2018-01-15 00:16:13 +00:00
paboyle
f8cb46d360 Namspace, indent, badly formatted code fixed 2018-01-15 00:14:47 +00:00
paboyle
0da64dea90 Namespace, indent 2018-01-15 00:13:32 +00:00
paboyle
2cceebbf12 Namespace, indent 2018-01-15 00:12:20 +00:00
paboyle
40232dcefe Namespce 2018-01-15 00:11:19 +00:00
paboyle
dbd86bb95b CLeanup, namespace, indent 2018-01-15 00:10:11 +00:00
paboyle
b8fd2c161f Indent, namespace 2018-01-15 00:09:33 +00:00
paboyle
df9b979583 Indent, namespace 2018-01-15 00:08:40 +00:00
paboyle
23ef0e3e19 Namespace and indentation 2018-01-15 00:07:46 +00:00
paboyle
ae9175735a Indentation, Namespace 2018-01-15 00:07:10 +00:00
paboyle
2d13ea1a22 Namespace and indentation emacs choices 2018-01-15 00:05:55 +00:00
paboyle
8c675064bd Namespace and indentation 2018-01-15 00:04:43 +00:00
paboyle
550b905bb8 Namespace nd indentation 2018-01-15 00:03:49 +00:00
paboyle
edb79dc088 Namespce,and indent 2018-01-15 00:02:33 +00:00
paboyle
88e635c5d1 Namepscae, format 2018-01-15 00:02:01 +00:00
paboyle
ecb4a24de8 Namespace 2018-01-15 00:01:25 +00:00
paboyle
c8c1d36710 Namespace, indent 2018-01-15 00:00:52 +00:00
paboyle
b4bb428d9b Namespace, indent 2018-01-14 23:59:57 +00:00
paboyle
e9ef7e3852 Namespace, indent 2018-01-14 23:59:23 +00:00
paboyle
31cbbfc07e Namespace, indent 2018-01-14 23:58:44 +00:00
paboyle
4eb0552d1d Namespace, indnet 2018-01-14 23:58:03 +00:00
paboyle
08f2a4564f Namespace, formatting 2018-01-14 23:56:33 +00:00
paboyle
7e00f643f8 Namespace indent 2018-01-14 23:55:44 +00:00
paboyle
c19ccdad7c Namespace, indent 2018-01-14 23:55:07 +00:00
paboyle
8aed4181e1 Namespace, indent 2018-01-14 23:54:25 +00:00
paboyle
06ab7f5661 Namespace 2018-01-14 23:53:31 +00:00
paboyle
645ec8eba0 Namespace 2018-01-14 23:52:26 +00:00
paboyle
72ffa8a88e Namespace 2018-01-14 23:51:38 +00:00
paboyle
4c829b410e Namespace 2018-01-14 23:50:20 +00:00
paboyle
eda4fd9912 Namespace 2018-01-14 23:49:11 +00:00
paboyle
041d9137c0 Namespace 2018-01-14 23:48:27 +00:00
paboyle
eeacdfe031 Namespace 2018-01-14 23:47:37 +00:00
paboyle
e5535f4d72 Namespace, indent 2018-01-14 23:46:51 +00:00
paboyle
044a292281 Namespace, indnet 2018-01-14 23:46:07 +00:00
paboyle
fe0467df1e Namespace, indenting 2018-01-14 23:45:19 +00:00
paboyle
19234fb40e Namespace, format 2018-01-14 23:44:16 +00:00
paboyle
f445257d28 Namespace, indenting 2018-01-14 23:43:36 +00:00
paboyle
bdc2a987aa Namespace, indent 2018-01-14 23:42:47 +00:00
paboyle
72acb0e48f Namespace, indent 2018-01-14 23:41:59 +00:00
paboyle
b4e9211df7 Namespace, indent 2018-01-14 23:40:38 +00:00
paboyle
97019d2997 Namespace, format 2018-01-14 23:39:57 +00:00
paboyle
83c5f05094 Namespace, indent 2018-01-14 23:39:13 +00:00
paboyle
1619e42d90 Indent and Namespace changes 2018-01-14 23:38:25 +00:00
paboyle
9f6cebe5ff Namespace and format changes 2018-01-14 23:37:40 +00:00
paboyle
a84ebe5624 Namespace, format change 2018-01-14 23:36:45 +00:00
paboyle
c527e39881 Namespace, format indent change 2018-01-14 23:36:07 +00:00
paboyle
a0f4687887 Namespace, formatting indent changes 2018-01-14 23:35:16 +00:00
paboyle
3ef7b2389e Format eamcs style after NAMESPCCE change 2018-01-14 23:34:08 +00:00
paboyle
7dfa3d0b50 Namespace, format 2018-01-14 23:33:16 +00:00
paboyle
bf629dddce Namespace, format improved 2018-01-14 23:32:19 +00:00
paboyle
7747b95430 Namespace, formatting emacs style 2018-01-14 23:31:28 +00:00
paboyle
ccd75c039a Namespace, fmt 2018-01-14 23:30:34 +00:00
paboyle
493ea80208 Namespace 2018-01-14 23:29:53 +00:00
paboyle
229baf3aba Namespace, emacs fmt 2018-01-14 23:29:02 +00:00
paboyle
0ce4ecfc84 Emacs format indent 2018-01-14 23:28:12 +00:00
paboyle
ddfaae8ea6 Namespace 2018-01-14 23:27:49 +00:00
paboyle
70c5b781e5 Namespace, clean up 2018-01-14 23:26:41 +00:00
paboyle
901e359d28 Namespace changes; need to simplify the EOFA as too many cases and duplicated from Mobius 2018-01-14 23:25:51 +00:00
paboyle
e857d4d4c8 Namespace, indent 2018-01-14 23:24:51 +00:00
paboyle
e5b77c7fd8 Namespace, indent 2018-01-14 23:24:06 +00:00
paboyle
3b5d629048 Namespace, format 2018-01-14 23:23:26 +00:00
paboyle
08772d5e0c Namespace, indent 2018-01-14 23:22:42 +00:00
paboyle
017dcd69a6 Namespace, indent 2018-01-14 23:21:40 +00:00
paboyle
8178a17b88 Namespace, indent 2018-01-14 23:20:55 +00:00
paboyle
c5c1b53e54 Namespace, indent 2018-01-14 23:20:08 +00:00
paboyle
440f9e2013 Namespace, indent 2018-01-14 23:19:22 +00:00
paboyle
c98657d588 Namespace 2018-01-14 23:18:46 +00:00
paboyle
f450857716 Namespce, indent 2018-01-14 23:17:33 +00:00
paboyle
9ec238df9e Namespace, indent 2018-01-14 23:16:49 +00:00
paboyle
3ba8eb1500 Namespace, indent 2018-01-14 23:16:08 +00:00
paboyle
8da49c5a34 Namespace 2018-01-14 23:15:26 +00:00
paboyle
e04f61b1fa Namespace 2018-01-14 23:14:46 +00:00
paboyle
115e13b227 Namespace 2018-01-14 23:13:49 +00:00
paboyle
75f3062a80 Think this should move to the algorithms directory 2018-01-14 23:12:14 +00:00
paboyle
b460cd3ef1 Namespace, format 2018-01-14 23:11:24 +00:00
paboyle
0e6727a33b Namespace, format; possibly some conflict with Azusa beware 2018-01-14 23:10:21 +00:00
paboyle
4c6745cb4c Namespace 2018-01-14 23:09:44 +00:00
paboyle
efdd0e572c Namespace 2018-01-14 23:09:10 +00:00
paboyle
ca60a218ac Namespace 2018-01-14 23:08:35 +00:00
paboyle
03633d709e Namespace 2018-01-14 23:07:36 +00:00
paboyle
4de58c4aab Namespace 2018-01-14 23:06:47 +00:00
paboyle
4f8b1c1940 Namespace 2018-01-14 23:05:23 +00:00
paboyle
dec39b313d Namespace and format 2018-01-14 23:04:37 +00:00
paboyle
dc835ad1cb Namespace 2018-01-14 23:03:49 +00:00
paboyle
71c8c9e4fb Pretty 2018-01-14 23:03:01 +00:00
paboyle
a935ef7b39 Namespace 2018-01-14 23:01:07 +00:00
paboyle
a97ad1a51d Namespce 2018-01-14 23:01:01 +00:00
paboyle
5ab9129db3 Namespace 2018-01-14 22:58:42 +00:00
paboyle
634943c11f Namepsace 2018-01-14 22:57:59 +00:00
paboyle
e598e65f69 Namespace 2018-01-14 22:57:10 +00:00
paboyle
291407dc7f Namespace 2018-01-14 22:54:42 +00:00
paboyle
641a28aa1d Namespace 2018-01-14 22:53:50 +00:00
paboyle
75207fa010 FOrmat 2018-01-14 22:53:13 +00:00
paboyle
c2b0e0269a Namespace 2018-01-14 22:52:22 +00:00
paboyle
7828887604 Namespace, indent 2018-01-14 22:51:18 +00:00
paboyle
e6efc93a7c Namespace 2018-01-14 22:50:35 +00:00
paboyle
ff7e773d5e Namesapce 2018-01-14 22:49:48 +00:00
paboyle
a0380fad72 Namespace 2018-01-14 22:48:57 +00:00
paboyle
61e9a33777 Namesapce 2018-01-14 22:48:08 +00:00
paboyle
3e139b52d3 Namespace 2018-01-14 22:47:24 +00:00
paboyle
fd6031b005 Namespace 2018-01-14 22:46:17 +00:00
paboyle
fe44fc50d9 Namespace 2018-01-14 22:45:29 +00:00
paboyle
2dd88cf3f8 Namespace 2018-01-14 22:44:41 +00:00
paboyle
6b7e82f1a9 Namespace, indentation 2018-01-14 22:44:06 +00:00
paboyle
be612b3931 Namespace, indentation 2018-01-14 22:43:27 +00:00
paboyle
f5e74033f9 Namespace 2018-01-14 22:42:31 +00:00
paboyle
8d52e0a349 Namespace 2018-01-14 22:41:23 +00:00
paboyle
a60f6d353e Namespace 2018-01-14 22:40:29 +00:00
paboyle
5d3b574325 Missing banner; should recreate globally 2018-01-14 22:39:24 +00:00
paboyle
6ee5ea6b32 Namespace QCD gone 2018-01-14 22:38:22 +00:00
paboyle
cc349c6512 Namespace 2018-01-14 22:36:59 +00:00
paboyle
fde2e07bf4 Namespace 2018-01-14 22:36:15 +00:00
paboyle
2f38fe8d45 Namespace 2018-01-14 22:35:24 +00:00
paboyle
813af84ae8 Format emacs C++ mode 2018-01-14 22:34:12 +00:00
paboyle
cfe6c6838f Namespace 2018-01-14 22:33:18 +00:00
paboyle
12a7216dfe Namespace 2018-01-14 22:32:29 +00:00
paboyle
71ebd61327 Namespace 2018-01-14 22:31:39 +00:00
paboyle
2c2da60cc2 Namespace 2018-01-14 22:30:54 +00:00
paboyle
7631ed9c56 Namespace 2018-01-14 22:30:09 +00:00
paboyle
65669b116e Namespace 2018-01-14 22:29:18 +00:00
paboyle
ae2a6cfc6e Namespace 2018-01-14 22:27:32 +00:00
paboyle
c36223055e Namespace 2018-01-14 22:26:55 +00:00
paboyle
e42de105c5 Namespace 2018-01-14 22:26:11 +00:00
paboyle
b08dae0809 Namespace 2018-01-14 22:25:29 +00:00
paboyle
3bf8fddbb5 Namespace 2018-01-14 22:24:47 +00:00
paboyle
d29fa23ebc Namespace 2018-01-14 22:23:49 +00:00
paboyle
c978c88521 Namespace 2018-01-14 22:22:27 +00:00
paboyle
93f09818da Namespace 2018-01-14 22:21:40 +00:00
paboyle
54a8ea93ec Namespace QCD gone 2018-01-14 22:20:42 +00:00
paboyle
56e87d6e55 Namespace 2018-01-14 22:19:25 +00:00
paboyle
df29cc19ab Namespace 2018-01-14 22:18:27 +00:00
paboyle
e61189db3f Namespace 2018-01-14 22:17:43 +00:00
paboyle
361ce948c3 Namespace 2018-01-14 22:16:33 +00:00
paboyle
049b4a4631 Namespace 2018-01-14 22:15:55 +00:00
paboyle
9f2f294a27 Namespace 2018-01-14 22:14:58 +00:00
paboyle
81dcd0e6ea Namespace 2018-01-14 22:13:46 +00:00
paboyle
34a788331f Namespace 2018-01-14 22:13:02 +00:00
paboyle
e2c39945b3 Namespace 2018-01-14 22:11:03 +00:00
paboyle
1591d391b9 Namespace 2018-01-14 22:09:42 +00:00
paboyle
f4c06ed8c0 Namespace 2018-01-14 22:08:25 +00:00
paboyle
1f49f781bf Namespace 2018-01-14 22:07:27 +00:00
paboyle
3a9f746421 Namespace 2018-01-14 22:06:01 +00:00
paboyle
4491d87766 Namespace 2018-01-14 22:04:21 +00:00
paboyle
0e080a7abc Namespace 2018-01-14 22:03:14 +00:00
paboyle
8bf78846ee Namespace 2018-01-14 22:02:09 +00:00
paboyle
9aa34dc803 Namespace 2018-01-14 22:01:17 +00:00
paboyle
fdcbe0a0d1 Namespace 2018-01-14 22:00:29 +00:00
paboyle
6a62a9c6a5 Namespace 2018-01-14 21:59:48 +00:00
paboyle
b331ecea78 Namespace 2018-01-14 21:58:47 +00:00
paboyle
66f8a2f082 Namespace 2018-01-14 21:57:46 +00:00
paboyle
d58b7cf9b9 Namespace changes 2018-01-14 21:56:55 +00:00
paboyle
0d749becff Namespace 2018-01-14 21:55:47 +00:00
paboyle
1dbea9aa69 Namespace 2018-01-14 21:54:28 +00:00
paboyle
c1438cbbe3 Namespace 2018-01-14 21:53:39 +00:00
paboyle
f4623fd551 Namespace 2018-01-14 21:53:05 +00:00
paboyle
59ba9ff3bb NAMESPACE & format 2018-01-14 21:52:27 +00:00
paboyle
1fbab4032b Namespace changes 2018-01-14 21:51:19 +00:00
paboyle
c037244874 Tensor reformatted with NAMESPACE too 2018-01-13 00:31:02 +00:00
paboyle
f4272aa6fd Clean up 2018-01-13 00:19:19 +00:00
paboyle
8cb7a1a887 Format 2018-01-13 00:17:16 +00:00
paboyle
b45bd8e097 NAMESPACE 2018-01-13 00:16:34 +00:00
paboyle
5e48b701ec FOrmatting 2018-01-13 00:11:53 +00:00
paboyle
7f6bffe5ad NAMESPACE 2018-01-13 00:11:30 +00:00
paboyle
6bf5fb1924 Clean up and format NAMESPACE 2018-01-13 00:08:25 +00:00
paboyle
086db7bd19 NAMESPACE and reformat 2018-01-13 00:05:33 +00:00
paboyle
c0a9b38c02 C++ NAMESPACE format emacs happy 2018-01-13 00:03:57 +00:00
paboyle
6d7bdfb5f5 Emacs happy 2018-01-13 00:02:53 +00:00
paboyle
be5d70ae6e C++ happy 2018-01-13 00:02:10 +00:00
paboyle
ab1068044e C++ emacs happy 2018-01-13 00:01:58 +00:00
paboyle
dda151250f Emacs format 2018-01-12 23:59:58 +00:00
paboyle
18daf85069 Emacs format 2018-01-12 23:58:23 +00:00
paboyle
81cc28f6ca Format 2018-01-12 23:57:22 +00:00
paboyle
c01a1e02fe Namespace, format 2018-01-12 23:55:38 +00:00
paboyle
7e70f4ed9c Format, NAMESPACE 2018-01-12 23:55:03 +00:00
paboyle
1056e36f11 Format, NAMESPACE 2018-01-12 23:49:46 +00:00
paboyle
0b8a88978b Format, NAMESPACE 2018-01-12 23:47:24 +00:00
paboyle
59b31b6bb8 Format, NAMESPACE 2018-01-12 23:43:44 +00:00
paboyle
69496482fc Format, NAMESPACE 2018-01-12 23:42:22 +00:00
paboyle
4be31ad1f6 C++ indentation 2018-01-12 23:39:49 +00:00
paboyle
176a021ce9 Formatting, NAMESPACE§ 2018-01-12 23:38:15 +00:00
paboyle
b673174b71 FOrmat, NAMESPACE 2018-01-12 23:29:22 +00:00
paboyle
e6f7a5a818 Namespace 2018-01-12 23:28:01 +00:00
paboyle
68b69a2ac0 Namespace management 2018-01-12 23:26:14 +00:00
paboyle
bd15c38ae8 Formatting emacs compliant 2018-01-12 23:25:02 +00:00
paboyle
b815f5f764 Formatting 2018-01-12 23:23:21 +00:00
paboyle
4da437431e Reformat 2018-01-12 23:22:46 +00:00
paboyle
3c7bf211a9 Reformat 2018-01-12 23:22:18 +00:00
paboyle
347d5404dd format 2018-01-12 23:21:25 +00:00
paboyle
5e2cd0d07c Format 2018-01-12 23:18:22 +00:00
paboyle
62fcee72c5 Format, NAMESPACE 2018-01-12 23:16:37 +00:00
paboyle
0a6168eef0 Format emacs style 2018-01-12 23:11:22 +00:00
paboyle
63865e4232 format 2018-01-12 23:10:48 +00:00
paboyle
c64deedf74 Format 2018-01-12 23:09:35 +00:00
paboyle
3281559ec3 Format 2018-01-12 23:09:01 +00:00
paboyle
6a2eca2ec2 NAMESAPCE 2018-01-12 23:00:03 +00:00
paboyle
d8ff895e74 NAMESPACE and format 2018-01-12 18:27:22 +00:00
paboyle
00c49d4c17 Format 2018-01-12 18:25:39 +00:00
paboyle
ec89714cce NAMESPACE 2018-01-12 18:24:16 +00:00
paboyle
6ab744c720 NAMESPACE and formatting 2018-01-12 18:11:04 +00:00
paboyle
bbb657da5c NAMESPACE and formatting 2018-01-12 18:10:11 +00:00
paboyle
fbc2380cb8 NAMESPACE & format 2018-01-12 18:05:36 +00:00
paboyle
08682c5461 NAMESPACE and format to my liking 2018-01-12 18:03:57 +00:00
paboyle
13bce2a6bf NAMESPACE 2018-01-12 17:58:53 +00:00
paboyle
70e689900b NAMESPACE 2018-01-12 17:58:13 +00:00
Guido Cossu
b7f8c5b823 Modify test to merge with the new Lanczos interface 2018-01-12 14:38:27 +00:00
Guido Cossu
3923683e9b Updating the feature/clover branch with the newest Hadron package 2018-01-12 13:35:51 +00:00
Guido Cossu
e199fda9dc Merge pull request #136 from pretidav/feature/clover
Feature/clover
2018-01-12 11:57:08 +00:00
7bb405e790 Merge branch 'develop' into feature/hadrons
# Conflicts:
#	lib/communicator/Communicator_mpi3_leader.cc
#	lib/communicator/Communicator_shmem.cc
2018-01-11 18:50:15 +00:00
Daniel Richtmann
10f7a17ae4 Make timing in VPGCR more detailed 2018-01-11 13:42:18 +01:00
Daniel Richtmann
26f14d7dd7 Adapt output format of non-herm solvers to the one of VPGCR 2018-01-11 13:36:30 +01:00
ec16eacc6a Hadrons: scalar SU(N) 2-pt function 2018-01-10 22:12:21 +00:00
pretidav
cf858deb16 Lanczos with 2 reps fixed (tobe tested) 2018-01-10 18:43:02 +01:00
David Preti
a3affac963 SU3 restored + output filename for mesons and baryons fixed. 2018-01-10 14:56:54 +01:00
d9d1f43ba2 Hadrons: code cleaning 2018-01-10 11:31:24 +00:00
b7cd721308 Hadrons: scalar SU(N) tr(mag^n) 2018-01-10 11:25:59 +00:00
29f026c375 Hadrons: scalar SU(N) tr(phi^n) 1-pt function 2018-01-10 11:01:03 +00:00
58c7a13d54 Hadrons: result file macro with trajectory number 2018-01-10 10:59:58 +00:00
Azusa Yamaguchi
24162c9ead Staggered overlap comms comput 2018-01-09 13:02:52 +00:00
Daniel Richtmann
73434db636 Merge remote-tracking branch 'upstream/develop' into feature/ddalphaamg 2018-01-09 10:43:33 +01:00
paboyle
e564d11687 Allow resize of the shared memory buffers 2018-01-08 15:20:26 +00:00
paboyle
0b2162f375 Clean up 2018-01-08 14:06:53 +00:00
paboyle
5610570182 Synthetic test of lanczos 2018-01-08 11:36:39 +00:00
paboyle
44f65526e0 Simplify communicators 2018-01-08 11:35:43 +00:00
paboyle
43e48542ab Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2018-01-08 11:34:45 +00:00
paboyle
0b85f1bfc8 Simplify the communicator proliferation: mpi and none. 2018-01-08 11:33:47 +00:00
paboyle
9947cfbf14 Simplify number of communicator cases 2018-01-08 11:33:01 +00:00
paboyle
357badce5e Simplify communicator case proliferation 2018-01-08 11:32:16 +00:00
paboyle
0091eec23a Simplify communicator cases 2018-01-08 11:31:32 +00:00
paboyle
9e9c2962df Simplify comms layer proliferation 2018-01-08 11:30:22 +00:00
paboyle
bda97212a9 Simplify proliferation of comms layers 2018-01-08 11:29:20 +00:00
paboyle
b91282ad46 Simplify comms layer proliferation 2018-01-08 11:28:52 +00:00
paboyle
0a68470f9a Simplify comms layers 2018-01-08 11:28:30 +00:00
paboyle
6ecf280723 Simplify comms layer proliferation 2018-01-08 11:28:04 +00:00
paboyle
7eeab7f995 Simplify comms layers 2018-01-08 11:27:43 +00:00
paboyle
9b32d51cd1 Simplify comms layer proliferatoin 2018-01-08 11:27:14 +00:00
paboyle
7b3ed160aa Rationalise MPI options 2018-01-08 11:26:48 +00:00
paboyle
1a0163f45c Updated to do list 2018-01-08 11:26:11 +00:00
Daniel Richtmann
c6411f8514 Merge remote-tracking branch 'upstream/develop' into feature/ddalphaamg 2018-01-08 10:37:10 +01:00
David Preti
9028e278e4 Trying to fix a bug with SU4 mesons (still under investigation) 2018-01-06 15:57:38 +01:00
dd62f2f371 Hadrons: log message fix 2017-12-29 16:58:44 +01:00
0d612039ed Hadrons: prettier Grid logging (non-intrusive) 2017-12-29 16:58:23 +01:00
e8ac75055c Hadrons: binary configuration loader 2017-12-27 14:24:29 +01:00
8b30c5956c Hadrons: copyright update 2017-12-26 14:16:47 +01:00
185da83454 Hadrons: new MIO module namespace, NERSC loader moved there 2017-12-26 14:05:17 +01:00
6718fa8c4f Merge branch 'feature/scalar_adjointFT' into feature/hadrons 2017-12-26 12:59:33 +01:00
pretidav
4ce63af7d5 Working on Hadrons with Hirep. (QCD is set for SU4) 2017-12-22 19:02:07 +01:00
Daniel Richtmann
6cf635d61c Remove some old code in Wilson MG 2017-12-22 13:20:09 +01:00
Daniel Richtmann
39558cce52 Multiply TVs in Wilson MG with G5 instead of G5R5 2017-12-22 13:07:56 +01:00
Vera Guelpers
935cd1e173 conserved current insertion summed over Lorentzindex 2017-12-22 11:38:45 +00:00
Vera Guelpers
55e39df30f tadpole insertion for DWF 2017-12-22 11:36:31 +00:00
67c3fa0f5f Hadrons: all modules are now ported, more tests need to be done 2017-12-21 11:39:07 +00:00
65d4f17976 Hadrons: no errors when trying to recreate a cache 2017-12-19 20:28:32 +00:00
e2fe97277b Hadrons: getReference use is rare, empty by default 2017-12-19 20:28:04 +00:00
Guido Cossu
84f9c37ed4 Merge branch 'feature/scalar_adjointFT' of https://github.com/paboyle/Grid into feature/scalar_adjointFT 2017-12-19 15:43:55 +00:00
bcf6f3890c Hadrons: more fixes after test 2017-12-14 21:14:10 +00:00
591a38c487 Hadrons: VM fixes 2017-12-14 19:42:16 +00:00
James Harrison
581be32ed2 Implement infrared improvement for v=0 on-shell self-energy 2017-12-14 13:42:41 +00:00
842754bea9 Hadrons: most modules ported to the new interface, compiles but untested 2017-12-13 19:41:41 +00:00
James Harrison
6bc136b1d0 Add module for calculating diagrams required for HVP counter-terms 2017-12-13 17:31:01 +00:00
0887566134 Hadrons: scheduler back! 2017-12-13 16:36:15 +00:00
61fc50d616 Hadrons: better organisation of the VM 2017-12-13 13:44:23 +00:00
a9c8d7dad0 Hadrons: code cleaning 2017-12-13 12:13:40 +00:00
259d504ef0 Hadrons: first full implementation of the module memory profiler 2017-12-12 19:32:58 +00:00
f3a77f4b7f Merge branch 'feature/hadrons' into feature/hadrons-new-memory-model 2017-12-12 14:05:23 +00:00
26d7b829a0 Hadrons: error managed through expections 2017-12-12 14:04:28 +00:00
64161a8743 Hadrons: much simpler reference dependency 2017-12-12 13:08:01 +00:00
2401360784 Merge pull request #138 from guelpers/feature/hadrons
bug fix in sequential insertion of conserved vector current
2017-12-11 18:53:41 +01:00
Vera Guelpers
2cfb50cbe5 bug fix in sequential insertion of conserved vector current 2017-12-08 11:13:39 +00:00
f9aa39e1c4 global memory debug through command line flag 2017-12-07 14:40:58 +01:00
Daniel Richtmann
df152648d6 Fix error in MR code when compiling for single precision 2017-12-06 18:00:58 +01:00
0fbf445edd Hadrons: object creation that get properly captured by the memory profiler 2017-12-06 16:51:48 +01:00
e78794688a memory profiler improvement 2017-12-06 16:50:25 +01:00
9e31307963 Merge branch 'feature/hadrons' into feature/hadrons-new-memory-model 2017-12-06 16:49:32 +01:00
29e2eddea8 Merge branch 'develop' into feature/hadrons-new-memory-model 2017-12-06 16:49:21 +01:00
0a038ea15a Merge branch 'develop' into feature/hadrons 2017-12-06 16:49:10 +01:00
62eb1f0e59 FermionOperator virtual destructor needed for polymorphism 2017-12-06 16:48:17 +01:00
5422251959 Hadrons: execution part moved in a new virtual machine class 2017-12-05 15:31:59 +01:00
paboyle
9579c9c327 Threading improvement 2017-12-05 14:12:22 +00:00
paboyle
3729c7a7a6 Clean up of test 2017-12-05 13:07:31 +00:00
paboyle
c24d4c8d0e Improved parallel RNG init 2017-12-05 13:01:10 +00:00
paboyle
a14038051f Improved AllToAll asserts 2017-12-05 11:43:25 +00:00
paboyle
3e560b9462 Faster RNG init 2017-12-05 11:42:05 +00:00
paboyle
d93c6760ec Faster code for split unsplit 2017-12-05 11:39:26 +00:00
paboyle
ae3b7713a9 Cold start doesnt need RNG 2017-12-05 11:36:31 +00:00
cbd8fbe771 Merge branch 'feature/hadrons' into feature/hadrons-new-memory-model 2017-12-03 19:48:56 +01:00
d391f05cb7 Merge branch 'develop' into feature/hadrons 2017-12-03 19:48:46 +01:00
3127b52c90 bootstrap script does not destroy Eigen is working offline 2017-12-03 19:48:34 +01:00
01f00385a4 Hadrons: genetic pair selection based on exponential probability 2017-12-03 19:47:40 +01:00
59aae5f5ec Hadrons: garbage collector clean temporaries 2017-12-03 19:47:11 +01:00
624246409c Hadrons: module setup/execute protected to forbid user to bypass execution control 2017-12-03 19:46:18 +01:00
2a9ebddad5 Hadrons: scheduler offline, minimal code working again 2017-12-03 19:45:15 +01:00
ff7afe6e17 Merge branch 'feature/hadrons' into feature/hadrons-new-memory-model 2017-12-01 19:45:44 +00:00
33cb509d4b Merge branch 'develop' into feature/hadrons 2017-12-01 19:45:32 +00:00
456c78c233 Merge branch 'develop' into feature/hadrons-new-memory-model 2017-12-01 19:45:12 +00:00
2fd4989029 Merge branch 'develop' of github.com:paboyle/Grid into develop 2017-12-01 19:44:31 +00:00
2427a21428 minor serial IO fixes, XML now issues warning when trying to read absent nodes, these becomes 2017-12-01 19:44:07 +00:00
514993ed17 Hadrons: progress on the interface, genetic algorithm freezing 2017-12-01 19:38:23 +00:00
David Murphy
7a0c9e84f8 Fix HDF5 src 2017-11-29 18:03:03 -05:00
David Murphy
caf1a3c85d Also add HDF5 src 2017-11-29 17:58:02 -05:00
David Murphy
21ca730a49 Also import some dependcies 2017-11-29 17:34:40 -05:00
David Murphy
c6cd27e9b2 Also import Eigen.inc 2017-11-29 17:26:20 -05:00
David Murphy
6068411d61 Remove Eigen from gitignore 2017-11-29 17:24:40 -05:00
Daniel Richtmann
4e965c168e Implement analogon to test vector analysis in WMG codebase 2017-11-29 15:05:27 +01:00
Daniel Richtmann
f260af546e Save current state 2017-11-28 15:03:02 +01:00
paboyle
28ceacec45 Split/Unsplit working 2017-11-27 15:13:29 +00:00
paboyle
e6a3e375cf Debug 2017-11-27 15:10:22 +00:00
paboyle
4987edbd44 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2017-11-27 12:34:56 +00:00
paboyle
ad140bb6e7 Clean on multinode target after split 1 1 2 4 -> 1 1 2 2 2017-11-27 12:34:25 +00:00
paboyle
1f04e56038 Believe split/unsplit works, but need to make pretty 2017-11-27 12:33:08 +00:00
paboyle
4bfc8c85c3 Clean up verbose communicator create 2017-11-27 12:32:37 +00:00
azusayamaguchi
e55397bc13 Staggerd cg 2017-11-24 14:18:30 +00:00
Daniel Richtmann
649b8c9aca Save current state 2017-11-24 10:46:20 +01:00
Daniel Richtmann
0afa22747d Merge remote-tracking branch 'upstream/develop' into feature/new-solver-algorithms 2017-11-24 10:11:42 +01:00
a3fe874a5b Hadrons: everything is broken, repairing while implementing the new memory model 2017-11-22 23:27:19 +00:00
f403ab0133 gitignore update 2017-11-22 17:13:09 +00:00
paboyle
94b8fb5686 Debug in progress 2017-11-19 01:39:04 +00:00
Guido Cossu
1f1d77b01a Performance metrics for the Scalar Action force term 2017-11-14 10:01:48 +00:00
pretidav
6a15e2e8ef Added WilsonTwoIndexAntiSymmImpl instantiation in WilsonKernelsHand.cc (shoud not be necessary) 2017-11-12 14:16:19 +01:00
074d17429f Merge branch 'develop' into feature/scalar_adjointFT
# Conflicts:
#	lib/communicator/Communicator_mpi3.cc
2017-11-11 18:09:55 +00:00
Daniel Richtmann
fa43206c79 Remove some empty lines 2017-11-10 13:48:38 +01:00
Peter Boyle
25f73018f4 Merge pull request #135 from fionnoh/develop
Declaring virtual functions as pure virtual functions.
2017-11-09 23:19:08 +00:00
fionnoh
1d7ccc6b2c Declaring virtual functions as pure virtual functions. 2017-11-09 19:46:57 +00:00
Daniel Richtmann
a367835bf2 Set everything up for the implementation of FCAGMRES
The current implementation is the exact same code as normal FGMRES. This commit
only sets up the "framework" for the implementation of FCAGMRES, i.e., a test
and an include in the algorithms header file.
2017-11-09 17:30:41 +01:00
Daniel Richtmann
d7743591ea Fix some minor formatting errors 2017-11-09 17:28:19 +01:00
Daniel Richtmann
c6cbe533ea Set everything up for the implementation of CAGMRES
The current implementation is the exact same code as normal GMRES. This commit
only sets up the "framework" for the implementation of CAGMRES, i.e., a test and
an include in the algorithms header file.
2017-11-09 17:14:44 +01:00
Daniel Richtmann
8402ab6cf9 Some minor formatting improvements 2017-11-09 12:52:04 +01:00
Daniel Richtmann
c63095345e Remove some superfluous comments 2017-11-09 12:47:20 +01:00
pretidav
59d9ccf70c restored WilsonKernelsHand.cc and added Qtop to production codes 2017-11-08 22:02:32 +01:00
Daniel Richtmann
a7ae46b61e Remove some comments 2017-11-08 16:58:20 +01:00
Daniel Richtmann
cd63052205 Remove everything preconditioner-related in GMRES code 2017-11-08 16:57:40 +01:00
Daniel Richtmann
699d537cd6 Add FGMRES test with staggered fermions 2017-11-08 16:56:42 +01:00
Daniel Richtmann
9031f0ed95 Fix a filename in a file header 2017-11-08 16:42:26 +01:00
Daniel Richtmann
26b3d441bb Check in forgotten FGMRES test with wilson Fermions 2017-11-08 16:39:11 +01:00
Daniel Richtmann
99bc4cde56 Fix an implementation error in FGMRES 2017-11-08 16:38:34 +01:00
Daniel Richtmann
e843d83d9d Make z in FGMRES a single Field 2017-11-08 16:38:16 +01:00
Daniel Richtmann
0f75ea52b7 First version of FGMRES; not working yet 2017-11-08 16:17:18 +01:00
Daniel Richtmann
8107b785cc Rename misunderstood "rsd_sq" to "rsq" in GMRES code 2017-11-08 14:40:03 +01:00
Daniel Richtmann
37b777d801 Add test for GMRES solver with staggered fermions 2017-11-08 14:28:48 +01:00
Daniel Richtmann
7382787856 Some minor changes 2017-11-08 14:23:55 +01:00
Daniel Richtmann
781c611ca0 Perform minor code style fix 2017-11-08 14:22:38 +01:00
Daniel Richtmann
b069090b52 Remove a superfluous comment 2017-11-08 13:58:02 +01:00
Daniel Richtmann
0c1c1d9900 Set precision and formatting only once in MR code 2017-11-08 13:57:06 +01:00
Daniel Richtmann
7f4ed6c2e5 First working version of GMRES + a test for Wilson fermions 2017-11-08 13:56:41 +01:00
Daniel Richtmann
56d32a4afb Rename misunderstood "rsd_sq" to "rsq" in MR code 2017-11-08 13:51:08 +01:00
Daniel Richtmann
b8ee496ed6 Print some info at start of GMRES 2017-11-08 13:23:41 +01:00
Azusa Yamaguchi
1860b1698c Fixed the bag on MPI_T at Cam 2017-11-08 09:03:01 +00:00
Azusa Yamaguchi
9b8d1cc3da Staggered Schur decomposed matrix norm changed to not be the Schur anymore :(
Carleton wanted this for multimass / multishift
2017-11-07 14:48:45 +00:00
James Harrison
0c668bf46a QedFVol: Write to output files from one process only. 2017-11-07 14:46:39 +00:00
Guido Cossu
149c3f9e9c Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2017-11-07 14:01:13 +00:00
Daniel Richtmann
b87416dac4 Fix error with conformable 2017-11-07 15:00:08 +01:00
Daniel Richtmann
176bf37372 Remove some commented stuff 2017-11-07 14:57:36 +01:00
Guido Cossu
c519aab19d Fixing the MPI memory leak in the communicators 2017-11-07 13:55:37 +00:00
Daniel Richtmann
b3d342ca22 Remove old implementation of GMRES operator 2017-11-07 10:24:49 +01:00
Daniel Richtmann
e1f928398d Save current state 2017-11-07 10:22:41 +01:00
paboyle
69929f20bb Destructor fix. Split Grid and MPI3 will not yet work without more effort from me. 2017-11-06 23:45:00 +00:00
Daniel Richtmann
8c579d2d4a Save current state 2017-11-06 18:09:48 +01:00
James Harrison
840814c776 QedFVol: Patch to fix MPI communicators error 2017-11-06 16:34:55 +00:00
Daniel Richtmann
fc7d07ade0 Correct function signature of body of GMRES outer loop 2017-11-06 17:12:38 +01:00
Daniel Richtmann
b3be9195b4 Save one lattice fermion in GMRES code 2017-11-06 17:12:23 +01:00
Daniel Richtmann
9e3c187a4d Save current state 2017-11-06 17:05:25 +01:00
Daniel Richtmann
8363edfcdb Perform some minor changes to GMRES code 2017-11-06 16:17:44 +01:00
Daniel Richtmann
74af31564f Adapt style of wilson GMRES test to style of wilson MR test 2017-11-06 14:06:45 +01:00
Daniel Richtmann
e0819d395f Merge remote-tracking branch 'upstream/develop' into feature/new-solver-algorithms 2017-11-06 13:09:36 +01:00
pretidav
a493429218 added Production tests for MixedRep, Adj, 2S, 2AS. Still missing QObs. The HMC is not printing correctly all the actions and forces. 2017-11-04 18:16:54 +01:00
pretidav
915f610da0 clover 2indexSymm hmc production test created. clover 2indexAsymm and clover mixed to be filled. 2017-11-04 01:17:06 +01:00
pretidav
c79606a5dc Test production code wilson clover. Still missing QObs measurement on-the-fly. 2017-11-03 22:46:32 +01:00
James Harrison
95af55128e QedFVol: Redo optimisation of scalar VP (extra memory requirements were not the problem), and undo optimisation of charged propagator (which seemed to be causing HDF5 errors, although I don’t know why). 2017-11-03 18:46:16 +00:00
James Harrison
9f2a57e334 QedFVol: Undo optimisation of scalar VP, to reduce memory requirements 2017-11-03 13:10:11 +00:00
James Harrison
c645d33db5 QedFVol: Redo optimisation of charged propagator, and fix I/O bug 2017-11-03 10:59:26 +00:00
James Harrison
e0f1349524 QedFVol: Undo optimisation of charged propagator 2017-11-03 09:22:41 +00:00
paboyle
360efd0088 Improved treatment of reverse asked for by chris.
Truncate the basis.
Power method renormalises
2017-11-02 22:05:31 +00:00
pretidav
7b42ac9982 added polyakov loop observable to the hmc 2017-11-02 21:58:16 +01:00
paboyle
c5c647e35e Merge branch 'feature/lanczos-reorg' into develop 2017-11-02 15:23:11 +00:00
a4e5fd1000 Merge branch 'feature/hadrons' into feature/hadrons-new-memory-model 2017-11-01 19:24:51 +00:00
682e7d7839 Merge branch 'develop' into feature/hadrons 2017-11-01 19:24:38 +00:00
Guido Cossu
8e057721a9 Anisotropic Clover term written and tested 2017-11-01 12:50:54 +00:00
Guido Cossu
fa5e4add47 Added support for anisotropy to the WilsonFermion class 2017-10-31 18:20:38 +00:00
Daniel Richtmann
6f81906b00 Add test for the MR solver with staggered fermions; does not converge atm
TODO: Is this a property of staggered or did I do something wrong?
2017-10-30 16:57:55 +01:00
James Harrison
79b761f923 Merge branch 'develop' into feature/qed-fvol
# Conflicts:
#	lib/communicator/Communicator_base.cc
2017-10-30 15:53:18 +00:00
James Harrison
0d4e31ca58 QedFVol: Calculate phase factors for momentum projections once per configuration only. 2017-10-30 15:46:50 +00:00
Daniel Richtmann
a2d83d4f3d Add test for the MR solver with DW fermions; does not converge atm
TODO: Is this a property of DWF or did I do something wrong?
2017-10-30 16:39:30 +01:00
Daniel Richtmann
89bacb0470 Fix path in MR solver header commentary 2017-10-30 16:16:55 +01:00
James Harrison
b07a354a33 QedFVol: output scalar propagator before FFT in spatial directions. 2017-10-30 14:20:44 +00:00
Daniel Richtmann
19010ff66a Merge remote-tracking branch 'upstream/develop' into feature/new-solver-algorithms 2017-10-30 13:16:46 +01:00
paboyle
27ea2afe86 No compile on comms == none fix 2017-10-30 01:14:11 +00:00
paboyle
78e8704eac Shaking out 2017-10-30 00:25:31 +00:00
paboyle
67131d82f2 Get subrank info from communicator constructor 2017-10-30 00:24:11 +00:00
paboyle
615a9448b9 Extended sub comm supported 2017-10-30 00:23:34 +00:00
paboyle
00164f5ce5 : 2017-10-30 00:22:52 +00:00
paboyle
a7f72eb994 SHaking out 2017-10-30 00:22:06 +00:00
paboyle
501fa1614a Communicator updates for split grid 2017-10-30 00:16:12 +00:00
paboyle
5bf42e1e15 Update 2017-10-30 00:05:21 +00:00
paboyle
fe4d9b003c More digits 2017-10-30 00:04:47 +00:00
paboyle
4a699b4da3 New rank can be found out 2017-10-30 00:04:14 +00:00
paboyle
689323f4ee Reverse dim ordering lexico support 2017-10-30 00:03:15 +00:00
Guido Cossu
749189fd72 Full clover force correct 2017-10-29 12:03:08 +00:00
Guido Cossu
f941c4ee18 Clover term force ok 2017-10-29 11:43:33 +00:00
paboyle
84b441800f Merge branch 'develop' into feature/lanczos-reorg 2017-10-27 14:21:38 +01:00
paboyle
1ef424b139 Split grid Y2K bug fix attempt 2017-10-27 14:20:35 +01:00
Daniel Richtmann
5a477ed29e Perform minor style correction 2017-10-27 14:46:18 +02:00
Daniel Richtmann
54128d579a Make MR a bit more verbose 2017-10-27 14:45:29 +02:00
Daniel Richtmann
e7b1933e88 Add a test for the MR solver 2017-10-27 14:38:57 +02:00
Daniel Richtmann
1bad64ac6a Some formatting 2017-10-27 14:35:04 +02:00
Daniel Richtmann
15dfa9f663 Change stopping criterion implementation in MR solver + some cleanup 2017-10-27 14:33:25 +02:00
Daniel Richtmann
2185b0d651 Correct author in the file 2017-10-27 14:32:38 +02:00
Daniel Richtmann
f61c0b5d03 Very early version of MR solver 2017-10-27 14:09:02 +02:00
Daniel Richtmann
074db32e54 Fix build of gmres test 2017-10-27 14:08:48 +02:00
paboyle
aa66f41c69 Bug fix in the coarse restore...
Think this is nearly there
2017-10-27 10:29:34 +01:00
paboyle
f96c800d25 Passes reload of coarse basis 2017-10-27 09:43:22 +01:00
paboyle
32a52d7583 Move the local coherence lanczos into algorithms.
Keep the I/O in the tester. Other people can copy this method to write other I/O formats.
2017-10-27 09:04:31 +01:00
paboyle
fa04b6d3c2 Finished ? Verifying coarse evec restore 2017-10-27 08:18:29 +01:00
paboyle
7fab183c0e Better read test 2017-10-27 08:17:49 +01:00
paboyle
9ec9850bdb 64bit ftello update 2017-10-26 23:34:31 +01:00
paboyle
0c4ddaea0b Cleaning up 2017-10-26 23:31:46 +01:00
paboyle
00ebc150ad Mistake in string parse; interface is ambiguous and must fix. Is char * a file, or a XML buffer ? 2017-10-26 23:30:37 +01:00
paboyle
0f3e9ae57d Gsites error. Only appeared (so far) in I/O code for even odd fields 2017-10-26 23:29:59 +01:00
Azusa Yamaguchi
034de160bf Staggered updates : Schur fixed and added a unit test for Test_staggered_cg_schur.cc giving stronger check 2017-10-26 20:58:46 +01:00
Guido Cossu
76bcf6cd8c Deleting vscode settings file 2017-10-26 18:45:41 +01:00
Guido Cossu
91b8bf0613 Debugging force term 2017-10-26 18:23:55 +01:00
paboyle
14507fd6e4 Final? candidate for push back on the lanczos reorg feature 2017-10-26 16:25:01 +01:00
paboyle
2db05ac214 Test for split/unsplit in isolation 2017-10-26 07:48:03 +01:00
paboyle
31f99574fa Moving these out of algorithms 2017-10-26 07:47:42 +01:00
paboyle
a34c8a2961 Update to IRL; getting close to the structure I would like. 2017-10-26 07:45:56 +01:00
paboyle
ccd20df827 Better IRL interface 2017-10-26 01:59:59 +01:00
paboyle
e9be293444 Better messaging 2017-10-26 01:59:30 +01:00
paboyle
d577211cc3 Relax stoppign condition 2017-10-25 23:57:54 +01:00
paboyle
f4336e480a Faster converge time 2017-10-25 23:53:44 +01:00
paboyle
e4d461cb03 Messagign 2017-10-25 23:53:19 +01:00
paboyle
3d63b4894e Use existing functionality where possible 2017-10-25 23:52:47 +01:00
paboyle
08583afaff Red black friendly coarsening 2017-10-25 23:51:18 +01:00
paboyle
b395a312af Better error messaging 2017-10-25 23:50:37 +01:00
paboyle
66295b99aa Bit less verbose SciDAC IO 2017-10-25 23:50:05 +01:00
paboyle
b8654be0ef 64 bit safe offsets 2017-10-25 23:49:23 +01:00
paboyle
a479325349 Rewrite of local coherence lanczos 2017-10-25 23:48:47 +01:00
paboyle
f6c3f6bf2d XML serialisation of parms and initialise from parms object 2017-10-25 23:47:59 +01:00
paboyle
d83868fdbb Identity linear op added -- useful in circumstances where a linear op may or may not be needed.
Supply a trivial one if not needed
2017-10-25 23:47:10 +01:00
paboyle
303e0b927d Improvements for coarse grid compressed lanczos 2017-10-25 23:46:33 +01:00
paboyle
28ba8a0f48 Force spacing more nicely 2017-10-25 23:45:57 +01:00
Azusa Yamaguchi
f9e28577f3 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2017-10-25 21:07:56 +01:00
Guido Cossu
e0cae833da Merge branch 'develop' into feature/scalar_adjointFT 2017-10-25 10:49:50 +01:00
Guido Cossu
8a3aae98f6 Solving minor bug in compilation 2017-10-25 10:34:49 +01:00
Guido Cossu
8309f2364b Solving again the MPI comm bug with FFTs 2017-10-25 10:24:14 +01:00
Daniel Richtmann
d5f661ba70 Save intermediate state 2017-10-25 10:38:26 +02:00
Azusa Yamaguchi
cac1750078 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2017-10-24 23:30:36 +01:00
Guido Cossu
e17cd35151 Merge branch 'develop' into feature/scalar_adjointFT 2017-10-24 17:31:22 +01:00
Guido Cossu
ccdec7a7ab Merge branch 'develop' into feature/clover 2017-10-24 16:51:14 +01:00
Guido Cossu
93642d813d Merging 2017-10-24 16:48:05 +01:00
Daniel Richtmann
1ab8d5cc13 Save two more files 2017-10-24 16:58:05 +02:00
Daniel Richtmann
789e892865 Save current state 2017-10-24 16:58:04 +02:00
Daniel Richtmann
53cfa44d7a Save current state 2017-10-24 16:58:03 +02:00
Guido Cossu
0bc381f982 Merge pull request #133 from pretidav/feature/clover
Feature/clover
2017-10-24 15:15:21 +01:00
Guido Cossu
2986aa76f8 Restoring Perfcounts 2017-10-24 13:32:02 +01:00
Guido Cossu
657779374b Adding vscode to gitignore 2017-10-24 13:27:17 +01:00
Guido Cossu
ec8cd11c1f Cleanup and prepare for pull request 2017-10-24 13:21:17 +01:00
Guido Cossu
cbda4f66e0 Debug of the field strength 2017-10-24 10:20:13 +01:00
Guido Cossu
6579dd30ff More debug test 2017-10-23 18:47:00 +01:00
Guido Cossu
031c94e02e Debugging process for the clover term 2017-10-23 18:27:34 +01:00
Guido Cossu
6391b2a1d0 Added test for Wilson and Clover fermions 2017-10-23 14:42:35 +01:00
Guido Cossu
2e50b55ae4 Changes in the Makefile to compile against Chroma on Linux 2017-10-23 13:32:26 +01:00
James Harrison
c433939795 QedFVol: Temporarily remove incomplete implementation of infinite-volume photon 2017-10-20 16:27:58 +01:00
James Harrison
b6a4c31b48 Merge branch 'feature/qed-fvol' of https://github.com/jch1g10/Grid into feature/qed-fvol 2017-10-20 16:25:07 +01:00
James Harrison
98b1439ff9 QedFVol: pass arbitrary input values to photon constructor in UnitEm 2017-10-20 16:24:09 +01:00
Guido Cossu
27936900e6 Putting the FG verbosity in the Integrator level 2017-10-18 13:08:09 +01:00
James Harrison
564738b1ff Add module for unit EM field 2017-10-17 14:03:57 +01:00
Guido Cossu
cd3e810d25 Merge branch 'develop' into feature/scalar_adjointFT 2017-10-17 11:31:14 +01:00
pretidav
317ddfedee updated test clover + first attempt derivative clove term (still missing spin part) 2017-10-16 02:47:33 +02:00
paboyle
e325929851 ALl codes compile against the new Lanczos call signature 2017-10-13 14:02:43 +01:00
paboyle
47af3565f4 Logging improvement; reunified the Lanczos codes 2017-10-13 13:23:07 +01:00
paboyle
4b4d187935 Reunified the Lanczos implementations 2017-10-13 13:22:44 +01:00
paboyle
9aff354ab5 Final version prior to reunification 2017-10-13 13:22:26 +01:00
paboyle
cb9ff20249 Approx tests and lanczos improvement 2017-10-13 11:30:50 +01:00
James Harrison
a80e43dbcf Added infinite-volume photon in Photon.h (not checked yet) 2017-10-11 16:44:51 -04:00
paboyle
9fe6ac71ea Starting reorg of Blocked lanczos 2017-10-11 10:12:07 +01:00
5c392a6ecc Merge commit 'bf58557fb1ec710c766e19c9a8809b0a352de239' into feature/scalar_adjointFT 2017-10-10 17:14:56 +01:00
Azusa Yamaguchi
f1fa00b71b Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2017-10-10 14:26:44 +01:00
paboyle
bf58557fb1 Block compressed Lanczos 2017-10-10 14:15:11 +01:00
paboyle
10cb37f504 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2017-10-10 14:09:44 +01:00
Azusa Yamaguchi
1374c943d4 Correct Schur operator called 2017-10-10 13:59:50 +01:00
paboyle
a1d80282ec cb factorise 2017-10-10 13:49:31 +01:00
paboyle
4eb8bbbebe Christop mods 2017-10-10 13:48:51 +01:00
paboyle
d1c6288c5f Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2017-10-10 13:38:40 +01:00
Azusa Yamaguchi
dd949bc428 Merge branch 'feature/staggering' into develop 2017-10-10 13:02:51 +01:00
Azusa Yamaguchi
bb7378cfc3 Schur for staggered 2017-10-10 12:02:18 +01:00
Azusa Yamaguchi
f0e084a88c Schur staggered 2017-10-10 10:00:43 +01:00
paboyle
153672d8ec Split CG testing 2017-10-09 23:20:58 +01:00
paboyle
08ca338875 Split grid communication 2017-10-09 23:19:45 +01:00
paboyle
f7cbf82c04 Better stdout/err debug 2017-10-09 23:18:48 +01:00
paboyle
07009c569a Comms splitting improvements 2017-10-09 23:16:51 +01:00
Guido Cossu
15d690e9b9 Adding the cartesian communicator destructor 2017-10-09 09:59:58 +01:00
63b2bc1936 Merge branch 'develop' into feature/hadrons
# Conflicts:
#	lib/qcd/action/fermion/FermionOperatorImpl.h
2017-10-05 14:16:23 +01:00
David Preti
d810e8c8fb first attempt to write C terms in clover derivative. Some shifts to be fixed 2017-10-05 10:13:53 +02:00
Azusa Yamaguchi
09f4cdb11e Merge branch 'develop' of https://github.com/paboyle/Grid into feature/staggering 2017-10-04 10:51:16 +01:00
Azusa Yamaguchi
1e54882f71 Stagger 2017-10-04 10:51:06 +01:00
Guido Cossu
27caff92c6 Merge branch 'feature/scalar_adjointFT' of https://github.com/paboyle/Grid into feature/scalar_adjointFT 2017-10-04 09:44:27 +01:00
d38cee73bf Scalar: easier Fourier acceleration parametrisation through -D flags 2017-10-03 17:29:34 +01:00
8784f2a88d post-merge fix 2017-10-03 14:38:10 +01:00
c497864b5d Merge commit 'd54807b8c0cd1a7658ff8563bb00d1137b987e3e' into feature/scalar_adjointFT
# Conflicts:
#	lib/communicator/Communicator_base.h
#	lib/communicator/Communicator_mpi.cc
#	lib/communicator/Communicator_mpit.cc
2017-10-03 14:27:54 +01:00
05c1c88440 Scalar: more action generalisation 2017-10-03 14:26:20 +01:00
paboyle
d54807b8c0 MPIT works with split grid now 2017-10-02 23:14:56 +01:00
Guido Cossu
f6ba2b95ce Merge branch 'develop' into feature/scalar_adjointFT 2017-10-02 15:19:20 +01:00
paboyle
5625b47c7d Merge branch 'feature/dwf-multirhs' into develop 2017-10-02 12:42:32 +01:00
paboyle
1edcf902b7 Macos ANON 2017-10-02 12:41:02 +01:00
paboyle
e5c19e1fd7 RB constructor change 2017-10-02 12:25:52 +01:00
paboyle
a11d0a33d1 Merge branch 'feature/dwf-multirhs' of https://github.com/paboyle/Grid into feature/dwf-multirhs 2017-10-02 11:42:07 +01:00
paboyle
4f8b6f26b4 Merge branch 'develop' into feature/dwf-multirhs 2017-10-02 11:41:49 +01:00
paboyle
073525c5b3 Small patch from cori 2017-10-02 03:38:21 -07:00
Azusa Yamaguchi
eb6153080a Merge branch 'develop' of https://github.com/paboyle/Grid into feature/staggering 2017-10-02 08:56:33 +01:00
Guido Cossu
f7072d1ac2 Solving an annoying compilation error in json 2017-10-02 07:13:40 +01:00
a021933002 Scalar: SU(N) action change to t'Hooft scaling 2017-09-29 16:09:34 +01:00
James Harrison
b99622d9fb QedFVol: fix problem with JSON wanting gcc 4.9 2017-09-28 13:34:33 -04:00
937c77ead2 Merge branch 'develop' into feature/qed-fvol 2017-09-28 16:25:20 +01:00
95e5a2ade3 Merge pull request #116 from jch1g10/feature/qed-fvol
Feature/qed fvol
2017-09-25 15:08:33 +01:00
David Preti
56478d63a5 clover + test (valence) 2017-09-24 19:32:15 +02:00
df21668f2c memory profiler update 2017-09-22 14:21:18 +01:00
Guido Cossu
482368e9de Merge branch 'develop' into feature/scalar_adjointFT 2017-09-21 13:44:08 +01:00
paboyle
fddeb29d6b Bug fix with spreadout FFT 2017-09-21 11:10:08 +01:00
paboyle
a9ec5cf564 Christoph bug report integrate 2017-09-21 10:32:41 +01:00
Peter Boyle
946a8671b9 Merge pull request #129 from djm2131/feature/eofa
Add support for DWF with the exact one flavor algorithm
2017-09-21 10:15:21 +01:00
Azusa Yamaguchi
a6eeea777b Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2017-09-21 10:12:41 +01:00
Peter Boyle
771a1b8e79 Merge pull request #128 from paboyle/feature/CG-reliable-update
Feature/cg reliable update
2017-09-21 10:12:03 +01:00
Peter Boyle
bfb68e6f02 Merge pull request #130 from giltirn/gparity-handunroll
Gparity handunroll
2017-09-21 10:11:00 +01:00
Azusa Yamaguchi
77f7737ccc Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2017-09-19 14:28:01 +01:00
Guido Cossu
9a827d0242 Fixing a compilation error 2017-09-18 14:55:51 +01:00
Guido Cossu
999c623590 Solving a memory leak in Communicator_mpi 2017-09-18 14:39:04 +01:00
paboyle
18c335198a Merge branch 'hotfix/dirac-ITT-fix1' into develop 2017-09-16 18:19:02 +01:00
paboyle
f9df685cde Merge branch 'hotfix/dirac-ITT-fix1' 2017-09-16 18:18:48 +01:00
paboyle
17c5b0f152 Patching comparison point 2017-09-16 18:18:07 +01:00
paboyle
5918769f97 Subtle Naik term bug updated in Stencil; less on logical && with a function call on right 2017-09-16 12:51:26 +01:00
Guido Cossu
b542d349b8 Minor cosmetic changes 2017-09-15 11:48:36 +01:00
Guido Cossu
91eaace19d Added support for FFT accelerated updates 2017-09-15 11:33:45 +01:00
Guido Cossu
bbaf1ada91 Merge branch 'feature/json-fix' into develop 2017-09-08 16:02:08 +01:00
Guido Cossu
1950ac9294 Fixed the Intel compiler problem with the JSON classes 2017-09-08 15:18:59 +01:00
Guido Cossu
13fa70ac1a Merge branch 'develop' into feature/json-fix 2017-09-08 13:42:20 +01:00
Guido Cossu
7cb2b11f26 Fixing Intel compiler error for the JSON parser 2017-09-08 13:41:53 +01:00
Guido Cossu
1184ed29ae Merge pull request #124 from nmeyer-ur/feature/arm-neon
Added integer reduce functionality
2017-09-08 10:54:35 +02:00
paboyle
203c7bf6fa Merge branch 'hotfix/dirac-ITT-fix' into develop 2017-09-05 15:08:51 +01:00
paboyle
c709883f3f Merge branch 'hotfix/dirac-ITT-fix' 2017-09-05 15:08:16 +01:00
paboyle
aed5de4d50 Patching macos compile 2017-09-05 15:07:07 +01:00
paboyle
ba27cc6571 Mac os happiness 2017-09-05 15:00:16 +01:00
paboyle
d856327250 Merge branch 'release/dirac-ITT' into develop 2017-09-05 14:56:12 +01:00
paboyle
d75369cb56 Merge branch 'release/dirac-ITT' 2017-09-05 14:55:54 +01:00
Peter Boyle
bf973d0d56 SHM complete 2017-09-05 14:30:29 +01:00
Peter Boyle
837bf8a5be Updating to control the SHM allocation scheme under configure time options 2017-09-05 12:51:02 +01:00
Peter Boyle
c05b2199f6 Improvements to huge memory 2017-09-04 10:41:21 -04:00
Azusa Yamaguchi
a5fe07c077 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2017-09-04 14:10:15 +01:00
Azusa Yamaguchi
b83b2b1415 Stability improvement to BCG. Force m_rr hermitian beyond rounding. 2017-09-04 14:09:47 +01:00
James Harrison
91676d1dda Fix “MAP_ANONYMOUS undefined” error on OSX. 2017-09-01 15:48:30 +01:00
Peter Boyle
b331be9101 Better reporting 2017-08-31 11:32:57 +01:00
Peter Boyle
49c20a9fa8 Patch to reporting 2017-08-31 11:32:21 +01:00
paboyle
7359df3501 Full reporting for benchmark; save robustness factor 2017-08-31 10:42:35 +01:00
Christopher Kelly
59bd1fe21b Fix for 'perm' and 'local' not being set for hand-unrolled external-site Dslash, which caused incorrect behavior of G-parity kernel 2017-08-29 13:07:37 -07:00
a56e3b40c4 Merge branch 'develop' into feature/hadrons 2017-08-29 11:03:53 -06:00
Nils Meyer
4e907fef2c Merge remote-tracking branch 'grid/develop' into feature/arm-neon 2017-08-29 17:47:36 +02:00
Christopher Kelly
67888b657f Merge branch 'gparity-handunroll' of https://github.com/giltirn/Grid into gparity-handunroll 2017-08-29 09:52:05 -04:00
Christopher Kelly
74af885d4e Removed some no-longer-needed associated with G-parity hand unrolled kernel 2017-08-29 09:50:37 -04:00
James Harrison
ac3611bb19 Merge branch 'develop' of https://github.com/paboyle/Grid into feature/qed-fvol 2017-08-29 11:53:37 +01:00
Christopher Kelly
d36d2fb40d Added ability to override default Ls in Benchmark_dwf 2017-08-28 06:53:56 -07:00
Peter Boyle
5b9267e88d Cleaner comms benchmark treatment for one node runs 2017-08-27 18:24:48 -04:00
paboyle
15fd4003ef Improving presentation of results 2017-08-27 13:46:02 +01:00
paboyle
4b4c2a715b fcntl.h needed 2017-08-26 11:38:04 +01:00
paboyle
54a5e6c1d0 Check if we get huge pages on linux. Larry Meadows piece of magic. 2017-08-25 22:36:08 +01:00
paboyle
73aeca7dea Merge branch 'feature/multi-communicator' into develop 2017-08-25 21:55:09 +01:00
paboyle
ad89abb018 Fix 2017-08-25 20:43:37 +01:00
paboyle
80c5bce5bb Merge branch 'develop' into feature/multi-communicator 2017-08-25 20:21:26 +01:00
paboyle
f68b5de9c8 No compile fix on Clang 2017-08-25 19:35:21 +01:00
Peter Boyle
d0f3d525d5 Optimal block size for KNL 2017-08-25 19:33:54 +01:00
Christopher Kelly
f365a83fae In G-parity unrolled kernel, replaced calls to permute and exchange with run-time-evaluated permute type with explicit calls to appropriate underlying functions 2017-08-25 14:24:11 -04:00
Peter Boyle
3a58217405 Updated 2017-08-25 14:29:53 +01:00
Peter Boyle
c289699d9a updated from cambridge mpi3 shakeout 2017-08-25 11:41:01 +01:00
Peter Boyle
c3b1263e75 Benchmark prep 2017-08-25 09:25:54 +01:00
Christopher Kelly
34a9aeb331 Reduced number of if-statement evaluations in G-parity unrolled kernel 2017-08-24 13:53:50 -07:00
5846566728 Merge branch 'develop' into feature/hadrons 2017-08-24 18:20:52 +01:00
102ea9ae66 CI update 2017-08-24 18:17:09 +01:00
James Harrison
cc4afb978d Fix bug in non-zero momentum projection 2017-08-24 17:31:44 +01:00
21b02760c3 Merge branch 'develop' into feature/hadrons 2017-08-24 17:05:45 +01:00
Peter Boyle
2bcb704af2 Merge pull request #121 from Lanny91/feature/hadrons
Feature/hadrons
2017-08-24 12:59:08 +01:00
paboyle
5fa386ddc9 FFT test compile fixed 2017-08-24 10:17:52 +01:00
Christopher Kelly
edabb3577f Imported Benchmark_gparity 2017-08-23 16:54:06 -04:00
Christopher Kelly
ce5df177ee Removed superfluous implementation of G-parity twist for hand-unrolled kernel from GparityWilsonImpl 2017-08-23 15:05:22 -04:00
Christopher Kelly
a0bb8e5b46 Added hand-unrolled kernel implementations of all the other dslash precision / comms precision combinations with G-parity 2017-08-23 14:44:40 -04:00
Christopher Kelly
46f88e6d72 G-parity hand-unrolled intrinsics twist now uses one less permute and one less temporary 2017-08-23 13:21:10 -04:00
David Murphy
dd8f1ea189 Vectorized Mobius EOFA Dperp + shift operation 2017-08-23 13:17:26 -04:00
Christopher Kelly
b61835c1a5 Added inplace version of intrinsic G-parity twist to hand-unrolled kernel 2017-08-23 12:33:48 -04:00
Azusa Yamaguchi
d9cd4f0273 Staggered multinode block cg debugged. Missing global sum.
Code stalls and resumes on KNL at cambridge. Curious.

CG iterations 23ms each, then 3200 ms pauses. Mean bandwidth reports
as 200MB/s. Comms dominant in the report. However, the time behaviour suggests it
is *bursty*.... Could be swap to disk?
2017-08-23 15:07:18 +01:00
David Murphy
459f70e8d4 Check-in of working Mobius EOFA class and tests 2017-08-22 22:38:30 -04:00
Christopher Kelly
061e48fd73 Replaced slow unpack-repack in G-parity BC twist with intrinsics version 2017-08-22 18:12:12 -04:00
Christopher Kelly
ab50145001 Implemented first, unoptimized version of hand-unrolled G-parity kernels
Improved Test_gparity
2017-08-22 17:12:25 -04:00
paboyle
b49bec0cec MAP_HUGETLB portability fix 2017-08-20 03:08:54 +01:00
paboyle
ae56e556c6 finalise issue on new OPA revert 2017-08-20 02:53:12 +01:00
paboyle
1cdf999668 Moving multicommunicator into mpi3 also for threading 2017-08-20 02:39:10 +01:00
paboyle
11062fb686 Comms none fail fix 2017-08-20 01:37:07 +01:00
paboyle
383ca7d392 Switch off comms for now until feature/multi-communicator is merged 2017-08-20 01:27:48 +01:00
paboyle
a446d95c33 Trying to pass TeamCity and Travis 2017-08-20 01:10:50 +01:00
paboyle
be66e7dd95 Merge branch 'develop' into feature/multi-communicator 2017-08-19 23:12:38 +01:00
paboyle
6d0d064a6c Update TODO 2017-08-19 23:11:30 +01:00
paboyle
bfef525ed2 New benchmark prep 2017-08-19 23:10:12 +01:00
Peter Boyle
0b0cf62193 Fix mpi 3 interface change 2017-08-19 13:18:50 -04:00
Peter Boyle
7d88198387 Merge branch 'develop' into feature/multi-communicator 2017-08-19 13:03:35 -04:00
Peter Boyle
2f619482b8 Enable blocking stencil send 2017-08-19 12:53:59 -04:00
Peter Boyle
d6472eda8d Use mmap 2017-08-19 12:53:18 -04:00
Peter Boyle
9e658de238 Use Vector 2017-08-19 12:52:44 -04:00
Peter Boyle
bcefdd7c4e Align both allocator calls to 2MB 2017-08-19 12:49:02 -04:00
David Murphy
9d45fca8bc Implement MobiusEOFAFermioncache.cc 2017-08-17 23:45:36 -04:00
David Murphy
ac9e6b63c0 More re-import of Mobius EOFA 2017-08-17 19:28:53 -04:00
David Murphy
e140b3f802 Beginning to re-import Mobius EOFA 2017-08-16 23:36:23 -04:00
David Murphy
d9d3d30cc7 Minor clean-up 2017-08-16 20:57:51 -04:00
David Murphy
47a12ec7b5 Implement EOFA pseudofermion force and Shamir tests for G-parity and non G-parity cases 2017-08-16 19:50:08 -04:00
David Murphy
ec1e2f7a40 Add (mostly implemented) ExactOneFlavourRatio pseudofermion class and tests of Shamir heatbath and action 2017-08-16 12:38:59 -04:00
David Murphy
41f73ec083 Add ChronoForecast class for forecasting solutions across poles in the EOFA heatbath 2017-08-16 12:37:38 -04:00
Guido Cossu
fd367d8bfd Debugging the PointerCache 2017-08-16 09:42:57 +01:00
David Murphy
6d0786ff9d Typo fixes and check-in of G-parity action test for DWF 2017-08-15 22:47:00 -04:00
David Murphy
b7f93aeb4d Change CayleyFermion5D::SetCoefficientsInternal to virtual to allow overriding in derived EOFA classes 2017-08-15 14:18:51 -04:00
David Murphy
202a7fe900 Re-import DWF and abstract base EOFA fermion classes and tests 2017-08-15 13:36:08 -04:00
Guido Cossu
8d168ded4a Correction of the dagger version of the Clover 2017-08-15 10:50:44 +01:00
Guido Cossu
8a3fe60a27 Added more asserts at grid creation time 2017-08-08 11:36:20 +01:00
Guido Cossu
44051aecd1 Checking for integer divisions in cartesian full 2017-08-08 10:31:12 +01:00
Guido Cossu
06e6f8de00 Check that the reduced dim is an integer 2017-08-08 10:22:12 +01:00
Guido Cossu
dbe4d7850c Make a test file compatible with all architectures 2017-08-06 10:49:45 +01:00
Guido Cossu
4fe182e5a7 Added high level HMC support for overriding default SIMD lane decomposition 2017-08-06 10:46:19 +01:00
Guido Cossu
75ee6cfc86 Debugging the Clover term 2017-08-04 16:08:07 +01:00
Guido Cossu
fde71c3c52 Merge branch 'develop' into feature/clover 2017-08-04 12:19:57 +01:00
Guido Cossu
175f393f9d Binary IO error checking 2017-08-04 12:14:10 +01:00
Christopher Kelly
7d867a8134 Merge branch 'develop' into feature/CG-reliable-update 2017-08-02 09:48:04 -04:00
Christopher Kelly
9939b267d2 Added switching to fallback linear operator in reliable update CG, and added recalculation of b parameter on update. 2017-07-31 13:39:44 -04:00
Lanny91
323e9c439a Hadrons: Legal banner fixes 2017-07-31 12:26:34 +01:00
Lanny91
28396f1048 Merge branch 'feature/rare_kaon' of https://github.com/Lanny91/Grid into feature/hadrons 2017-07-31 12:19:54 +01:00
Lanny91
67b34e5789 Modified conserved current 5th dimension loop for compatibility with 5D vectorisation. 2017-07-31 11:35:01 +01:00
Peter Boyle
14d53e1c9e Threaded MPI calls patches 2017-07-29 13:08:10 -04:00
Guido Cossu
8bd869da37 Correcting a bug in the IO routines 2017-07-27 15:12:50 +01:00
Guido Cossu
c7036f6717 Adding checks for libm and libstdc++ 2017-07-27 11:15:09 +01:00
Guido Cossu
c0485d799d Explicit parameter declaration in the WilsonGauge test 2017-07-26 16:26:04 +01:00
Guido Cossu
7abc5613bd Added smearing to the topological charge observable 2017-07-26 16:21:17 +01:00
Guido Cossu
237cfd11ab Solving the spurious O2 flags 2017-07-26 12:08:51 +01:00
Guido Cossu
a4b7dddb67 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2017-07-26 12:07:38 +01:00
Guido Cossu
5696781862 Debug error in Tensor mult 2017-07-26 12:07:34 +01:00
Christopher Kelly
8f4b3049cd Merge branch 'feature/CG-reliable-update' into ckelly_develop 2017-07-25 11:55:26 -04:00
Christopher Kelly
2a6e673a91 Merge branch 'develop' into feature/CG-reliable-update 2017-07-25 11:54:43 -04:00
Christopher Kelly
9b6cde173f Merge branch 'feature/CG-reliable-update' into ckelly_develop 2017-07-25 11:51:08 -04:00
Christopher Kelly
9f280b82c4 Added mixed-precision CG with reliable updates 2017-07-25 11:30:41 -04:00
c3f0889eda Merge pull request #123 from giltirn/develop
Fix for 'using namespace' in lib/qcd/utils/GaugeFix.h
2017-07-25 11:32:02 -03:00
Nils Meyer
7a53dc3715 Added integer reduce functionality 2017-07-24 11:12:59 +02:00
Christopher Kelly
0f214ad427 Moved FourierAcceleratedGaugeFixer into Grid::QCD namespace and removed 'using namespace' directives from header 2017-07-21 11:13:51 -04:00
Peter Boyle
fe4912880d Update README.md 2017-07-17 09:53:07 +01:00
Lanny91
875e1a841f Hadrons: updated Quark -> MFermion/GaugeProp module name in test. 2017-07-16 13:47:00 +01:00
Lanny91
0366288b1c Hadrons: added tests for 3pt contractions. 2017-07-16 13:45:55 +01:00
Lanny91
6293d438cd Hadrons: sink smearing compatibility for 3pt contraction modules. 2017-07-16 13:43:25 +01:00
Lanny91
852ade029a Hadrons: Added module to sink a propagator 2017-07-16 13:41:47 +01:00
Peter Boyle
f038c6babe Update README.md 2017-07-14 22:59:16 +01:00
Peter Boyle
169f4b2711 Update README.md 2017-07-14 22:56:06 +01:00
Peter Boyle
2d8aff36fe Update README.md 2017-07-14 22:52:16 +01:00
Guido Cossu
9fa07eecde Merge branch 'develop' into feature/json-fix 2017-07-12 15:47:22 +01:00
azusayamaguchi
659d7d1a40 For test/solver
Fixed
2017-07-12 15:01:48 +01:00
Guido Cossu
f64fb7bd77 Fix gcc error on JSON compilation 2017-07-12 14:55:42 +01:00
Guido Cossu
2a35449b91 Merge branch 'develop' into feature/json-fix 2017-07-12 14:47:00 +01:00
Guido Cossu
184af5bd05 Added support for std::pair in the JSON serialiser 2017-07-12 14:44:53 +01:00
Guido Cossu
097c9637ee Fixed the JSON parsing error 2017-07-11 14:31:57 +01:00
azusayamaguchi
dc6f078246 fixed the header file for mpi3 2017-07-11 14:15:08 +01:00
Peter Boyle
8a4714a4a6 Update README.md 2017-07-09 00:11:54 +01:00
Peter Boyle
40e119c61c NUMA improvements worth preserving from AMD EPYC tests 2017-07-08 22:27:11 -04:00
Guido Cossu
d9593c4b81 Merge branch 'develop' into feature/json-fix 2017-07-07 14:17:50 +01:00
paboyle
ac740f73ce Works on Cori 2017-07-02 16:47:58 -07:00
paboyle
75dc7794b9 Working on Cori 2017-07-02 16:47:42 -07:00
paboyle
dee68fc728 IO working multiple nodes again. Strategy of all nodes writing metadata is unsafe.
Only one rank should do this. must identify this rank. Means pass communicator to the
Objects.
2017-07-02 23:33:48 +01:00
paboyle
a2d3643634 Merge branch 'feature/dwf-multirhs' of https://github.com/paboyle/Grid into feature/dwf-multirhs 2017-07-02 14:59:22 -07:00
paboyle
57002924bc NERSC shakeout of this 2017-07-02 14:58:30 -07:00
Peter Boyle
7b0237b081 Update README.md 2017-07-01 10:24:41 +01:00
Peter Boyle
b68ad0cc0b Update README.md 2017-07-01 10:20:07 +01:00
Peter Boyle
37263fd9b1 Update README.md 2017-07-01 10:06:24 +01:00
Peter Boyle
3d09e3e9e0 Update README.md 2017-07-01 10:05:46 +01:00
Peter Boyle
1354b46338 Update README.md 2017-07-01 10:04:32 +01:00
Peter Boyle
251a97fe1b Update README.md 2017-07-01 09:55:36 +01:00
Peter Boyle
e18929eaa0 Update README.md 2017-07-01 09:53:15 +01:00
Peter Boyle
f3b0a92e71 Update README.md 2017-07-01 09:48:00 +01:00
Peter Boyle
a0be3f7330 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2017-06-30 10:53:50 +01:00
Peter Boyle
b5a6e4f1fd Best option for Xeon cache blocking set 2017-06-30 10:53:22 +01:00
Peter Boyle
7a788db3dc Guard first touch 2017-06-30 10:49:08 +01:00
Peter Boyle
f20eceb6cd First touch once per page in a threaded loop 2017-06-30 10:48:27 +01:00
Peter Boyle
38325ebbc6 Interleave code path; not enabled 2017-06-30 10:23:51 +01:00
Peter Boyle
b73bd151bb Switch off counters by default 2017-06-30 10:16:35 +01:00
Peter Boyle
694b305cab Update to reporting 2017-06-30 10:16:13 +01:00
Peter Boyle
2d3737a133 O3, KNL 2017-06-30 10:15:59 +01:00
Peter Boyle
ac1f1838bc KNL only 2017-06-30 10:15:32 +01:00
Guido Cossu
09d09d0fe5 Update README.md 2017-06-29 11:48:11 +01:00
Guido Cossu
bf630a6821 README file update 2017-06-29 11:42:25 +01:00
Guido Cossu
8859a151cc Small corrections to the NEON port 2017-06-29 11:30:29 +01:00
Guido Cossu
688a39cfd9 Merge pull request #114 from nmeyer-ur/feature/arm-neon
ARM neon intrinsics support
Guido: checked and approved
2017-06-29 09:57:17 +01:00
paboyle
6f5a5cd9b3 Improved threaded comms benchmark 2017-06-28 23:27:02 +01:00
Nils Meyer
0933aeefd4 corrected Grid_neon.h 2017-06-28 20:22:22 +02:00
Peter Boyle
322f61acee Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2017-06-28 15:30:35 +01:00
Peter Boyle
08e04b9676 Better benchmarks 2017-06-28 15:30:06 +01:00
feaa2ac947 Merge branch 'feature/scalar-hmc-update' into develop 2017-06-28 12:46:18 +01:00
07de925127 minor scalar action fixes 2017-06-28 12:45:44 +01:00
Nils Meyer
a9c816a268 moved file to correct folder 2017-06-27 21:39:15 +02:00
Nils Meyer
e43a8b6b8a removed comments 2017-06-27 20:58:48 +02:00
Nils Meyer
bf729766dd removed collision with QPX implementation 2017-06-27 20:32:24 +02:00
Guido Cossu
dafb351d38 Merge pull request #120 from paboyle/feature/scalar-hmc-update
Scalar HMC update. 
I agree with the changes.
2017-06-27 16:23:14 +01:00
0b707b861c Merge branch 'develop' into feature/scalar-hmc-update 2017-06-27 14:40:05 +01:00
15e87a4607 HDF5 IO fix 2017-06-27 14:39:27 +01:00
7d7220cbd7 scalar: lambda/4! convention 2017-06-27 14:38:45 +01:00
Lanny91
7d2d5e8d3d Merge branch 'develop' of https://github.com/paboyle/Grid into feature/hadrons 2017-06-26 15:19:46 +01:00
paboyle
54e94360ad Experimental: Multiple communicators to see if we can avoid thread locks in --enable-comms=mpit 2017-06-24 23:10:24 +01:00
0af740dc15 minor scalar HMC code improvement 2017-06-24 23:04:05 +01:00
d2e8372df3 SU(N) algebra fix (was not working) 2017-06-24 23:03:39 +01:00
paboyle
869b99ec1e Threaded calls to multiple communicators 2017-06-24 10:55:54 +01:00
paboyle
4a29ab0d0a Merge branch 'feature/dwf-multirhs' of https://github.com/paboyle/Grid into feature/dwf-multirhs 2017-06-23 23:10:43 +01:00
paboyle
0165bcb58e Added an update to TODO list 2017-06-23 23:10:24 +01:00
Lanny91
deca1ecc50 Merge branch 'develop' of https://github.com/paboyle/Grid into feature/rare_kaon 2017-06-23 19:35:19 +02:00
4372d04ad4 Merge pull request #118 from Lanny91/hotfix/bgq
Hotfix/bgq
2017-06-23 16:59:27 +01:00
paboyle
349d75e483 Precision fix 2017-06-23 02:57:59 -07:00
Lanny91
56abbdf4c2 AVX512 integer reduce fix (for non-intel compiler) 2017-06-23 11:09:14 +02:00
Lanny91
af71c63f4c AVX2 fix 2017-06-23 11:03:12 +02:00
paboyle
e51475703a Ticking off lots on the TODO list 2017-06-23 09:42:21 +01:00
paboyle
1feddf4ba6 const fixes 2017-06-22 19:32:41 +01:00
paboyle
600d7ddc2e Proof of concept : Multi RHS solver, running independent solves on different ranks 2017-06-22 18:54:34 +01:00
paboyle
e504260f3d Able to run a test job splitting into multiple MPI subdomains. 2017-06-22 18:53:11 +01:00
Lanny91
0440d4ce66 Merge branch 'develop' of https://github.com/paboyle/Grid into hotfix/bgq 2017-06-22 17:09:42 +02:00
Lanny91
08b0e472aa Fixed hadrons tests after merge 2017-06-22 16:34:33 +02:00
Lanny91
c11d69787e Merge branch 'feature/hadrons' of https://github.com/paboyle/Grid into feature/rare_kaon
# Conflicts:
#	extras/Hadrons/Modules.hpp
#	extras/Hadrons/Modules/MFermion/GaugeProp.hpp
#	extras/Hadrons/modules.inc
#	tests/hadrons/Test_hadrons.hpp
#	tests/hadrons/Test_hadrons_meson_3pt.cc
2017-06-22 16:26:31 +02:00
Lanny91
dc6b2d30d2 Documentation fix 2017-06-22 16:09:45 +02:00
Lanny91
7a3bd5c66c Hadrons: new conserved current contraction test (for regression testing) 2017-06-22 16:06:15 +02:00
Lanny91
18211eb5b1 Hadrons: Fixed test to use new implementation of meson module. 2017-06-22 16:03:59 +02:00
Lanny91
863bb2ad10 Moving overly-specialised code out of Grid 2017-06-22 16:02:15 +02:00
paboyle
5e4bea8f20 Benchmark DWF works 2017-06-22 08:38:54 +01:00
paboyle
6ebf9f15b7 Splitting communicators first cut 2017-06-22 08:14:34 +01:00
paboyle
1d7aa673a4 Include BlockCG by default 2017-06-21 21:08:53 +01:00
paboyle
b9104f3072 Block CG 2017-06-21 21:08:03 +01:00
b22eab8c8b Merge commit 'a7d56523abee6c9030fdd9303c79954897b1086f' into feature/hadrons 2017-06-21 18:32:48 +01:00
paboyle
a7d56523ab Merge branch 'feature/lanczos-simplify' into develop 2017-06-21 14:03:20 +01:00
paboyle
9e56c65730 Updated TODO list 2017-06-21 14:02:58 +01:00
paboyle
ef4f2b8c41 todo update 2017-06-21 09:22:20 +01:00
paboyle
e8b95bd35b Clean up finished. Could shrink Lanczos to around 400 lines at a push 2017-06-21 02:50:09 +01:00
paboyle
7e35286860 Simplified lanczos, added Eigen diagonalisation.
Curious if we can deprecate dependencly on BLAS.
Will see when we get 48^3 running on our BG/Q port
2017-06-21 02:26:03 +01:00
paboyle
0486ff8e79 Improved the lancos 2017-06-20 18:46:01 +01:00
1e8a2e1621 various compatibility fixes after merge 2017-06-20 17:24:55 +01:00
7587df831a Merge branch 'develop' into feature/hadrons
# Conflicts:
#	lib/qcd/action/scalar/ScalarImpl.h
2017-06-20 15:50:39 +01:00
Azusa Yamaguchi
e9cc21900f Block solver complete for staggered. Now stable on mass 0.003 and
gives 8x (!) speed up on Haswell laptop vs. standard CG for 8 RHS solves.

166 iterations vs. 537 iterations so algorithmic gain + 2x in flop rate gain.

Better than a slap in the face with a wet kipper.
2017-06-20 12:37:41 +01:00
Azusa Yamaguchi
0a8faac271 Fix make tests compile 2017-06-19 22:54:18 +01:00
Azusa Yamaguchi
abc4de0fd2 No compile make tests fix 2017-06-19 22:03:03 +01:00
b672717096 Test_serialiation update for JSON 2017-06-19 14:38:39 +01:00
284ee194b1 JSON update 2017-06-19 14:38:15 +01:00
Azusa Yamaguchi
cfe3cd76d1 Block solver improvements 2017-06-19 14:04:21 +01:00
Azusa Yamaguchi
3fa5e3109f Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2017-06-19 14:01:44 +01:00
paboyle
8b7049f737 Improved detectino of usqcdInfo for plaq/linktr 2017-06-19 08:46:07 +01:00
paboyle
c85024683e Merge branch 'feature/parallelio' into develop 2017-06-19 01:39:48 +01:00
81b18f843a Merge branch 'feature/scalar_adjointFT' into feature/hadrons
# Conflicts:
#	lib/qcd/action/scalar/ScalarImpl.h
2017-06-16 17:59:55 +01:00
Lanny91
1bd311ba9c Faster sequential conserved current implementation, now compatible with 5D vectorisation & G-parity. 2017-06-16 16:43:15 +01:00
Lanny91
41af8c12d7 Code cleaning for conserved current contractions. Will now be easier to implement mobius conserved current. 2017-06-16 16:38:59 +01:00
Lanny91
a833f88c32 Added missing SIMD integer reduction implementation for AVX, AVX-512, SSE4, IMCI 2017-06-16 15:58:47 +01:00
Lanny91
07b2c1b253 Placeholder precision change functions to allow Grid to compile with QPX (warning: no actual functionality) 2017-06-16 15:04:26 +01:00
Lanny91
735cbdb983 QPX Integer reduction (+ integer reduction test) 2017-06-14 10:55:10 +01:00
Lanny91
2ad54c5a02 QPX exchange support 2017-06-14 10:53:39 +01:00
Nils Meyer
3d04dc33c6 ARM neon intrinsics support 2017-06-13 13:26:59 +02:00
James Harrison
20e92a7009 QedVFol: Allow output of scalar propagator and vacuum polarisation projected to arbitrary lattice momentum, not just zero-momentum. 2017-06-12 18:27:32 +01:00
Lanny91
5633a2db20 Faster implementation of conserved current site contraction. Added 5D vectorised support, but not G-parity. 2017-06-12 10:41:02 +01:00
Lanny91
2d433ba307 Changed header include guards to match new convention 2017-06-12 10:32:14 +01:00
James Harrison
42f0afcbfa QedFVol: Output all scalar VP diagrams separately 2017-06-09 18:08:40 +01:00
Azusa Yamaguchi
70ab598c96 Move gfix into utils 2017-06-08 22:22:23 +01:00
Azusa Yamaguchi
1d0ca65e28 Move Gfix into utils 2017-06-08 22:21:50 +01:00
Azusa Yamaguchi
2bc4d0a20e Move code into utils 2017-06-08 22:21:25 +01:00
James Harrison
20ac13fdf3 QedFVol: add ChargedProp as an input to ScalarVP module, instead of calculating scalar propagator within ScalarVP. 2017-06-08 17:43:39 +01:00
2490816297 Hadrons: rare kaon program removed 2017-06-07 20:11:02 -05:00
5f55bca378 Hadrons: Quark module renamed MFermion::GaugeProp 2017-06-07 20:10:48 -05:00
James Harrison
e38612e6fa QedFVol: Update ScalarVP module for compatibility with new scalar action 2017-06-07 17:42:00 +01:00
James Harrison
c2b2b71c5d Merge branch 'feature/qed-fvol' of https://github.com/paboyle/Grid into feature/qed-fvol
# Conflicts:
#	extras/Hadrons/Modules.hpp
#	extras/Hadrons/modules.inc
2017-06-07 16:59:47 +01:00
James Harrison
009f48a904 QedFVol: Add missing factor of 2 in free vacuum polarisation 2017-06-07 16:34:09 +01:00
Lanny91
b8e45ae490 Fixed remaining fermion type aliases after merge. 2017-06-07 16:26:22 +01:00
Lanny91
b35fc4e7f9 Merge branch 'feature/hadrons' of https://github.com/paboyle/Grid into feature/rare_kaon
# Conflicts:
#	extras/Hadrons/Global.hpp
#	tests/hadrons/Test_hadrons_rarekaon.cc
2017-06-07 14:38:51 +01:00
Lanny91
60f11bfd72 Removed redundant test module 2017-06-07 12:34:47 +01:00
f6aa82b7f2 Merge branch 'develop' into feature/hadrons 2017-06-06 11:46:33 -05:00
22749699a3 Fixes after merge and point sink module 2017-06-06 11:45:30 -05:00
Lanny91
8d442b502d Sequential current fix for spacial indices. 2017-06-06 17:06:40 +01:00
Lanny91
e5c8b7369e Boundary condition option in quark actions for hadrons tests. 2017-06-06 14:19:10 +01:00
0503c028be Merge branch 'feature/qed-fvol' into feature/hadrons (non-trivial conflicts on scalar Impl)
# Conflicts:
#	configure.ac
#	lib/qcd/action/scalar/Scalar.h
2017-06-05 16:37:47 -05:00
Lanny91
c504b4dbad Code cleaning 2017-06-05 15:56:43 +01:00
Lanny91
622a21bec6 Improvements to sequential conserved current test and small bugfix. 2017-06-05 15:55:32 +01:00
Lanny91
eec79e0a1e Ward Identity test improvements and conserved current bug fixes 2017-06-05 11:55:41 +01:00
Guido Cossu
4a8c4ccfba Test wilson flow, added maxTau for adaptive flow 2017-06-02 17:02:29 +01:00
Guido Cossu
9b44189d5a Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2017-06-02 16:56:00 +01:00
Guido Cossu
7da4856e8e Wilson flow with adaptive steps 2017-06-02 16:55:53 +01:00
Guido Cossu
aaf1e33a77 Adding adaptive integration in the WilsonFlow 2017-06-02 16:32:35 +01:00
d8648307ff Merge branch 'develop' into feature/hadrons 2017-05-29 12:58:08 +01:00
064315c00b Hadrons: mesons gamma list fix 2017-05-29 12:57:33 +01:00
Guido Cossu
7c6cc85df6 Updating WilsonFlow test 2017-05-27 18:03:49 +01:00
Guido Cossu
a6691ef87c Merge pull request #110 from Lanny91/feature/hadrons
Hadrons: Fermion boundary conditions can now be set in measurement code.
2017-05-26 16:43:22 +01:00
Lanny91
23135aa58a Merge branch 'feature/hadrons' of https://github.com/paboyle/Grid into feature/rare_kaon 2017-05-26 16:00:50 +01:00
Lanny91
8e0ced627a Hadrons: Fermion boundary conditions can now be set in measurement code. 2017-05-26 15:59:15 +01:00
Guido Cossu
0de314870d Faster derivative for WilsonGauge 2017-05-26 14:31:49 +01:00
Guido Cossu
ffb91e53d2 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2017-05-26 12:46:02 +01:00
Guido Cossu
f4e8bf2858 Fixing the topological charge. Wilson Flow tested, ok 2017-05-26 12:45:59 +01:00
a74c34315c Bootstrap script fix 2017-05-25 14:27:49 +01:00
Guido Cossu
75856f2945 Compilation fix in the Tensor_exp 2017-05-25 12:44:56 +01:00
Guido Cossu
3c112a7a25 Small correction to the general exp definition 2017-05-25 12:09:00 +01:00
Guido Cossu
ab3596d4d3 Using Cayley-Hamilton form for the exponential of SU(3) matrices 2017-05-25 12:07:47 +01:00
Lanny91
08b314fd0f Hadrons: conserved current test fixes. Axial current tests now also optional. 2017-05-18 13:16:14 +01:00
22f4feee7b Merge branch 'develop' into feature/scalar_adjointFT 2017-05-17 13:27:13 +02:00
3f858d6755 Scalar: phi^2 observable 2017-05-17 13:25:14 +02:00
Lanny91
34332fe393 Improvement to sequential conserved current insertion tests 2017-05-12 16:30:43 +01:00
Lanny91
c2010f21ab Added sequential propagator test for gamma matrix insertion 2017-05-12 16:23:01 +01:00
Lanny91
98f610ce53 Reduced code duplication in hadron tests 2017-05-12 16:15:26 +01:00
Lanny91
d44cc204d1 Added test module for sequential gamma matrix insertion 2017-05-12 14:58:17 +01:00
35fa3d1dfd Merge branch 'master' into feature/scalar_adjointFT 2017-05-12 10:41:39 +01:00
paboyle
c4435e6beb Merge branch 'release/v0.7.0' 2017-05-12 01:15:59 +01:00
d1ece74137 HMC scalar test: magnetisation measurement 2017-05-11 11:40:44 +01:00
43c817cc67 Scalar action: const fix 2017-05-11 00:07:17 +01:00
James Harrison
5cfc0180aa QedFVol: Output free VP along with charged VP. 2017-05-09 12:46:57 +01:00
James Harrison
914f180fa3 QedFVol: Implement exact O(alpha) vacuum polarisation. 2017-05-09 11:46:25 +01:00
paboyle
51bf1501fc Merge branch 'release/v0.7.0' 2017-05-06 18:42:50 +01:00
Guido Cossu
741bc836f6 Exposing support for Ncolours and Ndimensions and JSON input file for the ScalarAction 2017-05-05 17:36:43 +01:00
James Harrison
6cb563a40c QedFVol: Access HVP tensor using a vector<vector<ScalarField>> instead of vector<vector<ScalarField*>> 2017-05-05 17:12:41 +01:00
Guido Cossu
8546d01a4c Merge branch 'develop' into feature/scalar_adjointFT 2017-05-05 15:47:33 +01:00
Lanny91
77e0af9c2e Compilation fix after merge - conserved current code not yet operational for vectorised 5D or Gparity Impl. 2017-05-05 12:27:50 +01:00
Lanny91
ca1077c560 Merge branch 'develop' of https://github.com/paboyle/Grid into feature/rare_kaon
# Conflicts:
#	lib/qcd/action/fermion/WilsonFermion5D.cc
#	tests/hadrons/Test_hadrons_rarekaon.cc
2017-05-04 16:22:33 +01:00
James Harrison
db3837be22 QedFVol: Change “double” to “Real” in ScalarVP.cc 2017-05-03 13:26:49 +01:00
James Harrison
2f0dd83016 Calculate HVP using a single contraction of O(alpha) charged propagators. 2017-05-03 12:53:41 +01:00
Guido Cossu
62a64d9108 EO support, wip 2017-05-01 11:06:21 +01:00
Lanny91
49331a3e72 Minor improvements to Ward Identity checks 2017-04-28 16:50:17 +01:00
Lanny91
51d84ec057 Bugfixes in Wilson 5D sequential conserved current insertion 2017-04-28 16:49:14 +01:00
Lanny91
db14fb30df Hadrons: overhaul of conserved current test 2017-04-28 16:48:00 +01:00
Lanny91
b9356d3866 Added more complete test of sequential insertion of conserved current. 2017-04-28 16:46:40 +01:00
Guido Cossu
99a73f4287 Correcting the M and Mdag in the clover term 2017-04-28 15:51:05 +01:00
Lanny91
f302eea91e SitePropagator redefined to be a scalar object in TYPE_ALIASES. 2017-04-28 15:27:49 +01:00
Guido Cossu
5553b8d2b8 Clover term compiles, not tested 2017-04-28 15:23:34 +01:00
Lanny91
a6ccbbe108 Conserved current sequential source now registered properly and fixed module inputs. 2017-04-28 10:43:47 +01:00
James Harrison
3ac27e5596 QedFVol: remove unnecessary copies of free propagator from shifted sources in ScalarVP 2017-04-27 14:17:50 +01:00
Lanny91
d2003f24f4 Corrected incorrect usage of ExtractSlice for conserved current code. 2017-04-26 17:25:28 +01:00
Lanny91
6299dd35f5 Hadrons: Added test of conserved current code. Tests Ward identities for conserved vector and partially conserved axial currents. 2017-04-26 12:41:39 +01:00
Lanny91
a39daecb62 Removed make_5D const declaration to avoid compilation error 2017-04-26 12:39:07 +01:00
Lanny91
159770e21b Legal Banners added 2017-04-26 09:32:57 +01:00
Lanny91
dc5a6404ea Hadrons: modules for testing conserved current contractions and sequential insertion. 2017-04-25 22:08:33 +01:00
Lanny91
44260643f6 First conserved current implementation for Wilson fermions only. Not implemented for Gparity or 5D-vectorised Wilson fermions. 2017-04-25 18:00:24 +01:00
Lanny91
1425afc72f Rare Kaon test fix 2017-04-25 17:26:56 +01:00
James Harrison
bd466a55a8 QedFVol: remove charge dependence in chargedProp function of ScalarVP 2017-04-25 10:04:03 +01:00
Guido Cossu
752048f410 Merge branch 'develop' into feature/clover 2017-04-24 14:41:20 +01:00
Guido Cossu
b694996302 adding comments 2017-04-14 13:30:14 +01:00
James Harrison
c8e6f58e24 Fix typos in ScalarVP 2017-04-13 17:04:37 +01:00
James Harrison
888988ad37 Merge branch 'feature/qed-fvol' of https://github.com/paboyle/Grid into feature/qed-fvol
# Conflicts:
#	lib/qcd/action/fermion/Fermion.h
2017-04-13 15:54:40 +01:00
1407418755 Old qed-fvol program build disabled 2017-04-13 15:32:30 +01:00
a6a0da873f Merge branch 'feature/hadrons' into feature/qed-fvol 2017-04-13 15:31:06 +01:00
Lanny91
c382c351a5 Quark test output correction. 2017-04-12 14:36:15 +01:00
Lanny91
af2d6ce2e0 Encapsulated 4D->5D and 5D->4D conversions in separate functions & added corresponding tests. 2017-04-12 14:36:02 +01:00
Lanny91
ac1253bb76 Corrected solver in rare kaon test 2017-04-10 17:42:55 +01:00
James Harrison
e4a105a30b Merge branch 'feature/qed-fvol' of https://github.com/paboyle/Grid into feature/qed-fvol 2017-04-10 16:35:01 +01:00
James Harrison
26ebe41fef QedFVol: Implement charged propagator calculation within ScalarVP module 2017-04-10 16:33:54 +01:00
Guido Cossu
363611ae21 Merge branch 'develop' into feature/clover 2017-04-05 16:26:04 +01:00
Guido Cossu
3b8a791e28 Merge branch 'develop' into feature/clover 2017-04-05 16:20:28 +01:00
Guido Cossu
7b03d8d087 Fixing the remaining merge conflicts 2017-04-05 16:17:46 +01:00
Guido Cossu
4b759b8f2a Merge branch 'feature/hmc_generalise' into feature/scalar_adjointFT 2017-04-05 14:50:28 +01:00
Guido Cossu
6fd82228bf Working on the derivative 2017-04-05 10:51:44 +01:00
Guido Cossu
ca6efc685e Merge branch 'develop' into feature/clover 2017-04-04 10:19:02 +01:00
1e496fee74 Merge branch 'develop' into feature/qed-fvol
# Conflicts:
#	lib/qcd/action/fermion/Fermion.h
2017-04-03 19:02:57 +01:00
Guido Cossu
b8ae787b5e Correcting a simple typo 2017-03-30 11:33:15 +01:00
Guido Cossu
fbe2c3b5f9 ]Merge branch 'develop' into feature/clover 2017-03-30 11:18:31 +01:00
Guido Cossu
1ed69816b9 First steps for the force term 2017-03-30 11:14:27 +01:00
James Harrison
9f755e0379 Add functions momD1 and momD2 to ScalarVP 2017-03-27 16:49:18 +01:00
James Harrison
4512dbdf58 Rename module ScalarFV to ScalarVP 2017-03-27 15:02:16 +01:00
James Harrison
483fd3cfa1 Add propagator expansion terms as inputs to ScalarFV 2017-03-27 13:24:51 +01:00
Guido Cossu
3750b9ffee Deleting MPI test for OSX in Travis 2017-03-27 16:53:32 +09:00
Guido Cossu
5e549ebd8b Adding force terms 2017-03-27 16:43:15 +09:00
Guido Cossu
fff484eca5 Populating Clover fermions methods 2017-03-27 15:12:57 +09:00
Guido Cossu
5fdc05782b More in the clover fermion class 2017-03-27 10:54:16 +09:00
James Harrison
85516e9c7c Output all terms of scalar propagator separately 2017-03-24 17:13:55 +00:00
James Harrison
0c006fbfaa Add ScalarFV inputs to ScalarFV.hpp 2017-03-24 11:59:09 +00:00
James Harrison
54c10a42cc Add source and emField inputs to ScalarFV module 2017-03-24 11:42:32 +00:00
Guido Cossu
a04eb7df5d Starting Clover term 2017-03-24 12:43:28 +09:00
James Harrison
ef0fe2bcc1 Added empty ScalarFV module 2017-03-21 11:39:46 +00:00
Guido Cossu
038b6ee9cd Fixing JSON compilation error 2017-03-16 01:09:24 +09:00
Guido Cossu
38806343a8 Improving efficiency of the force term 2017-03-15 15:16:16 +09:00
Guido Cossu
831ca4e3bf Added Scalar action for fields in the adjoint representation 2017-03-14 14:55:18 +09:00
eedcaf6470 Merge branch 'feature/hadrons' into feature/qed-fvol 2017-02-01 15:53:10 -08:00
b39f0d1fb6 Hadrons: default I/O to HDF5 if possible, XML otherwise 2017-01-27 18:12:35 -08:00
9f1267dfe6 Merge branch 'feature/qed-fvol' of github.com:paboyle/Grid into feature/qed-fvol 2017-01-27 17:06:34 -08:00
2e90285232 Merge pull request #80 from jch1g10/feature/qed-fvol
ChargedProp: remove ScalarField fs
2017-01-27 17:06:13 -08:00
e254de982e Merge branch 'feature/qed-fvol' of github.com:paboyle/Grid into feature/qed-fvol 2017-01-27 17:02:35 -08:00
28d99b5297 Merge branch 'develop' into feature/qed-fvol 2017-01-27 16:59:53 -08:00
James Harrison
ee93f0218b ChargedProp: remove ScalarField fs 2017-01-27 12:22:48 +00:00
161ed102a5 Merge pull request #79 from jch1g10/feature/qed-fvol
Fixed bug in ChargedProp
2017-01-26 19:49:14 -08:00
James Harrison
f65a585236 ChargedProp: Switch to HDF5 output 2017-01-26 15:02:30 +00:00
James Harrison
ae99e99da2 Fixed bug in ChargedProp 2017-01-23 17:27:50 +00:00
f3ca29af6c Merge branch 'feature/hadrons' into feature/qed-fvol 2017-01-21 13:41:05 -08:00
37988221a8 Merge branch 'feature/serialisation-hdf5' into feature/qed-fvol 2017-01-20 14:04:20 -08:00
7a327a3f28 Merge branch 'develop' into feature/qed-fvol 2017-01-19 14:22:36 -08:00
92f8950a56 Charged scalar prop: cleaning and output 2017-01-13 13:30:56 +00:00
65987a8a58 First implementation of the scalar QED propagator, runs but absolutely not checked 2017-01-12 20:44:23 +00:00
889d828bc2 Code cleaning 2017-01-12 18:17:44 +00:00
ad98b6193d creating the necessary caches for the FFT EM scalar propagator 2017-01-11 18:40:43 +00:00
fc760016b3 More uniform cache name for scalar momentum propagators 2017-01-11 18:39:58 +00:00
2da86f7dae Merge branch 'feature/hadrons' into feature/qed-fvol 2017-01-11 18:38:05 +00:00
97843e2b58 Hadrons: free scalar buffer fix and output 2017-01-05 14:58:55 +00:00
82b3f54697 scalar free propagator fix 2017-01-05 14:58:07 +00:00
673994b281 Hadrons: modules for scalar propagators 2016-12-29 22:44:58 +01:00
bbc0eff078 Hadrons: scalar sources 2016-12-29 22:44:22 +01:00
4c60e31070 Hadrons: code cleaning 2016-12-29 22:44:08 +01:00
afbf7d4c37 QED Gimpl moved in Photon.h 2016-12-29 22:43:38 +01:00
8c3cc32364 Scalar action 2016-12-29 22:42:58 +01:00
4c3fd9fa3f stochastic QED field module in Hadrons 2016-12-22 00:29:41 +01:00
17b3a10d46 stochastic QED: function to cache 1/sqrt(khat^2) 2016-12-22 00:29:19 +01:00
149a46b92c Merge branch 'feature/hadrons' into feature/qed-fvol 2016-12-22 00:26:43 +01:00
db9c28a773 qed-fvol: Photon parameter name fix 2016-12-20 12:41:39 +01:00
9ac3ac41df serialisable Photon parameters 2016-12-20 12:41:01 +01:00
2af9ab9034 old Makefile cleaning 2016-12-20 12:40:26 +01:00
6f1ea96293 Merge branch 'develop' into feature/qed-fvol 2016-12-20 12:33:02 +01:00
2e3c5890b6 qed-fvol: build fix 2016-12-15 20:06:46 +00:00
bc6678732f Merge branch 'feature/hadrons' into feature/qed-fvol
# Conflicts:
#	Makefile.am
#	configure.ac
#	lib/qcd/action/gauge/Photon.h
2016-12-15 19:53:00 +00:00
b10ae00c8a Merge commit '6ad73145bc9754a5f26093eee5a34473ba0cff82' into feature/qed-fvol 2016-12-15 19:48:58 +00:00
Azusa Yamaguchi
0cd6b1858c Merge branch 'develop' of https://github.com/paboyle/Grid into feature/staggering 2016-12-14 09:23:22 +00:00
James Harrison
6ad73145bc Calculate Wilson loop average over multiple configurations. 2016-11-30 15:17:22 +00:00
f7293f2ddb Merge pull request #69 from jch1g10/feature/qed-fvol 2016-11-26 07:04:04 +09:00
James Harrison
6b8ee7bae0 Merge branch 'feature/feynman-rules' into feature/qed-fvol 2016-11-15 13:08:08 +00:00
James Harrison
739c2308b5 Set imaginary part of stochastic QED field to zero using real() instead of conjugate(). 2016-11-15 13:07:52 +00:00
James Harrison
a71b69389b QedFVol: calculate square Wilson loops up to 10x10 2016-11-14 18:23:04 +00:00
James Harrison
d49e502f53 Merge branch 'feature/feynman-rules' into feature/qed-fvol 2016-11-14 18:00:33 +00:00
James Harrison
92ec3404f8 Set imaginary part of stochastic QED field to zero after FFT into position space 2016-11-14 17:59:02 +00:00
James Harrison
f4ebea3381 QedFVol: add functions for computing spatial and timelike Wilson loops 2016-11-14 17:51:53 +00:00
James Harrison
cf167d0cd1 QedFVol: implement exponentiation of photon field 2016-11-14 17:02:29 +00:00
paboyle
c363bdd784 Merge branch 'release/v0.6.0' 2016-11-09 12:43:14 +00:00
James Harrison
c30d96ea50 QedFVol: x86intrin.h namespace fix 2016-11-09 11:06:20 +00:00
James Harrison
7ffe17ada1 Merge branch 'feature/feynman-rules' into feature/qed-fvol 2016-11-08 14:52:43 +00:00
330a9b3f4c Merge pull request #65 from jch1g10/feature/qed-fvol 2016-11-01 19:53:25 +00:00
James Harrison
28ff66a381 Merge branch 'feature/feynman-rules' into feature/qed-fvol 2016-11-01 16:07:46 +00:00
James Harrison
78c7bcee36 QedFVol: Change variables of type "double" to type "Real". 2016-11-01 16:06:05 +00:00
00a7b95631 Merge remote-tracking branch 'gh-james/feature/qed-fvol' into feature/qed-fvol 2016-10-31 18:46:23 +00:00
94d8321d01 Merge branch 'feature/feynman-rules' into feature/qed-fvol 2016-10-31 18:41:30 +00:00
James Harrison
ac24cc9f99 Merge branch 'feature/feynman-rules' into feature/qed-fvol 2016-10-29 11:05:26 +01:00
James Harrison
3ab4c8c0bb QedFVol: calculate plaquette and 2x2 Wilson loop of stochastic QED field 2016-10-25 13:32:02 +01:00
26d124283e Merge branch 'feature/feynman-rules' into feature/qed-fvol 2016-10-21 15:23:31 +01:00
0d889b7041 QedFVol: first attempt at generating a QED field 2016-10-21 15:21:32 +01:00
ab31ad006a Merge branch 'feature/feynman-rules' into feature/qed-fvol 2016-10-21 14:42:18 +01:00
6e4a06e180 qed-fvol: initial commit 2016-10-20 15:04:00 +01:00
paboyle
446c768cd3 Merge branch 'hotfix/v0.5.1'
Double precision compile fix
2016-07-01 16:33:59 +01:00
1125 changed files with 131351 additions and 76582 deletions

22
.gitignore vendored
View File

@@ -83,6 +83,7 @@ ltmain.sh
.Trashes
ehthumbs.db
Thumbs.db
.dirstamp
# build directory #
###################
@@ -93,14 +94,12 @@ build*/*
*.xcodeproj/*
build.sh
.vscode
*.code-workspace
# Eigen source #
################
lib/Eigen/*
# FFTW source #
################
lib/fftw/*
Grid/Eigen
Eigen/*
# libtool macros #
##################
@@ -111,15 +110,8 @@ m4/libtool.m4
################
gh-pages/
# Buck files #
##############
.buck*
buck-out
BUCK
make-bin-BUCK.sh
# generated sources #
#####################
lib/qcd/spin/gamma-gen/*.h
lib/qcd/spin/gamma-gen/*.cc
Grid/qcd/spin/gamma-gen/*.h
Grid/qcd/spin/gamma-gen/*.cc
Grid/util/Version.h

View File

@@ -9,68 +9,11 @@ matrix:
- os: osx
osx_image: xcode8.3
compiler: clang
- compiler: gcc
dist: trusty
sudo: required
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-4.9
- libmpfr-dev
- libgmp-dev
- libmpc-dev
- libopenmpi-dev
- openmpi-bin
- binutils-dev
env: VERSION=-4.9
- compiler: gcc
dist: trusty
sudo: required
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-5
- libmpfr-dev
- libgmp-dev
- libmpc-dev
- libopenmpi-dev
- openmpi-bin
- binutils-dev
env: VERSION=-5
- compiler: clang
dist: trusty
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-4.8
- libmpfr-dev
- libgmp-dev
- libmpc-dev
- libopenmpi-dev
- openmpi-bin
- binutils-dev
env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
- compiler: clang
dist: trusty
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-4.8
- libmpfr-dev
- libgmp-dev
- libmpc-dev
- libopenmpi-dev
- openmpi-bin
- binutils-dev
env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
env: PREC=single
- os: osx
osx_image: xcode8.3
compiler: clang
env: PREC=double
before_install:
- export GRIDDIR=`pwd`
@@ -78,9 +21,11 @@ before_install:
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export PATH="${GRIDDIR}/clang/bin:${PATH}"; fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc openssl; fi
install:
- export CWD=`pwd`
- echo $CWD
- export CC=$CC$VERSION
- export CXX=$CXX$VERSION
- echo $PATH
@@ -93,22 +38,24 @@ install:
- which $CXX
- $CXX --version
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export EXTRACONF='--with-openssl=/usr/local/opt/openssl'; fi
script:
- ./bootstrap.sh
- mkdir build
- cd build
- ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=none
- mkdir lime
- cd lime
- mkdir build
- cd build
- wget http://usqcd-software.github.io/downloads/c-lime/lime-1.3.2.tar.gz
- tar xf lime-1.3.2.tar.gz
- cd lime-1.3.2
- ./configure --prefix=$CWD/build/lime/install
- make -j4
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
- echo make clean
- ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none
- make install
- cd $CWD/build
- ../configure --enable-precision=$PREC --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF}
- make -j4
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
- make check
- echo make clean
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto ; fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then make -j4; fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi

View File

@@ -0,0 +1,5 @@
Version : 0.8.0
- Clang 3.5 and above, ICPC v16 and above, GCC 6.3 and above recommended
- MPI and MPI3 comms optimisations for KNL and OPA finished
- Half precision comms

63
Grid/DisableWarnings.h Normal file
View File

@@ -0,0 +1,63 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/DisableWarnings.h
Copyright (C) 2016
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef DISABLE_WARNINGS_H
#define DISABLE_WARNINGS_H
#if defined __GNUC__ && __GNUC__>=6
#pragma GCC diagnostic ignored "-Wignored-attributes"
#endif
//disables and intel compiler specific warning (in json.hpp)
#pragma warning disable 488
#ifdef __NVCC__
//disables nvcc specific warning in json.hpp
#pragma clang diagnostic ignored "-Wdeprecated-register"
#pragma diag_suppress unsigned_compare_with_zero
#pragma diag_suppress cast_to_qualified_type
//disables nvcc specific warning in many files
#pragma diag_suppress esa_on_defaulted_function_ignored
#pragma diag_suppress extra_semicolon
//Eigen only
#endif
// Disable vectorisation in Eigen on the Power8/9 and PowerPC
#ifdef __ALTIVEC__
#define EIGEN_DONT_VECTORIZE
#endif
#ifdef __VSX__
#define EIGEN_DONT_VECTORIZE
#endif
#endif

View File

@@ -41,6 +41,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h>
#include <Grid/GridQCDcore.h>
#include <Grid/qcd/action/Action.h>
#include <Grid/qcd/utils/GaugeFix.h>
#include <Grid/qcd/utils/CovariantSmearing.h>
#include <Grid/qcd/smearing/Smearing.h>
#include <Grid/parallelIO/MetaData.h>
#include <Grid/qcd/hmc/HMC_aggregate.h>

View File

@@ -38,16 +38,20 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_BASE_H
#define GRID_BASE_H
#include <Grid/GridStd.h>
#include <Grid/DisableWarnings.h>
#include <Grid/Namespace.h>
#include <Grid/GridStd.h>
#include <Grid/threads/Pragmas.h>
#include <Grid/perfmon/Timer.h>
#include <Grid/perfmon/PerfCount.h>
#include <Grid/util/Util.h>
#include <Grid/log/Log.h>
#include <Grid/allocator/AlignedAllocator.h>
#include <Grid/simd/Simd.h>
#include <Grid/serialisation/Serialisation.h>
#include <Grid/threads/Threads.h>
#include <Grid/util/Util.h>
#include <Grid/serialisation/Serialisation.h>
#include <Grid/util/Sha.h>
#include <Grid/communicator/Communicator.h>
#include <Grid/cartesian/Cartesian.h>
#include <Grid/tensors/Tensors.h>
@@ -56,5 +60,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/stencil/Stencil.h>
#include <Grid/parallelIO/BinaryIO.h>
#include <Grid/algorithms/Algorithms.h>
NAMESPACE_CHECK(GridCore)
#endif

View File

@@ -38,5 +38,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/qcd/spin/Spin.h>
#include <Grid/qcd/utils/Utils.h>
#include <Grid/qcd/representations/Representations.h>
NAMESPACE_CHECK(GridQCDCore);
#endif

View File

@@ -7,6 +7,7 @@
#include <cassert>
#include <complex>
#include <vector>
#include <array>
#include <string>
#include <iostream>
#include <iomanip>

41
Grid/Grid_Eigen_Dense.h Normal file
View File

@@ -0,0 +1,41 @@
#include <Grid/GridCore.h>
#pragma once
// Force Eigen to use MKL if Grid has been configured with --enable-mkl
#ifdef USE_MKL
#define EIGEN_USE_MKL_ALL
#endif
#if defined __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
#endif
/* NVCC save and restore compile environment*/
#ifdef __NVCC__
#pragma push
#pragma diag_suppress code_is_unreachable
#pragma push_macro("__CUDA_ARCH__")
#pragma push_macro("__NVCC__")
#pragma push_macro("__CUDACC__")
#undef __NVCC__
#undef __CUDACC__
#undef __CUDA_ARCH__
#define __NVCC__REDEFINE__
#endif
#include <Grid/Eigen/Dense>
#include <Grid/Eigen/unsupported/CXX11/Tensor>
/* NVCC restore */
#ifdef __NVCC__REDEFINE__
#pragma pop_macro("__CUDACC__")
#pragma pop_macro("__NVCC__")
#pragma pop_macro("__CUDA_ARCH__")
#pragma pop
#endif
#if defined __GNUC__
#pragma GCC diagnostic pop
#endif

1
Grid/Grid_Eigen_Tensor.h Normal file
View File

@@ -0,0 +1 @@
#include <Grid/Grid_Eigen_Dense.h>

63
Grid/Makefile.am Normal file
View File

@@ -0,0 +1,63 @@
extra_sources=
extra_headers=
if BUILD_COMMS_MPI3
extra_sources+=communicator/Communicator_mpi3.cc
extra_sources+=communicator/Communicator_base.cc
extra_sources+=communicator/SharedMemoryMPI.cc
extra_sources+=communicator/SharedMemory.cc
endif
if BUILD_COMMS_NONE
extra_sources+=communicator/Communicator_none.cc
extra_sources+=communicator/Communicator_base.cc
extra_sources+=communicator/SharedMemoryNone.cc
extra_sources+=communicator/SharedMemory.cc
endif
if BUILD_HDF5
extra_sources+=serialisation/Hdf5IO.cc
extra_headers+=serialisation/Hdf5IO.h
extra_headers+=serialisation/Hdf5Type.h
endif
all: version-cache
version-cache:
@if [ `git status --porcelain | grep -v '??' | wc -l` -gt 0 ]; then\
a="uncommited changes";\
else\
a="clean";\
fi;\
echo "`git log -n 1 --format=format:"#define GITHASH \\"%H:%d $$a\\"%n" HEAD`" > vertmp;\
if [ -e version-cache ]; then\
d=`diff vertmp version-cache`;\
if [ "$${d}" != "" ]; then\
mv vertmp version-cache;\
rm -f Version.h;\
fi;\
else\
mv vertmp version-cache;\
rm -f Version.h;\
fi;\
rm -f vertmp
Version.h:
cp version-cache Version.h
.PHONY: version-cache
#
# Libraries
#
include Make.inc
include Eigen.inc
lib_LIBRARIES = libGrid.a
CCFILES += $(extra_sources)
HFILES += $(extra_headers) Config.h Version.h
libGrid_a_SOURCES = $(CCFILES)
libGrid_adir = $(includedir)/Grid
nobase_dist_pkginclude_HEADERS = $(HFILES) $(eigen_files) $(eigen_unsupp_files)

38
Grid/Namespace.h Normal file
View File

@@ -0,0 +1,38 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/Namespace.h
Copyright (C) 2016
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <type_traits>
#include <cassert>
#define NAMESPACE_BEGIN(A) namespace A {
#define NAMESPACE_END(A) }
#define GRID_NAMESPACE_BEGIN NAMESPACE_BEGIN(Grid)
#define GRID_NAMESPACE_END NAMESPACE_END(Grid)
#define NAMESPACE_CHECK(x) struct namespaceTEST##x {}; static_assert(std::is_same<namespaceTEST##x, ::namespaceTEST##x>::value,"Not in :: at" );

View File

@@ -37,37 +37,27 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/algorithms/approx/Chebyshev.h>
#include <Grid/algorithms/approx/Remez.h>
#include <Grid/algorithms/approx/MultiShiftFunction.h>
#include <Grid/algorithms/approx/Forecast.h>
#include <Grid/algorithms/iterative/Deflation.h>
#include <Grid/algorithms/iterative/ConjugateGradient.h>
#include <Grid/algorithms/iterative/ConjugateResidual.h>
#include <Grid/algorithms/iterative/NormalEquations.h>
#include <Grid/algorithms/iterative/SchurRedBlack.h>
#include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
#include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
// Lanczos support
//#include <Grid/algorithms/iterative/MatrixUtils.h>
#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
#include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
#include <Grid/algorithms/iterative/MinimalResidual.h>
#include <Grid/algorithms/iterative/GeneralisedMinimalResidual.h>
#include <Grid/algorithms/iterative/CommunicationAvoidingGeneralisedMinimalResidual.h>
#include <Grid/algorithms/iterative/FlexibleGeneralisedMinimalResidual.h>
#include <Grid/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h>
#include <Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h>
#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
#include <Grid/algorithms/iterative/PowerMethod.h>
#include <Grid/algorithms/CoarsenedMatrix.h>
#include <Grid/algorithms/FFT.h>
// Eigen/lanczos
// EigCg
// MCR
// Pcg
// Multishift CG
// Hdcg
// GCR
// etc..
// integrator/Leapfrog
// integrator/Omelyan
// integrator/ForceGradient
// montecarlo/hmc
// montecarlo/rhmc
// montecarlo/metropolis
// etc...
#endif

View File

@@ -0,0 +1,512 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/CoarsenedMatrix.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ALGORITHM_COARSENED_MATRIX_H
#define GRID_ALGORITHM_COARSENED_MATRIX_H
NAMESPACE_BEGIN(Grid);
class Geometry {
// int dimension;
public:
int npoint;
std::vector<int> directions ;
std::vector<int> displacements;
Geometry(int _d) {
int base = (_d==5) ? 1:0;
// make coarse grid stencil for 4d , not 5d
if ( _d==5 ) _d=4;
npoint = 2*_d+1;
directions.resize(npoint);
displacements.resize(npoint);
for(int d=0;d<_d;d++){
directions[2*d ] = d+base;
directions[2*d+1] = d+base;
displacements[2*d ] = +1;
displacements[2*d+1] = -1;
}
directions [2*_d]=0;
displacements[2*_d]=0;
//// report back
std::cout<<GridLogMessage<<"directions :";
for(int d=0;d<npoint;d++) std::cout<< directions[d]<< " ";
std::cout <<std::endl;
std::cout<<GridLogMessage<<"displacements :";
for(int d=0;d<npoint;d++) std::cout<< displacements[d]<< " ";
std::cout<<std::endl;
}
/*
// Original cleaner code
Geometry(int _d) : dimension(_d), npoint(2*_d+1), directions(npoint), displacements(npoint) {
for(int d=0;d<dimension;d++){
directions[2*d ] = d;
directions[2*d+1] = d;
displacements[2*d ] = +1;
displacements[2*d+1] = -1;
}
directions [2*dimension]=0;
displacements[2*dimension]=0;
}
std::vector<int> GetDelta(int point) {
std::vector<int> delta(dimension,0);
delta[directions[point]] = displacements[point];
return delta;
};
*/
};
template<class Fobj,class CComplex,int nbasis>
class Aggregation {
public:
typedef iVector<CComplex,nbasis > siteVector;
typedef Lattice<siteVector> CoarseVector;
typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj > FineField;
GridBase *CoarseGrid;
GridBase *FineGrid;
std::vector<Lattice<Fobj> > subspace;
int checkerboard;
int Checkerboard(void){return checkerboard;}
Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) :
CoarseGrid(_CoarseGrid),
FineGrid(_FineGrid),
subspace(nbasis,_FineGrid),
checkerboard(_checkerboard)
{
};
void Orthogonalise(void){
CoarseScalar InnerProd(CoarseGrid);
std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
blockOrthogonalise(InnerProd,subspace);
std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl;
blockOrthogonalise(InnerProd,subspace);
// std::cout << GridLogMessage <<" Gramm-Schmidt checking orthogonality"<<std::endl;
// CheckOrthogonal();
}
void CheckOrthogonal(void){
CoarseVector iProj(CoarseGrid);
CoarseVector eProj(CoarseGrid);
for(int i=0;i<nbasis;i++){
blockProject(iProj,subspace[i],subspace);
eProj=Zero();
thread_for(ss, CoarseGrid->oSites(),{
eProj[ss](i)=CComplex(1.0);
});
eProj=eProj - iProj;
std::cout<<GridLogMessage<<"Orthog check error "<<i<<" " << norm2(eProj)<<std::endl;
}
std::cout<<GridLogMessage <<"CheckOrthog done"<<std::endl;
}
void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
blockProject(CoarseVec,FineVec,subspace);
}
void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
FineVec.Checkerboard() = subspace[0].Checkerboard();
blockPromote(CoarseVec,FineVec,subspace);
}
void CreateSubspaceRandom(GridParallelRNG &RNG){
for(int i=0;i<nbasis;i++){
random(RNG,subspace[i]);
std::cout<<GridLogMessage<<" norm subspace["<<i<<"] "<<norm2(subspace[i])<<std::endl;
}
Orthogonalise();
}
/*
virtual void CreateSubspaceLanczos(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis)
{
// Run a Lanczos with sloppy convergence
const int Nstop = nn;
const int Nk = nn+20;
const int Np = nn+20;
const int Nm = Nk+Np;
const int MaxIt= 10000;
RealD resid = 1.0e-3;
Chebyshev<FineField> Cheb(0.5,64.0,21);
ImplicitlyRestartedLanczos<FineField> IRL(hermop,Cheb,Nstop,Nk,Nm,resid,MaxIt);
// IRL.lock = 1;
FineField noise(FineGrid); gaussian(RNG,noise);
FineField tmp(FineGrid);
std::vector<RealD> eval(Nm);
std::vector<FineField> evec(Nm,FineGrid);
int Nconv;
IRL.calc(eval,evec,
noise,
Nconv);
// pull back nn vectors
for(int b=0;b<nn;b++){
subspace[b] = evec[b];
std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
hermop.Op(subspace[b],tmp);
std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(tmp)<<std::endl;
noise = tmp - sqrt(eval[b])*subspace[b] ;
std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<" ; [ M - Lambda ]_"<<b<<" vec_"<<b<<" = " <<norm2(noise)<<std::endl;
noise = tmp + eval[b]*subspace[b] ;
std::cout<<GridLogMessage << " lambda_"<<b<<" = "<< eval[b] <<" ; [ M - Lambda ]_"<<b<<" vec_"<<b<<" = " <<norm2(noise)<<std::endl;
}
Orthogonalise();
for(int b=0;b<nn;b++){
std::cout << GridLogMessage <<"subspace["<<b<<"] = "<<norm2(subspace[b])<<std::endl;
}
}
*/
virtual void CreateSubspace(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
RealD scale;
ConjugateGradient<FineField> CG(1.0e-2,10000);
FineField noise(FineGrid);
FineField Mn(FineGrid);
for(int b=0;b<nn;b++){
subspace[b] = Zero();
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
for(int i=0;i<1;i++){
CG(hermop,noise,subspace[b]);
noise = subspace[b];
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
}
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(Mn)<<std::endl;
subspace[b] = noise;
}
Orthogonalise();
}
};
// Fine Object == (per site) type of fine field
// nbasis == number of deflation vectors
template<class Fobj,class CComplex,int nbasis>
class CoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > > {
public:
typedef iVector<CComplex,nbasis > siteVector;
typedef Lattice<siteVector> CoarseVector;
typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj > FineField;
////////////////////
// Data members
////////////////////
Geometry geom;
GridBase * _grid;
CartesianStencil<siteVector,siteVector,int> Stencil;
std::vector<CoarseMatrix> A;
///////////////////////
// Interface
///////////////////////
GridBase * Grid(void) { return _grid; }; // this is all the linalg routines need to know
RealD M (const CoarseVector &in, CoarseVector &out){
conformable(_grid,in.Grid());
conformable(in.Grid(),out.Grid());
SimpleCompressor<siteVector> compressor;
Stencil.HaloExchange(in,compressor);
auto in_v = in.View();
auto out_v = in.View();
thread_for(ss,Grid()->oSites(),{
siteVector res = Zero();
siteVector nbr;
int ptype;
StencilEntry *SE;
for(int point=0;point<geom.npoint;point++){
SE=Stencil.GetEntry(ptype,point,ss);
if(SE->_is_local&&SE->_permute) {
permute(nbr,in_v[SE->_offset],ptype);
} else if(SE->_is_local) {
nbr = in_v[SE->_offset];
} else {
nbr = Stencil.CommBuf()[SE->_offset];
}
auto A_point = A[point].View();
res = res + A_point[ss]*nbr;
}
vstream(out_v[ss],res);
});
return norm2(out);
};
RealD Mdag (const CoarseVector &in, CoarseVector &out){
// // corresponds to Petrov-Galerkin coarsening
// return M(in,out);
// corresponds to Galerkin coarsening
CoarseVector tmp(Grid());
G5C(tmp, in);
M(tmp, out);
G5C(out, out);
return norm2(out);
};
void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp){
conformable(_grid,in.Grid());
conformable(in.Grid(),out.Grid());
SimpleCompressor<siteVector> compressor;
Stencil.HaloExchange(in,compressor);
auto point = [dir, disp](){
if(dir == 0 and disp == 0)
return 8;
else
return (4 * dir + 1 - disp) / 2;
}();
auto out_v = out.View();
auto in_v = in.View();
thread_for(ss,Grid()->oSites(),{
siteVector res = Zero();
siteVector nbr;
int ptype;
StencilEntry *SE;
SE=Stencil.GetEntry(ptype,point,ss);
if(SE->_is_local&&SE->_permute) {
permute(nbr,in_v[SE->_offset],ptype);
} else if(SE->_is_local) {
nbr = in_v[SE->_offset];
} else {
nbr = Stencil.CommBuf()[SE->_offset];
}
auto A_point = A[point].View();
res = res + A_point[ss]*nbr;
vstream(out_v[ss],res);
});
};
void Mdiag(const CoarseVector &in, CoarseVector &out){
Mdir(in, out, 0, 0); // use the self coupling (= last) point of the stencil
};
CoarsenedMatrix(GridCartesian &CoarseGrid) :
_grid(&CoarseGrid),
geom(CoarseGrid._ndimension),
Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
A(geom.npoint,&CoarseGrid)
{
};
void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & Subspace){
FineField iblock(FineGrid); // contributions from within this block
FineField oblock(FineGrid); // contributions from outwith this block
FineField phi(FineGrid);
FineField tmp(FineGrid);
FineField zz(FineGrid); zz=Zero();
FineField Mphi(FineGrid);
Lattice<iScalar<vInteger> > coor(FineGrid);
CoarseVector iProj(Grid());
CoarseVector oProj(Grid());
CoarseScalar InnerProd(Grid());
// Orthogonalise the subblocks over the basis
blockOrthogonalise(InnerProd,Subspace.subspace);
// Compute the matrix elements of linop between this orthonormal
// set of vectors.
int self_stencil=-1;
for(int p=0;p<geom.npoint;p++){
A[p]=Zero();
if( geom.displacements[p]==0){
self_stencil=p;
}
}
assert(self_stencil!=-1);
for(int i=0;i<nbasis;i++){
phi=Subspace.subspace[i];
std::cout<<GridLogMessage<<"("<<i<<").."<<std::endl;
for(int p=0;p<geom.npoint;p++){
int dir = geom.directions[p];
int disp = geom.displacements[p];
Integer block=(FineGrid->_rdimensions[dir])/(Grid()->_rdimensions[dir]);
LatticeCoordinate(coor,dir);
if ( disp==0 ){
linop.OpDiag(phi,Mphi);
}
else {
linop.OpDir(phi,Mphi,dir,disp);
}
////////////////////////////////////////////////////////////////////////
// Pick out contributions coming from this cell and neighbour cell
////////////////////////////////////////////////////////////////////////
if ( disp==0 ) {
iblock = Mphi;
oblock = Zero();
} else if ( disp==1 ) {
oblock = where(mod(coor,block)==(block-1),Mphi,zz);
iblock = where(mod(coor,block)!=(block-1),Mphi,zz);
} else if ( disp==-1 ) {
oblock = where(mod(coor,block)==(Integer)0,Mphi,zz);
iblock = where(mod(coor,block)!=(Integer)0,Mphi,zz);
} else {
assert(0);
}
Subspace.ProjectToSubspace(iProj,iblock);
Subspace.ProjectToSubspace(oProj,oblock);
// blockProject(iProj,iblock,Subspace.subspace);
// blockProject(oProj,oblock,Subspace.subspace);
auto iProj_v = iProj.View() ;
auto oProj_v = oProj.View() ;
auto A_p = A[p].View();
auto A_self = A[self_stencil].View();
thread_for(ss, Grid()->oSites(),{
for(int j=0;j<nbasis;j++){
if( disp!= 0 ) {
A_p[ss](j,i) = oProj_v[ss](j);
}
A_self[ss](j,i) = A_self[ss](j,i) + iProj_v[ss](j);
}
});
}
}
#if 0
///////////////////////////
// test code worth preserving in if block
///////////////////////////
std::cout<<GridLogMessage<< " Computed matrix elements "<< self_stencil <<std::endl;
for(int p=0;p<geom.npoint;p++){
std::cout<<GridLogMessage<< "A["<<p<<"]" << std::endl;
std::cout<<GridLogMessage<< A[p] << std::endl;
}
std::cout<<GridLogMessage<< " picking by block0 "<< self_stencil <<std::endl;
phi=Subspace.subspace[0];
std::vector<int> bc(FineGrid->_ndimension,0);
blockPick(Grid(),phi,tmp,bc); // Pick out a block
linop.Op(tmp,Mphi); // Apply big dop
blockProject(iProj,Mphi,Subspace.subspace); // project it and print it
std::cout<<GridLogMessage<< " Computed matrix elements from block zero only "<<std::endl;
std::cout<<GridLogMessage<< iProj <<std::endl;
std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
#endif
// ForceHermitian();
// AssertHermitian();
// ForceDiagonal();
}
void ForceHermitian(void) {
for(int d=0;d<4;d++){
int dd=d+1;
A[2*d] = adj(Cshift(A[2*d+1],dd,1));
}
// A[8] = 0.5*(A[8] + adj(A[8]));
}
void AssertHermitian(void) {
CoarseMatrix AA (Grid());
CoarseMatrix AAc (Grid());
CoarseMatrix Diff (Grid());
for(int d=0;d<4;d++){
int dd=d+1;
AAc = Cshift(A[2*d+1],dd,1);
AA = A[2*d];
Diff = AA - adj(AAc);
std::cout<<GridLogMessage<<"Norm diff dim "<<d<<" "<< norm2(Diff)<<std::endl;
std::cout<<GridLogMessage<<"Norm dim "<<d<<" "<< norm2(AA)<<std::endl;
}
Diff = A[8] - adj(A[8]);
std::cout<<GridLogMessage<<"Norm diff local "<< norm2(Diff)<<std::endl;
std::cout<<GridLogMessage<<"Norm local "<< norm2(A[8])<<std::endl;
}
};
NAMESPACE_END(Grid);
#endif

291
Grid/algorithms/FFT.h Normal file
View File

@@ -0,0 +1,291 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/Cshift.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef _GRID_FFT_H_
#define _GRID_FFT_H_
#ifdef HAVE_FFTW
#ifdef USE_MKL
#include <fftw/fftw3.h>
#else
#include <fftw3.h>
#endif
#endif
NAMESPACE_BEGIN(Grid);
template<class scalar> struct FFTW { };
#ifdef HAVE_FFTW
template<> struct FFTW<ComplexD> {
public:
typedef fftw_complex FFTW_scalar;
typedef fftw_plan FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
FFTW_scalar *in, const int *inembed,
int istride, int idist,
FFTW_scalar *out, const int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
return ::fftw_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
}
static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
::fftw_flops(p,add,mul,fmas);
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
::fftw_execute_dft(p,in,out);
}
inline static void fftw_destroy_plan(const FFTW_plan p) {
::fftw_destroy_plan(p);
}
};
template<> struct FFTW<ComplexF> {
public:
typedef fftwf_complex FFTW_scalar;
typedef fftwf_plan FFTW_plan;
static FFTW_plan fftw_plan_many_dft(int rank, const int *n,int howmany,
FFTW_scalar *in, const int *inembed,
int istride, int idist,
FFTW_scalar *out, const int *onembed,
int ostride, int odist,
int sign, unsigned flags) {
return ::fftwf_plan_many_dft(rank,n,howmany,in,inembed,istride,idist,out,onembed,ostride,odist,sign,flags);
}
static void fftw_flops(const FFTW_plan p,double *add, double *mul, double *fmas){
::fftwf_flops(p,add,mul,fmas);
}
inline static void fftw_execute_dft(const FFTW_plan p,FFTW_scalar *in,FFTW_scalar *out) {
::fftwf_execute_dft(p,in,out);
}
inline static void fftw_destroy_plan(const FFTW_plan p) {
::fftwf_destroy_plan(p);
}
};
#endif
#ifndef FFTW_FORWARD
#define FFTW_FORWARD (-1)
#define FFTW_BACKWARD (+1)
#endif
class FFT {
private:
GridCartesian *vgrid;
GridCartesian *sgrid;
int Nd;
double flops;
double flops_call;
uint64_t usec;
Coordinate dimensions;
Coordinate processors;
Coordinate processor_coor;
public:
static const int forward=FFTW_FORWARD;
static const int backward=FFTW_BACKWARD;
double Flops(void) {return flops;}
double MFlops(void) {return flops/usec;}
double USec(void) {return (double)usec;}
FFT ( GridCartesian * grid ) :
vgrid(grid),
Nd(grid->_ndimension),
dimensions(grid->_fdimensions),
processors(grid->_processors),
processor_coor(grid->_processor_coor)
{
flops=0;
usec =0;
Coordinate layout(Nd,1);
sgrid = new GridCartesian(dimensions,layout,processors);
};
~FFT ( void) {
delete sgrid;
}
template<class vobj>
void FFT_dim_mask(Lattice<vobj> &result,const Lattice<vobj> &source,Coordinate mask,int sign){
conformable(result.Grid(),vgrid);
conformable(source.Grid(),vgrid);
Lattice<vobj> tmp(vgrid);
tmp = source;
for(int d=0;d<Nd;d++){
if( mask[d] ) {
FFT_dim(result,tmp,d,sign);
tmp=result;
}
}
}
template<class vobj>
void FFT_all_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int sign){
Coordinate mask(Nd,1);
FFT_dim_mask(result,source,mask,sign);
}
template<class vobj>
void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
#ifndef HAVE_FFTW
assert(0);
#else
conformable(result.Grid(),vgrid);
conformable(source.Grid(),vgrid);
int L = vgrid->_ldimensions[dim];
int G = vgrid->_fdimensions[dim];
Coordinate layout(Nd,1);
Coordinate pencil_gd(vgrid->_fdimensions);
pencil_gd[dim] = G*processors[dim];
// Pencil global vol LxLxGxLxL per node
GridCartesian pencil_g(pencil_gd,layout,processors);
// Construct pencils
typedef typename vobj::scalar_object sobj;
typedef typename sobj::scalar_type scalar;
Lattice<sobj> pgbuf(&pencil_g);
auto pgbuf_v = pgbuf.View();
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
int Ncomp = sizeof(sobj)/sizeof(scalar);
int Nlow = 1;
for(int d=0;d<dim;d++){
Nlow*=vgrid->_ldimensions[d];
}
int rank = 1; /* 1d transforms */
int n[] = {G}; /* 1d transforms of length G */
int howmany = Ncomp;
int odist,idist,istride,ostride;
idist = odist = 1; /* Distance between consecutive FT's */
istride = ostride = Ncomp*Nlow; /* distance between two elements in the same FT */
int *inembed = n, *onembed = n;
scalar div;
if ( sign == backward ) div = 1.0/G;
else if ( sign == forward ) div = 1.0;
else assert(0);
FFTW_plan p;
{
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
FFTW_scalar *out= (FFTW_scalar *)&pgbuf_v[0];
p = FFTW<scalar>::fftw_plan_many_dft(rank,n,howmany,
in,inembed,
istride,idist,
out,onembed,
ostride, odist,
sign,FFTW_ESTIMATE);
}
// Barrel shift and collect global pencil
Coordinate lcoor(Nd), gcoor(Nd);
result = source;
int pc = processor_coor[dim];
for(int p=0;p<processors[dim];p++) {
thread_for(idx, sgrid->lSites(),{
Coordinate cbuf(Nd);
sobj s;
sgrid->LocalIndexToLocalCoor(idx,cbuf);
peekLocalSite(s,result,cbuf);
cbuf[dim]+=((pc+p) % processors[dim])*L;
// cbuf[dim]+=p*L;
pokeLocalSite(s,pgbuf,cbuf);
});
if (p != processors[dim] - 1) {
result = Cshift(result,dim,L);
}
}
// Loop over orthog coords
int NN=pencil_g.lSites();
GridStopWatch timer;
timer.Start();
thread_for( idx,NN,{
Coordinate cbuf(Nd);
pencil_g.LocalIndexToLocalCoor(idx, cbuf);
if ( cbuf[dim] == 0 ) { // restricts loop to plane at lcoor[dim]==0
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[idx];
FFTW_scalar *out= (FFTW_scalar *)&pgbuf_v[idx];
FFTW<scalar>::fftw_execute_dft(p,in,out);
}
});
timer.Stop();
// performance counting
double add,mul,fma;
FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
flops_call = add+mul+2.0*fma;
usec += timer.useconds();
flops+= flops_call*NN;
// writing out result
thread_for(idx,sgrid->lSites(),{
Coordinate clbuf(Nd), cgbuf(Nd);
sobj s;
sgrid->LocalIndexToLocalCoor(idx,clbuf);
cgbuf = clbuf;
cgbuf[dim] = clbuf[dim]+L*pc;
peekLocalSite(s,pgbuf,cgbuf);
pokeLocalSite(s,result,clbuf);
});
result = result*div;
// destroying plan
FFTW<scalar>::fftw_destroy_plan(p);
#endif
}
};
NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,495 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/LinearOperator.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////////////////
// LinearOperators Take a something and return a something.
/////////////////////////////////////////////////////////////////////////////////////////////
//
// Hopefully linearity is satisfied and the AdjOp is indeed the Hermitian Conjugateugate (transpose if real):
//SBase
// i) F(a x + b y) = aF(x) + b F(y).
// ii) <x|Op|y> = <y|AdjOp|x>^\ast
//
// Would be fun to have a test linearity & Herm Conj function!
/////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class LinearOperatorBase {
public:
// Support for coarsening to a multigrid
virtual void OpDiag (const Field &in, Field &out) = 0; // Abstract base
virtual void OpDir (const Field &in, Field &out,int dir,int disp) = 0; // Abstract base
virtual void Op (const Field &in, Field &out) = 0; // Abstract base
virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
virtual void HermOp(const Field &in, Field &out)=0;
};
/////////////////////////////////////////////////////////////////////////////////////////////
// By sharing the class for Sparse Matrix across multiple operator wrappers, we can share code
// between RB and non-RB variants. Sparse matrix is like the fermion action def, and then
// the wrappers implement the specialisation of "Op" and "AdjOp" to the cases minimising
// replication of code.
//
// I'm not entirely happy with implementation; to share the Schur code between herm and non-herm
// while still having a "OpAndNorm" in the abstract base I had to implement it in both cases
// with an assert trap in the non-herm. This isn't right; there must be a better C++ way to
// do it, but I fear it required multiple inheritance and mixed in abstract base classes
/////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////
// Construct herm op from non-herm matrix
////////////////////////////////////////////////////////////////////
template<class Matrix,class Field>
class MdagMLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
public:
MdagMLinearOperator(Matrix &Mat): _Mat(Mat){};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out);
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp);
}
void Op (const Field &in, Field &out){
_Mat.M(in,out);
}
void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
_Mat.MdagM(in,out,n1,n2);
}
void HermOp(const Field &in, Field &out){
RealD n1,n2;
HermOpAndNorm(in,out,n1,n2);
}
};
////////////////////////////////////////////////////////////////////
// Construct herm op and shift it for mgrid smoother
////////////////////////////////////////////////////////////////////
template<class Matrix,class Field>
class ShiftedMdagMLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
RealD _shift;
public:
ShiftedMdagMLinearOperator(Matrix &Mat,RealD shift): _Mat(Mat), _shift(shift){};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out);
assert(0);
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp);
assert(0);
}
void Op (const Field &in, Field &out){
_Mat.M(in,out);
assert(0);
}
void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out);
assert(0);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
_Mat.MdagM(in,out,n1,n2);
out = out + _shift*in;
ComplexD dot;
dot= innerProduct(in,out);
n1=real(dot);
n2=norm2(out);
}
void HermOp(const Field &in, Field &out){
RealD n1,n2;
HermOpAndNorm(in,out,n1,n2);
}
};
////////////////////////////////////////////////////////////////////
// Wrap an already herm matrix
////////////////////////////////////////////////////////////////////
template<class Matrix,class Field>
class HermitianLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
public:
HermitianLinearOperator(Matrix &Mat): _Mat(Mat){};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out);
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp);
}
void Op (const Field &in, Field &out){
_Mat.M(in,out);
}
void AdjOp (const Field &in, Field &out){
_Mat.M(in,out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
_Mat.M(in,out);
ComplexD dot= innerProduct(in,out); n1=real(dot);
n2=norm2(out);
}
void HermOp(const Field &in, Field &out){
_Mat.M(in,out);
}
};
//////////////////////////////////////////////////////////
// Even Odd Schur decomp operators; there are several
// ways to introduce the even odd checkerboarding
//////////////////////////////////////////////////////////
template<class Field>
class SchurOperatorBase : public LinearOperatorBase<Field> {
public:
virtual RealD Mpc (const Field &in, Field &out) =0;
virtual RealD MpcDag (const Field &in, Field &out) =0;
virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
Field tmp(in.Grid());
tmp.Checkerboard() = in.Checkerboard();
ni=Mpc(in,tmp);
no=MpcDag(tmp,out);
}
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
out.Checkerboard() = in.Checkerboard();
MpcDagMpc(in,out,n1,n2);
}
virtual void HermOp(const Field &in, Field &out){
RealD n1,n2;
HermOpAndNorm(in,out,n1,n2);
}
void Op (const Field &in, Field &out){
Mpc(in,out);
}
void AdjOp (const Field &in, Field &out){
MpcDag(in,out);
}
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
assert(0); // must coarsen the unpreconditioned system
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
assert(0);
}
};
template<class Matrix,class Field>
class SchurDiagMooeeOperator : public SchurOperatorBase<Field> {
public:
Matrix &_Mat;
SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
virtual RealD Mpc (const Field &in, Field &out) {
Field tmp(in.Grid());
tmp.Checkerboard() = !in.Checkerboard();
//std::cout <<"grid pointers: in._grid="<< in._grid << " out._grid=" << out._grid << " _Mat.Grid=" << _Mat.Grid() << " _Mat.RedBlackGrid=" << _Mat.RedBlackGrid() << std::endl;
_Mat.Meooe(in,tmp);
_Mat.MooeeInv(tmp,out);
_Mat.Meooe(out,tmp);
//std::cout << "cb in " << in.Checkerboard() << " cb out " << out.Checkerboard() << std::endl;
_Mat.Mooee(in,out);
return axpy_norm(out,-1.0,tmp,out);
}
virtual RealD MpcDag (const Field &in, Field &out){
Field tmp(in.Grid());
_Mat.MeooeDag(in,tmp);
_Mat.MooeeInvDag(tmp,out);
_Mat.MeooeDag(out,tmp);
_Mat.MooeeDag(in,out);
return axpy_norm(out,-1.0,tmp,out);
}
};
template<class Matrix,class Field>
class SchurDiagOneOperator : public SchurOperatorBase<Field> {
protected:
Matrix &_Mat;
public:
SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){};
virtual RealD Mpc (const Field &in, Field &out) {
Field tmp(in.Grid());
_Mat.Meooe(in,out);
_Mat.MooeeInv(out,tmp);
_Mat.Meooe(tmp,out);
_Mat.MooeeInv(out,tmp);
return axpy_norm(out,-1.0,tmp,in);
}
virtual RealD MpcDag (const Field &in, Field &out){
Field tmp(in.Grid());
_Mat.MooeeInvDag(in,out);
_Mat.MeooeDag(out,tmp);
_Mat.MooeeInvDag(tmp,out);
_Mat.MeooeDag(out,tmp);
return axpy_norm(out,-1.0,tmp,in);
}
};
template<class Matrix,class Field>
class SchurDiagTwoOperator : public SchurOperatorBase<Field> {
protected:
Matrix &_Mat;
public:
SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){};
virtual RealD Mpc (const Field &in, Field &out) {
Field tmp(in.Grid());
_Mat.MooeeInv(in,out);
_Mat.Meooe(out,tmp);
_Mat.MooeeInv(tmp,out);
_Mat.Meooe(out,tmp);
return axpy_norm(out,-1.0,tmp,in);
}
virtual RealD MpcDag (const Field &in, Field &out){
Field tmp(in.Grid());
_Mat.MeooeDag(in,out);
_Mat.MooeeInvDag(out,tmp);
_Mat.MeooeDag(tmp,out);
_Mat.MooeeInvDag(out,tmp);
return axpy_norm(out,-1.0,tmp,in);
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////
// Left handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) psi = eta --> ( 1 - Moo^-1 Moe Mee^-1 Meo ) psi = Moo^-1 eta
// Right handed Moo^-1 ; (Moo - Moe Mee^-1 Meo) Moo^-1 Moo psi = eta --> ( 1 - Moe Mee^-1 Meo ) Moo^-1 phi=eta ; psi = Moo^-1 phi
///////////////////////////////////////////////////////////////////////////////////////////////////
template<class Matrix,class Field> using SchurDiagOneRH = SchurDiagTwoOperator<Matrix,Field> ;
template<class Matrix,class Field> using SchurDiagOneLH = SchurDiagOneOperator<Matrix,Field> ;
///////////////////////////////////////////////////////////////////////////////////////////////////
// Staggered use
///////////////////////////////////////////////////////////////////////////////////////////////////
template<class Matrix,class Field>
class SchurStaggeredOperator : public SchurOperatorBase<Field> {
protected:
Matrix &_Mat;
Field tmp;
RealD mass;
double tMpc;
double tIP;
double tMeo;
double taxpby_norm;
uint64_t ncall;
public:
void Report(void)
{
std::cout << GridLogMessage << " HermOpAndNorm.Mpc "<< tMpc/ncall<<" usec "<<std::endl;
std::cout << GridLogMessage << " HermOpAndNorm.IP "<< tIP /ncall<<" usec "<<std::endl;
std::cout << GridLogMessage << " Mpc.MeoMoe "<< tMeo/ncall<<" usec "<<std::endl;
std::cout << GridLogMessage << " Mpc.axpby_norm "<< taxpby_norm/ncall<<" usec "<<std::endl;
}
SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid())
{
assert( _Mat.isTrivialEE() );
mass = _Mat.Mass();
tMpc=0;
tIP =0;
tMeo=0;
taxpby_norm=0;
ncall=0;
}
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
ncall++;
tMpc-=usecond();
n2 = Mpc(in,out);
tMpc+=usecond();
tIP-=usecond();
ComplexD dot= innerProduct(in,out);
tIP+=usecond();
n1 = real(dot);
}
virtual void HermOp(const Field &in, Field &out){
ncall++;
tMpc-=usecond();
_Mat.Meooe(in,out);
_Mat.Meooe(out,tmp);
tMpc+=usecond();
taxpby_norm-=usecond();
axpby(out,-1.0,mass*mass,tmp,in);
taxpby_norm+=usecond();
}
virtual RealD Mpc (const Field &in, Field &out)
{
Field tmp(in.Grid());
Field tmp2(in.Grid());
// std::cout << GridLogIterative << " HermOp.Mpc "<<std::endl;
_Mat.Mooee(in,out);
_Mat.Mooee(out,tmp);
// std::cout << GridLogIterative << " HermOp.MooeeMooee "<<std::endl;
tMeo-=usecond();
_Mat.Meooe(in,out);
_Mat.Meooe(out,tmp);
tMeo+=usecond();
taxpby_norm-=usecond();
RealD nn=axpby_norm(out,-1.0,mass*mass,tmp,in);
taxpby_norm+=usecond();
return nn;
}
virtual RealD MpcDag (const Field &in, Field &out){
return Mpc(in,out);
}
virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
assert(0);// Never need with staggered
}
};
template<class Matrix,class Field> using SchurStagOperator = SchurStaggeredOperator<Matrix,Field>;
/////////////////////////////////////////////////////////////
// Base classes for functions of operators
/////////////////////////////////////////////////////////////
template<class Field> class OperatorFunction {
public:
virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) = 0;
virtual void operator() (LinearOperatorBase<Field> &Linop, const std::vector<Field> &in,std::vector<Field> &out) {
assert(in.size()==out.size());
for(int k=0;k<in.size();k++){
(*this)(Linop,in[k],out[k]);
}
};
};
template<class Field> class LinearFunction {
public:
virtual void operator() (const Field &in, Field &out) = 0;
};
template<class Field> class IdentityLinearFunction : public LinearFunction<Field> {
public:
void operator() (const Field &in, Field &out){
out = in;
};
};
/////////////////////////////////////////////////////////////
// Base classes for Multishift solvers for operators
/////////////////////////////////////////////////////////////
template<class Field> class OperatorMultiFunction {
public:
virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, std::vector<Field> &out) = 0;
};
// FIXME : To think about
// Chroma functionality list defining LinearOperator
/*
virtual void operator() (T& chi, const T& psi, enum PlusMinus isign) const = 0;
virtual void operator() (T& chi, const T& psi, enum PlusMinus isign, Real epsilon) const
virtual const Subset& subset() const = 0;
virtual unsigned long nFlops() const { return 0; }
virtual void deriv(P& ds_u, const T& chi, const T& psi, enum PlusMinus isign) const
class UnprecLinearOperator : public DiffLinearOperator<T,P,Q>
const Subset& subset() const {return all;}
};
*/
////////////////////////////////////////////////////////////////////////////////////////////
// Hermitian operator Linear function and operator function
////////////////////////////////////////////////////////////////////////////////////////////
template<class Field>
class HermOpOperatorFunction : public OperatorFunction<Field> {
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
Linop.HermOp(in,out);
};
};
template<typename Field>
class PlainHermOp : public LinearFunction<Field> {
public:
LinearOperatorBase<Field> &_Linop;
PlainHermOp(LinearOperatorBase<Field>& linop) : _Linop(linop)
{}
void operator()(const Field& in, Field& out) {
_Linop.HermOp(in,out);
}
};
template<typename Field>
class FunctionHermOp : public LinearFunction<Field> {
public:
OperatorFunction<Field> & _poly;
LinearOperatorBase<Field> &_Linop;
FunctionHermOp(OperatorFunction<Field> & poly,LinearOperatorBase<Field>& linop)
: _poly(poly), _Linop(linop) {};
void operator()(const Field& in, Field& out) {
_poly(_Linop,in,out);
}
};
template<class Field>
class Polynomial : public OperatorFunction<Field> {
private:
std::vector<RealD> Coeffs;
public:
using OperatorFunction<Field>::operator();
Polynomial(std::vector<RealD> &_Coeffs) : Coeffs(_Coeffs) { };
// Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
Field AtoN(in.Grid());
Field Mtmp(in.Grid());
AtoN = in;
out = AtoN*Coeffs[0];
for(int n=1;n<Coeffs.size();n++){
Mtmp = AtoN;
Linop.HermOp(Mtmp,AtoN);
out=out+AtoN*Coeffs[n];
}
};
};
NAMESPACE_END(Grid);

View File

@@ -28,7 +28,7 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
#ifndef GRID_PRECONDITIONER_H
#define GRID_PRECONDITIONER_H
namespace Grid {
NAMESPACE_BEGIN(Grid);
template<class Field> class Preconditioner : public LinearFunction<Field> {
virtual void operator()(const Field &src, Field & psi)=0;
@@ -42,5 +42,5 @@ namespace Grid {
TrivialPrecon(void){};
};
}
NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,79 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/SparseMatrix.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ALGORITHM_SPARSE_MATRIX_H
#define GRID_ALGORITHM_SPARSE_MATRIX_H
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////////////////
// Interface defining what I expect of a general sparse matrix, such as a Fermion action
/////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class SparseMatrixBase {
public:
virtual GridBase *Grid(void) =0;
// Full checkerboar operations
virtual RealD M (const Field &in, Field &out)=0;
virtual RealD Mdag (const Field &in, Field &out)=0;
virtual void MdagM(const Field &in, Field &out,RealD &ni,RealD &no) {
Field tmp (in.Grid());
ni=M(in,tmp);
no=Mdag(tmp,out);
}
virtual void Mdiag (const Field &in, Field &out)=0;
virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0;
};
/////////////////////////////////////////////////////////////////////////////////////////////
// Interface augmented by a red black sparse matrix, such as a Fermion action
/////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class CheckerBoardedSparseMatrixBase : public SparseMatrixBase<Field> {
public:
virtual GridBase *RedBlackGrid(void)=0;
//////////////////////////////////////////////////////////////////////
// Query the even even properties to make algorithmic decisions
//////////////////////////////////////////////////////////////////////
virtual RealD Mass(void) { return 0.0; };
virtual int ConstEE(void) { return 1; }; // Disable assumptions unless overridden
virtual int isTrivialEE(void) { return 0; }; // by a derived class that knows better
// half checkerboard operaions
virtual void Meooe (const Field &in, Field &out)=0;
virtual void Mooee (const Field &in, Field &out)=0;
virtual void MooeeInv (const Field &in, Field &out)=0;
virtual void MeooeDag (const Field &in, Field &out)=0;
virtual void MooeeDag (const Field &in, Field &out)=0;
virtual void MooeeInvDag (const Field &in, Field &out)=0;
};
NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,379 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/approx/Chebyshev.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Christoph Lehner <clehner@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CHEBYSHEV_H
#define GRID_CHEBYSHEV_H
#include <Grid/algorithms/LinearOperator.h>
NAMESPACE_BEGIN(Grid);
struct ChebyParams : Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(ChebyParams,
RealD, alpha,
RealD, beta,
int, Npoly);
};
////////////////////////////////////////////////////////////////////////////////////////////
// Generic Chebyshev approximations
////////////////////////////////////////////////////////////////////////////////////////////
template<class Field>
class Chebyshev : public OperatorFunction<Field> {
private:
using OperatorFunction<Field>::operator();
std::vector<RealD> Coeffs;
int order;
RealD hi;
RealD lo;
public:
void csv(std::ostream &out){
RealD diff = hi-lo;
RealD delta = diff*1.0e-9;
for (RealD x=lo; x<hi; x+=delta) {
delta*=1.1;
RealD f = approx(x);
out<< x<<" "<<f<<std::endl;
}
return;
}
// Convenience for plotting the approximation
void PlotApprox(std::ostream &out) {
out<<"Polynomial approx ["<<lo<<","<<hi<<"]"<<std::endl;
for(RealD x=lo;x<hi;x+=(hi-lo)/50.0){
out <<x<<"\t"<<approx(x)<<std::endl;
}
};
Chebyshev(){};
Chebyshev(ChebyParams p){ Init(p.alpha,p.beta,p.Npoly);};
Chebyshev(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD) ) {Init(_lo,_hi,_order,func);};
Chebyshev(RealD _lo,RealD _hi,int _order) {Init(_lo,_hi,_order);};
////////////////////////////////////////////////////////////////////////////////////////////////////
// c.f. numerical recipes "chebft"/"chebev". This is sec 5.8 "Chebyshev approximation".
////////////////////////////////////////////////////////////////////////////////////////////////////
// CJ: the one we need for Lanczos
void Init(RealD _lo,RealD _hi,int _order)
{
lo=_lo;
hi=_hi;
order=_order;
if(order < 2) exit(-1);
Coeffs.resize(order);
Coeffs.assign(0.,order);
Coeffs[order-1] = 1.;
};
void Init(RealD _lo,RealD _hi,int _order, RealD (* func)(RealD))
{
lo=_lo;
hi=_hi;
order=_order;
if(order < 2) exit(-1);
Coeffs.resize(order);
for(int j=0;j<order;j++){
RealD s=0;
for(int k=0;k<order;k++){
RealD y=std::cos(M_PI*(k+0.5)/order);
RealD x=0.5*(y*(hi-lo)+(hi+lo));
RealD f=func(x);
s=s+f*std::cos( j*M_PI*(k+0.5)/order );
}
Coeffs[j] = s * 2.0/order;
}
};
void JacksonSmooth(void){
RealD M=order;
RealD alpha = M_PI/(M+2);
RealD lmax = std::cos(alpha);
RealD sumUsq =0;
std::vector<RealD> U(M);
std::vector<RealD> a(M);
std::vector<RealD> g(M);
for(int n=0;n<=M;n++){
U[n] = std::sin((n+1)*std::acos(lmax))/std::sin(std::acos(lmax));
sumUsq += U[n]*U[n];
}
sumUsq = std::sqrt(sumUsq);
for(int i=1;i<=M;i++){
a[i] = U[i]/sumUsq;
}
g[0] = 1.0;
for(int m=1;m<=M;m++){
g[m] = 0;
for(int i=0;i<=M-m;i++){
g[m]+= a[i]*a[m+i];
}
}
for(int m=1;m<=M;m++){
Coeffs[m]*=g[m];
}
}
RealD approx(RealD x) // Convenience for plotting the approximation
{
RealD Tn;
RealD Tnm;
RealD Tnp;
RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
RealD T0=1;
RealD T1=y;
RealD sum;
sum = 0.5*Coeffs[0]*T0;
sum+= Coeffs[1]*T1;
Tn =T1;
Tnm=T0;
for(int i=2;i<order;i++){
Tnp=2*y*Tn-Tnm;
Tnm=Tn;
Tn =Tnp;
sum+= Tn*Coeffs[i];
}
return sum;
};
RealD approxD(RealD x)
{
RealD Un;
RealD Unm;
RealD Unp;
RealD y=( x-0.5*(hi+lo))/(0.5*(hi-lo));
RealD U0=1;
RealD U1=2*y;
RealD sum;
sum = Coeffs[1]*U0;
sum+= Coeffs[2]*U1*2.0;
Un =U1;
Unm=U0;
for(int i=2;i<order-1;i++){
Unp=2*y*Un-Unm;
Unm=Un;
Un =Unp;
sum+= Un*Coeffs[i+1]*(i+1.0);
}
return sum/(0.5*(hi-lo));
};
RealD approxInv(RealD z, RealD x0, int maxiter, RealD resid) {
RealD x = x0;
RealD eps;
int i;
for (i=0;i<maxiter;i++) {
eps = approx(x) - z;
if (fabs(eps / z) < resid)
return x;
x = x - eps / approxD(x);
}
return std::numeric_limits<double>::quiet_NaN();
}
// Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
GridBase *grid=in.Grid();
// std::cout << "Chevyshef(): in.Grid()="<<in.Grid()<<std::endl;
//std::cout <<" Linop.Grid()="<<Linop.Grid()<<"Linop.RedBlackGrid()="<<Linop.RedBlackGrid()<<std::endl;
int vol=grid->gSites();
Field T0(grid); T0 = in;
Field T1(grid);
Field T2(grid);
Field y(grid);
Field *Tnm = &T0;
Field *Tn = &T1;
Field *Tnp = &T2;
// Tn=T1 = (xscale M + mscale)in
RealD xscale = 2.0/(hi-lo);
RealD mscale = -(hi+lo)/(hi-lo);
Linop.HermOp(T0,y);
T1=y*xscale+in*mscale;
// sum = .5 c[0] T0 + c[1] T1
out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1;
for(int n=2;n<order;n++){
Linop.HermOp(*Tn,y);
y=xscale*y+mscale*(*Tn);
*Tnp=2.0*y-(*Tnm);
out=out+Coeffs[n]* (*Tnp);
// Cycle pointers to avoid copies
Field *swizzle = Tnm;
Tnm =Tn;
Tn =Tnp;
Tnp =swizzle;
}
}
};
template<class Field>
class ChebyshevLanczos : public Chebyshev<Field> {
private:
std::vector<RealD> Coeffs;
int order;
RealD alpha;
RealD beta;
RealD mu;
public:
ChebyshevLanczos(RealD _alpha,RealD _beta,RealD _mu,int _order) :
alpha(_alpha),
beta(_beta),
mu(_mu)
{
order=_order;
Coeffs.resize(order);
for(int i=0;i<_order;i++){
Coeffs[i] = 0.0;
}
Coeffs[order-1]=1.0;
};
void csv(std::ostream &out){
for (RealD x=-1.2*alpha; x<1.2*alpha; x+=(2.0*alpha)/10000) {
RealD f = approx(x);
out<< x<<" "<<f<<std::endl;
}
return;
}
RealD approx(RealD xx) // Convenience for plotting the approximation
{
RealD Tn;
RealD Tnm;
RealD Tnp;
Real aa = alpha * alpha;
Real bb = beta * beta;
RealD x = ( 2.0 * (xx-mu)*(xx-mu) - (aa+bb) ) / (aa-bb);
RealD y= x;
RealD T0=1;
RealD T1=y;
RealD sum;
sum = 0.5*Coeffs[0]*T0;
sum+= Coeffs[1]*T1;
Tn =T1;
Tnm=T0;
for(int i=2;i<order;i++){
Tnp=2*y*Tn-Tnm;
Tnm=Tn;
Tn =Tnp;
sum+= Tn*Coeffs[i];
}
return sum;
};
// shift_Multiply in Rudy's code
void AminusMuSq(LinearOperatorBase<Field> &Linop, const Field &in, Field &out)
{
GridBase *grid=in.Grid();
Field tmp(grid);
RealD aa= alpha*alpha;
RealD bb= beta * beta;
Linop.HermOp(in,out);
out = out - mu*in;
Linop.HermOp(out,tmp);
tmp = tmp - mu * out;
out = (2.0/ (aa-bb) ) * tmp - ((aa+bb)/(aa-bb))*in;
};
// Implement the required interface
void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) {
GridBase *grid=in.Grid();
int vol=grid->gSites();
Field T0(grid); T0 = in;
Field T1(grid);
Field T2(grid);
Field y(grid);
Field *Tnm = &T0;
Field *Tn = &T1;
Field *Tnp = &T2;
// Tn=T1 = (xscale M )*in
AminusMuSq(Linop,T0,T1);
// sum = .5 c[0] T0 + c[1] T1
out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1;
for(int n=2;n<order;n++){
AminusMuSq(Linop,*Tn,y);
*Tnp=2.0*y-(*Tnm);
out=out+Coeffs[n]* (*Tnp);
// Cycle pointers to avoid copies
Field *swizzle = Tnm;
Tnm =Tn;
Tn =Tnp;
Tnp =swizzle;
}
}
};
NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,152 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/approx/Forecast.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: David Murphy <dmurphy@phys.columbia.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef INCLUDED_FORECAST_H
#define INCLUDED_FORECAST_H
NAMESPACE_BEGIN(Grid);
// Abstract base class.
// Takes a matrix (Mat), a source (phi), and a vector of Fields (chi)
// and returns a forecasted solution to the system D*psi = phi (psi).
template<class Matrix, class Field>
class Forecast
{
public:
virtual Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& chi) = 0;
};
// Implementation of Brower et al.'s chronological inverter (arXiv:hep-lat/9509012),
// used to forecast solutions across poles of the EOFA heatbath.
//
// Modified from CPS (cps_pp/src/util/dirac_op/d_op_base/comsrc/minresext.C)
template<class Matrix, class Field>
class ChronoForecast : public Forecast<Matrix,Field>
{
public:
Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& prev_solns)
{
int degree = prev_solns.size();
Field chi(phi); // forecasted solution
// Trivial cases
if(degree == 0){ chi = Zero(); return chi; }
else if(degree == 1){ return prev_solns[0]; }
// RealD dot;
ComplexD xp;
Field r(phi); // residual
Field Mv(phi);
std::vector<Field> v(prev_solns); // orthonormalized previous solutions
std::vector<Field> MdagMv(degree,phi);
// Array to hold the matrix elements
std::vector<std::vector<ComplexD>> G(degree, std::vector<ComplexD>(degree));
// Solution and source vectors
std::vector<ComplexD> a(degree);
std::vector<ComplexD> b(degree);
// Orthonormalize the vector basis
for(int i=0; i<degree; i++){
v[i] *= 1.0/std::sqrt(norm2(v[i]));
for(int j=i+1; j<degree; j++){ v[j] -= innerProduct(v[i],v[j]) * v[i]; }
}
// Perform sparse matrix multiplication and construct rhs
for(int i=0; i<degree; i++){
b[i] = innerProduct(v[i],phi);
Mat.M(v[i],Mv);
Mat.Mdag(Mv,MdagMv[i]);
G[i][i] = innerProduct(v[i],MdagMv[i]);
}
// Construct the matrix
for(int j=0; j<degree; j++){
for(int k=j+1; k<degree; k++){
G[j][k] = innerProduct(v[j],MdagMv[k]);
G[k][j] = conjugate(G[j][k]);
}}
// Gauss-Jordan elimination with partial pivoting
for(int i=0; i<degree; i++){
// Perform partial pivoting
int k = i;
for(int j=i+1; j<degree; j++){ if(abs(G[j][j]) > abs(G[k][k])){ k = j; } }
if(k != i){
xp = b[k];
b[k] = b[i];
b[i] = xp;
for(int j=0; j<degree; j++){
xp = G[k][j];
G[k][j] = G[i][j];
G[i][j] = xp;
}
}
// Convert matrix to upper triangular form
for(int j=i+1; j<degree; j++){
xp = G[j][i]/G[i][i];
b[j] -= xp * b[i];
for(int k=0; k<degree; k++){ G[j][k] -= xp*G[i][k]; }
}
}
// Use Gaussian elimination to solve equations and calculate initial guess
chi = Zero();
r = phi;
for(int i=degree-1; i>=0; i--){
a[i] = 0.0;
for(int j=i+1; j<degree; j++){ a[i] += G[i][j] * a[j]; }
a[i] = (b[i]-a[i])/G[i][i];
chi += a[i]*v[i];
r -= a[i]*MdagMv[i];
}
RealD true_r(0.0);
ComplexD tmp;
for(int i=0; i<degree; i++){
tmp = -b[i];
for(int j=0; j<degree; j++){ tmp += G[i][j]*a[j]; }
tmp = conjugate(tmp)*tmp;
true_r += std::sqrt(tmp.real());
}
RealD error = std::sqrt(norm2(r)/norm2(phi));
std::cout << GridLogMessage << "ChronoForecast: |res|/|src| = " << error << std::endl;
return chi;
};
};
NAMESPACE_END(Grid);
#endif

View File

@@ -27,7 +27,8 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
/* END LEGAL */
#include <Grid/GridCore.h>
namespace Grid {
NAMESPACE_BEGIN(Grid);
double MultiShiftFunction::approx(double x)
{
double a = norm;
@@ -53,4 +54,4 @@ void MultiShiftFunction::csv(std::ostream &out)
}
return;
}
}
NAMESPACE_END(Grid);

View File

@@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef MULTI_SHIFT_FUNCTION
#define MULTI_SHIFT_FUNCTION
namespace Grid {
NAMESPACE_BEGIN(Grid);
class MultiShiftFunction {
public:
@@ -63,5 +63,5 @@ public:
}
};
}
NAMESPACE_END(Grid);
#endif

View File

@@ -298,7 +298,7 @@ void AlgRemez::stpini(bigfloat *step) {
// Search for error maxima and minima
void AlgRemez::search(bigfloat *step) {
bigfloat a, q, xm, ym, xn, yn, xx0, xx1;
int i, j, meq, emsign, ensign, steps;
int i, meq, emsign, ensign, steps;
meq = neq + 1;
bigfloat *yy = new bigfloat[meq];
@@ -306,7 +306,6 @@ void AlgRemez::search(bigfloat *step) {
bigfloat eclose = 1.0e30;
bigfloat farther = 0l;
j = 1;
xx0 = apstrt;
for (i = 0; i < meq; i++) {

View File

@@ -58,8 +58,8 @@
/* Compute the partial fraction expansion coefficients (alpha) from the
* factored form */
namespace Grid {
namespace Approx {
NAMESPACE_BEGIN(Grid);
NAMESPACE_BEGIN(Approx);
static void construct_partfrac(izd *z) {
int dn = z -> dn, dd = z -> dd, type = z -> type;
@@ -516,7 +516,9 @@ zolotarev_data* higham(PRECISION epsilon, int n) {
free(d);
return zd;
}
}}
NAMESPACE_END(Approx);
NAMESPACE_END(Grid);
#ifdef TEST
@@ -585,6 +587,7 @@ static PRECISION zolotarev_cayley_eval(PRECISION x, zolotarev_data* rdata) {
return (ONE - T) / (ONE + T);
}
/* Test program. Apart from printing out the parameters for R(x) it produces
* the following data files for plotting (unless NPLOT is defined):
*
@@ -723,5 +726,5 @@ int main(int argc, char** argv) {
return EXIT_SUCCESS;
}
#endif /* TEST */

View File

@@ -1,13 +1,13 @@
/* -*- Mode: C; comment-column: 22; fill-column: 79; -*- */
#ifdef __cplusplus
namespace Grid {
namespace Approx {
#include <Grid/Namespace.h>
NAMESPACE_BEGIN(Grid);
NAMESPACE_BEGIN(Approx);
#endif
#define HVERSION Header Time-stamp: <14-OCT-2004 09:26:51.00 adk@MISSCONTRARY>
#ifndef ZOLOTAREV_INTERNAL
#ifndef PRECISION
#define PRECISION double
@@ -83,5 +83,6 @@ void zolotarev_free(zolotarev_data *zdata);
#endif
#ifdef __cplusplus
}}
NAMESPACE_END(Approx);
NAMESPACE_END(Grid);
#endif

View File

@@ -10,10 +10,12 @@
#ifndef INCLUDED_BIGFLOAT_H
#define INCLUDED_BIGFLOAT_H
#define __GMP_WITHIN_CONFIGURE
#include <gmp.h>
#include <mpf2mpfr.h>
#include <mpfr.h>
#undef __GMP_WITHIN_CONFIGURE
class bigfloat {
private:

View File

@@ -90,8 +90,8 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
void operator() (const Field &src, Field &psi){
void operator() (const Field &src, Field &psi){
psi.checkerboard = src.checkerboard;
grid = src._grid;
psi.Checkerboard() = src.Checkerboard();
grid = src.Grid();
RealD f;
RealD rtzp,rtz,a,d,b;

View File

@@ -0,0 +1,694 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/BlockConjugateGradient.h
Copyright (C) 2017
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec };
//////////////////////////////////////////////////////////////////////////
// Block conjugate gradient. Dimension zero should be the block direction
//////////////////////////////////////////////////////////////////////////
template <class Field>
class BlockConjugateGradient : public OperatorFunction<Field> {
public:
typedef typename Field::scalar_type scomplex;
int blockDim ;
int Nblock;
BlockCGtype CGtype;
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
// Defaults true.
RealD Tolerance;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
Integer PrintInterval; //GridLogMessages or Iterative
BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
: Tolerance(tol), CGtype(cgtype), blockDim(_Orthog), MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv),PrintInterval(100)
{};
////////////////////////////////////////////////////////////////////////////////////////////////////
// Thin QR factorisation (google it)
////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
//Dimensions
// R_{ferm x Nblock} = Q_{ferm x Nblock} x C_{Nblock x Nblock} -> ferm x Nblock
//
// Rdag R = m_rr = Herm = L L^dag <-- Cholesky decomposition (LLT routine in Eigen)
//
// Q C = R => Q = R C^{-1}
//
// Want Ident = Q^dag Q = C^{-dag} R^dag R C^{-1} = C^{-dag} L L^dag C^{-1} = 1_{Nblock x Nblock}
//
// Set C = L^{dag}, and then Q^dag Q = ident
//
// Checks:
// Cdag C = Rdag R ; passes.
// QdagQ = 1 ; passes
////////////////////////////////////////////////////////////////////////////////////////////////////
void ThinQRfact (Eigen::MatrixXcd &m_rr,
Eigen::MatrixXcd &C,
Eigen::MatrixXcd &Cinv,
Field & Q,
const Field & R)
{
int Orthog = blockDim; // First dimension is block dim; this is an assumption
sliceInnerProductMatrix(m_rr,R,R,Orthog);
// Force manifest hermitian to avoid rounding related
m_rr = 0.5*(m_rr+m_rr.adjoint());
Eigen::MatrixXcd L = m_rr.llt().matrixL();
C = L.adjoint();
Cinv = C.inverse();
////////////////////////////////////////////////////////////////////////////////////////////////////
// Q = R C^{-1}
//
// Q_j = R_i Cinv(i,j)
//
// NB maddMatrix conventions are Right multiplication X[j] a[j,i] already
////////////////////////////////////////////////////////////////////////////////////////////////////
sliceMulMatrix(Q,Cinv,R,Orthog);
}
// see comments above
void ThinQRfact (Eigen::MatrixXcd &m_rr,
Eigen::MatrixXcd &C,
Eigen::MatrixXcd &Cinv,
std::vector<Field> & Q,
const std::vector<Field> & R)
{
InnerProductMatrix(m_rr,R,R);
m_rr = 0.5*(m_rr+m_rr.adjoint());
Eigen::MatrixXcd L = m_rr.llt().matrixL();
C = L.adjoint();
Cinv = C.inverse();
MulMatrix(Q,Cinv,R);
}
////////////////////////////////////////////////////////////////////////////////////////////////////
// Call one of several implementations
////////////////////////////////////////////////////////////////////////////////////////////////////
void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
{
if ( CGtype == BlockCGrQ ) {
BlockCGrQsolve(Linop,Src,Psi);
} else if (CGtype == CGmultiRHS ) {
CGmultiRHSsolve(Linop,Src,Psi);
} else {
assert(0);
}
}
virtual void operator()(LinearOperatorBase<Field> &Linop, const std::vector<Field> &Src, std::vector<Field> &Psi)
{
if ( CGtype == BlockCGrQVec ) {
BlockCGrQsolveVec(Linop,Src,Psi);
} else {
assert(0);
}
}
////////////////////////////////////////////////////////////////////////////
// BlockCGrQ implementation:
//--------------------------
// X is guess/Solution
// B is RHS
// Solve A X_i = B_i ; i refers to Nblock index
////////////////////////////////////////////////////////////////////////////
void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
{
int Orthog = blockDim; // First dimension is block dim; this is an assumption
Nblock = B.Grid()->_fdimensions[Orthog];
/* FAKE */
Nblock=8;
std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
X.Checkerboard() = B.Checkerboard();
conformable(X, B);
Field tmp(B);
Field Q(B);
Field D(B);
Field Z(B);
Field AD(B);
Eigen::MatrixXcd m_DZ = Eigen::MatrixXcd::Identity(Nblock,Nblock);
Eigen::MatrixXcd m_M = Eigen::MatrixXcd::Identity(Nblock,Nblock);
Eigen::MatrixXcd m_rr = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_C = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_Cinv = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_S = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_Sinv = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_tmp = Eigen::MatrixXcd::Identity(Nblock,Nblock);
Eigen::MatrixXcd m_tmp1 = Eigen::MatrixXcd::Identity(Nblock,Nblock);
// Initial residual computation & set up
std::vector<RealD> residuals(Nblock);
std::vector<RealD> ssq(Nblock);
sliceNorm(ssq,B,Orthog);
RealD sssum=0;
for(int b=0;b<Nblock;b++) sssum+=ssq[b];
sliceNorm(residuals,B,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
sliceNorm(residuals,X,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
/************************************************************************
* Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
************************************************************************
* Dimensions:
*
* X,B==(Nferm x Nblock)
* A==(Nferm x Nferm)
*
* Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
*
* QC = R = B-AX, D = Q ; QC => Thin QR factorisation (google it)
* for k:
* Z = AD
* M = [D^dag Z]^{-1}
* X = X + D MC
* QS = Q - ZM
* D = Q + D S^dag
* C = S C
*/
///////////////////////////////////////
// Initial block: initial search dir is guess
///////////////////////////////////////
std::cout << GridLogMessage<<"BlockCGrQ algorithm initialisation " <<std::endl;
//1. QC = R = B-AX, D = Q ; QC => Thin QR factorisation (google it)
Linop.HermOp(X, AD);
tmp = B - AD;
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
D=Q;
std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl;
///////////////////////////////////////
// Timers
///////////////////////////////////////
GridStopWatch sliceInnerTimer;
GridStopWatch sliceMaddTimer;
GridStopWatch QRTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
SolverTimer.Start();
int k;
for (k = 1; k <= MaxIterations; k++) {
//3. Z = AD
MatrixTimer.Start();
Linop.HermOp(D, Z);
MatrixTimer.Stop();
//4. M = [D^dag Z]^{-1}
sliceInnerTimer.Start();
sliceInnerProductMatrix(m_DZ,D,Z,Orthog);
sliceInnerTimer.Stop();
m_M = m_DZ.inverse();
//5. X = X + D MC
m_tmp = m_M * m_C;
sliceMaddTimer.Start();
sliceMaddMatrix(X,m_tmp, D,X,Orthog);
sliceMaddTimer.Stop();
//6. QS = Q - ZM
sliceMaddTimer.Start();
sliceMaddMatrix(tmp,m_M,Z,Q,Orthog,-1.0);
sliceMaddTimer.Stop();
QRTimer.Start();
ThinQRfact (m_rr, m_S, m_Sinv, Q, tmp);
QRTimer.Stop();
//7. D = Q + D S^dag
m_tmp = m_S.adjoint();
sliceMaddTimer.Start();
sliceMaddMatrix(D,m_tmp,D,Q,Orthog);
sliceMaddTimer.Stop();
//8. C = S C
m_C = m_S*m_C;
/*********************
* convergence monitor
*********************
*/
m_rr = m_C.adjoint() * m_C;
RealD max_resid=0;
RealD rrsum=0;
RealD rr;
for(int b=0;b<Nblock;b++) {
rrsum+=real(m_rr(b,b));
rr = real(m_rr(b,b))/ssq[b];
if ( rr > max_resid ) max_resid = rr;
}
std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
<<" ave "<<std::sqrt(rrsum/sssum) << " max "<< max_resid <<std::endl;
if ( max_resid < Tolerance*Tolerance ) {
SolverTimer.Stop();
std::cout << GridLogMessage<<"BlockCGrQ converged in "<<k<<" iterations"<<std::endl;
for(int b=0;b<Nblock;b++){
std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "
<< std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
}
std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
Linop.HermOp(X, AD);
AD = AD-B;
std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AD)/norm2(B)) <<std::endl;
std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tInnerProd " << sliceInnerTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tThinQRfact " << QRTimer.Elapsed() <<std::endl;
IterationsToComplete = k;
return;
}
}
std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k;
}
//////////////////////////////////////////////////////////////////////////
// multiRHS conjugate gradient. Dimension zero should be the block direction
// Use this for spread out across nodes
//////////////////////////////////////////////////////////////////////////
void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
{
int Orthog = blockDim; // First dimension is block dim
Nblock = Src.Grid()->_fdimensions[Orthog];
std::cout<<GridLogMessage<<"MultiRHS Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
Psi.Checkerboard() = Src.Checkerboard();
conformable(Psi, Src);
Field P(Src);
Field AP(Src);
Field R(Src);
std::vector<ComplexD> v_pAp(Nblock);
std::vector<RealD> v_rr (Nblock);
std::vector<RealD> v_rr_inv(Nblock);
std::vector<RealD> v_alpha(Nblock);
std::vector<RealD> v_beta(Nblock);
// Initial residual computation & set up
std::vector<RealD> residuals(Nblock);
std::vector<RealD> ssq(Nblock);
sliceNorm(ssq,Src,Orthog);
RealD sssum=0;
for(int b=0;b<Nblock;b++) sssum+=ssq[b];
sliceNorm(residuals,Src,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
sliceNorm(residuals,Psi,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
// Initial search dir is guess
Linop.HermOp(Psi, AP);
R = Src - AP;
P = R;
sliceNorm(v_rr,R,Orthog);
GridStopWatch sliceInnerTimer;
GridStopWatch sliceMaddTimer;
GridStopWatch sliceNormTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
SolverTimer.Start();
int k;
for (k = 1; k <= MaxIterations; k++) {
RealD rrsum=0;
for(int b=0;b<Nblock;b++) rrsum+=real(v_rr[b]);
std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
<<" / "<<std::sqrt(rrsum/sssum) <<std::endl;
MatrixTimer.Start();
Linop.HermOp(P, AP);
MatrixTimer.Stop();
// Alpha
sliceInnerTimer.Start();
sliceInnerProductVector(v_pAp,P,AP,Orthog);
sliceInnerTimer.Stop();
for(int b=0;b<Nblock;b++){
v_alpha[b] = v_rr[b]/real(v_pAp[b]);
}
// Psi, R update
sliceMaddTimer.Start();
sliceMaddVector(Psi,v_alpha, P,Psi,Orthog); // add alpha * P to psi
sliceMaddVector(R ,v_alpha,AP, R,Orthog,-1.0);// sub alpha * AP to resid
sliceMaddTimer.Stop();
// Beta
for(int b=0;b<Nblock;b++){
v_rr_inv[b] = 1.0/v_rr[b];
}
sliceNormTimer.Start();
sliceNorm(v_rr,R,Orthog);
sliceNormTimer.Stop();
for(int b=0;b<Nblock;b++){
v_beta[b] = v_rr_inv[b] *v_rr[b];
}
// Search update
sliceMaddTimer.Start();
sliceMaddVector(P,v_beta,P,R,Orthog);
sliceMaddTimer.Stop();
/*********************
* convergence monitor
*********************
*/
RealD max_resid=0;
for(int b=0;b<Nblock;b++){
RealD rr = v_rr[b]/ssq[b];
if ( rr > max_resid ) max_resid = rr;
}
if ( max_resid < Tolerance*Tolerance ) {
SolverTimer.Stop();
std::cout << GridLogMessage<<"MultiRHS solver converged in " <<k<<" iterations"<<std::endl;
for(int b=0;b<Nblock;b++){
std::cout << GridLogMessage<< "\t\tBlock "<<b<<" computed resid "<< std::sqrt(v_rr[b]/ssq[b])<<std::endl;
}
std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
Linop.HermOp(Psi, AP);
AP = AP-Src;
std::cout <<GridLogMessage << "\tTrue residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tInnerProd " << sliceInnerTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tNorm " << sliceNormTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed() <<std::endl;
IterationsToComplete = k;
return;
}
}
std::cout << GridLogMessage << "MultiRHSConjugateGradient did NOT converge" << std::endl;
if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k;
}
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
for(int b=0;b<Nblock;b++){
for(int bp=0;bp<Nblock;bp++) {
m(b,bp) = innerProduct(X[b],Y[bp]);
}}
}
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
// Should make this cache friendly with site outermost, parallel_for
// Deal with case AP aliases with either Y or X
std::vector<Field> tmp(Nblock,X[0]);
for(int b=0;b<Nblock;b++){
tmp[b] = Y[b];
for(int bp=0;bp<Nblock;bp++) {
tmp[b] = tmp[b] + scomplex(scale*m(bp,b))*X[bp];
}
}
for(int b=0;b<Nblock;b++){
AP[b] = tmp[b];
}
}
void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
// Should make this cache friendly with site outermost, parallel_for
for(int b=0;b<Nblock;b++){
AP[b] = Zero();
for(int bp=0;bp<Nblock;bp++) {
AP[b] += scomplex(m(bp,b))*X[bp];
}
}
}
double normv(const std::vector<Field> &P){
double nn = 0.0;
for(int b=0;b<Nblock;b++) {
nn+=norm2(P[b]);
}
return nn;
}
////////////////////////////////////////////////////////////////////////////
// BlockCGrQvec implementation:
//--------------------------
// X is guess/Solution
// B is RHS
// Solve A X_i = B_i ; i refers to Nblock index
////////////////////////////////////////////////////////////////////////////
void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field> &B, std::vector<Field> &X)
{
Nblock = B.size();
assert(Nblock == X.size());
std::cout<<GridLogMessage<<" Block Conjugate Gradient Vec rQ : Nblock "<<Nblock<<std::endl;
for(int b=0;b<Nblock;b++){
X[b].Checkerboard() = B[b].Checkerboard();
conformable(X[b], B[b]);
conformable(X[b], X[0]);
}
Field Fake(B[0]);
std::vector<Field> tmp(Nblock,Fake);
std::vector<Field> Q(Nblock,Fake);
std::vector<Field> D(Nblock,Fake);
std::vector<Field> Z(Nblock,Fake);
std::vector<Field> AD(Nblock,Fake);
Eigen::MatrixXcd m_DZ = Eigen::MatrixXcd::Identity(Nblock,Nblock);
Eigen::MatrixXcd m_M = Eigen::MatrixXcd::Identity(Nblock,Nblock);
Eigen::MatrixXcd m_rr = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_C = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_Cinv = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_S = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_Sinv = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_tmp = Eigen::MatrixXcd::Identity(Nblock,Nblock);
Eigen::MatrixXcd m_tmp1 = Eigen::MatrixXcd::Identity(Nblock,Nblock);
// Initial residual computation & set up
std::vector<RealD> residuals(Nblock);
std::vector<RealD> ssq(Nblock);
RealD sssum=0;
for(int b=0;b<Nblock;b++){ ssq[b] = norm2(B[b]);}
for(int b=0;b<Nblock;b++) sssum+=ssq[b];
for(int b=0;b<Nblock;b++){ residuals[b] = norm2(B[b]);}
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
for(int b=0;b<Nblock;b++){ residuals[b] = norm2(X[b]);}
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
/************************************************************************
* Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
************************************************************************
* Dimensions:
*
* X,B==(Nferm x Nblock)
* A==(Nferm x Nferm)
*
* Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
*
* QC = R = B-AX, D = Q ; QC => Thin QR factorisation (google it)
* for k:
* Z = AD
* M = [D^dag Z]^{-1}
* X = X + D MC
* QS = Q - ZM
* D = Q + D S^dag
* C = S C
*/
///////////////////////////////////////
// Initial block: initial search dir is guess
///////////////////////////////////////
std::cout << GridLogMessage<<"BlockCGrQvec algorithm initialisation " <<std::endl;
//1. QC = R = B-AX, D = Q ; QC => Thin QR factorisation (google it)
for(int b=0;b<Nblock;b++) {
Linop.HermOp(X[b], AD[b]);
tmp[b] = B[b] - AD[b];
}
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
for(int b=0;b<Nblock;b++) D[b]=Q[b];
std::cout << GridLogMessage<<"BlockCGrQ vec computed initial residual and QR fact " <<std::endl;
///////////////////////////////////////
// Timers
///////////////////////////////////////
GridStopWatch sliceInnerTimer;
GridStopWatch sliceMaddTimer;
GridStopWatch QRTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
SolverTimer.Start();
int k;
for (k = 1; k <= MaxIterations; k++) {
//3. Z = AD
MatrixTimer.Start();
for(int b=0;b<Nblock;b++) Linop.HermOp(D[b], Z[b]);
MatrixTimer.Stop();
//4. M = [D^dag Z]^{-1}
sliceInnerTimer.Start();
InnerProductMatrix(m_DZ,D,Z);
sliceInnerTimer.Stop();
m_M = m_DZ.inverse();
//5. X = X + D MC
m_tmp = m_M * m_C;
sliceMaddTimer.Start();
MaddMatrix(X,m_tmp, D,X);
sliceMaddTimer.Stop();
//6. QS = Q - ZM
sliceMaddTimer.Start();
MaddMatrix(tmp,m_M,Z,Q,-1.0);
sliceMaddTimer.Stop();
QRTimer.Start();
ThinQRfact (m_rr, m_S, m_Sinv, Q, tmp);
QRTimer.Stop();
//7. D = Q + D S^dag
m_tmp = m_S.adjoint();
sliceMaddTimer.Start();
MaddMatrix(D,m_tmp,D,Q);
sliceMaddTimer.Stop();
//8. C = S C
m_C = m_S*m_C;
/*********************
* convergence monitor
*********************
*/
m_rr = m_C.adjoint() * m_C;
RealD max_resid=0;
RealD rrsum=0;
RealD rr;
for(int b=0;b<Nblock;b++) {
rrsum+=real(m_rr(b,b));
rr = real(m_rr(b,b))/ssq[b];
if ( rr > max_resid ) max_resid = rr;
}
std::cout << GridLogIterative << "\t Block Iteration "<<k<<" ave resid "<< sqrt(rrsum/sssum) << " max "<< sqrt(max_resid) <<std::endl;
if ( max_resid < Tolerance*Tolerance ) {
SolverTimer.Stop();
std::cout << GridLogMessage<<"BlockCGrQ converged in "<<k<<" iterations"<<std::endl;
for(int b=0;b<Nblock;b++){
std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "<< std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
}
std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
for(int b=0;b<Nblock;b++) Linop.HermOp(X[b], AD[b]);
for(int b=0;b<Nblock;b++) AD[b] = AD[b]-B[b];
std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(normv(AD)/normv(B)) <<std::endl;
std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tInnerProd " << sliceInnerTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tThinQRfact " << QRTimer.Elapsed() <<std::endl;
IterationsToComplete = k;
return;
}
}
std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k;
}
};
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,248 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/CommunicationAvoidingGeneralisedMinimalResidual.h
Copyright (C) 2015
Author: Daniel Richtmann <daniel.richtmann@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_COMMUNICATION_AVOIDING_GENERALISED_MINIMAL_RESIDUAL_H
#define GRID_COMMUNICATION_AVOIDING_GENERALISED_MINIMAL_RESIDUAL_H
namespace Grid {
template<class Field>
class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<Field> {
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an assert when CAGMRES fails to converge,
// defaults to true
RealD Tolerance;
Integer MaxIterations;
Integer RestartLength;
Integer MaxNumberOfRestarts;
Integer IterationCount; // Number of iterations the CAGMRES took to finish,
// filled in upon completion
GridStopWatch MatrixTimer;
GridStopWatch LinalgTimer;
GridStopWatch QrTimer;
GridStopWatch CompSolutionTimer;
Eigen::MatrixXcd H;
std::vector<ComplexD> y;
std::vector<ComplexD> gamma;
std::vector<ComplexD> c;
std::vector<ComplexD> s;
CommunicationAvoidingGeneralisedMinimalResidual(RealD tol,
Integer maxit,
Integer restart_length,
bool err_on_no_conv = true)
: Tolerance(tol)
, MaxIterations(maxit)
, RestartLength(restart_length)
, MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
, ErrorOnNoConverge(err_on_no_conv)
, H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
, y(RestartLength + 1, 0.)
, gamma(RestartLength + 1, 0.)
, c(RestartLength + 1, 0.)
, s(RestartLength + 1, 0.) {};
void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
std::cout << GridLogWarning << "This algorithm currently doesn't differ from regular GMRES" << std::endl;
psi.Checkerboard() = src.Checkerboard();
conformable(psi, src);
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
RealD cp;
RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq;
Field r(src.Grid());
std::cout << std::setprecision(4) << std::scientific;
std::cout << GridLogIterative << "CommunicationAvoidingGeneralisedMinimalResidual: guess " << guess << std::endl;
std::cout << GridLogIterative << "CommunicationAvoidingGeneralisedMinimalResidual: src " << ssq << std::endl;
MatrixTimer.Reset();
LinalgTimer.Reset();
QrTimer.Reset();
CompSolutionTimer.Reset();
GridStopWatch SolverTimer;
SolverTimer.Start();
IterationCount = 0;
for (int k=0; k<MaxNumberOfRestarts; k++) {
cp = outerLoopBody(LinOp, src, psi, rsq);
// Stopping condition
if (cp <= rsq) {
SolverTimer.Stop();
LinOp.Op(psi,r);
axpy(r,-1.0,src,r);
RealD srcnorm = sqrt(ssq);
RealD resnorm = sqrt(norm2(r));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "CommunicationAvoidingGeneralisedMinimalResidual: Converged on iteration " << IterationCount
<< " computed residual " << sqrt(cp / ssq)
<< " true residual " << true_residual
<< " target " << Tolerance << std::endl;
std::cout << GridLogMessage << "CAGMRES Time elapsed: Total " << SolverTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "CAGMRES Time elapsed: Matrix " << MatrixTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "CAGMRES Time elapsed: Linalg " << LinalgTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "CAGMRES Time elapsed: QR " << QrTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "CAGMRES Time elapsed: CompSol " << CompSolutionTimer.Elapsed() << std::endl;
return;
}
}
std::cout << GridLogMessage << "CommunicationAvoidingGeneralisedMinimalResidual did NOT converge" << std::endl;
if (ErrorOnNoConverge)
assert(0);
}
RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
RealD cp = 0;
Field w(src.Grid());
Field r(src.Grid());
// this should probably be made a class member so that it is only allocated once, not in every restart
std::vector<Field> v(RestartLength + 1, src.Grid()); for (auto &elem : v) elem = Zero();
MatrixTimer.Start();
LinOp.Op(psi, w);
MatrixTimer.Stop();
LinalgTimer.Start();
r = src - w;
gamma[0] = sqrt(norm2(r));
ComplexD scale = 1.0/gamma[0];
v[0] = scale * r;
LinalgTimer.Stop();
for (int i=0; i<RestartLength; i++) {
IterationCount++;
arnoldiStep(LinOp, v, w, i);
qrUpdate(i);
cp = norm(gamma[i+1]);
std::cout << GridLogIterative << "CommunicationAvoidingGeneralisedMinimalResidual: Iteration " << IterationCount
<< " residual " << cp << " target " << rsq << std::endl;
if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
computeSolution(v, psi, i);
return cp;
}
}
assert(0); // Never reached
return cp;
}
void arnoldiStep(LinearOperatorBase<Field> &LinOp, std::vector<Field> &v, Field &w, int iter) {
MatrixTimer.Start();
LinOp.Op(v[iter], w);
MatrixTimer.Stop();
LinalgTimer.Start();
for (int i = 0; i <= iter; ++i) {
H(iter, i) = innerProduct(v[i], w);
w = w - ComplexD(H(iter, i)) * v[i];
}
H(iter, iter + 1) = sqrt(norm2(w));
v[iter + 1] = ComplexD(1. / H(iter, iter + 1)) * w;
LinalgTimer.Stop();
}
void qrUpdate(int iter) {
QrTimer.Start();
for (int i = 0; i < iter ; ++i) {
auto tmp = -s[i] * ComplexD(H(iter, i)) + c[i] * ComplexD(H(iter, i + 1));
H(iter, i) = conjugate(c[i]) * ComplexD(H(iter, i)) + conjugate(s[i]) * ComplexD(H(iter, i + 1));
H(iter, i + 1) = tmp;
}
// Compute new Givens Rotation
auto nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
c[iter] = H(iter, iter) / nu;
s[iter] = H(iter, iter + 1) / nu;
// Apply new Givens rotation
H(iter, iter) = nu;
H(iter, iter + 1) = 0.;
gamma[iter + 1] = -s[iter] * gamma[iter];
gamma[iter] = conjugate(c[iter]) * gamma[iter];
QrTimer.Stop();
}
void computeSolution(std::vector<Field> const &v, Field &psi, int iter) {
CompSolutionTimer.Start();
for (int i = iter; i >= 0; i--) {
y[i] = gamma[i];
for (int k = i + 1; k <= iter; k++)
y[i] = y[i] - ComplexD(H(k, i)) * y[k];
y[i] = y[i] / ComplexD(H(i, i));
}
for (int i = 0; i <= iter; i++)
psi = psi + v[i] * y[i];
CompSolutionTimer.Stop();
}
};
}
#endif

View File

@@ -31,7 +31,7 @@ directory
#ifndef GRID_CONJUGATE_GRADIENT_H
#define GRID_CONJUGATE_GRADIENT_H
namespace Grid {
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators
@@ -41,6 +41,9 @@ namespace Grid {
template <class Field>
class ConjugateGradient : public OperatorFunction<Field> {
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
// Defaults true.
RealD Tolerance;
@@ -52,12 +55,14 @@ class ConjugateGradient : public OperatorFunction<Field> {
MaxIterations(maxit),
ErrorOnNoConverge(err_on_no_conv){};
void operator()(LinearOperatorBase<Field> &Linop, const Field &src,
Field &psi) {
psi.checkerboard = src.checkerboard;
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
psi.Checkerboard() = src.Checkerboard();
conformable(psi, src);
RealD cp, c, a, d, b, ssq, qq, b_pred;
RealD cp, c, a, d, b, ssq, qq;
//RealD b_pred;
Field p(src);
Field mmp(src);
@@ -70,7 +75,6 @@ class ConjugateGradient : public OperatorFunction<Field> {
Linop.HermOpAndNorm(psi, mmp, d, b);
r = src - mmp;
p = r;
@@ -78,24 +82,29 @@ class ConjugateGradient : public OperatorFunction<Field> {
cp = a;
ssq = norm2(src);
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: guess " << guess << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: src " << ssq << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: mp " << d << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: mmp " << b << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: cp,r " << cp << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradient: p " << a << std::endl;
std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: guess " << guess << std::endl;
std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: src " << ssq << std::endl;
std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: mp " << d << std::endl;
std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: mmp " << b << std::endl;
std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: cp,r " << cp << std::endl;
std::cout << GridLogIterative << std::setprecision(8) << "ConjugateGradient: p " << a << std::endl;
RealD rsq = Tolerance * Tolerance * ssq;
// Check if guess is really REALLY good :)
if (cp <= rsq) {
std::cout << GridLogMessage << "ConjugateGradient guess is converged already " << std::endl;
IterationsToComplete = 0;
return;
}
std::cout << GridLogIterative << std::setprecision(4)
std::cout << GridLogIterative << std::setprecision(8)
<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
GridStopWatch LinalgTimer;
GridStopWatch InnerTimer;
GridStopWatch AxpyNormTimer;
GridStopWatch LinearCombTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
@@ -105,29 +114,35 @@ class ConjugateGradient : public OperatorFunction<Field> {
c = cp;
MatrixTimer.Start();
Linop.HermOpAndNorm(p, mmp, d, qq);
Linop.HermOp(p, mmp);
MatrixTimer.Stop();
LinalgTimer.Start();
// RealD qqck = norm2(mmp);
// ComplexD dck = innerProduct(p,mmp);
InnerTimer.Start();
ComplexD dc = innerProduct(p,mmp);
InnerTimer.Stop();
d = dc.real();
a = c / d;
b_pred = a * (a * qq - d) / c;
AxpyNormTimer.Start();
cp = axpy_norm(r, -a, mmp, r);
AxpyNormTimer.Stop();
b = cp / c;
// Fuse these loops ; should be really easy
psi = a * p + psi;
p = p * b + r;
LinearCombTimer.Start();
auto psi_v = psi.View();
auto p_v = p.View();
auto r_v = r.View();
accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(psi_v[ss], a * p_v(ss) + psi_v(ss));
coalescedWrite(p_v[ss] , b * p_v(ss) + r_v (ss));
});
LinearCombTimer.Stop();
LinalgTimer.Stop();
std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
<< " residual " << cp << " target " << rsq << std::endl;
std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << " b = "<< b << std::endl;
std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << " c = "<< c << std::endl;
<< " residual^2 " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;
// Stopping condition
if (cp <= rsq) {
@@ -135,12 +150,12 @@ class ConjugateGradient : public OperatorFunction<Field> {
Linop.HermOpAndNorm(psi, mmp, d, qq);
p = mmp - src;
RealD srcnorm = sqrt(norm2(src));
RealD resnorm = sqrt(norm2(p));
RealD srcnorm = std::sqrt(norm2(src));
RealD resnorm = std::sqrt(norm2(p));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k << std::endl;
std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
std::cout << GridLogMessage << "\tComputed residual " << std::sqrt(cp / ssq)<<std::endl;
std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
@@ -148,6 +163,9 @@ class ConjugateGradient : public OperatorFunction<Field> {
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tInner " << InnerTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
@@ -156,13 +174,12 @@ class ConjugateGradient : public OperatorFunction<Field> {
return;
}
}
std::cout << GridLogMessage << "ConjugateGradient did NOT converge"
<< std::endl;
std::cout << GridLogMessage << "ConjugateGradient did NOT converge "<<k<<" / "<< MaxIterations<< std::endl;
if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k;
}
};
}
NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,161 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrec.h
Copyright (C) 2015
Author: Christopher Kelly <ckelly@phys.columbia.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_H
#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_H
NAMESPACE_BEGIN(Grid);
//Mixed precision restarted defect correction CG
template<class FieldD,class FieldF,
typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
public:
RealD Tolerance;
RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
Integer MaxInnerIterations;
Integer MaxOuterIterations;
GridBase* SinglePrecGrid; //Grid for single-precision fields
RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
LinearOperatorBase<FieldF> &Linop_f;
LinearOperatorBase<FieldD> &Linop_d;
Integer TotalInnerIterations; //Number of inner CG iterations
Integer TotalOuterIterations; //Number of restarts
Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
//Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
LinearFunction<FieldF> *guesser;
MixedPrecisionConjugateGradient(RealD tol,
Integer maxinnerit,
Integer maxouterit,
GridBase* _sp_grid,
LinearOperatorBase<FieldF> &_Linop_f,
LinearOperatorBase<FieldD> &_Linop_d) :
Linop_f(_Linop_f), Linop_d(_Linop_d),
Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid),
OuterLoopNormMult(100.), guesser(NULL){ };
void useGuesser(LinearFunction<FieldF> &g){
guesser = &g;
}
void operator() (const FieldD &src_d_in, FieldD &sol_d){
TotalInnerIterations = 0;
GridStopWatch TotalTimer;
TotalTimer.Start();
int cb = src_d_in.Checkerboard();
sol_d.Checkerboard() = cb;
RealD src_norm = norm2(src_d_in);
RealD stop = src_norm * Tolerance*Tolerance;
GridBase* DoublePrecGrid = src_d_in.Grid();
FieldD tmp_d(DoublePrecGrid);
tmp_d.Checkerboard() = cb;
FieldD tmp2_d(DoublePrecGrid);
tmp2_d.Checkerboard() = cb;
FieldD src_d(DoublePrecGrid);
src_d = src_d_in; //source for next inner iteration, computed from residual during operation
RealD inner_tol = InnerTolerance;
FieldF src_f(SinglePrecGrid);
src_f.Checkerboard() = cb;
FieldF sol_f(SinglePrecGrid);
sol_f.Checkerboard() = cb;
ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
CG_f.ErrorOnNoConverge = false;
GridStopWatch InnerCGtimer;
GridStopWatch PrecChangeTimer;
Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
//Compute double precision rsd and also new RHS vector.
Linop_d.HermOp(sol_d, tmp_d);
RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
if(norm < OuterLoopNormMult * stop){
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
break;
}
while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
PrecChangeTimer.Start();
precisionChange(src_f, src_d);
PrecChangeTimer.Stop();
sol_f = Zero();
//Optionally improve inner solver guess (eg using known eigenvectors)
if(guesser != NULL)
(*guesser)(src_f, sol_f);
//Inner CG
CG_f.Tolerance = inner_tol;
InnerCGtimer.Start();
CG_f(Linop_f, src_f, sol_f);
InnerCGtimer.Stop();
TotalInnerIterations += CG_f.IterationsToComplete;
//Convert sol back to double and add to double prec solution
PrecChangeTimer.Start();
precisionChange(tmp_d, sol_f);
PrecChangeTimer.Stop();
axpy(sol_d, 1.0, tmp_d, sol_d);
}
//Final trial CG
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting final patch-up double-precision solve"<<std::endl;
ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
CG_d(Linop_d, src_d_in, sol_d);
TotalFinalStepIterations = CG_d.IterationsToComplete;
TotalTimer.Stop();
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
}
};
NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,325 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CONJUGATE_MULTI_SHIFT_GRADIENT_H
#define GRID_CONJUGATE_MULTI_SHIFT_GRADIENT_H
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators
// single input vec, single output vec.
/////////////////////////////////////////////////////////////
template<class Field>
class ConjugateGradientMultiShift : public OperatorMultiFunction<Field>,
public OperatorFunction<Field>
{
public:
using OperatorFunction<Field>::operator();
RealD Tolerance;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
int verbose;
MultiShiftFunction shifts;
ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) :
MaxIterations(maxit),
shifts(_shifts)
{
verbose=1;
}
void operator() (LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
{
GridBase *grid = src.Grid();
int nshift = shifts.order;
std::vector<Field> results(nshift,grid);
(*this)(Linop,src,results,psi);
}
void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &results, Field &psi)
{
int nshift = shifts.order;
(*this)(Linop,src,results);
psi = shifts.norm*src;
for(int i=0;i<nshift;i++){
psi = psi + shifts.residues[i]*results[i];
}
return;
}
void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &psi)
{
GridBase *grid = src.Grid();
////////////////////////////////////////////////////////////////////////
// Convenience references to the info stored in "MultiShiftFunction"
////////////////////////////////////////////////////////////////////////
int nshift = shifts.order;
std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
std::vector<RealD> &mresidual(shifts.tolerances);
std::vector<RealD> alpha(nshift,1.0);
std::vector<Field> ps(nshift,grid);// Search directions
assert(psi.size()==nshift);
assert(mass.size()==nshift);
assert(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector
RealD bs[nshift];
RealD rsq[nshift];
RealD z[nshift][2];
int converged[nshift];
const int primary =0;
//Primary shift fields CG iteration
RealD a,b,c,d;
RealD cp,bp,qq; //prev
// Matrix mult fields
Field r(grid);
Field p(grid);
Field tmp(grid);
Field mmp(grid);
// Check lightest mass
for(int s=0;s<nshift;s++){
assert( mass[s]>= mass[primary] );
converged[s]=0;
}
// Wire guess to zero
// Residuals "r" are src
// First search direction "p" is also src
cp = norm2(src);
for(int s=0;s<nshift;s++){
rsq[s] = cp * mresidual[s] * mresidual[s];
std::cout<<GridLogMessage<<"ConjugateGradientMultiShift: shift "<<s
<<" target resid "<<rsq[s]<<std::endl;
ps[s] = src;
}
// r and p for primary
r=src;
p=src;
//MdagM+m[0]
Linop.HermOpAndNorm(p,mmp,d,qq);
axpy(mmp,mass[0],p,mmp);
RealD rn = norm2(p);
d += rn*mass[0];
// have verified that inner product of
// p and mmp is equal to d after this since
// the d computation is tricky
// qq = real(innerProduct(p,mmp));
// std::cout<<GridLogMessage << "debug equal ? qq "<<qq<<" d "<< d<<std::endl;
b = -cp /d;
// Set up the various shift variables
int iz=0;
z[0][1-iz] = 1.0;
z[0][iz] = 1.0;
bs[0] = b;
for(int s=1;s<nshift;s++){
z[s][1-iz] = 1.0;
z[s][iz] = 1.0/( 1.0 - b*(mass[s]-mass[0]));
bs[s] = b*z[s][iz];
}
// r += b[0] A.p[0]
// c= norm(r)
c=axpy_norm(r,b,mmp,r);
for(int s=0;s<nshift;s++) {
axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
}
///////////////////////////////////////
// Timers
///////////////////////////////////////
GridStopWatch AXPYTimer;
GridStopWatch ShiftTimer;
GridStopWatch QRTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
SolverTimer.Start();
// Iteration loop
int k;
for (k=1;k<=MaxIterations;k++){
a = c /cp;
AXPYTimer.Start();
axpy(p,a,p,r);
AXPYTimer.Stop();
// Note to self - direction ps is iterated seperately
// for each shift. Does not appear to have any scope
// for avoiding linear algebra in "single" case.
//
// However SAME r is used. Could load "r" and update
// ALL ps[s]. 2/3 Bandwidth saving
// New Kernel: Load r, vector of coeffs, vector of pointers ps
AXPYTimer.Start();
for(int s=0;s<nshift;s++){
if ( ! converged[s] ) {
if (s==0){
axpy(ps[s],a,ps[s],r);
} else{
RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
axpby(ps[s],z[s][iz],as,r,ps[s]);
}
}
}
AXPYTimer.Stop();
cp=c;
MatrixTimer.Start();
//Linop.HermOpAndNorm(p,mmp,d,qq); // d is used
// The below is faster on KNL
Linop.HermOp(p,mmp);
d=real(innerProduct(p,mmp));
MatrixTimer.Stop();
AXPYTimer.Start();
axpy(mmp,mass[0],p,mmp);
AXPYTimer.Stop();
RealD rn = norm2(p);
d += rn*mass[0];
bp=b;
b=-cp/d;
AXPYTimer.Start();
c=axpy_norm(r,b,mmp,r);
AXPYTimer.Stop();
// Toggle the recurrence history
bs[0] = b;
iz = 1-iz;
ShiftTimer.Start();
for(int s=1;s<nshift;s++){
if((!converged[s])){
RealD z0 = z[s][1-iz];
RealD z1 = z[s][iz];
z[s][iz] = z0*z1*bp
/ (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b));
bs[s] = b*z[s][iz]/z0; // NB sign rel to Mike
}
}
ShiftTimer.Stop();
for(int s=0;s<nshift;s++){
int ss = s;
// Scope for optimisation here in case of "single".
// Could load psi[0] and pull all ps[s] in.
// if ( single ) ss=primary;
// Bandwith saving in single case is Ls * 3 -> 2+Ls, so ~ 3x saving
// Pipelined CG gain:
//
// New Kernel: Load r, vector of coeffs, vector of pointers ps
// New Kernel: Load psi[0], vector of coeffs, vector of pointers ps
// If can predict the coefficient bs then we can fuse these and avoid write reread cyce
// on ps[s].
// Before: 3 x npole + 3 x npole
// After : 2 x npole (ps[s]) => 3x speed up of multishift CG.
if( (!converged[s]) ) {
axpy(psi[ss],-bs[s]*alpha[s],ps[s],psi[ss]);
}
}
// Convergence checks
int all_converged = 1;
for(int s=0;s<nshift;s++){
if ( (!converged[s]) ){
RealD css = c * z[s][iz]* z[s][iz];
if(css<rsq[s]){
if ( ! converged[s] )
std::cout<<GridLogMessage<<"ConjugateGradientMultiShift k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
converged[s]=1;
} else {
all_converged=0;
}
}
}
if ( all_converged ){
SolverTimer.Stop();
std::cout<<GridLogMessage<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl;
std::cout<<GridLogMessage<< "CGMultiShift: Checking solutions"<<std::endl;
// Check answers
for(int s=0; s < nshift; s++) {
Linop.HermOpAndNorm(psi[s],mmp,d,qq);
axpy(tmp,mass[s],psi[s],mmp);
axpy(r,-alpha[s],src,tmp);
RealD rn = norm2(r);
RealD cn = norm2(src);
std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
}
std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tAXPY " << AXPYTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMarix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tShift " << ShiftTimer.Elapsed() <<std::endl;
IterationsToComplete = k;
return;
}
}
// ugly hack
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
// assert(0);
}
};
NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,258 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ConjugateGradientReliableUpdate.h
Copyright (C) 2015
Author: Christopher Kelly <ckelly@phys.columbia.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H
#define GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H
NAMESPACE_BEGIN(Grid);
template<class FieldD,class FieldF,
typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class ConjugateGradientReliableUpdate : public LinearFunction<FieldD> {
public:
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
// Defaults true.
RealD Tolerance;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
Integer ReliableUpdatesPerformed;
bool DoFinalCleanup; //Final DP cleanup, defaults to true
Integer IterationsToCleanup; //Final DP cleanup step iterations
LinearOperatorBase<FieldF> &Linop_f;
LinearOperatorBase<FieldD> &Linop_d;
GridBase* SinglePrecGrid;
RealD Delta; //reliable update parameter
//Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single
LinearOperatorBase<FieldF> *Linop_fallback;
RealD fallback_transition_tol;
ConjugateGradientReliableUpdate(RealD tol, Integer maxit, RealD _delta, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d, bool err_on_no_conv = true)
: Tolerance(tol),
MaxIterations(maxit),
Delta(_delta),
Linop_f(_Linop_f),
Linop_d(_Linop_d),
SinglePrecGrid(_sp_grid),
ErrorOnNoConverge(err_on_no_conv),
DoFinalCleanup(true),
Linop_fallback(NULL)
{};
void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
Linop_fallback = &_Linop_fallback;
fallback_transition_tol = _fallback_transition_tol;
}
void operator()(const FieldD &src, FieldD &psi) {
LinearOperatorBase<FieldF> *Linop_f_use = &Linop_f;
bool using_fallback = false;
psi.Checkerboard() = src.Checkerboard();
conformable(psi, src);
RealD cp, c, a, d, b, ssq, qq, b_pred;
FieldD p(src);
FieldD mmp(src);
FieldD r(src);
// Initial residual computation & set up
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
Linop_d.HermOpAndNorm(psi, mmp, d, b);
r = src - mmp;
p = r;
a = norm2(p);
cp = a;
ssq = norm2(src);
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: guess " << guess << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: src " << ssq << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: mp " << d << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: mmp " << b << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: cp,r " << cp << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: p " << a << std::endl;
RealD rsq = Tolerance * Tolerance * ssq;
// Check if guess is really REALLY good :)
if (cp <= rsq) {
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate guess was REALLY good\n";
std::cout << GridLogMessage << "\tComputed residual " << std::sqrt(cp / ssq)<<std::endl;
return;
}
//Single prec initialization
FieldF r_f(SinglePrecGrid);
r_f.Checkerboard() = r.Checkerboard();
precisionChange(r_f, r);
FieldF psi_f(r_f);
psi_f = Zero();
FieldF p_f(r_f);
FieldF mmp_f(r_f);
RealD MaxResidSinceLastRelUp = cp; //initial residual
std::cout << GridLogIterative << std::setprecision(4)
<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
GridStopWatch LinalgTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
SolverTimer.Start();
int k = 0;
int l = 0;
for (k = 1; k <= MaxIterations; k++) {
c = cp;
MatrixTimer.Start();
Linop_f_use->HermOpAndNorm(p_f, mmp_f, d, qq);
MatrixTimer.Stop();
LinalgTimer.Start();
a = c / d;
b_pred = a * (a * qq - d) / c;
cp = axpy_norm(r_f, -a, mmp_f, r_f);
b = cp / c;
// Fuse these loops ; should be really easy
psi_f = a * p_f + psi_f;
//p_f = p_f * b + r_f;
LinalgTimer.Stop();
std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: Iteration " << k
<< " residual " << cp << " target " << rsq << std::endl;
std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << " b = "<< b << std::endl;
std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << " c = "<< c << std::endl;
if(cp > MaxResidSinceLastRelUp){
std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: updating MaxResidSinceLastRelUp : " << MaxResidSinceLastRelUp << " -> " << cp << std::endl;
MaxResidSinceLastRelUp = cp;
}
// Stopping condition
if (cp <= rsq) {
//Although not written in the paper, I assume that I have to add on the final solution
precisionChange(mmp, psi_f);
psi = psi + mmp;
SolverTimer.Stop();
Linop_d.HermOpAndNorm(psi, mmp, d, qq);
p = mmp - src;
RealD srcnorm = std::sqrt(norm2(src));
RealD resnorm = std::sqrt(norm2(p));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate Converged on iteration " << k << " after " << l << " reliable updates" << std::endl;
std::cout << GridLogMessage << "\tComputed residual " << std::sqrt(cp / ssq)<<std::endl;
std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
std::cout << GridLogMessage << "Time breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
IterationsToComplete = k;
ReliableUpdatesPerformed = l;
if(DoFinalCleanup){
//Do a final CG to cleanup
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate performing final cleanup.\n";
ConjugateGradient<FieldD> CG(Tolerance,MaxIterations);
CG.ErrorOnNoConverge = ErrorOnNoConverge;
CG(Linop_d,src,psi);
IterationsToCleanup = CG.IterationsToComplete;
}
else if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate complete.\n";
return;
}
else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate "
<< cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n";
precisionChange(mmp, psi_f);
psi = psi + mmp;
Linop_d.HermOpAndNorm(psi, mmp, d, qq);
r = src - mmp;
psi_f = Zero();
precisionChange(r_f, r);
cp = norm2(r);
MaxResidSinceLastRelUp = cp;
b = cp/c;
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate new residual " << cp << std::endl;
l = l+1;
}
p_f = p_f * b + r_f; //update search vector after reliable update appears to help convergence
if(!using_fallback && Linop_fallback != NULL && cp < fallback_transition_tol){
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate switching to fallback linear operator on iteration " << k << " at residual " << cp << std::endl;
Linop_f_use = Linop_fallback;
using_fallback = true;
}
}
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate did NOT converge"
<< std::endl;
if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k;
ReliableUpdatesPerformed = l;
}
};
NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,113 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ConjugateResidual.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CONJUGATE_RESIDUAL_H
#define GRID_CONJUGATE_RESIDUAL_H
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators
// single input vec, single output vec.
/////////////////////////////////////////////////////////////
template<class Field>
class ConjugateResidual : public OperatorFunction<Field> {
public:
using OperatorFunction<Field>::operator();
RealD Tolerance;
Integer MaxIterations;
int verbose;
ConjugateResidual(RealD tol,Integer maxit) : Tolerance(tol), MaxIterations(maxit) {
verbose=0;
};
void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
RealD a, b; // c, d;
RealD cp, ssq,rsq;
RealD rAr, rAAr, rArp;
RealD pAp, pAAp;
GridBase *grid = src.Grid();
psi=Zero();
Field r(grid), p(grid), Ap(grid), Ar(grid);
r=src;
p=src;
Linop.HermOpAndNorm(p,Ap,pAp,pAAp);
Linop.HermOpAndNorm(r,Ar,rAr,rAAr);
cp =norm2(r);
ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq;
if (verbose) std::cout<<GridLogMessage<<"ConjugateResidual: iteration " <<0<<" residual "<<cp<< " target"<< rsq<<std::endl;
for(int k=1;k<MaxIterations;k++){
a = rAr/pAAp;
axpy(psi,a,p,psi);
cp = axpy_norm(r,-a,Ap,r);
rArp=rAr;
Linop.HermOpAndNorm(r,Ar,rAr,rAAr);
b =rAr/rArp;
axpy(p,b,p,r);
pAAp=axpy_norm(Ap,b,Ap,Ar);
if(verbose) std::cout<<GridLogMessage<<"ConjugateResidual: iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
if(cp<rsq) {
Linop.HermOp(psi,Ap);
axpy(r,-1.0,src,Ap);
RealD true_resid = norm2(r)/ssq;
std::cout<<GridLogMessage<<"ConjugateResidual: Converged on iteration " <<k
<< " computed residual "<<std::sqrt(cp/ssq)
<< " true residual "<<std::sqrt(true_resid)
<< " target " <<Tolerance <<std::endl;
return;
}
}
std::cout<<GridLogMessage<<"ConjugateResidual did NOT converge"<<std::endl;
assert(0);
}
};
NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,108 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_DEFLATION_H
#define GRID_DEFLATION_H
namespace Grid {
template<class Field>
class ZeroGuesser: public LinearFunction<Field> {
public:
virtual void operator()(const Field &src, Field &guess) { guess = Zero(); };
};
template<class Field>
class DoNothingGuesser: public LinearFunction<Field> {
public:
virtual void operator()(const Field &src, Field &guess) { };
};
template<class Field>
class SourceGuesser: public LinearFunction<Field> {
public:
virtual void operator()(const Field &src, Field &guess) { guess = src; };
};
////////////////////////////////
// Fine grid deflation
////////////////////////////////
template<class Field>
class DeflatedGuesser: public LinearFunction<Field> {
private:
const std::vector<Field> &evec;
const std::vector<RealD> &eval;
public:
DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {};
virtual void operator()(const Field &src,Field &guess) {
guess = Zero();
assert(evec.size()==eval.size());
auto N = evec.size();
for (int i=0;i<N;i++) {
const Field& tmp = evec[i];
axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
}
guess.Checkerboard() = src.Checkerboard();
}
};
template<class FineField, class CoarseField>
class LocalCoherenceDeflatedGuesser: public LinearFunction<FineField> {
private:
const std::vector<FineField> &subspace;
const std::vector<CoarseField> &evec_coarse;
const std::vector<RealD> &eval_coarse;
public:
LocalCoherenceDeflatedGuesser(const std::vector<FineField> &_subspace,
const std::vector<CoarseField> &_evec_coarse,
const std::vector<RealD> &_eval_coarse)
: subspace(_subspace),
evec_coarse(_evec_coarse),
eval_coarse(_eval_coarse)
{
}
void operator()(const FineField &src,FineField &guess) {
int N = (int)evec_coarse.size();
CoarseField src_coarse(evec_coarse[0].Grid());
CoarseField guess_coarse(evec_coarse[0].Grid()); guess_coarse = Zero();
blockProject(src_coarse,src,subspace);
for (int i=0;i<N;i++) {
const CoarseField & tmp = evec_coarse[i];
axpy(guess_coarse,TensorRemove(innerProduct(tmp,src_coarse)) / eval_coarse[i],tmp,guess_coarse);
}
blockPromote(guess_coarse,guess,subspace);
guess.Checkerboard() = src.Checkerboard();
};
};
}
#endif

View File

@@ -0,0 +1,258 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h
Copyright (C) 2015
Author: Daniel Richtmann <daniel.richtmann@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_FLEXIBLE_COMMUNICATION_AVOIDING_GENERALISED_MINIMAL_RESIDUAL_H
#define GRID_FLEXIBLE_COMMUNICATION_AVOIDING_GENERALISED_MINIMAL_RESIDUAL_H
namespace Grid {
template<class Field>
class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<Field> {
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an assert when FCAGMRES fails to converge,
// defaults to true
RealD Tolerance;
Integer MaxIterations;
Integer RestartLength;
Integer MaxNumberOfRestarts;
Integer IterationCount; // Number of iterations the FCAGMRES took to finish,
// filled in upon completion
GridStopWatch MatrixTimer;
GridStopWatch PrecTimer;
GridStopWatch LinalgTimer;
GridStopWatch QrTimer;
GridStopWatch CompSolutionTimer;
Eigen::MatrixXcd H;
std::vector<ComplexD> y;
std::vector<ComplexD> gamma;
std::vector<ComplexD> c;
std::vector<ComplexD> s;
LinearFunction<Field> &Preconditioner;
FlexibleCommunicationAvoidingGeneralisedMinimalResidual(RealD tol,
Integer maxit,
LinearFunction<Field> &Prec,
Integer restart_length,
bool err_on_no_conv = true)
: Tolerance(tol)
, MaxIterations(maxit)
, RestartLength(restart_length)
, MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
, ErrorOnNoConverge(err_on_no_conv)
, H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
, y(RestartLength + 1, 0.)
, gamma(RestartLength + 1, 0.)
, c(RestartLength + 1, 0.)
, s(RestartLength + 1, 0.)
, Preconditioner(Prec) {};
void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
std::cout << GridLogWarning << "This algorithm currently doesn't differ from regular FGMRES" << std::endl;
psi.Checkerboard() = src.Checkerboard();
conformable(psi, src);
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
RealD cp;
RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq;
Field r(src.Grid());
std::cout << std::setprecision(4) << std::scientific;
std::cout << GridLogIterative << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: guess " << guess << std::endl;
std::cout << GridLogIterative << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: src " << ssq << std::endl;
PrecTimer.Reset();
MatrixTimer.Reset();
LinalgTimer.Reset();
QrTimer.Reset();
CompSolutionTimer.Reset();
GridStopWatch SolverTimer;
SolverTimer.Start();
IterationCount = 0;
for (int k=0; k<MaxNumberOfRestarts; k++) {
cp = outerLoopBody(LinOp, src, psi, rsq);
// Stopping condition
if (cp <= rsq) {
SolverTimer.Stop();
LinOp.Op(psi,r);
axpy(r,-1.0,src,r);
RealD srcnorm = sqrt(ssq);
RealD resnorm = sqrt(norm2(r));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: Converged on iteration " << IterationCount
<< " computed residual " << sqrt(cp / ssq)
<< " true residual " << true_residual
<< " target " << Tolerance << std::endl;
std::cout << GridLogMessage << "FCAGMRES Time elapsed: Total " << SolverTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "FCAGMRES Time elapsed: Precon " << PrecTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "FCAGMRES Time elapsed: Matrix " << MatrixTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "FCAGMRES Time elapsed: Linalg " << LinalgTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "FCAGMRES Time elapsed: QR " << QrTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "FCAGMRES Time elapsed: CompSol " << CompSolutionTimer.Elapsed() << std::endl;
return;
}
}
std::cout << GridLogMessage << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual did NOT converge" << std::endl;
if (ErrorOnNoConverge)
assert(0);
}
RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
RealD cp = 0;
Field w(src.Grid());
Field r(src.Grid());
// these should probably be made class members so that they are only allocated once, not in every restart
std::vector<Field> v(RestartLength + 1, src.Grid()); for (auto &elem : v) elem = Zero();
std::vector<Field> z(RestartLength + 1, src.Grid()); for (auto &elem : z) elem = Zero();
MatrixTimer.Start();
LinOp.Op(psi, w);
MatrixTimer.Stop();
LinalgTimer.Start();
r = src - w;
gamma[0] = sqrt(norm2(r));
v[0] = (1. / gamma[0]) * r;
LinalgTimer.Stop();
for (int i=0; i<RestartLength; i++) {
IterationCount++;
arnoldiStep(LinOp, v, z, w, i);
qrUpdate(i);
cp = norm(gamma[i+1]);
std::cout << GridLogIterative << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual: Iteration " << IterationCount
<< " residual " << cp << " target " << rsq << std::endl;
if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
computeSolution(z, psi, i);
return cp;
}
}
assert(0); // Never reached
return cp;
}
void arnoldiStep(LinearOperatorBase<Field> &LinOp, std::vector<Field> &v, std::vector<Field> &z, Field &w, int iter) {
PrecTimer.Start();
Preconditioner(v[iter], z[iter]);
PrecTimer.Stop();
MatrixTimer.Start();
LinOp.Op(z[iter], w);
MatrixTimer.Stop();
LinalgTimer.Start();
for (int i = 0; i <= iter; ++i) {
H(iter, i) = innerProduct(v[i], w);
w = w - ComplexD(H(iter, i)) * v[i];
}
H(iter, iter + 1) = sqrt(norm2(w));
v[iter + 1] = ComplexD(1. / H(iter, iter + 1)) * w;
LinalgTimer.Stop();
}
void qrUpdate(int iter) {
QrTimer.Start();
for (int i = 0; i < iter ; ++i) {
auto tmp = -s[i] * ComplexD(H(iter, i)) + c[i] * ComplexD(H(iter, i + 1));
H(iter, i) = conjugate(c[i]) * ComplexD(H(iter, i)) + conjugate(s[i]) * ComplexD(H(iter, i + 1));
H(iter, i + 1) = tmp;
}
// Compute new Givens Rotation
auto nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
c[iter] = H(iter, iter) / nu;
s[iter] = H(iter, iter + 1) / nu;
// Apply new Givens rotation
H(iter, iter) = nu;
H(iter, iter + 1) = 0.;
gamma[iter + 1] = -s[iter] * gamma[iter];
gamma[iter] = conjugate(c[iter]) * gamma[iter];
QrTimer.Stop();
}
void computeSolution(std::vector<Field> const &z, Field &psi, int iter) {
CompSolutionTimer.Start();
for (int i = iter; i >= 0; i--) {
y[i] = gamma[i];
for (int k = i + 1; k <= iter; k++)
y[i] = y[i] - ComplexD(H(k, i)) * y[k];
y[i] = y[i] / ComplexD(H(i, i));
}
for (int i = 0; i <= iter; i++)
psi = psi + z[i] * y[i];
CompSolutionTimer.Stop();
}
};
}
#endif

View File

@@ -0,0 +1,256 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/FlexibleGeneralisedMinimalResidual.h
Copyright (C) 2015
Author: Daniel Richtmann <daniel.richtmann@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_FLEXIBLE_GENERALISED_MINIMAL_RESIDUAL_H
#define GRID_FLEXIBLE_GENERALISED_MINIMAL_RESIDUAL_H
namespace Grid {
template<class Field>
class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an assert when FGMRES fails to converge,
// defaults to true
RealD Tolerance;
Integer MaxIterations;
Integer RestartLength;
Integer MaxNumberOfRestarts;
Integer IterationCount; // Number of iterations the FGMRES took to finish,
// filled in upon completion
GridStopWatch MatrixTimer;
GridStopWatch PrecTimer;
GridStopWatch LinalgTimer;
GridStopWatch QrTimer;
GridStopWatch CompSolutionTimer;
Eigen::MatrixXcd H;
std::vector<ComplexD> y;
std::vector<ComplexD> gamma;
std::vector<ComplexD> c;
std::vector<ComplexD> s;
LinearFunction<Field> &Preconditioner;
FlexibleGeneralisedMinimalResidual(RealD tol,
Integer maxit,
LinearFunction<Field> &Prec,
Integer restart_length,
bool err_on_no_conv = true)
: Tolerance(tol)
, MaxIterations(maxit)
, RestartLength(restart_length)
, MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
, ErrorOnNoConverge(err_on_no_conv)
, H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
, y(RestartLength + 1, 0.)
, gamma(RestartLength + 1, 0.)
, c(RestartLength + 1, 0.)
, s(RestartLength + 1, 0.)
, Preconditioner(Prec) {};
void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
psi.Checkerboard() = src.Checkerboard();
conformable(psi, src);
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
RealD cp;
RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq;
Field r(src.Grid());
std::cout << std::setprecision(4) << std::scientific;
std::cout << GridLogIterative << "FlexibleGeneralisedMinimalResidual: guess " << guess << std::endl;
std::cout << GridLogIterative << "FlexibleGeneralisedMinimalResidual: src " << ssq << std::endl;
PrecTimer.Reset();
MatrixTimer.Reset();
LinalgTimer.Reset();
QrTimer.Reset();
CompSolutionTimer.Reset();
GridStopWatch SolverTimer;
SolverTimer.Start();
IterationCount = 0;
for (int k=0; k<MaxNumberOfRestarts; k++) {
cp = outerLoopBody(LinOp, src, psi, rsq);
// Stopping condition
if (cp <= rsq) {
SolverTimer.Stop();
LinOp.Op(psi,r);
axpy(r,-1.0,src,r);
RealD srcnorm = sqrt(ssq);
RealD resnorm = sqrt(norm2(r));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "FlexibleGeneralisedMinimalResidual: Converged on iteration " << IterationCount
<< " computed residual " << sqrt(cp / ssq)
<< " true residual " << true_residual
<< " target " << Tolerance << std::endl;
std::cout << GridLogMessage << "FGMRES Time elapsed: Total " << SolverTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "FGMRES Time elapsed: Precon " << PrecTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "FGMRES Time elapsed: Matrix " << MatrixTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "FGMRES Time elapsed: Linalg " << LinalgTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "FGMRES Time elapsed: QR " << QrTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "FGMRES Time elapsed: CompSol " << CompSolutionTimer.Elapsed() << std::endl;
return;
}
}
std::cout << GridLogMessage << "FlexibleGeneralisedMinimalResidual did NOT converge" << std::endl;
if (ErrorOnNoConverge)
assert(0);
}
RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
RealD cp = 0;
Field w(src.Grid());
Field r(src.Grid());
// these should probably be made class members so that they are only allocated once, not in every restart
std::vector<Field> v(RestartLength + 1, src.Grid()); for (auto &elem : v) elem = Zero();
std::vector<Field> z(RestartLength + 1, src.Grid()); for (auto &elem : z) elem = Zero();
MatrixTimer.Start();
LinOp.Op(psi, w);
MatrixTimer.Stop();
LinalgTimer.Start();
r = src - w;
gamma[0] = sqrt(norm2(r));
v[0] = (1. / gamma[0]) * r;
LinalgTimer.Stop();
for (int i=0; i<RestartLength; i++) {
IterationCount++;
arnoldiStep(LinOp, v, z, w, i);
qrUpdate(i);
cp = norm(gamma[i+1]);
std::cout << GridLogIterative << "FlexibleGeneralisedMinimalResidual: Iteration " << IterationCount
<< " residual " << cp << " target " << rsq << std::endl;
if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
computeSolution(z, psi, i);
return cp;
}
}
assert(0); // Never reached
return cp;
}
void arnoldiStep(LinearOperatorBase<Field> &LinOp, std::vector<Field> &v, std::vector<Field> &z, Field &w, int iter) {
PrecTimer.Start();
Preconditioner(v[iter], z[iter]);
PrecTimer.Stop();
MatrixTimer.Start();
LinOp.Op(z[iter], w);
MatrixTimer.Stop();
LinalgTimer.Start();
for (int i = 0; i <= iter; ++i) {
H(iter, i) = innerProduct(v[i], w);
w = w - ComplexD(H(iter, i)) * v[i];
}
H(iter, iter + 1) = sqrt(norm2(w));
v[iter + 1] = ComplexD(1. / H(iter, iter + 1)) * w;
LinalgTimer.Stop();
}
void qrUpdate(int iter) {
QrTimer.Start();
for (int i = 0; i < iter ; ++i) {
auto tmp = -s[i] * ComplexD(H(iter, i)) + c[i] * ComplexD(H(iter, i + 1));
H(iter, i) = conjugate(c[i]) * ComplexD(H(iter, i)) + conjugate(s[i]) * ComplexD(H(iter, i + 1));
H(iter, i + 1) = tmp;
}
// Compute new Givens Rotation
auto nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
c[iter] = H(iter, iter) / nu;
s[iter] = H(iter, iter + 1) / nu;
// Apply new Givens rotation
H(iter, iter) = nu;
H(iter, iter + 1) = 0.;
gamma[iter + 1] = -s[iter] * gamma[iter];
gamma[iter] = conjugate(c[iter]) * gamma[iter];
QrTimer.Stop();
}
void computeSolution(std::vector<Field> const &z, Field &psi, int iter) {
CompSolutionTimer.Start();
for (int i = iter; i >= 0; i--) {
y[i] = gamma[i];
for (int k = i + 1; k <= iter; k++)
y[i] = y[i] - ComplexD(H(k, i)) * y[k];
y[i] = y[i] / ComplexD(H(i, i));
}
for (int i = 0; i <= iter; i++)
psi = psi + z[i] * y[i];
CompSolutionTimer.Stop();
}
};
}
#endif

View File

@@ -0,0 +1,244 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/GeneralisedMinimalResidual.h
Copyright (C) 2015
Author: Daniel Richtmann <daniel.richtmann@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_GENERALISED_MINIMAL_RESIDUAL_H
#define GRID_GENERALISED_MINIMAL_RESIDUAL_H
namespace Grid {
template<class Field>
class GeneralisedMinimalResidual : public OperatorFunction<Field> {
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an assert when GMRES fails to converge,
// defaults to true
RealD Tolerance;
Integer MaxIterations;
Integer RestartLength;
Integer MaxNumberOfRestarts;
Integer IterationCount; // Number of iterations the GMRES took to finish,
// filled in upon completion
GridStopWatch MatrixTimer;
GridStopWatch LinalgTimer;
GridStopWatch QrTimer;
GridStopWatch CompSolutionTimer;
Eigen::MatrixXcd H;
std::vector<ComplexD> y;
std::vector<ComplexD> gamma;
std::vector<ComplexD> c;
std::vector<ComplexD> s;
GeneralisedMinimalResidual(RealD tol,
Integer maxit,
Integer restart_length,
bool err_on_no_conv = true)
: Tolerance(tol)
, MaxIterations(maxit)
, RestartLength(restart_length)
, MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
, ErrorOnNoConverge(err_on_no_conv)
, H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
, y(RestartLength + 1, 0.)
, gamma(RestartLength + 1, 0.)
, c(RestartLength + 1, 0.)
, s(RestartLength + 1, 0.) {};
void operator()(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi) {
psi.Checkerboard() = src.Checkerboard();
conformable(psi, src);
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
RealD cp;
RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq;
Field r(src.Grid());
std::cout << std::setprecision(4) << std::scientific;
std::cout << GridLogIterative << "GeneralisedMinimalResidual: guess " << guess << std::endl;
std::cout << GridLogIterative << "GeneralisedMinimalResidual: src " << ssq << std::endl;
MatrixTimer.Reset();
LinalgTimer.Reset();
QrTimer.Reset();
CompSolutionTimer.Reset();
GridStopWatch SolverTimer;
SolverTimer.Start();
IterationCount = 0;
for (int k=0; k<MaxNumberOfRestarts; k++) {
cp = outerLoopBody(LinOp, src, psi, rsq);
// Stopping condition
if (cp <= rsq) {
SolverTimer.Stop();
LinOp.Op(psi,r);
axpy(r,-1.0,src,r);
RealD srcnorm = sqrt(ssq);
RealD resnorm = sqrt(norm2(r));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "GeneralisedMinimalResidual: Converged on iteration " << IterationCount
<< " computed residual " << sqrt(cp / ssq)
<< " true residual " << true_residual
<< " target " << Tolerance << std::endl;
std::cout << GridLogMessage << "GMRES Time elapsed: Total " << SolverTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "GMRES Time elapsed: Matrix " << MatrixTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "GMRES Time elapsed: Linalg " << LinalgTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "GMRES Time elapsed: QR " << QrTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "GMRES Time elapsed: CompSol " << CompSolutionTimer.Elapsed() << std::endl;
return;
}
}
std::cout << GridLogMessage << "GeneralisedMinimalResidual did NOT converge" << std::endl;
if (ErrorOnNoConverge)
assert(0);
}
RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
RealD cp = 0;
Field w(src.Grid());
Field r(src.Grid());
// this should probably be made a class member so that it is only allocated once, not in every restart
std::vector<Field> v(RestartLength + 1, src.Grid()); for (auto &elem : v) elem = Zero();
MatrixTimer.Start();
LinOp.Op(psi, w);
MatrixTimer.Stop();
LinalgTimer.Start();
r = src - w;
gamma[0] = sqrt(norm2(r));
v[0] = (1. / gamma[0]) * r;
LinalgTimer.Stop();
for (int i=0; i<RestartLength; i++) {
IterationCount++;
arnoldiStep(LinOp, v, w, i);
qrUpdate(i);
cp = norm(gamma[i+1]);
std::cout << GridLogIterative << "GeneralisedMinimalResidual: Iteration " << IterationCount
<< " residual " << cp << " target " << rsq << std::endl;
if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
computeSolution(v, psi, i);
return cp;
}
}
assert(0); // Never reached
return cp;
}
void arnoldiStep(LinearOperatorBase<Field> &LinOp, std::vector<Field> &v, Field &w, int iter) {
MatrixTimer.Start();
LinOp.Op(v[iter], w);
MatrixTimer.Stop();
LinalgTimer.Start();
for (int i = 0; i <= iter; ++i) {
H(iter, i) = innerProduct(v[i], w);
w = w - ComplexD(H(iter, i)) * v[i];
}
H(iter, iter + 1) = sqrt(norm2(w));
v[iter + 1] = ComplexD(1. / H(iter, iter + 1)) * w;
LinalgTimer.Stop();
}
void qrUpdate(int iter) {
QrTimer.Start();
for (int i = 0; i < iter ; ++i) {
auto tmp = -s[i] * ComplexD(H(iter, i)) + c[i] * ComplexD(H(iter, i + 1));
H(iter, i) = conjugate(c[i]) * ComplexD(H(iter, i)) + conjugate(s[i]) * ComplexD(H(iter, i + 1));
H(iter, i + 1) = tmp;
}
// Compute new Givens Rotation
auto nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
c[iter] = H(iter, iter) / nu;
s[iter] = H(iter, iter + 1) / nu;
// Apply new Givens rotation
H(iter, iter) = nu;
H(iter, iter + 1) = 0.;
gamma[iter + 1] = -s[iter] * gamma[iter];
gamma[iter] = conjugate(c[iter]) * gamma[iter];
QrTimer.Stop();
}
void computeSolution(std::vector<Field> const &v, Field &psi, int iter) {
CompSolutionTimer.Start();
for (int i = iter; i >= 0; i--) {
y[i] = gamma[i];
for (int k = i + 1; k <= iter; k++)
y[i] = y[i] - ComplexD(H(k, i)) * y[k];
y[i] = y[i] / ComplexD(H(i, i));
}
for (int i = 0; i <= iter; i++)
psi = psi + v[i] * y[i];
CompSolutionTimer.Stop();
}
};
}
#endif

View File

@@ -0,0 +1,863 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Chulwoo Jung <chulwoo@bnl.gov>
Author: Christoph Lehner <clehner@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_BIRL_H
#define GRID_BIRL_H
#include <string.h> //memset
//#include <zlib.h>
#include <sys/stat.h>
NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////////////////
// Move following 100 LOC to lattice/Lattice_basis.h
////////////////////////////////////////////////////////
template<class Field>
void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k)
{
for(int j=0; j<k; ++j){
auto ip = innerProduct(basis[j],w);
w = w - ip*basis[j];
}
}
template<class Field>
void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm)
{
typedef decltype(basis[0].View()) View;
auto tmp_v = basis[0].View();
std::vector<View> basis_v(basis.size(),tmp_v);
typedef typename Field::vector_object vobj;
GridBase* grid = basis[0].Grid();
for(int k=0;k<basis.size();k++){
basis_v[k] = basis[k].View();
}
thread_region
{
std::vector < vobj , commAllocator<vobj> > B(Nm); // Thread private
thread_for_in_region(ss, grid->oSites(),{
for(int j=j0; j<j1; ++j) B[j]=0.;
for(int j=j0; j<j1; ++j){
for(int k=k0; k<k1; ++k){
B[j] +=Qt(j,k) * basis_v[k][ss];
}
}
for(int j=j0; j<j1; ++j){
basis_v[j][ss] = B[j];
}
});
}
}
// Extract a single rotated vector
template<class Field>
void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm)
{
typedef typename Field::vector_object vobj;
GridBase* grid = basis[0].Grid();
result.Checkerboard() = basis[0].Checkerboard();
auto result_v=result.View();
thread_for(ss, grid->oSites(),{
vobj B = Zero();
for(int k=k0; k<k1; ++k){
auto basis_k = basis[k].View();
B +=Qt(j,k) * basis_k[ss];
}
result_v[ss] = B;
});
}
template<class Field>
void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, std::vector<int>& idx)
{
int vlen = idx.size();
assert(vlen>=1);
assert(vlen<=sort_vals.size());
assert(vlen<=_v.size());
for (size_t i=0;i<vlen;i++) {
if (idx[i] != i) {
//////////////////////////////////////
// idx[i] is a table of desired sources giving a permutation.
// Swap v[i] with v[idx[i]].
// Find j>i for which _vnew[j] = _vold[i],
// track the move idx[j] => idx[i]
// track the move idx[i] => i
//////////////////////////////////////
size_t j;
for (j=i;j<idx.size();j++)
if (idx[j]==i)
break;
assert(idx[i] > i); assert(j!=idx.size()); assert(idx[j]==i);
swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy
std::swap(sort_vals[i],sort_vals[idx[i]]);
idx[j] = idx[i];
idx[i] = i;
}
}
}
inline std::vector<int> basisSortGetIndex(std::vector<RealD>& sort_vals)
{
std::vector<int> idx(sort_vals.size());
std::iota(idx.begin(), idx.end(), 0);
// sort indexes based on comparing values in v
std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) {
return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]);
});
return idx;
}
template<class Field>
void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, bool reverse)
{
std::vector<int> idx = basisSortGetIndex(sort_vals);
if (reverse)
std::reverse(idx.begin(), idx.end());
basisReorderInPlace(_v,sort_vals,idx);
}
// PAB: faster to compute the inner products first then fuse loops.
// If performance critical can improve.
template<class Field>
void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
result = Zero();
assert(_v.size()==eval.size());
int N = (int)_v.size();
for (int i=0;i<N;i++) {
Field& tmp = _v[i];
axpy(result,TensorRemove(innerProduct(tmp,src_orig)) / eval[i],tmp,result);
}
}
/////////////////////////////////////////////////////////////
// Implicitly restarted lanczos
/////////////////////////////////////////////////////////////
template<class Field> class ImplicitlyRestartedLanczosTester
{
public:
virtual int TestConvergence(int j,RealD resid,Field &evec, RealD &eval,RealD evalMaxApprox)=0;
virtual int ReconstructEval(int j,RealD resid,Field &evec, RealD &eval,RealD evalMaxApprox)=0;
};
enum IRLdiagonalisation {
IRLdiagonaliseWithDSTEGR,
IRLdiagonaliseWithQR,
IRLdiagonaliseWithEigen
};
template<class Field> class ImplicitlyRestartedLanczosHermOpTester : public ImplicitlyRestartedLanczosTester<Field>
{
public:
LinearFunction<Field> &_HermOp;
ImplicitlyRestartedLanczosHermOpTester(LinearFunction<Field> &HermOp) : _HermOp(HermOp) { };
int ReconstructEval(int j,RealD resid,Field &B, RealD &eval,RealD evalMaxApprox)
{
return TestConvergence(j,resid,B,eval,evalMaxApprox);
}
int TestConvergence(int j,RealD eresid,Field &B, RealD &eval,RealD evalMaxApprox)
{
Field v(B);
RealD eval_poly = eval;
// Apply operator
_HermOp(B,v);
RealD vnum = real(innerProduct(B,v)); // HermOp.
RealD vden = norm2(B);
RealD vv0 = norm2(v);
eval = vnum/vden;
v -= eval*B;
RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0);
std::cout.precision(13);
std::cout<<GridLogIRL << "[" << std::setw(3)<<j<<"] "
<<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
<<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
<<std::endl;
int conv=0;
if( (vv<eresid*eresid) ) conv = 1;
return conv;
}
};
template<class Field>
class ImplicitlyRestartedLanczos {
private:
const RealD small = 1.0e-8;
int MaxIter;
int MinRestart; // Minimum number of restarts; only check for convergence after
int Nstop; // Number of evecs checked for convergence
int Nk; // Number of converged sought
// int Np; // Np -- Number of spare vecs in krylov space // == Nm - Nk
int Nm; // Nm -- total number of vectors
IRLdiagonalisation diagonalisation;
int orth_period;
RealD OrthoTime;
RealD eresid, betastp;
////////////////////////////////
// Embedded objects
////////////////////////////////
LinearFunction<Field> &_PolyOp;
LinearFunction<Field> &_HermOp;
ImplicitlyRestartedLanczosTester<Field> &_Tester;
// Default tester provided (we need a ref to something in default case)
ImplicitlyRestartedLanczosHermOpTester<Field> SimpleTester;
/////////////////////////
// Constructor
/////////////////////////
public:
//////////////////////////////////////////////////////////////////
// PAB:
//////////////////////////////////////////////////////////////////
// Too many options & knobs.
// Eliminate:
// orth_period
// betastp
// MinRestart
//
// Do we really need orth_period
// What is the theoretical basis & guarantees of betastp ?
// Nstop=Nk viable?
// MinRestart avoidable with new convergence test?
// Could cut to PolyOp, HermOp, Tester, Nk, Nm, resid, maxiter (+diagonalisation)
// HermOp could be eliminated if we dropped the Power method for max eval.
// -- also: The eval, eval2, eval2_copy stuff is still unnecessarily unclear
//////////////////////////////////////////////////////////////////
ImplicitlyRestartedLanczos(LinearFunction<Field> & PolyOp,
LinearFunction<Field> & HermOp,
ImplicitlyRestartedLanczosTester<Field> & Tester,
int _Nstop, // sought vecs
int _Nk, // sought vecs
int _Nm, // spare vecs
RealD _eresid, // resid in lmdue deficit
int _MaxIter, // Max iterations
RealD _betastp=0.0, // if beta(k) < betastp: converged
int _MinRestart=1, int _orth_period = 1,
IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
SimpleTester(HermOp), _PolyOp(PolyOp), _HermOp(HermOp), _Tester(Tester),
Nstop(_Nstop) , Nk(_Nk), Nm(_Nm),
eresid(_eresid), betastp(_betastp),
MaxIter(_MaxIter) , MinRestart(_MinRestart),
orth_period(_orth_period), diagonalisation(_diagonalisation) { };
ImplicitlyRestartedLanczos(LinearFunction<Field> & PolyOp,
LinearFunction<Field> & HermOp,
int _Nstop, // sought vecs
int _Nk, // sought vecs
int _Nm, // spare vecs
RealD _eresid, // resid in lmdue deficit
int _MaxIter, // Max iterations
RealD _betastp=0.0, // if beta(k) < betastp: converged
int _MinRestart=1, int _orth_period = 1,
IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
SimpleTester(HermOp), _PolyOp(PolyOp), _HermOp(HermOp), _Tester(SimpleTester),
Nstop(_Nstop) , Nk(_Nk), Nm(_Nm),
eresid(_eresid), betastp(_betastp),
MaxIter(_MaxIter) , MinRestart(_MinRestart),
orth_period(_orth_period), diagonalisation(_diagonalisation) { };
////////////////////////////////
// Helpers
////////////////////////////////
template<typename T> static RealD normalise(T& v)
{
RealD nn = norm2(v);
nn = std::sqrt(nn);
v = v * (1.0/nn);
return nn;
}
void orthogonalize(Field& w, std::vector<Field>& evec,int k)
{
OrthoTime-=usecond()/1e6;
basisOrthogonalize(evec,w,k);
normalise(w);
OrthoTime+=usecond()/1e6;
}
/* Rudy Arthur's thesis pp.137
------------------------
Require: M > K P = M K †
Compute the factorization AVM = VM HM + fM eM
repeat
Q=I
for i = 1,...,P do
QiRi =HM θiI Q = QQi
H M = Q †i H M Q i
end for
βK =HM(K+1,K) σK =Q(M,K)
r=vK+1βK +rσK
VK =VM(1:M)Q(1:M,1:K)
HK =HM(1:K,1:K)
→AVK =VKHK +fKe†K † Extend to an M = K + P step factorization AVM = VMHM + fMeM
until convergence
*/
void calc(std::vector<RealD>& eval, std::vector<Field>& evec, const Field& src, int& Nconv, bool reverse=false)
{
GridBase *grid = src.Grid();
assert(grid == evec[0].Grid());
GridLogIRL.TimingMode(1);
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
std::cout << GridLogIRL <<" ImplicitlyRestartedLanczos::calc() starting iteration 0 / "<< MaxIter<< std::endl;
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
std::cout << GridLogIRL <<" -- seek Nk = " << Nk <<" vectors"<< std::endl;
std::cout << GridLogIRL <<" -- accept Nstop = " << Nstop <<" vectors"<< std::endl;
std::cout << GridLogIRL <<" -- total Nm = " << Nm <<" vectors"<< std::endl;
std::cout << GridLogIRL <<" -- size of eval = " << eval.size() << std::endl;
std::cout << GridLogIRL <<" -- size of evec = " << evec.size() << std::endl;
if ( diagonalisation == IRLdiagonaliseWithDSTEGR ) {
std::cout << GridLogIRL << "Diagonalisation is DSTEGR "<<std::endl;
} else if ( diagonalisation == IRLdiagonaliseWithQR ) {
std::cout << GridLogIRL << "Diagonalisation is QR "<<std::endl;
} else if ( diagonalisation == IRLdiagonaliseWithEigen ) {
std::cout << GridLogIRL << "Diagonalisation is Eigen "<<std::endl;
}
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
assert(Nm <= evec.size() && Nm <= eval.size());
// quickly get an idea of the largest eigenvalue to more properly normalize the residuum
RealD evalMaxApprox = 0.0;
{
auto src_n = src;
auto tmp = src;
const int _MAX_ITER_IRL_MEVAPP_ = 50;
for (int i=0;i<_MAX_ITER_IRL_MEVAPP_;i++) {
normalise(src_n);
_HermOp(src_n,tmp);
RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
RealD vden = norm2(src_n);
RealD na = vnum/vden;
if (fabs(evalMaxApprox/na - 1.0) < 0.05)
i=_MAX_ITER_IRL_MEVAPP_;
evalMaxApprox = na;
std::cout << GridLogIRL << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
src_n = tmp;
}
}
std::vector<RealD> lme(Nm);
std::vector<RealD> lme2(Nm);
std::vector<RealD> eval2(Nm);
std::vector<RealD> eval2_copy(Nm);
Eigen::MatrixXd Qt = Eigen::MatrixXd::Zero(Nm,Nm);
Field f(grid);
Field v(grid);
int k1 = 1;
int k2 = Nk;
RealD beta_k;
Nconv = 0;
// Set initial vector
evec[0] = src;
normalise(evec[0]);
// Initial Nk steps
OrthoTime=0.;
for(int k=0; k<Nk; ++k) step(eval,lme,evec,f,Nm,k);
std::cout<<GridLogIRL <<"Initial "<< Nk <<"steps done "<<std::endl;
std::cout<<GridLogIRL <<"Initial steps:OrthoTime "<<OrthoTime<< "seconds"<<std::endl;
//////////////////////////////////
// Restarting loop begins
//////////////////////////////////
int iter;
for(iter = 0; iter<MaxIter; ++iter){
OrthoTime=0.;
std::cout<< GridLogMessage <<" **********************"<< std::endl;
std::cout<< GridLogMessage <<" Restart iteration = "<< iter << std::endl;
std::cout<< GridLogMessage <<" **********************"<< std::endl;
std::cout<<GridLogIRL <<" running "<<Nm-Nk <<" steps: "<<std::endl;
for(int k=Nk; k<Nm; ++k) step(eval,lme,evec,f,Nm,k);
f *= lme[Nm-1];
std::cout<<GridLogIRL <<" "<<Nm-Nk <<" steps done "<<std::endl;
std::cout<<GridLogIRL <<"Initial steps:OrthoTime "<<OrthoTime<< "seconds"<<std::endl;
//////////////////////////////////
// getting eigenvalues
//////////////////////////////////
for(int k=0; k<Nm; ++k){
eval2[k] = eval[k+k1-1];
lme2[k] = lme[k+k1-1];
}
Qt = Eigen::MatrixXd::Identity(Nm,Nm);
diagonalize(eval2,lme2,Nm,Nm,Qt,grid);
std::cout<<GridLogIRL <<" diagonalized "<<std::endl;
//////////////////////////////////
// sorting
//////////////////////////////////
eval2_copy = eval2;
std::partial_sort(eval2.begin(),eval2.begin()+Nm,eval2.end(),std::greater<RealD>());
std::cout<<GridLogIRL <<" evals sorted "<<std::endl;
const int chunk=8;
for(int io=0; io<k2;io+=chunk){
std::cout<<GridLogIRL << "eval "<< std::setw(3) << io ;
for(int ii=0;ii<chunk;ii++){
if ( (io+ii)<k2 )
std::cout<< " "<< std::setw(12)<< eval2[io+ii];
}
std::cout << std::endl;
}
//////////////////////////////////
// Implicitly shifted QR transformations
//////////////////////////////////
Qt = Eigen::MatrixXd::Identity(Nm,Nm);
for(int ip=k2; ip<Nm; ++ip){
QR_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
}
std::cout<<GridLogIRL <<"QR decomposed "<<std::endl;
assert(k2<Nm); assert(k2<Nm); assert(k1>0);
basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis
std::cout<<GridLogIRL <<"basisRotated by Qt *"<<k1-1<<","<<k2+1<<")"<<std::endl;
////////////////////////////////////////////////////
// Compressed vector f and beta(k2)
////////////////////////////////////////////////////
f *= Qt(k2-1,Nm-1);
f += lme[k2-1] * evec[k2];
beta_k = norm2(f);
beta_k = std::sqrt(beta_k);
std::cout<<GridLogIRL<<" beta(k) = "<<beta_k<<std::endl;
RealD betar = 1.0/beta_k;
evec[k2] = betar * f;
lme[k2-1] = beta_k;
////////////////////////////////////////////////////
// Convergence test
////////////////////////////////////////////////////
for(int k=0; k<Nm; ++k){
eval2[k] = eval[k];
lme2[k] = lme[k];
}
Qt = Eigen::MatrixXd::Identity(Nm,Nm);
diagonalize(eval2,lme2,Nk,Nm,Qt,grid);
std::cout<<GridLogIRL <<" Diagonalized "<<std::endl;
Nconv = 0;
if (iter >= MinRestart) {
std::cout << GridLogIRL << "Test convergence: rotate subset of vectors to test convergence " << std::endl;
Field B(grid); B.Checkerboard() = evec[0].Checkerboard();
// power of two search pattern; not every evalue in eval2 is assessed.
int allconv =1;
for(int jj = 1; jj<=Nstop; jj*=2){
int j = Nstop-jj;
RealD e = eval2_copy[j]; // Discard the evalue
basisRotateJ(B,evec,Qt,j,0,Nk,Nm);
if( !_Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) {
allconv=0;
}
}
// Do evec[0] for good measure
{
int j=0;
RealD e = eval2_copy[0];
basisRotateJ(B,evec,Qt,j,0,Nk,Nm);
if( !_Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) allconv=0;
}
if ( allconv ) Nconv = Nstop;
// test if we converged, if so, terminate
std::cout<<GridLogIRL<<" #modes converged: >= "<<Nconv<<"/"<<Nstop<<std::endl;
// if( Nconv>=Nstop || beta_k < betastp){
if( Nconv>=Nstop){
goto converged;
}
} else {
std::cout << GridLogIRL << "iter < MinRestart: do not yet test for convergence\n";
} // end of iter loop
}
std::cout<<GridLogError<<"\n NOT converged.\n";
abort();
converged:
{
Field B(grid); B.Checkerboard() = evec[0].Checkerboard();
basisRotate(evec,Qt,0,Nk,0,Nk,Nm);
std::cout << GridLogIRL << " Rotated basis"<<std::endl;
Nconv=0;
//////////////////////////////////////////////////////////////////////
// Full final convergence test; unconditionally applied
//////////////////////////////////////////////////////////////////////
for(int j = 0; j<=Nk; j++){
B=evec[j];
if( _Tester.ReconstructEval(j,eresid,B,eval2[j],evalMaxApprox) ) {
Nconv++;
}
}
if ( Nconv < Nstop )
std::cout << GridLogIRL << "Nconv ("<<Nconv<<") < Nstop ("<<Nstop<<")"<<std::endl;
eval=eval2;
//Keep only converged
eval.resize(Nconv);// Nstop?
evec.resize(Nconv,grid);// Nstop?
basisSortInPlace(evec,eval,reverse);
}
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
std::cout << GridLogIRL << "ImplicitlyRestartedLanczos CONVERGED ; Summary :\n";
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
std::cout << GridLogIRL << " -- Iterations = "<< iter << "\n";
std::cout << GridLogIRL << " -- beta(k) = "<< beta_k << "\n";
std::cout << GridLogIRL << " -- Nconv = "<< Nconv << "\n";
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
}
private:
/* Saad PP. 195
1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0
2. For k = 1,2,...,m Do:
3. wk:=Avkβkv_{k1}
4. αk:=(wk,vk) //
5. wk:=wkαkvk // wk orthog vk
6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
7. vk+1 := wk/βk+1
8. EndDo
*/
void step(std::vector<RealD>& lmd,
std::vector<RealD>& lme,
std::vector<Field>& evec,
Field& w,int Nm,int k)
{
const RealD tiny = 1.0e-20;
assert( k< Nm );
GridStopWatch gsw_op,gsw_o;
Field& evec_k = evec[k];
_PolyOp(evec_k,w); std::cout<<GridLogIRL << "PolyOp" <<std::endl;
if(k>0) w -= lme[k-1] * evec[k-1];
ComplexD zalph = innerProduct(evec_k,w); // 4. αk:=(wk,vk)
RealD alph = real(zalph);
w = w - alph * evec_k;// 5. wk:=wkαkvk
RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
// 7. vk+1 := wk/βk+1
lmd[k] = alph;
lme[k] = beta;
if (k>0 && k % orth_period == 0) {
orthogonalize(w,evec,k); // orthonormalise
std::cout<<GridLogIRL << "Orthogonalised " <<std::endl;
}
if(k < Nm-1) evec[k+1] = w;
std::cout<<GridLogIRL << "alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
if ( beta < tiny )
std::cout<<GridLogIRL << " beta is tiny "<<beta<<std::endl;
}
void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme,
int Nk, int Nm,
Eigen::MatrixXd & Qt, // Nm x Nm
GridBase *grid)
{
Eigen::MatrixXd TriDiag = Eigen::MatrixXd::Zero(Nk,Nk);
for(int i=0;i<Nk;i++) TriDiag(i,i) = lmd[i];
for(int i=0;i<Nk-1;i++) TriDiag(i,i+1) = lme[i];
for(int i=0;i<Nk-1;i++) TriDiag(i+1,i) = lme[i];
Eigen::SelfAdjointEigenSolver<Eigen::MatrixXd> eigensolver(TriDiag);
for (int i = 0; i < Nk; i++) {
lmd[Nk-1-i] = eigensolver.eigenvalues()(i);
}
for (int i = 0; i < Nk; i++) {
for (int j = 0; j < Nk; j++) {
Qt(Nk-1-i,j) = eigensolver.eigenvectors()(j,i);
}
}
}
///////////////////////////////////////////////////////////////////////////
// File could end here if settle on Eigen ??? !!!
///////////////////////////////////////////////////////////////////////////
void QR_decomp(std::vector<RealD>& lmd, // Nm
std::vector<RealD>& lme, // Nm
int Nk, int Nm, // Nk, Nm
Eigen::MatrixXd& Qt, // Nm x Nm matrix
RealD Dsh, int kmin, int kmax)
{
int k = kmin-1;
RealD x;
RealD Fden = 1.0/hypot(lmd[k]-Dsh,lme[k]);
RealD c = ( lmd[k] -Dsh) *Fden;
RealD s = -lme[k] *Fden;
RealD tmpa1 = lmd[k];
RealD tmpa2 = lmd[k+1];
RealD tmpb = lme[k];
lmd[k] = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb;
lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb;
lme[k] = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb;
x =-s*lme[k+1];
lme[k+1] = c*lme[k+1];
for(int i=0; i<Nk; ++i){
RealD Qtmp1 = Qt(k,i);
RealD Qtmp2 = Qt(k+1,i);
Qt(k,i) = c*Qtmp1 - s*Qtmp2;
Qt(k+1,i)= s*Qtmp1 + c*Qtmp2;
}
// Givens transformations
for(int k = kmin; k < kmax-1; ++k){
RealD Fden = 1.0/hypot(x,lme[k-1]);
RealD c = lme[k-1]*Fden;
RealD s = - x*Fden;
RealD tmpa1 = lmd[k];
RealD tmpa2 = lmd[k+1];
RealD tmpb = lme[k];
lmd[k] = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb;
lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb;
lme[k] = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb;
lme[k-1] = c*lme[k-1] -s*x;
if(k != kmax-2){
x = -s*lme[k+1];
lme[k+1] = c*lme[k+1];
}
for(int i=0; i<Nk; ++i){
RealD Qtmp1 = Qt(k,i);
RealD Qtmp2 = Qt(k+1,i);
Qt(k,i) = c*Qtmp1 -s*Qtmp2;
Qt(k+1,i) = s*Qtmp1 +c*Qtmp2;
}
}
}
void diagonalize(std::vector<RealD>& lmd, std::vector<RealD>& lme,
int Nk, int Nm,
Eigen::MatrixXd & Qt,
GridBase *grid)
{
Qt = Eigen::MatrixXd::Identity(Nm,Nm);
if ( diagonalisation == IRLdiagonaliseWithDSTEGR ) {
diagonalize_lapack(lmd,lme,Nk,Nm,Qt,grid);
} else if ( diagonalisation == IRLdiagonaliseWithQR ) {
diagonalize_QR(lmd,lme,Nk,Nm,Qt,grid);
} else if ( diagonalisation == IRLdiagonaliseWithEigen ) {
diagonalize_Eigen(lmd,lme,Nk,Nm,Qt,grid);
} else {
assert(0);
}
}
#ifdef USE_LAPACK
void LAPACK_dstegr(char *jobz, char *range, int *n, double *d, double *e,
double *vl, double *vu, int *il, int *iu, double *abstol,
int *m, double *w, double *z, int *ldz, int *isuppz,
double *work, int *lwork, int *iwork, int *liwork,
int *info);
#endif
void diagonalize_lapack(std::vector<RealD>& lmd,
std::vector<RealD>& lme,
int Nk, int Nm,
Eigen::MatrixXd& Qt,
GridBase *grid)
{
#ifdef USE_LAPACK
const int size = Nm;
int NN = Nk;
double evals_tmp[NN];
double evec_tmp[NN][NN];
memset(evec_tmp[0],0,sizeof(double)*NN*NN);
double DD[NN];
double EE[NN];
for (int i = 0; i< NN; i++) {
for (int j = i - 1; j <= i + 1; j++) {
if ( j < NN && j >= 0 ) {
if (i==j) DD[i] = lmd[i];
if (i==j) evals_tmp[i] = lmd[i];
if (j==(i-1)) EE[j] = lme[j];
}
}
}
int evals_found;
int lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ;
int liwork = 3+NN*10 ;
int iwork[liwork];
double work[lwork];
int isuppz[2*NN];
char jobz = 'V'; // calculate evals & evecs
char range = 'I'; // calculate all evals
// char range = 'A'; // calculate all evals
char uplo = 'U'; // refer to upper half of original matrix
char compz = 'I'; // Compute eigenvectors of tridiagonal matrix
int ifail[NN];
int info;
int total = grid->_Nprocessors;
int node = grid->_processor;
int interval = (NN/total)+1;
double vl = 0.0, vu = 0.0;
int il = interval*node+1 , iu = interval*(node+1);
if (iu > NN) iu=NN;
double tol = 0.0;
if (1) {
memset(evals_tmp,0,sizeof(double)*NN);
if ( il <= NN){
LAPACK_dstegr(&jobz, &range, &NN,
(double*)DD, (double*)EE,
&vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A'
&tol, // tolerance
&evals_found, evals_tmp, (double*)evec_tmp, &NN,
isuppz,
work, &lwork, iwork, &liwork,
&info);
for (int i = iu-1; i>= il-1; i--){
evals_tmp[i] = evals_tmp[i - (il-1)];
if (il>1) evals_tmp[i-(il-1)]=0.;
for (int j = 0; j< NN; j++){
evec_tmp[i][j] = evec_tmp[i - (il-1)][j];
if (il>1) evec_tmp[i-(il-1)][j]=0.;
}
}
}
{
grid->GlobalSumVector(evals_tmp,NN);
grid->GlobalSumVector((double*)evec_tmp,NN*NN);
}
}
// Safer to sort instead of just reversing it,
// but the document of the routine says evals are sorted in increasing order.
// qr gives evals in decreasing order.
for(int i=0;i<NN;i++){
lmd [NN-1-i]=evals_tmp[i];
for(int j=0;j<NN;j++){
Qt((NN-1-i),j)=evec_tmp[i][j];
}
}
#else
assert(0);
#endif
}
void diagonalize_QR(std::vector<RealD>& lmd, std::vector<RealD>& lme,
int Nk, int Nm,
Eigen::MatrixXd & Qt,
GridBase *grid)
{
int QRiter = 100*Nm;
int kmin = 1;
int kmax = Nk;
// (this should be more sophisticated)
for(int iter=0; iter<QRiter; ++iter){
// determination of 2x2 leading submatrix
RealD dsub = lmd[kmax-1]-lmd[kmax-2];
RealD dd = std::sqrt(dsub*dsub + 4.0*lme[kmax-2]*lme[kmax-2]);
RealD Dsh = 0.5*(lmd[kmax-2]+lmd[kmax-1] +dd*(dsub/fabs(dsub)));
// (Dsh: shift)
// transformation
QR_decomp(lmd,lme,Nk,Nm,Qt,Dsh,kmin,kmax); // Nk, Nm
// Convergence criterion (redef of kmin and kamx)
for(int j=kmax-1; j>= kmin; --j){
RealD dds = fabs(lmd[j-1])+fabs(lmd[j]);
if(fabs(lme[j-1])+dds > dds){
kmax = j+1;
goto continued;
}
}
QRiter = iter;
return;
continued:
for(int j=0; j<kmax-1; ++j){
RealD dds = fabs(lmd[j])+fabs(lmd[j+1]);
if(fabs(lme[j])+dds > dds){
kmin = j+1;
break;
}
}
}
std::cout << GridLogError << "[QL method] Error - Too many iteration: "<<QRiter<<"\n";
abort();
}
};
NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,409 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/LocalCoherenceLanczos.h
Copyright (C) 2015
Author: Christoph Lehner <clehner@bnl.gov>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LOCAL_COHERENCE_IRL_H
#define GRID_LOCAL_COHERENCE_IRL_H
NAMESPACE_BEGIN(Grid);
struct LanczosParams : Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParams,
ChebyParams, Cheby,/*Chebyshev*/
int, Nstop, /*Vecs in Lanczos must converge Nstop < Nk < Nm*/
int, Nk, /*Vecs in Lanczos seek converge*/
int, Nm, /*Total vecs in Lanczos include restart*/
RealD, resid, /*residual*/
int, MaxIt,
RealD, betastp, /* ? */
int, MinRes); // Must restart
};
struct LocalCoherenceLanczosParams : Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams,
bool, saveEvecs,
bool, doFine,
bool, doFineRead,
bool, doCoarse,
bool, doCoarseRead,
LanczosParams, FineParams,
LanczosParams, CoarseParams,
ChebyParams, Smoother,
RealD , coarse_relax_tol,
std::vector<int>, blockSize,
std::string, config,
std::vector < ComplexD >, omega,
RealD, mass,
RealD, M5);
};
// Duplicate functionality; ProjectedFunctionHermOp could be used with the trivial function
template<class Fobj,class CComplex,int nbasis>
class ProjectedHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
public:
typedef iVector<CComplex,nbasis > CoarseSiteVector;
typedef Lattice<CoarseSiteVector> CoarseField;
typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj> FineField;
LinearOperatorBase<FineField> &_Linop;
std::vector<FineField> &subspace;
ProjectedHermOp(LinearOperatorBase<FineField>& linop, std::vector<FineField> & _subspace) :
_Linop(linop), subspace(_subspace)
{
assert(subspace.size() >0);
};
void operator()(const CoarseField& in, CoarseField& out) {
GridBase *FineGrid = subspace[0].Grid();
int checkerboard = subspace[0].Checkerboard();
FineField fin (FineGrid); fin.Checkerboard()= checkerboard;
FineField fout(FineGrid); fout.Checkerboard() = checkerboard;
blockPromote(in,fin,subspace); std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl;
_Linop.HermOp(fin,fout); std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl;
blockProject(out,fout,subspace); std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl;
}
};
template<class Fobj,class CComplex,int nbasis>
class ProjectedFunctionHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
public:
typedef iVector<CComplex,nbasis > CoarseSiteVector;
typedef Lattice<CoarseSiteVector> CoarseField;
typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj> FineField;
OperatorFunction<FineField> & _poly;
LinearOperatorBase<FineField> &_Linop;
std::vector<FineField> &subspace;
ProjectedFunctionHermOp(OperatorFunction<FineField> & poly,
LinearOperatorBase<FineField>& linop,
std::vector<FineField> & _subspace) :
_poly(poly),
_Linop(linop),
subspace(_subspace)
{ };
void operator()(const CoarseField& in, CoarseField& out) {
GridBase *FineGrid = subspace[0].Grid();
int checkerboard = subspace[0].Checkerboard();
FineField fin (FineGrid); fin.Checkerboard() =checkerboard;
FineField fout(FineGrid);fout.Checkerboard() =checkerboard;
blockPromote(in,fin,subspace); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl;
_poly(_Linop,fin,fout); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Poly "<<std::endl;
blockProject(out,fout,subspace); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Project to coarse "<<std::endl;
}
};
template<class Fobj,class CComplex,int nbasis>
class ImplicitlyRestartedLanczosSmoothedTester : public ImplicitlyRestartedLanczosTester<Lattice<iVector<CComplex,nbasis > > >
{
public:
typedef iVector<CComplex,nbasis > CoarseSiteVector;
typedef Lattice<CoarseSiteVector> CoarseField;
typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj> FineField;
LinearFunction<CoarseField> & _Poly;
OperatorFunction<FineField> & _smoother;
LinearOperatorBase<FineField> &_Linop;
RealD _coarse_relax_tol;
std::vector<FineField> &_subspace;
ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField> &Poly,
OperatorFunction<FineField> &smoother,
LinearOperatorBase<FineField> &Linop,
std::vector<FineField> &subspace,
RealD coarse_relax_tol=5.0e3)
: _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace),
_coarse_relax_tol(coarse_relax_tol)
{ };
int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
{
CoarseField v(B);
RealD eval_poly = eval;
// Apply operator
_Poly(B,v);
RealD vnum = real(innerProduct(B,v)); // HermOp.
RealD vden = norm2(B);
RealD vv0 = norm2(v);
eval = vnum/vden;
v -= eval*B;
RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0);
std::cout.precision(13);
std::cout<<GridLogIRL << "[" << std::setw(3)<<j<<"] "
<<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
<<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
<<std::endl;
int conv=0;
if( (vv<eresid*eresid) ) conv = 1;
return conv;
}
int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
{
GridBase *FineGrid = _subspace[0].Grid();
int checkerboard = _subspace[0].Checkerboard();
FineField fB(FineGrid);fB.Checkerboard() =checkerboard;
FineField fv(FineGrid);fv.Checkerboard() =checkerboard;
blockPromote(B,fv,_subspace);
_smoother(_Linop,fv,fB);
RealD eval_poly = eval;
_Linop.HermOp(fB,fv);
RealD vnum = real(innerProduct(fB,fv)); // HermOp.
RealD vden = norm2(fB);
RealD vv0 = norm2(fv);
eval = vnum/vden;
fv -= eval*fB;
RealD vv = norm2(fv) / ::pow(evalMaxApprox,2.0);
std::cout.precision(13);
std::cout<<GridLogIRL << "[" << std::setw(3)<<j<<"] "
<<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
<<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
<<std::endl;
if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
if( (vv<eresid*eresid) ) return 1;
return 0;
}
};
////////////////////////////////////////////
// Make serializable Lanczos params
////////////////////////////////////////////
template<class Fobj,class CComplex,int nbasis>
class LocalCoherenceLanczos
{
public:
typedef iVector<CComplex,nbasis > CoarseSiteVector;
typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field
typedef Lattice<CoarseSiteVector> CoarseField;
typedef Lattice<Fobj> FineField;
protected:
GridBase *_CoarseGrid;
GridBase *_FineGrid;
int _checkerboard;
LinearOperatorBase<FineField> & _FineOp;
std::vector<RealD> &evals_fine;
std::vector<RealD> &evals_coarse;
std::vector<FineField> &subspace;
std::vector<CoarseField> &evec_coarse;
private:
std::vector<RealD> _evals_fine;
std::vector<RealD> _evals_coarse;
std::vector<FineField> _subspace;
std::vector<CoarseField> _evec_coarse;
public:
LocalCoherenceLanczos(GridBase *FineGrid,
GridBase *CoarseGrid,
LinearOperatorBase<FineField> &FineOp,
int checkerboard) :
_CoarseGrid(CoarseGrid),
_FineGrid(FineGrid),
_FineOp(FineOp),
_checkerboard(checkerboard),
evals_fine (_evals_fine),
evals_coarse(_evals_coarse),
subspace (_subspace),
evec_coarse(_evec_coarse)
{
evals_fine.resize(0);
evals_coarse.resize(0);
};
//////////////////////////////////////////////////////////////////////////
// Alternate constructore, external storage for use by Hadrons module
//////////////////////////////////////////////////////////////////////////
LocalCoherenceLanczos(GridBase *FineGrid,
GridBase *CoarseGrid,
LinearOperatorBase<FineField> &FineOp,
int checkerboard,
std::vector<FineField> &ext_subspace,
std::vector<CoarseField> &ext_coarse,
std::vector<RealD> &ext_eval_fine,
std::vector<RealD> &ext_eval_coarse
) :
_CoarseGrid(CoarseGrid),
_FineGrid(FineGrid),
_FineOp(FineOp),
_checkerboard(checkerboard),
evals_fine (ext_eval_fine),
evals_coarse(ext_eval_coarse),
subspace (ext_subspace),
evec_coarse (ext_coarse)
{
evals_fine.resize(0);
evals_coarse.resize(0);
};
void Orthogonalise(void ) {
CoarseScalar InnerProd(_CoarseGrid);
std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
blockOrthogonalise(InnerProd,subspace);
std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl;
blockOrthogonalise(InnerProd,subspace);
};
template<typename T> static RealD normalise(T& v)
{
RealD nn = norm2(v);
nn = ::sqrt(nn);
v = v * (1.0/nn);
return nn;
}
/*
void fakeFine(void)
{
int Nk = nbasis;
subspace.resize(Nk,_FineGrid);
subspace[0]=1.0;
subspace[0].Checkerboard()=_checkerboard;
normalise(subspace[0]);
PlainHermOp<FineField> Op(_FineOp);
for(int k=1;k<Nk;k++){
subspace[k].Checkerboard()=_checkerboard;
Op(subspace[k-1],subspace[k]);
normalise(subspace[k]);
}
}
*/
void testFine(RealD resid)
{
assert(evals_fine.size() == nbasis);
assert(subspace.size() == nbasis);
PlainHermOp<FineField> Op(_FineOp);
ImplicitlyRestartedLanczosHermOpTester<FineField> SimpleTester(Op);
for(int k=0;k<nbasis;k++){
assert(SimpleTester.ReconstructEval(k,resid,subspace[k],evals_fine[k],1.0)==1);
}
}
void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax)
{
assert(evals_fine.size() == nbasis);
assert(subspace.size() == nbasis);
//////////////////////////////////////////////////////////////////////////////////////////////////
// create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
//////////////////////////////////////////////////////////////////////////////////////////////////
Chebyshev<FineField> ChebySmooth(cheby_smooth);
ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (ChebySmooth,_FineOp,subspace);
ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
for(int k=0;k<evec_coarse.size();k++){
if ( k < nbasis ) {
assert(ChebySmoothTester.ReconstructEval(k,resid,evec_coarse[k],evals_coarse[k],1.0)==1);
} else {
assert(ChebySmoothTester.ReconstructEval(k,resid*relax,evec_coarse[k],evals_coarse[k],1.0)==1);
}
}
}
void calcFine(ChebyParams cheby_parms,int Nstop,int Nk,int Nm,RealD resid,
RealD MaxIt, RealD betastp, int MinRes)
{
assert(nbasis<=Nm);
Chebyshev<FineField> Cheby(cheby_parms);
FunctionHermOp<FineField> ChebyOp(Cheby,_FineOp);
PlainHermOp<FineField> Op(_FineOp);
evals_fine.resize(Nm);
subspace.resize(Nm,_FineGrid);
ImplicitlyRestartedLanczos<FineField> IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
FineField src(_FineGrid);
typedef typename FineField::scalar_type Scalar;
// src=1.0;
src=Scalar(1.0);
src.Checkerboard() = _checkerboard;
int Nconv;
IRL.calc(evals_fine,subspace,src,Nconv,false);
// Shrink down to number saved
assert(Nstop>=nbasis);
assert(Nconv>=nbasis);
evals_fine.resize(nbasis);
subspace.resize(nbasis,_FineGrid);
}
void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
int Nstop, int Nk, int Nm,RealD resid,
RealD MaxIt, RealD betastp, int MinRes)
{
Chebyshev<FineField> Cheby(cheby_op);
ProjectedHermOp<Fobj,CComplex,nbasis> Op(_FineOp,subspace);
ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace);
//////////////////////////////////////////////////////////////////////////////////////////////////
// create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
//////////////////////////////////////////////////////////////////////////////////////////////////
Chebyshev<FineField> ChebySmooth(cheby_smooth);
ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
evals_coarse.resize(Nm);
evec_coarse.resize(Nm,_CoarseGrid);
CoarseField src(_CoarseGrid); src=1.0;
ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
int Nconv=0;
IRL.calc(evals_coarse,evec_coarse,src,Nconv,false);
assert(Nconv>=Nstop);
evals_coarse.resize(Nstop);
evec_coarse.resize (Nstop,_CoarseGrid);
for (int i=0;i<Nstop;i++){
std::cout << i << " Coarse eval = " << evals_coarse[i] << std::endl;
}
}
};
NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,157 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/MinimalResidual.h
Copyright (C) 2015
Author: Daniel Richtmann <daniel.richtmann@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_MINIMAL_RESIDUAL_H
#define GRID_MINIMAL_RESIDUAL_H
namespace Grid {
template<class Field> class MinimalResidual : public OperatorFunction<Field> {
public:
using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // throw an assert when the MR fails to converge.
// Defaults true.
RealD Tolerance;
Integer MaxIterations;
RealD overRelaxParam;
Integer IterationsToComplete; // Number of iterations the MR took to finish.
// Filled in upon completion
MinimalResidual(RealD tol, Integer maxit, Real ovrelparam = 1.0, bool err_on_no_conv = true)
: Tolerance(tol), MaxIterations(maxit), overRelaxParam(ovrelparam), ErrorOnNoConverge(err_on_no_conv){};
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
psi.Checkerboard() = src.Checkerboard();
conformable(psi, src);
ComplexD a, c;
RealD d;
Field Mr(src);
Field r(src);
// Initial residual computation & set up
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq;
Linop.Op(psi, Mr);
r = src - Mr;
RealD cp = norm2(r);
std::cout << std::setprecision(4) << std::scientific;
std::cout << GridLogIterative << "MinimalResidual: guess " << guess << std::endl;
std::cout << GridLogIterative << "MinimalResidual: src " << ssq << std::endl;
std::cout << GridLogIterative << "MinimalResidual: cp,r " << cp << std::endl;
if (cp <= rsq) {
return;
}
std::cout << GridLogIterative << "MinimalResidual: k=0 residual " << cp << " target " << rsq << std::endl;
GridStopWatch LinalgTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
SolverTimer.Start();
int k;
for (k = 1; k <= MaxIterations; k++) {
MatrixTimer.Start();
Linop.Op(r, Mr);
MatrixTimer.Stop();
LinalgTimer.Start();
c = innerProduct(Mr, r);
d = norm2(Mr);
a = c / d;
a = a * overRelaxParam;
psi = psi + r * a;
r = r - Mr * a;
cp = norm2(r);
LinalgTimer.Stop();
std::cout << GridLogIterative << "MinimalResidual: Iteration " << k
<< " residual " << cp << " target " << rsq << std::endl;
std::cout << GridLogDebug << "a = " << a << " c = " << c << " d = " << d << std::endl;
// Stopping condition
if (cp <= rsq) {
SolverTimer.Stop();
Linop.Op(psi, Mr);
r = src - Mr;
RealD srcnorm = sqrt(ssq);
RealD resnorm = sqrt(norm2(r));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "MinimalResidual Converged on iteration " << k
<< " computed residual " << sqrt(cp / ssq)
<< " true residual " << true_residual
<< " target " << Tolerance << std::endl;
std::cout << GridLogMessage << "MR Time elapsed: Total " << SolverTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "MR Time elapsed: Matrix " << MatrixTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "MR Time elapsed: Linalg " << LinalgTimer.Elapsed() << std::endl;
if (ErrorOnNoConverge)
assert(true_residual / Tolerance < 10000.0);
IterationsToComplete = k;
return;
}
}
std::cout << GridLogMessage << "MinimalResidual did NOT converge"
<< std::endl;
if (ErrorOnNoConverge)
assert(0);
IterationsToComplete = k;
}
};
} // namespace Grid
#endif

View File

@@ -0,0 +1,276 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h
Copyright (C) 2015
Author: Daniel Richtmann <daniel.richtmann@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_MIXED_PRECISION_FLEXIBLE_GENERALISED_MINIMAL_RESIDUAL_H
#define GRID_MIXED_PRECISION_FLEXIBLE_GENERALISED_MINIMAL_RESIDUAL_H
namespace Grid {
template<class FieldD, class FieldF, typename std::enable_if<getPrecision<FieldD>::value == 2, int>::type = 0, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction<FieldD> {
public:
using OperatorFunction<FieldD>::operator();
bool ErrorOnNoConverge; // Throw an assert when MPFGMRES fails to converge,
// defaults to true
RealD Tolerance;
Integer MaxIterations;
Integer RestartLength;
Integer MaxNumberOfRestarts;
Integer IterationCount; // Number of iterations the MPFGMRES took to finish,
// filled in upon completion
GridStopWatch MatrixTimer;
GridStopWatch PrecTimer;
GridStopWatch LinalgTimer;
GridStopWatch QrTimer;
GridStopWatch CompSolutionTimer;
GridStopWatch ChangePrecTimer;
Eigen::MatrixXcd H;
std::vector<ComplexD> y;
std::vector<ComplexD> gamma;
std::vector<ComplexD> c;
std::vector<ComplexD> s;
GridBase* SinglePrecGrid;
LinearFunction<FieldF> &Preconditioner;
MixedPrecisionFlexibleGeneralisedMinimalResidual(RealD tol,
Integer maxit,
GridBase * sp_grid,
LinearFunction<FieldF> &Prec,
Integer restart_length,
bool err_on_no_conv = true)
: Tolerance(tol)
, MaxIterations(maxit)
, RestartLength(restart_length)
, MaxNumberOfRestarts(MaxIterations/RestartLength + ((MaxIterations%RestartLength == 0) ? 0 : 1))
, ErrorOnNoConverge(err_on_no_conv)
, H(Eigen::MatrixXcd::Zero(RestartLength, RestartLength + 1)) // sizes taken from DD-αAMG code base
, y(RestartLength + 1, 0.)
, gamma(RestartLength + 1, 0.)
, c(RestartLength + 1, 0.)
, s(RestartLength + 1, 0.)
, SinglePrecGrid(sp_grid)
, Preconditioner(Prec) {};
void operator()(LinearOperatorBase<FieldD> &LinOp, const FieldD &src, FieldD &psi) {
psi.Checkerboard() = src.Checkerboard();
conformable(psi, src);
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
RealD cp;
RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq;
FieldD r(src.Grid());
std::cout << std::setprecision(4) << std::scientific;
std::cout << GridLogIterative << "MPFGMRES: guess " << guess << std::endl;
std::cout << GridLogIterative << "MPFGMRES: src " << ssq << std::endl;
PrecTimer.Reset();
MatrixTimer.Reset();
LinalgTimer.Reset();
QrTimer.Reset();
CompSolutionTimer.Reset();
ChangePrecTimer.Reset();
GridStopWatch SolverTimer;
SolverTimer.Start();
IterationCount = 0;
for (int k=0; k<MaxNumberOfRestarts; k++) {
cp = outerLoopBody(LinOp, src, psi, rsq);
// Stopping condition
if (cp <= rsq) {
SolverTimer.Stop();
LinOp.Op(psi,r);
axpy(r,-1.0,src,r);
RealD srcnorm = sqrt(ssq);
RealD resnorm = sqrt(norm2(r));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "MPFGMRES: Converged on iteration " << IterationCount
<< " computed residual " << sqrt(cp / ssq)
<< " true residual " << true_residual
<< " target " << Tolerance << std::endl;
std::cout << GridLogMessage << "MPFGMRES Time elapsed: Total " << SolverTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "MPFGMRES Time elapsed: Precon " << PrecTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "MPFGMRES Time elapsed: Matrix " << MatrixTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "MPFGMRES Time elapsed: Linalg " << LinalgTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "MPFGMRES Time elapsed: QR " << QrTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "MPFGMRES Time elapsed: CompSol " << CompSolutionTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "MPFGMRES Time elapsed: PrecChange " << ChangePrecTimer.Elapsed() << std::endl;
return;
}
}
std::cout << GridLogMessage << "MPFGMRES did NOT converge" << std::endl;
if (ErrorOnNoConverge)
assert(0);
}
RealD outerLoopBody(LinearOperatorBase<FieldD> &LinOp, const FieldD &src, FieldD &psi, RealD rsq) {
RealD cp = 0;
FieldD w(src.Grid());
FieldD r(src.Grid());
// these should probably be made class members so that they are only allocated once, not in every restart
std::vector<FieldD> v(RestartLength + 1, src.Grid()); for (auto &elem : v) elem = Zero();
std::vector<FieldD> z(RestartLength + 1, src.Grid()); for (auto &elem : z) elem = Zero();
MatrixTimer.Start();
LinOp.Op(psi, w);
MatrixTimer.Stop();
LinalgTimer.Start();
r = src - w;
gamma[0] = sqrt(norm2(r));
v[0] = (1. / gamma[0]) * r;
LinalgTimer.Stop();
for (int i=0; i<RestartLength; i++) {
IterationCount++;
arnoldiStep(LinOp, v, z, w, i);
qrUpdate(i);
cp = norm(gamma[i+1]);
std::cout << GridLogIterative << "MPFGMRES: Iteration " << IterationCount
<< " residual " << cp << " target " << rsq << std::endl;
if ((i == RestartLength - 1) || (IterationCount == MaxIterations) || (cp <= rsq)) {
computeSolution(z, psi, i);
return cp;
}
}
assert(0); // Never reached
return cp;
}
void arnoldiStep(LinearOperatorBase<FieldD> &LinOp, std::vector<FieldD> &v, std::vector<FieldD> &z, FieldD &w, int iter) {
FieldF v_f(SinglePrecGrid);
FieldF z_f(SinglePrecGrid);
ChangePrecTimer.Start();
precisionChange(v_f, v[iter]);
precisionChange(z_f, z[iter]);
ChangePrecTimer.Stop();
PrecTimer.Start();
Preconditioner(v_f, z_f);
PrecTimer.Stop();
ChangePrecTimer.Start();
precisionChange(z[iter], z_f);
ChangePrecTimer.Stop();
MatrixTimer.Start();
LinOp.Op(z[iter], w);
MatrixTimer.Stop();
LinalgTimer.Start();
for (int i = 0; i <= iter; ++i) {
H(iter, i) = innerProduct(v[i], w);
w = w - ComplexD(H(iter, i)) * v[i];
}
H(iter, iter + 1) = sqrt(norm2(w));
v[iter + 1] = ComplexD(1. / H(iter, iter + 1)) * w;
LinalgTimer.Stop();
}
void qrUpdate(int iter) {
QrTimer.Start();
for (int i = 0; i < iter ; ++i) {
auto tmp = -s[i] * ComplexD(H(iter, i)) + c[i] * ComplexD(H(iter, i + 1));
H(iter, i) = conjugate(c[i]) * ComplexD(H(iter, i)) + conjugate(s[i]) * ComplexD(H(iter, i + 1));
H(iter, i + 1) = tmp;
}
// Compute new Givens Rotation
auto nu = sqrt(std::norm(H(iter, iter)) + std::norm(H(iter, iter + 1)));
c[iter] = H(iter, iter) / nu;
s[iter] = H(iter, iter + 1) / nu;
// Apply new Givens rotation
H(iter, iter) = nu;
H(iter, iter + 1) = 0.;
gamma[iter + 1] = -s[iter] * gamma[iter];
gamma[iter] = conjugate(c[iter]) * gamma[iter];
QrTimer.Stop();
}
void computeSolution(std::vector<FieldD> const &z, FieldD &psi, int iter) {
CompSolutionTimer.Start();
for (int i = iter; i >= 0; i--) {
y[i] = gamma[i];
for (int k = i + 1; k <= iter; k++)
y[i] = y[i] - ComplexD(H(k, i)) * y[k];
y[i] = y[i] / ComplexD(H(i, i));
}
for (int i = 0; i <= iter; i++)
psi = psi + z[i] * y[i];
CompSolutionTimer.Stop();
}
};
}
#endif

View File

@@ -28,7 +28,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_NORMAL_EQUATIONS_H
#define GRID_NORMAL_EQUATIONS_H
namespace Grid {
NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Take a matrix and form an NE solver calling a Herm solver
@@ -48,7 +48,7 @@ namespace Grid {
void operator() (const Field &in, Field &out){
Field src(in._grid);
Field src(in.Grid());
_Matrix.Mdag(in,src);
_HermitianSolver(src,out); // Mdag M out = Mdag in
@@ -56,5 +56,5 @@ namespace Grid {
}
};
}
NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,45 @@
#pragma once
namespace Grid {
template<class Field> class PowerMethod
{
public:
template<typename T> static RealD normalise(T& v)
{
RealD nn = norm2(v);
nn = sqrt(nn);
v = v * (1.0/nn);
return nn;
}
RealD operator()(LinearOperatorBase<Field> &HermOp, const Field &src)
{
GridBase *grid = src.Grid();
// quickly get an idea of the largest eigenvalue to more properly normalize the residuum
RealD evalMaxApprox = 0.0;
auto src_n = src;
auto tmp = src;
const int _MAX_ITER_EST_ = 50;
for (int i=0;i<_MAX_ITER_EST_;i++) {
normalise(src_n);
HermOp.HermOp(src_n,tmp);
RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
RealD vden = norm2(src_n);
RealD na = vnum/vden;
if ( (fabs(evalMaxApprox/na - 1.0) < 0.01) || (i==_MAX_ITER_EST_-1) ) {
evalMaxApprox = na;
return evalMaxApprox;
}
evalMaxApprox = na;
std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
src_n = tmp;
}
assert(0);
return 0;
}
};
}

View File

@@ -0,0 +1,119 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/PrecConjugateResidual.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_PREC_CONJUGATE_RESIDUAL_H
#define GRID_PREC_CONJUGATE_RESIDUAL_H
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////
// Base classes for iterative processes based on operators
// single input vec, single output vec.
/////////////////////////////////////////////////////////////
template<class Field>
class PrecConjugateResidual : public OperatorFunction<Field> {
public:
RealD Tolerance;
Integer MaxIterations;
int verbose;
LinearFunction<Field> &Preconditioner;
PrecConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec) : Tolerance(tol), MaxIterations(maxit), Preconditioner(Prec)
{
verbose=1;
};
void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
RealD a, b, c, d;
RealD cp, ssq,rsq;
RealD rAr, rAAr, rArp;
RealD pAp, pAAp;
GridBase *grid = src.Grid();
Field r(grid), p(grid), Ap(grid), Ar(grid), z(grid);
psi=zero;
r = src;
Preconditioner(r,p);
Linop.HermOpAndNorm(p,Ap,pAp,pAAp);
Ar=Ap;
rAr=pAp;
rAAr=pAAp;
cp =norm2(r);
ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq;
if (verbose) std::cout<<GridLogMessage<<"PrecConjugateResidual: iteration " <<0<<" residual "<<cp<< " target"<< rsq<<std::endl;
for(int k=0;k<MaxIterations;k++){
Preconditioner(Ap,z);
RealD rq= real(innerProduct(Ap,z));
a = rAr/rq;
axpy(psi,a,p,psi);
cp = axpy_norm(r,-a,z,r);
rArp=rAr;
Linop.HermOpAndNorm(r,Ar,rAr,rAAr);
b =rAr/rArp;
axpy(p,b,p,r);
pAAp=axpy_norm(Ap,b,Ap,Ar);
if(verbose) std::cout<<GridLogMessage<<"PrecConjugateResidual: iteration " <<k<<" residual "<<cp<< " target"<< rsq<<std::endl;
if(cp<rsq) {
Linop.HermOp(psi,Ap);
axpy(r,-1.0,src,Ap);
RealD true_resid = norm2(r)/ssq;
std::cout<<GridLogMessage<<"PrecConjugateResidual: Converged on iteration " <<k
<< " computed residual "<<sqrt(cp/ssq)
<< " true residual "<<sqrt(true_resid)
<< " target " <<Tolerance <<std::endl;
return;
}
}
std::cout<<GridLogMessage<<"PrecConjugateResidual did NOT converge"<<std::endl;
assert(0);
}
};
NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,243 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_PREC_GCR_H
#define GRID_PREC_GCR_H
///////////////////////////////////////////////////////////////////////////////////////////////////////
//VPGCR Abe and Zhang, 2005.
//INTERNATIONAL JOURNAL OF NUMERICAL ANALYSIS AND MODELING
//Computing and Information Volume 2, Number 2, Pages 147-161
//NB. Likely not original reference since they are focussing on a preconditioner variant.
// but VPGCR was nicely written up in their paper
///////////////////////////////////////////////////////////////////////////////////////////////////////
NAMESPACE_BEGIN(Grid);
template<class Field>
class PrecGeneralisedConjugateResidual : public OperatorFunction<Field> {
public:
using OperatorFunction<Field>::operator();
RealD Tolerance;
Integer MaxIterations;
int verbose;
int mmax;
int nstep;
int steps;
GridStopWatch PrecTimer;
GridStopWatch MatTimer;
GridStopWatch LinalgTimer;
LinearFunction<Field> &Preconditioner;
PrecGeneralisedConjugateResidual(RealD tol,Integer maxit,LinearFunction<Field> &Prec,int _mmax,int _nstep) :
Tolerance(tol),
MaxIterations(maxit),
Preconditioner(Prec),
mmax(_mmax),
nstep(_nstep)
{
verbose=1;
};
void operator() (LinearOperatorBase<Field> &Linop,const Field &src, Field &psi){
psi=Zero();
RealD cp, ssq,rsq;
ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq;
Field r(src.Grid());
PrecTimer.Reset();
MatTimer.Reset();
LinalgTimer.Reset();
GridStopWatch SolverTimer;
SolverTimer.Start();
steps=0;
for(int k=0;k<MaxIterations;k++){
cp=GCRnStep(Linop,src,psi,rsq);
std::cout<<GridLogMessage<<"VPGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<std::endl;
if(cp<rsq) {
SolverTimer.Stop();
Linop.HermOp(psi,r);
axpy(r,-1.0,src,r);
RealD tr = norm2(r);
std::cout<<GridLogMessage<<"PrecGeneralisedConjugateResidual: Converged on iteration " <<steps
<< " computed residual "<<sqrt(cp/ssq)
<< " true residual " <<sqrt(tr/ssq)
<< " target " <<Tolerance <<std::endl;
std::cout<<GridLogMessage<<"VPGCR Time elapsed: Total "<< SolverTimer.Elapsed() <<std::endl;
std::cout<<GridLogMessage<<"VPGCR Time elapsed: Precon "<< PrecTimer.Elapsed() <<std::endl;
std::cout<<GridLogMessage<<"VPGCR Time elapsed: Matrix "<< MatTimer.Elapsed() <<std::endl;
std::cout<<GridLogMessage<<"VPGCR Time elapsed: Linalg "<< LinalgTimer.Elapsed() <<std::endl;
return;
}
}
std::cout<<GridLogMessage<<"Variable Preconditioned GCR did not converge"<<std::endl;
assert(0);
}
RealD GCRnStep(LinearOperatorBase<Field> &Linop,const Field &src, Field &psi,RealD rsq){
RealD cp;
RealD a, b;
RealD zAz, zAAz;
RealD rq;
GridBase *grid = src.Grid();
Field r(grid);
Field z(grid);
Field tmp(grid);
Field ttmp(grid);
Field Az(grid);
////////////////////////////////
// history for flexible orthog
////////////////////////////////
std::vector<Field> q(mmax,grid);
std::vector<Field> p(mmax,grid);
std::vector<RealD> qq(mmax);
//////////////////////////////////
// initial guess x0 is taken as nonzero.
// r0=src-A x0 = src
//////////////////////////////////
MatTimer.Start();
Linop.HermOpAndNorm(psi,Az,zAz,zAAz);
MatTimer.Stop();
LinalgTimer.Start();
r=src-Az;
LinalgTimer.Stop();
/////////////////////
// p = Prec(r)
/////////////////////
PrecTimer.Start();
Preconditioner(r,z);
PrecTimer.Stop();
MatTimer.Start();
Linop.HermOp(z,tmp);
MatTimer.Stop();
LinalgTimer.Start();
ttmp=tmp;
tmp=tmp-r;
LinalgTimer.Stop();
/*
std::cout<<GridLogMessage<<r<<std::endl;
std::cout<<GridLogMessage<<z<<std::endl;
std::cout<<GridLogMessage<<ttmp<<std::endl;
std::cout<<GridLogMessage<<tmp<<std::endl;
*/
MatTimer.Start();
Linop.HermOpAndNorm(z,Az,zAz,zAAz);
MatTimer.Stop();
LinalgTimer.Start();
//p[0],q[0],qq[0]
p[0]= z;
q[0]= Az;
qq[0]= zAAz;
cp =norm2(r);
LinalgTimer.Stop();
for(int k=0;k<nstep;k++){
steps++;
int kp = k+1;
int peri_k = k %mmax;
int peri_kp= kp%mmax;
LinalgTimer.Start();
rq= real(innerProduct(r,q[peri_k])); // what if rAr not real?
a = rq/qq[peri_k];
axpy(psi,a,p[peri_k],psi);
cp = axpy_norm(r,-a,q[peri_k],r);
LinalgTimer.Stop();
if((k==nstep-1)||(cp<rsq)){
return cp;
}
std::cout<<GridLogMessage<< " VPGCR_step["<<steps<<"] resid " <<sqrt(cp/rsq)<<std::endl;
PrecTimer.Start();
Preconditioner(r,z);// solve Az = r
PrecTimer.Stop();
MatTimer.Start();
Linop.HermOpAndNorm(z,Az,zAz,zAAz);
Linop.HermOp(z,tmp);
MatTimer.Stop();
LinalgTimer.Start();
tmp=tmp-r;
std::cout<<GridLogMessage<< " Preconditioner resid " <<sqrt(norm2(tmp)/norm2(r))<<std::endl;
q[peri_kp]=Az;
p[peri_kp]=z;
int northog = ((kp)>(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history.
for(int back=0;back<northog;back++){
int peri_back=(k-back)%mmax; assert((k-back)>=0);
b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
p[peri_kp]=p[peri_kp]+b*p[peri_back];
q[peri_kp]=q[peri_kp]+b*q[peri_back];
}
qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
LinalgTimer.Stop();
}
assert(0); // never reached
return cp;
}
};
NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,486 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/SchurRedBlack.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_SCHUR_RED_BLACK_H
#define GRID_SCHUR_RED_BLACK_H
/*
* Red black Schur decomposition
*
* M = (Mee Meo) = (1 0 ) (Mee 0 ) (1 Mee^{-1} Meo)
* (Moe Moo) (Moe Mee^-1 1 ) (0 Moo-Moe Mee^-1 Meo) (0 1 )
* = L D U
*
* L^-1 = (1 0 )
* (-MoeMee^{-1} 1 )
* L^{dag} = ( 1 Mee^{-dag} Moe^{dag} )
* ( 0 1 )
* L^{-d} = ( 1 -Mee^{-dag} Moe^{dag} )
* ( 0 1 )
*
* U^-1 = (1 -Mee^{-1} Meo)
* (0 1 )
* U^{dag} = ( 1 0)
* (Meo^dag Mee^{-dag} 1)
* U^{-dag} = ( 1 0)
* (-Meo^dag Mee^{-dag} 1)
***********************
* M psi = eta
***********************
*Odd
* i) D_oo psi_o = L^{-1} eta_o
* eta_o' = (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e)
*
* Wilson:
* (D_oo)^{\dag} D_oo psi_o = (D_oo)^dag L^{-1} eta_o
* Stag:
* D_oo psi_o = L^{-1} eta = (eta_o - Moe Mee^{-1} eta_e)
*
* L^-1 eta_o= (1 0 ) (e
* (-MoeMee^{-1} 1 )
*
*Even
* ii) Mee psi_e + Meo psi_o = src_e
*
* => sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
*
*
* TODO: Other options:
*
* a) change checkerboards for Schur e<->o
*
* Left precon by Moo^-1
* b) Doo^{dag} M_oo^-dag Moo^-1 Doo psi_0 = (D_oo)^dag M_oo^-dag Moo^-1 L^{-1} eta_o
* eta_o' = (D_oo)^dag M_oo^-dag Moo^-1 (eta_o - Moe Mee^{-1} eta_e)
*
* Right precon by Moo^-1
* c) M_oo^-dag Doo^{dag} Doo Moo^-1 phi_0 = M_oo^-dag (D_oo)^dag L^{-1} eta_o
* eta_o' = M_oo^-dag (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e)
* psi_o = M_oo^-1 phi_o
* TODO: Deflation
*/
namespace Grid {
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Use base class to share code
///////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Take a matrix and form a Red Black solver calling a Herm solver
// Use of RB info prevents making SchurRedBlackSolve conform to standard interface
///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class SchurRedBlackBase {
protected:
typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
OperatorFunction<Field> & _HermitianRBSolver;
int CBfactorise;
bool subGuess;
bool useSolnAsInitGuess; // if true user-supplied solution vector is used as initial guess for solver
public:
SchurRedBlackBase(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false,
const bool _solnAsInitGuess = false) :
_HermitianRBSolver(HermitianRBSolver),
useSolnAsInitGuess(_solnAsInitGuess)
{
CBfactorise = 0;
subtractGuess(initSubGuess);
};
void subtractGuess(const bool initSubGuess)
{
subGuess = initSubGuess;
}
bool isSubtractGuess(void)
{
return subGuess;
}
/////////////////////////////////////////////////////////////
// Shared code
/////////////////////////////////////////////////////////////
void operator() (Matrix & _Matrix,const Field &in, Field &out){
ZeroGuesser<Field> guess;
(*this)(_Matrix,in,out,guess);
}
void operator()(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &out)
{
ZeroGuesser<Field> guess;
(*this)(_Matrix,in,out,guess);
}
template<class Guesser>
void operator()(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &out,Guesser &guess)
{
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
int nblock = in.size();
std::vector<Field> src_o(nblock,grid);
std::vector<Field> sol_o(nblock,grid);
std::vector<Field> guess_save;
Field resid(fgrid);
Field tmp(grid);
////////////////////////////////////////////////
// Prepare RedBlack source
////////////////////////////////////////////////
for(int b=0;b<nblock;b++){
RedBlackSource(_Matrix,in[b],tmp,src_o[b]);
}
////////////////////////////////////////////////
// Make the guesses
////////////////////////////////////////////////
if ( subGuess ) guess_save.resize(nblock,grid);
for(int b=0;b<nblock;b++){
if(useSolnAsInitGuess) {
pickCheckerboard(Odd, sol_o[b], out[b]);
} else {
guess(src_o[b],sol_o[b]);
}
if ( subGuess ) {
guess_save[b] = sol_o[b];
}
}
//////////////////////////////////////////////////////////////
// Call the block solver
//////////////////////////////////////////////////////////////
std::cout<<GridLogMessage << "SchurRedBlackBase calling the solver for "<<nblock<<" RHS" <<std::endl;
RedBlackSolve(_Matrix,src_o,sol_o);
////////////////////////////////////////////////
// A2A boolean behavioural control & reconstruct other checkerboard
////////////////////////////////////////////////
for(int b=0;b<nblock;b++) {
if (subGuess) sol_o[b] = sol_o[b] - guess_save[b];
///////// Needs even source //////////////
pickCheckerboard(Even,tmp,in[b]);
RedBlackSolution(_Matrix,sol_o[b],tmp,out[b]);
/////////////////////////////////////////////////
// Check unprec residual if possible
/////////////////////////////////////////////////
if ( ! subGuess ) {
_Matrix.M(out[b],resid);
resid = resid-in[b];
RealD ns = norm2(in[b]);
RealD nr = norm2(resid);
std::cout<<GridLogMessage<< "SchurRedBlackBase solver true unprec resid["<<b<<"] "<<std::sqrt(nr/ns) << std::endl;
} else {
std::cout<<GridLogMessage<< "SchurRedBlackBase Guess subtracted after solve["<<b<<"] " << std::endl;
}
}
}
template<class Guesser>
void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
// FIXME CGdiagonalMee not implemented virtual function
// FIXME use CBfactorise to control schur decomp
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
Field resid(fgrid);
Field src_o(grid);
Field src_e(grid);
Field sol_o(grid);
////////////////////////////////////////////////
// RedBlack source
////////////////////////////////////////////////
RedBlackSource(_Matrix,in,src_e,src_o);
////////////////////////////////
// Construct the guess
////////////////////////////////
if(useSolnAsInitGuess) {
pickCheckerboard(Odd, sol_o, out);
} else {
guess(src_o,sol_o);
}
Field guess_save(grid);
guess_save = sol_o;
//////////////////////////////////////////////////////////////
// Call the red-black solver
//////////////////////////////////////////////////////////////
RedBlackSolve(_Matrix,src_o,sol_o);
////////////////////////////////////////////////
// Fionn A2A boolean behavioural control
////////////////////////////////////////////////
if (subGuess) sol_o= sol_o-guess_save;
///////////////////////////////////////////////////
// RedBlack solution needs the even source
///////////////////////////////////////////////////
RedBlackSolution(_Matrix,sol_o,src_e,out);
// Verify the unprec residual
if ( ! subGuess ) {
_Matrix.M(out,resid);
resid = resid-in;
RealD ns = norm2(in);
RealD nr = norm2(resid);
std::cout<<GridLogMessage << "SchurRedBlackBase solver true unprec resid "<< std::sqrt(nr/ns) << std::endl;
} else {
std::cout << GridLogMessage << "SchurRedBlackBase Guess subtracted after solve." << std::endl;
}
}
/////////////////////////////////////////////////////////////
// Override in derived.
/////////////////////////////////////////////////////////////
virtual void RedBlackSource (Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o) =0;
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol) =0;
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o) =0;
virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o)=0;
};
template<class Field> class SchurRedBlackStaggeredSolve : public SchurRedBlackBase<Field> {
public:
typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
SchurRedBlackStaggeredSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false,
const bool _solnAsInitGuess = false)
: SchurRedBlackBase<Field> (HermitianRBSolver,initSubGuess,_solnAsInitGuess)
{
}
//////////////////////////////////////////////////////
// Override RedBlack specialisation
//////////////////////////////////////////////////////
virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
{
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
Field tmp(grid);
Field Mtmp(grid);
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd ,src_o,src);
/////////////////////////////////////////////////////
// src_o = (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd);
tmp=src_o-Mtmp; assert( tmp.Checkerboard() ==Odd);
_Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm.
}
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e_c,Field &sol)
{
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
Field tmp(grid);
Field sol_e(grid);
Field src_e(grid);
src_e = src_e_c; // Const correctness
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.Checkerboard() ==Even);
src_e = src_e-tmp; assert( src_e.Checkerboard() ==Even);
_Matrix.MooeeInv(src_e,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o); assert( sol_o.Checkerboard() ==Odd );
}
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
{
SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.Checkerboard()==Odd);
};
virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o)
{
SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
}
};
template<class Field> using SchurRedBlackStagSolve = SchurRedBlackStaggeredSolve<Field>;
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Site diagonal has Mooee on it.
///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class SchurRedBlackDiagMooeeSolve : public SchurRedBlackBase<Field> {
public:
typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
SchurRedBlackDiagMooeeSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false,
const bool _solnAsInitGuess = false)
: SchurRedBlackBase<Field> (HermitianRBSolver,initSubGuess,_solnAsInitGuess) {};
//////////////////////////////////////////////////////
// Override RedBlack specialisation
//////////////////////////////////////////////////////
virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
{
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
Field tmp(grid);
Field Mtmp(grid);
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd ,src_o,src);
/////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd);
tmp=src_o-Mtmp; assert( tmp.Checkerboard() ==Odd);
// get the right MpcDag
SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.Checkerboard() ==Odd);
}
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
{
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
Field tmp(grid);
Field sol_e(grid);
Field src_e_i(grid);
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.Checkerboard() ==Even);
src_e_i = src_e-tmp; assert( src_e_i.Checkerboard() ==Even);
_Matrix.MooeeInv(src_e_i,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o); assert( sol_o.Checkerboard() ==Odd );
}
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
{
SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.Checkerboard()==Odd);
};
virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o)
{
SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Site diagonal is identity, right preconditioned by Mee^inv
// ( 1 - Meo Moo^inv Moe Mee^inv ) phi =( 1 - Meo Moo^inv Moe Mee^inv ) Mee psi = = eta = eta
//=> psi = MeeInv phi
///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class SchurRedBlackDiagTwoSolve : public SchurRedBlackBase<Field> {
public:
typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
/////////////////////////////////////////////////////
// Wrap the usual normal equations Schur trick
/////////////////////////////////////////////////////
SchurRedBlackDiagTwoSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false,
const bool _solnAsInitGuess = false)
: SchurRedBlackBase<Field>(HermitianRBSolver,initSubGuess,_solnAsInitGuess) {};
virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
{
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
Field tmp(grid);
Field Mtmp(grid);
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd ,src_o,src);
/////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd);
tmp=src_o-Mtmp; assert( tmp.Checkerboard() ==Odd);
// get the right MpcDag
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.Checkerboard() ==Odd);
}
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
{
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
Field sol_o_i(grid);
Field tmp(grid);
Field sol_e(grid);
////////////////////////////////////////////////
// MooeeInv due to pecond
////////////////////////////////////////////////
_Matrix.MooeeInv(sol_o,tmp);
sol_o_i = tmp;
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o_i,tmp); assert( tmp.Checkerboard() ==Even);
tmp = src_e-tmp; assert( src_e.Checkerboard() ==Even);
_Matrix.MooeeInv(tmp,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o_i); assert( sol_o_i.Checkerboard() ==Odd );
};
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
{
SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
};
virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o)
{
SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
}
};
}
#endif

View File

@@ -0,0 +1,127 @@
#include <Grid/GridCore.h>
#include <fcntl.h>
NAMESPACE_BEGIN(Grid);
MemoryStats *MemoryProfiler::stats = nullptr;
bool MemoryProfiler::debug = false;
#ifdef POINTER_CACHE
int PointerCache::victim;
PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];
void *PointerCache::Insert(void *ptr,size_t bytes) {
if (bytes < 4096 ) return ptr;
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
void * ret = NULL;
int v = -1;
for(int e=0;e<Ncache;e++) {
if ( Entries[e].valid==0 ) {
v=e;
break;
}
}
if ( v==-1 ) {
v=victim;
victim = (victim+1)%Ncache;
}
if ( Entries[v].valid ) {
ret = Entries[v].address;
Entries[v].valid = 0;
Entries[v].address = NULL;
Entries[v].bytes = 0;
}
Entries[v].address=ptr;
Entries[v].bytes =bytes;
Entries[v].valid =1;
return ret;
}
void *PointerCache::Lookup(size_t bytes) {
if (bytes < 4096 ) return NULL;
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
for(int e=0;e<Ncache;e++){
if ( Entries[e].valid && ( Entries[e].bytes == bytes ) ) {
Entries[e].valid = 0;
return Entries[e].address;
}
}
return NULL;
}
#endif
void check_huge_pages(void *Buf,uint64_t BYTES)
{
#ifdef __linux__
int fd = open("/proc/self/pagemap", O_RDONLY);
assert(fd >= 0);
const int page_size = 4096;
uint64_t virt_pfn = (uint64_t)Buf / page_size;
off_t offset = sizeof(uint64_t) * virt_pfn;
uint64_t npages = (BYTES + page_size-1) / page_size;
uint64_t pagedata[npages];
uint64_t ret = lseek(fd, offset, SEEK_SET);
assert(ret == offset);
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
assert(ret == sizeof(uint64_t) * npages);
int nhugepages = npages / 512;
int n4ktotal, nnothuge;
n4ktotal = 0;
nnothuge = 0;
for (int i = 0; i < nhugepages; ++i) {
uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
for (int j = 0; j < 512; ++j) {
uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
++n4ktotal;
if (pageaddr != baseaddr + j * page_size)
++nnothuge;
}
}
int rank = CartesianCommunicator::RankWorld();
printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
#endif
}
std::string sizeString(const size_t bytes)
{
constexpr unsigned int bufSize = 256;
const char *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"};
char buf[256];
size_t s = 0;
double count = bytes;
while (count >= 1024 && s < 7)
{
s++;
count /= 1024;
}
if (count - floor(count) == 0.0)
{
snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
}
else
{
snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
}
return std::string(buf);
}
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,244 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/AlignedAllocator.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ALIGNED_ALLOCATOR_H
#define GRID_ALIGNED_ALLOCATOR_H
#ifdef HAVE_MALLOC_MALLOC_H
#include <malloc/malloc.h>
#endif
#ifdef HAVE_MALLOC_H
#include <malloc.h>
#endif
#ifdef HAVE_MM_MALLOC_H
#include <mm_malloc.h>
#endif
#define POINTER_CACHE
#define GRID_ALLOC_ALIGN (2*1024*1024)
NAMESPACE_BEGIN(Grid);
// Move control to configure.ac and Config.h?
#ifdef POINTER_CACHE
class PointerCache {
private:
static const int Ncache=8;
static int victim;
typedef struct {
void *address;
size_t bytes;
int valid;
} PointerCacheEntry;
static PointerCacheEntry Entries[Ncache];
public:
static void *Insert(void *ptr,size_t bytes) ;
static void *Lookup(size_t bytes) ;
};
#endif
std::string sizeString(size_t bytes);
struct MemoryStats
{
size_t totalAllocated{0}, maxAllocated{0},
currentlyAllocated{0}, totalFreed{0};
};
class MemoryProfiler
{
public:
static MemoryStats *stats;
static bool debug;
};
#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
#define profilerDebugPrint \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \
std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \
<< std::endl; \
}
#define profilerAllocate(bytes) \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
s->totalAllocated += (bytes); \
s->currentlyAllocated += (bytes); \
s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated); \
} \
if (MemoryProfiler::debug) \
{ \
std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \
profilerDebugPrint; \
}
#define profilerFree(bytes) \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
s->totalFreed += (bytes); \
s->currentlyAllocated -= (bytes); \
} \
if (MemoryProfiler::debug) \
{ \
std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \
profilerDebugPrint; \
}
void check_huge_pages(void *Buf,uint64_t BYTES);
////////////////////////////////////////////////////////////////////
// A lattice of something, but assume the something is SIMDized.
////////////////////////////////////////////////////////////////////
template<typename _Tp>
class alignedAllocator {
public:
typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type;
typedef _Tp* pointer;
typedef const _Tp* const_pointer;
typedef _Tp& reference;
typedef const _Tp& const_reference;
typedef _Tp value_type;
template<typename _Tp1> struct rebind { typedef alignedAllocator<_Tp1> other; };
alignedAllocator() throw() { }
alignedAllocator(const alignedAllocator&) throw() { }
template<typename _Tp1> alignedAllocator(const alignedAllocator<_Tp1>&) throw() { }
~alignedAllocator() throw() { }
pointer address(reference __x) const { return &__x; }
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
pointer allocate(size_type __n, const void* _p= 0)
{
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
#ifdef POINTER_CACHE
_Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
#else
pointer ptr = nullptr;
#endif
#ifdef GRID_NVCC
////////////////////////////////////
// Unified (managed) memory
////////////////////////////////////
if ( ptr == (_Tp *) NULL ) {
auto err = cudaMallocManaged((void **)&ptr,bytes);
if( err != cudaSuccess ) {
ptr = (_Tp *) NULL;
std::cerr << " cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
assert(0);
}
}
assert( ptr != (_Tp *)NULL);
#else
//////////////////////////////////////////////////////////////////////////////////////////
// 2MB align; could make option probably doesn't need configurability
//////////////////////////////////////////////////////////////////////////////////////////
#ifdef HAVE_MM_MALLOC_H
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
#else
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
#endif
assert( ptr != (_Tp *)NULL);
//////////////////////////////////////////////////
// First touch optimise in threaded loop
//////////////////////////////////////////////////
uint64_t *cp = (uint64_t *)ptr;
thread_for(n,bytes/sizeof(uint64_t), { // need only one touch per page
cp[n]=0;
});
#endif
return ptr;
}
void deallocate(pointer __p, size_type __n) {
size_type bytes = __n * sizeof(_Tp);
profilerFree(bytes);
#ifdef POINTER_CACHE
pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
#else
pointer __freeme = __p;
#endif
#ifdef GRID_NVCC
if ( __freeme ) cudaFree((void *)__freeme);
#else
#ifdef HAVE_MM_MALLOC_H
if ( __freeme ) _mm_free((void *)__freeme);
#else
if ( __freeme ) free((void *)__freeme);
#endif
#endif
}
// FIXME: hack for the copy constructor, eventually it must be avoided
void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
//void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p) { };
void destroy(pointer __p) { };
};
template<typename _Tp> inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
////////////////////////////////////////////////////////////////////////////////
// Template typedefs
////////////////////////////////////////////////////////////////////////////////
template<class T> using commAllocator = alignedAllocator<T>;
template<class T> using Vector = std::vector<T,alignedAllocator<T> >;
template<class T> using commVector = std::vector<T,alignedAllocator<T> >;
template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >;
NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,290 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/cartesian/Cartesian_base.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CARTESIAN_BASE_H
#define GRID_CARTESIAN_BASE_H
NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////////////////////
// Commicator provides information on the processor grid
//////////////////////////////////////////////////////////////////////
// unsigned long _ndimension;
// Coordinate _processors; // processor grid
// int _processor; // linear processor rank
// Coordinate _processor_coor; // linear processor rank
//////////////////////////////////////////////////////////////////////
class GridBase : public CartesianCommunicator , public GridThread {
public:
int dummy;
// Give Lattice access
template<class object> friend class Lattice;
GridBase(const Coordinate & processor_grid) : CartesianCommunicator(processor_grid) {};
GridBase(const Coordinate & processor_grid,
const CartesianCommunicator &parent,
int &split_rank)
: CartesianCommunicator(processor_grid,parent,split_rank) {};
GridBase(const Coordinate & processor_grid,
const CartesianCommunicator &parent)
: CartesianCommunicator(processor_grid,parent,dummy) {};
virtual ~GridBase() = default;
// Physics Grid information.
Coordinate _simd_layout;// Which dimensions get relayed out over simd lanes.
Coordinate _fdimensions;// (full) Global dimensions of array prior to cb removal
Coordinate _gdimensions;// Global dimensions of array after cb removal
Coordinate _ldimensions;// local dimensions of array with processor images removed
Coordinate _rdimensions;// Reduced local dimensions with simd lane images and processor images removed
Coordinate _ostride; // Outer stride for each dimension
Coordinate _istride; // Inner stride i.e. within simd lane
int _osites; // _isites*_osites = product(dimensions).
int _isites;
int _fsites; // _isites*_osites = product(dimensions).
int _gsites;
Coordinate _slice_block;// subslice information
Coordinate _slice_stride;
Coordinate _slice_nblock;
Coordinate _lstart; // local start of array in gcoors _processor_coor[d]*_ldimensions[d]
Coordinate _lend ; // local end of array in gcoors _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
bool _isCheckerBoarded;
public:
////////////////////////////////////////////////////////////////
// Checkerboarding interface is virtual and overridden by
// GridCartesian / GridRedBlackCartesian
////////////////////////////////////////////////////////////////
virtual int CheckerBoarded(int dim)=0;
virtual int CheckerBoard(const Coordinate &site)=0;
virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0;
virtual int CheckerBoardFromOindex (int Oindex)=0;
virtual int CheckerBoardFromOindexTable (int Oindex)=0;
//////////////////////////////////////////////////////////////////////////////////////////////
// Local layout calculations
//////////////////////////////////////////////////////////////////////////////////////////////
// These routines are key. Subdivide the linearised cartesian index into
// "inner" index identifying which simd lane of object<vFcomplex> is associated with coord
// "outer" index identifying which element of _odata in class "Lattice" is associated with coord.
//
// Compared to, say, Blitz++ we simply need to store BOTH an inner stride and an outer
// stride per dimension. The cost of evaluating the indexing information is doubled for an n-dimensional
// coordinate. Note, however, for data parallel operations the "inner" indexing cost is not paid and all
// lanes are operated upon simultaneously.
virtual int oIndex(Coordinate &coor)
{
int idx=0;
// Works with either global or local coordinates
for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
return idx;
}
virtual int iIndex(Coordinate &lcoor)
{
int idx=0;
for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
return idx;
}
inline int oIndexReduced(Coordinate &ocoor)
{
int idx=0;
// ocoor is already reduced so can eliminate the modulo operation
// for fast indexing and inline the routine
for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*ocoor[d];
return idx;
}
inline void oCoorFromOindex (Coordinate& coor,int Oindex){
Lexicographic::CoorFromIndex(coor,Oindex,_rdimensions);
}
inline void InOutCoorToLocalCoor (Coordinate &ocoor, Coordinate &icoor, Coordinate &lcoor) {
lcoor.resize(_ndimension);
for (int d = 0; d < _ndimension; d++)
lcoor[d] = ocoor[d] + _rdimensions[d] * icoor[d];
}
//////////////////////////////////////////////////////////
// SIMD lane addressing
//////////////////////////////////////////////////////////
inline void iCoorFromIindex(Coordinate &coor,int lane)
{
Lexicographic::CoorFromIndex(coor,lane,_simd_layout);
}
inline int PermuteDim(int dimension){
return _simd_layout[dimension]>1;
}
inline int PermuteType(int dimension){
int permute_type=0;
//
// Best way to encode this would be to present a mask
// for which simd dimensions are rotated, and the rotation
// size. If there is only one simd dimension rotated, this is just
// a permute.
//
// Cases: PermuteType == 1,2,4,8
// Distance should be either 0,1,2..
//
if ( _simd_layout[dimension] > 2 ) {
for(int d=0;d<_ndimension;d++){
if ( d != dimension ) assert ( (_simd_layout[d]==1) );
}
permute_type = RotateBit; // How to specify distance; this is not just direction.
return permute_type;
}
for(int d=_ndimension-1;d>dimension;d--){
if (_simd_layout[d]>1 ) permute_type++;
}
return permute_type;
}
////////////////////////////////////////////////////////////////
// Array sizing queries
////////////////////////////////////////////////////////////////
inline int iSites(void) const { return _isites; };
inline int Nsimd(void) const { return _isites; };// Synonymous with iSites
inline int oSites(void) const { return _osites; };
inline int lSites(void) const { return _isites*_osites; };
inline int gSites(void) const { return _isites*_osites*_Nprocessors; };
inline int Nd (void) const { return _ndimension;};
inline const Coordinate LocalStarts(void) { return _lstart; };
inline const Coordinate &FullDimensions(void) { return _fdimensions;};
inline const Coordinate &GlobalDimensions(void) { return _gdimensions;};
inline const Coordinate &LocalDimensions(void) { return _ldimensions;};
inline const Coordinate &VirtualLocalDimensions(void) { return _ldimensions;};
////////////////////////////////////////////////////////////////
// Utility to print the full decomposition details
////////////////////////////////////////////////////////////////
void show_decomposition(){
std::cout << GridLogMessage << "\tFull Dimensions : " << _fdimensions << std::endl;
std::cout << GridLogMessage << "\tSIMD layout : " << _simd_layout << std::endl;
std::cout << GridLogMessage << "\tGlobal Dimensions : " << _gdimensions << std::endl;
std::cout << GridLogMessage << "\tLocal Dimensions : " << _ldimensions << std::endl;
std::cout << GridLogMessage << "\tReduced Dimensions : " << _rdimensions << std::endl;
std::cout << GridLogMessage << "\tOuter strides : " << _ostride << std::endl;
std::cout << GridLogMessage << "\tInner strides : " << _istride << std::endl;
std::cout << GridLogMessage << "\tiSites : " << _isites << std::endl;
std::cout << GridLogMessage << "\toSites : " << _osites << std::endl;
std::cout << GridLogMessage << "\tlSites : " << lSites() << std::endl;
std::cout << GridLogMessage << "\tgSites : " << gSites() << std::endl;
std::cout << GridLogMessage << "\tNd : " << _ndimension << std::endl;
}
////////////////////////////////////////////////////////////////
// Global addressing
////////////////////////////////////////////////////////////////
void GlobalIndexToGlobalCoor(int gidx,Coordinate &gcoor){
assert(gidx< gSites());
Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
}
void LocalIndexToLocalCoor(int lidx,Coordinate &lcoor){
assert(lidx<lSites());
Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
}
void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int & gidx){
gidx=0;
int mult=1;
for(int mu=0;mu<_ndimension;mu++) {
gidx+=mult*gcoor[mu];
mult*=_gdimensions[mu];
}
}
void GlobalCoorToProcessorCoorLocalCoor(Coordinate &pcoor,Coordinate &lcoor,const Coordinate &gcoor)
{
pcoor.resize(_ndimension);
lcoor.resize(_ndimension);
for(int mu=0;mu<_ndimension;mu++){
int _fld = _fdimensions[mu]/_processors[mu];
pcoor[mu] = gcoor[mu]/_fld;
lcoor[mu] = gcoor[mu]%_fld;
}
}
void GlobalCoorToRankIndex(int &rank, int &o_idx, int &i_idx ,const Coordinate &gcoor)
{
Coordinate pcoor;
Coordinate lcoor;
GlobalCoorToProcessorCoorLocalCoor(pcoor,lcoor,gcoor);
rank = RankFromProcessorCoor(pcoor);
/*
Coordinate cblcoor(lcoor);
for(int d=0;d<cblcoor.size();d++){
if( this->CheckerBoarded(d) ) {
cblcoor[d] = lcoor[d]/2;
}
}
*/
i_idx= iIndex(lcoor);
o_idx= oIndex(lcoor);
}
void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , Coordinate &gcoor)
{
gcoor.resize(_ndimension);
Coordinate coor(_ndimension);
ProcessorCoorFromRank(rank,coor);
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = _ldimensions[mu]*coor[mu];
iCoorFromIindex(coor,i_idx);
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += _rdimensions[mu]*coor[mu];
oCoorFromOindex (coor,o_idx);
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += coor[mu];
}
void RankIndexCbToFullGlobalCoor(int rank, int o_idx, int i_idx, int cb,Coordinate &fcoor)
{
RankIndexToGlobalCoor(rank,o_idx,i_idx ,fcoor);
if(CheckerBoarded(0)){
fcoor[0] = fcoor[0]*2+cb;
}
}
void ProcessorCoorLocalCoorToGlobalCoor(Coordinate &Pcoor,Coordinate &Lcoor,Coordinate &gcoor)
{
gcoor.resize(_ndimension);
for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = Pcoor[mu]*_ldimensions[mu]+Lcoor[mu];
}
};
NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,174 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/cartesian/Cartesian_full.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CARTESIAN_FULL_H
#define GRID_CARTESIAN_FULL_H
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////////////
// Grid Support.
/////////////////////////////////////////////////////////////////////////////////////////
class GridCartesian: public GridBase {
public:
int dummy;
virtual int CheckerBoardFromOindexTable (int Oindex) {
return 0;
}
virtual int CheckerBoardFromOindex (int Oindex)
{
return 0;
}
virtual int CheckerBoarded(int dim){
return 0;
}
virtual int CheckerBoard(const Coordinate &site){
return 0;
}
virtual int CheckerBoardDestination(int cb,int shift,int dim){
return 0;
}
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift, int ocb){
return shift;
}
virtual int CheckerBoardShift(int source_cb,int dim,int shift, int osite){
return shift;
}
/////////////////////////////////////////////////////////////////////////
// Constructor takes a parent grid and possibly subdivides communicator.
/////////////////////////////////////////////////////////////////////////
GridCartesian(const Coordinate &dimensions,
const Coordinate &simd_layout,
const Coordinate &processor_grid,
const GridCartesian &parent) : GridBase(processor_grid,parent,dummy)
{
Init(dimensions,simd_layout,processor_grid);
}
GridCartesian(const Coordinate &dimensions,
const Coordinate &simd_layout,
const Coordinate &processor_grid,
const GridCartesian &parent,int &split_rank) : GridBase(processor_grid,parent,split_rank)
{
Init(dimensions,simd_layout,processor_grid);
}
/////////////////////////////////////////////////////////////////////////
// Construct from comm world
/////////////////////////////////////////////////////////////////////////
GridCartesian(const Coordinate &dimensions,
const Coordinate &simd_layout,
const Coordinate &processor_grid) : GridBase(processor_grid)
{
Init(dimensions,simd_layout,processor_grid);
}
virtual ~GridCartesian() = default;
void Init(const Coordinate &dimensions,
const Coordinate &simd_layout,
const Coordinate &processor_grid)
{
///////////////////////
// Grid information
///////////////////////
_isCheckerBoarded = false;
_ndimension = dimensions.size();
_fdimensions.resize(_ndimension);
_gdimensions.resize(_ndimension);
_ldimensions.resize(_ndimension);
_rdimensions.resize(_ndimension);
_simd_layout.resize(_ndimension);
_lstart.resize(_ndimension);
_lend.resize(_ndimension);
_ostride.resize(_ndimension);
_istride.resize(_ndimension);
_fsites = _gsites = _osites = _isites = 1;
for (int d = 0; d < _ndimension; d++)
{
_fdimensions[d] = dimensions[d]; // Global dimensions
_gdimensions[d] = _fdimensions[d]; // Global dimensions
_simd_layout[d] = simd_layout[d];
_fsites = _fsites * _fdimensions[d];
_gsites = _gsites * _gdimensions[d];
// Use a reduced simd grid
_ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions
//std::cout << _ldimensions[d] << " " << _gdimensions[d] << " " << _processors[d] << std::endl;
assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
_rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition
assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
_lstart[d] = _processor_coor[d] * _ldimensions[d];
_lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
_osites *= _rdimensions[d];
_isites *= _simd_layout[d];
// Addressing support
if (d == 0)
{
_ostride[d] = 1;
_istride[d] = 1;
}
else
{
_ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
_istride[d] = _istride[d - 1] * _simd_layout[d - 1];
}
}
///////////////////////
// subplane information
///////////////////////
_slice_block.resize(_ndimension);
_slice_stride.resize(_ndimension);
_slice_nblock.resize(_ndimension);
int block = 1;
int nblock = 1;
for (int d = 0; d < _ndimension; d++)
nblock *= _rdimensions[d];
for (int d = 0; d < _ndimension; d++)
{
nblock /= _rdimensions[d];
_slice_block[d] = block;
_slice_stride[d] = _ostride[d] * _rdimensions[d];
_slice_nblock[d] = nblock;
block = block * _rdimensions[d];
}
};
};
NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,289 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/cartesian/Cartesian_red_black.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CARTESIAN_RED_BLACK_H
#define GRID_CARTESIAN_RED_BLACK_H
NAMESPACE_BEGIN(Grid);
static const int CbRed =0;
static const int CbBlack=1;
static const int Even =CbRed;
static const int Odd =CbBlack;
// Specialise this for red black grids storing half the data like a chess board.
class GridRedBlackCartesian : public GridBase
{
public:
Coordinate _checker_dim_mask;
int _checker_dim;
std::vector<int> _checker_board;
virtual int CheckerBoarded(int dim){
if( dim==_checker_dim) return 1;
else return 0;
}
virtual int CheckerBoard(const Coordinate &site){
int linear=0;
assert(site.size()==_ndimension);
for(int d=0;d<_ndimension;d++){
if(_checker_dim_mask[d])
linear=linear+site[d];
}
return (linear&0x1);
}
// Depending on the cb of site, we toggle source cb.
// for block #b, element #e = (b, e)
// we need
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int ocb){
if(dim != _checker_dim) return shift;
int fulldim =_fdimensions[dim];
shift = (shift+fulldim)%fulldim;
// Probably faster with table lookup;
// or by looping over x,y,z and multiply rather than computing checkerboard.
if ( (source_cb+ocb)&1 ) {
return (shift)/2;
} else {
return (shift+1)/2;
}
}
virtual int CheckerBoardFromOindexTable (int Oindex) {
return _checker_board[Oindex];
}
virtual int CheckerBoardFromOindex (int Oindex)
{
Coordinate ocoor;
oCoorFromOindex(ocoor,Oindex);
return CheckerBoard(ocoor);
}
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite){
if(dim != _checker_dim) return shift;
int ocb=CheckerBoardFromOindex(osite);
return CheckerBoardShiftForCB(source_cb,dim,shift,ocb);
}
virtual int CheckerBoardDestination(int source_cb,int shift,int dim){
if ( _checker_dim_mask[dim] ) {
// If _fdimensions[checker_dim] is odd, then shifting by 1 in other dims
// does NOT cause a parity hop.
int add=(dim==_checker_dim) ? 0 : _fdimensions[_checker_dim];
if ( (shift+add) &0x1) {
return 1-source_cb;
} else {
return source_cb;
}
} else {
return source_cb;
}
};
////////////////////////////////////////////////////////////
// Create Redblack from original grid; require full grid pointer ?
////////////////////////////////////////////////////////////
GridRedBlackCartesian(const GridBase *base) : GridBase(base->_processors,*base)
{
int dims = base->_ndimension;
Coordinate checker_dim_mask(dims,1);
int checker_dim = 0;
Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim);
};
////////////////////////////////////////////////////////////
// Create redblack from original grid, with non-trivial checker dim mask
////////////////////////////////////////////////////////////
GridRedBlackCartesian(const GridBase *base,
const Coordinate &checker_dim_mask,
int checker_dim
) : GridBase(base->_processors,*base)
{
Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim) ;
}
virtual ~GridRedBlackCartesian() = default;
void Init(const Coordinate &dimensions,
const Coordinate &simd_layout,
const Coordinate &processor_grid,
const Coordinate &checker_dim_mask,
int checker_dim)
{
_isCheckerBoarded = true;
_checker_dim = checker_dim;
assert(checker_dim_mask[checker_dim] == 1);
_ndimension = dimensions.size();
assert(checker_dim_mask.size() == _ndimension);
assert(processor_grid.size() == _ndimension);
assert(simd_layout.size() == _ndimension);
_fdimensions.resize(_ndimension);
_gdimensions.resize(_ndimension);
_ldimensions.resize(_ndimension);
_rdimensions.resize(_ndimension);
_simd_layout.resize(_ndimension);
_lstart.resize(_ndimension);
_lend.resize(_ndimension);
_ostride.resize(_ndimension);
_istride.resize(_ndimension);
_fsites = _gsites = _osites = _isites = 1;
_checker_dim_mask = checker_dim_mask;
for (int d = 0; d < _ndimension; d++)
{
_fdimensions[d] = dimensions[d];
_gdimensions[d] = _fdimensions[d];
_fsites = _fsites * _fdimensions[d];
_gsites = _gsites * _gdimensions[d];
if (d == _checker_dim)
{
assert((_gdimensions[d] & 0x1) == 0);
_gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard
_gsites /= 2;
}
_ldimensions[d] = _gdimensions[d] / _processors[d];
assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
_lstart[d] = _processor_coor[d] * _ldimensions[d];
_lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
// Use a reduced simd grid
_simd_layout[d] = simd_layout[d];
_rdimensions[d] = _ldimensions[d] / _simd_layout[d]; // this is not checking if this is integer
assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
assert(_rdimensions[d] > 0);
// all elements of a simd vector must have same checkerboard.
// If Ls vectorised, this must still be the case; e.g. dwf rb5d
if (_simd_layout[d] > 1)
{
if (checker_dim_mask[d])
{
assert((_rdimensions[d] & 0x1) == 0);
}
}
_osites *= _rdimensions[d];
_isites *= _simd_layout[d];
// Addressing support
if (d == 0)
{
_ostride[d] = 1;
_istride[d] = 1;
}
else
{
_ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
_istride[d] = _istride[d - 1] * _simd_layout[d - 1];
}
}
////////////////////////////////////////////////////////////////////////////////////////////
// subplane information
////////////////////////////////////////////////////////////////////////////////////////////
_slice_block.resize(_ndimension);
_slice_stride.resize(_ndimension);
_slice_nblock.resize(_ndimension);
int block = 1;
int nblock = 1;
for (int d = 0; d < _ndimension; d++)
nblock *= _rdimensions[d];
for (int d = 0; d < _ndimension; d++)
{
nblock /= _rdimensions[d];
_slice_block[d] = block;
_slice_stride[d] = _ostride[d] * _rdimensions[d];
_slice_nblock[d] = nblock;
block = block * _rdimensions[d];
}
////////////////////////////////////////////////
// Create a checkerboard lookup table
////////////////////////////////////////////////
int rvol = 1;
for (int d = 0; d < _ndimension; d++)
{
rvol = rvol * _rdimensions[d];
}
_checker_board.resize(rvol);
for (int osite = 0; osite < _osites; osite++)
{
_checker_board[osite] = CheckerBoardFromOindex(osite);
}
};
protected:
virtual int oIndex(Coordinate &coor)
{
int idx = 0;
for (int d = 0; d < _ndimension; d++)
{
if (d == _checker_dim)
{
idx += _ostride[d] * ((coor[d] / 2) % _rdimensions[d]);
}
else
{
idx += _ostride[d] * (coor[d] % _rdimensions[d]);
}
}
return idx;
};
virtual int iIndex(Coordinate &lcoor)
{
int idx = 0;
for (int d = 0; d < _ndimension; d++)
{
if (d == _checker_dim)
{
idx += _istride[d] * (lcoor[d] / (2 * _rdimensions[d]));
}
else
{
idx += _istride[d] * (lcoor[d] / _rdimensions[d]);
}
}
return idx;
}
};
NAMESPACE_END(Grid);
#endif

View File

@@ -28,6 +28,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_COMMUNICATOR_H
#define GRID_COMMUNICATOR_H
#include <Grid/util/Coordinate.h>
#include <Grid/communicator/SharedMemory.h>
#include <Grid/communicator/Communicator_base.h>
#endif

View File

@@ -0,0 +1,77 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/Communicator_none.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
#include <fcntl.h>
#include <unistd.h>
#include <limits.h>
#include <sys/mman.h>
NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
///////////////////////////////////////////////////////////////
CartesianCommunicator::CommunicatorPolicy_t
CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
int CartesianCommunicator::nCommThreads = -1;
/////////////////////////////////
// Grid information queries
/////////////////////////////////
int CartesianCommunicator::Dimensions(void) { return _ndimension; };
int CartesianCommunicator::IsBoss(void) { return _processor==0; };
int CartesianCommunicator::BossRank(void) { return 0; };
int CartesianCommunicator::ThisRank(void) { return _processor; };
const Coordinate & CartesianCommunicator::ThisProcessorCoor(void) { return _processor_coor; };
const Coordinate & CartesianCommunicator::ProcessorGrid(void) { return _processors; };
int CartesianCommunicator::ProcessorCount(void) { return _Nprocessors; };
////////////////////////////////////////////////////////////////////////////////
// very VERY rarely (Log, serial RNG) we need world without a grid
////////////////////////////////////////////////////////////////////////////////
void CartesianCommunicator::GlobalSum(ComplexF &c)
{
GlobalSumVector((float *)&c,2);
}
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
{
GlobalSumVector((float *)c,2*N);
}
void CartesianCommunicator::GlobalSum(ComplexD &c)
{
GlobalSumVector((double *)&c,2);
}
void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
{
GlobalSumVector((double *)c,2*N);
}
NAMESPACE_END(Grid);

View File

@@ -32,104 +32,33 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
///////////////////////////////////
// Processor layout information
///////////////////////////////////
#ifdef GRID_COMMS_MPI
#include <mpi.h>
#endif
#ifdef GRID_COMMS_MPI3
#include <mpi.h>
#endif
#ifdef GRID_COMMS_MPI3L
#include <mpi.h>
#endif
#ifdef GRID_COMMS_SHMEM
#include <mpp/shmem.h>
#endif
#include <Grid/communicator/SharedMemory.h>
namespace Grid {
NAMESPACE_BEGIN(Grid);
class CartesianCommunicator : public SharedMemory {
class CartesianCommunicator {
public:
// 65536 ranks per node adequate for now
// 128MB shared memory for comms enought for 48^4 local vol comms
// Give external control (command line override?) of this
static const int MAXLOG2RANKSPERNODE = 16;
static uint64_t MAX_MPI_SHM_BYTES;
// Communicator should know nothing of the physics grid, only processor grid.
int _Nprocessors; // How many in all
std::vector<int> _processors; // Which dimensions get relayed out over processors lanes.
int _processor; // linear processor rank
std::vector<int> _processor_coor; // linear processor coordinate
unsigned long _ndimension;
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPI3L)
static MPI_Comm communicator_world;
MPI_Comm communicator;
typedef MPI_Request CommsRequest_t;
#else
typedef int CommsRequest_t;
#endif
////////////////////////////////////////////////////////////////////
// Helper functionality for SHM Windows common to all other impls
////////////////////////////////////////////////////////////////////
// Longer term; drop this in favour of a master / slave model with
// cartesian communicator on a subset of ranks, slave ranks controlled
// by group leader with data xfer via shared memory
////////////////////////////////////////////////////////////////////
#ifdef GRID_COMMS_MPI3
static int ShmRank;
static int ShmSize;
static int GroupRank;
static int GroupSize;
static int WorldRank;
static int WorldSize;
std::vector<int> WorldDims;
std::vector<int> GroupDims;
std::vector<int> ShmDims;
std::vector<int> GroupCoor;
std::vector<int> ShmCoor;
std::vector<int> WorldCoor;
static std::vector<int> GroupRanks;
static std::vector<int> MyGroup;
static int ShmSetup;
static MPI_Win ShmWindow;
static MPI_Comm ShmComm;
std::vector<int> LexicographicToWorldRank;
static std::vector<void *> ShmCommBufs;
#else
static void ShmInitGeneric(void);
static commVector<uint8_t> ShmBufStorageVector;
#endif
/////////////////////////////////
// Grid information and queries
// Implemented in Communicator_base.C
/////////////////////////////////
static void * ShmCommBuf;
// Isend/Irecv/Wait, or Sendrecv blocking
////////////////////////////////////////////
// Policies
////////////////////////////////////////////
enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
static CommunicatorPolicy_t CommunicatorPolicy;
static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
static int nCommThreads;
size_t heap_top;
size_t heap_bytes;
void *ShmBufferSelf(void);
void *ShmBuffer(int rank);
void *ShmBufferTranslate(int rank,void * local_p);
void *ShmBufferMalloc(size_t bytes);
void ShmBufferFreeAll(void) ;
////////////////////////////////////////////
// Communicator should know nothing of the physics grid, only processor grid.
////////////////////////////////////////////
int _Nprocessors; // How many in all
Coordinate _processors; // Which dimensions get relayed out over processors lanes.
int _processor; // linear processor rank
Coordinate _processor_coor; // linear processor coordinate
unsigned long _ndimension;
static Grid_MPI_Comm communicator_world;
Grid_MPI_Comm communicator;
std::vector<Grid_MPI_Comm> communicator_halo;
////////////////////////////////////////////////
// Must call in Grid startup
@@ -137,26 +66,38 @@ class CartesianCommunicator {
static void Init(int *argc, char ***argv);
////////////////////////////////////////////////
// Constructor of any given grid
// Constructors to sub-divide a parent communicator
// and default to comm world
////////////////////////////////////////////////
CartesianCommunicator(const std::vector<int> &pdimensions_in);
CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank);
CartesianCommunicator(const Coordinate &pdimensions_in);
virtual ~CartesianCommunicator();
private:
////////////////////////////////////////////////
// Private initialise from an MPI communicator
// Can use after an MPI_Comm_split, but hidden from user so private
////////////////////////////////////////////////
void InitFromMPICommunicator(const Coordinate &processors, Grid_MPI_Comm communicator_base);
public:
////////////////////////////////////////////////////////////////////////////////////////
// Wraps MPI_Cart routines, or implements equivalent on other impls
////////////////////////////////////////////////////////////////////////////////////////
void ShiftedRanks(int dim,int shift,int & source, int & dest);
int RankFromProcessorCoor(std::vector<int> &coor);
void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
int RankFromProcessorCoor(Coordinate &coor);
void ProcessorCoorFromRank(int rank,Coordinate &coor);
int Dimensions(void) ;
int IsBoss(void) ;
int BossRank(void) ;
int ThisRank(void) ;
const std::vector<int> & ThisProcessorCoor(void) ;
const std::vector<int> & ProcessorGrid(void) ;
const Coordinate & ThisProcessorCoor(void) ;
const Coordinate & ProcessorGrid(void) ;
int ProcessorCount(void) ;
int NodeCount(void) ;
int RankCount(void) ;
////////////////////////////////////////////////////////////////////////////////
// very VERY rarely (Log, serial RNG) we need world without a grid
@@ -211,14 +152,21 @@ class CartesianCommunicator {
void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
double StencilSendToRecvFrom(void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes,int dir);
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes);
int bytes,int dir);
void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
void StencilBarrier(void);
////////////////////////////////////////////////////////////
@@ -231,12 +179,30 @@ class CartesianCommunicator {
////////////////////////////////////////////////////////////
void Broadcast(int root,void* data, int bytes);
////////////////////////////////////////////////////////////
// All2All down one dimension
////////////////////////////////////////////////////////////
template<class T> void AllToAll(int dim,std::vector<T> &in, std::vector<T> &out){
assert(dim>=0);
assert(dim<_ndimension);
assert(in.size()==out.size());
int numnode = _processors[dim];
uint64_t bytes=sizeof(T);
uint64_t words=in.size()/numnode;
assert(numnode * words == in.size());
assert(words < (1ULL<<31));
AllToAll(dim,(void *)&in[0],(void *)&out[0],words,bytes);
}
void AllToAll(int dim ,void *in,void *out,uint64_t words,uint64_t bytes);
void AllToAll(void *in,void *out,uint64_t words ,uint64_t bytes);
template<class obj> void Broadcast(int root,obj &data)
{
Broadcast(root,(void *)&data,sizeof(data));
};
};
}
};
NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,483 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/Communicator_mpi.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
#include <Grid/communicator/SharedMemory.h>
NAMESPACE_BEGIN(Grid);
Grid_MPI_Comm CartesianCommunicator::communicator_world;
////////////////////////////////////////////
// First initialise of comms system
////////////////////////////////////////////
void CartesianCommunicator::Init(int *argc, char ***argv)
{
int flag;
int provided;
MPI_Initialized(&flag); // needed to coexist with other libs apparently
if ( !flag ) {
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
//If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
if( (nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) {
assert(0);
}
if( (nCommThreads > 1) && (provided != MPI_THREAD_MULTIPLE) ) {
assert(0);
}
}
// Never clean up as done once.
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
Grid_quiesce_nodes();
GlobalSharedMemory::Init(communicator_world);
GlobalSharedMemory::SharedMemoryAllocate(
GlobalSharedMemory::MAX_MPI_SHM_BYTES,
GlobalSharedMemory::Hugepages);
Grid_unquiesce_nodes();
}
///////////////////////////////////////////////////////////////////////////
// Use cartesian communicators now even in MPI3
///////////////////////////////////////////////////////////////////////////
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{
int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
assert(ierr==0);
}
int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor)
{
int rank;
int ierr=MPI_Cart_rank (communicator, &coor[0], &rank);
assert(ierr==0);
return rank;
}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor)
{
coor.resize(_ndimension);
int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]);
assert(ierr==0);
}
////////////////////////////////////////////////////////////////////////////////////////////////////////
// Initialises from communicator_world
////////////////////////////////////////////////////////////////////////////////////////////////////////
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
{
MPI_Comm optimal_comm;
////////////////////////////////////////////////////
// Remap using the shared memory optimising routine
// The remap creates a comm which must be freed
////////////////////////////////////////////////////
GlobalSharedMemory::OptimalCommunicator (processors,optimal_comm);
InitFromMPICommunicator(processors,optimal_comm);
SetCommunicator(optimal_comm);
///////////////////////////////////////////////////
// Free the temp communicator
///////////////////////////////////////////////////
MPI_Comm_free(&optimal_comm);
}
//////////////////////////////////
// Try to subdivide communicator
//////////////////////////////////
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank)
{
_ndimension = processors.size(); assert(_ndimension>=1);
int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
Coordinate parent_processor_coor(_ndimension,0);
Coordinate parent_processors (_ndimension,1);
// Can make 5d grid from 4d etc...
int pad = _ndimension-parent_ndimension;
for(int d=0;d<parent_ndimension;d++){
parent_processor_coor[pad+d]=parent._processor_coor[d];
parent_processors [pad+d]=parent._processors[d];
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
// split the communicator
//////////////////////////////////////////////////////////////////////////////////////////////////////
// int Nparent = parent._processors ;
int Nparent;
MPI_Comm_size(parent.communicator,&Nparent);
int childsize=1;
for(int d=0;d<processors.size();d++) {
childsize *= processors[d];
}
int Nchild = Nparent/childsize;
assert (childsize * Nchild == Nparent);
Coordinate ccoor(_ndimension); // coor within subcommunicator
Coordinate scoor(_ndimension); // coor of split within parent
Coordinate ssize(_ndimension); // coor of split within parent
for(int d=0;d<_ndimension;d++){
ccoor[d] = parent_processor_coor[d] % processors[d];
scoor[d] = parent_processor_coor[d] / processors[d];
ssize[d] = parent_processors[d] / processors[d];
}
// rank within subcomm ; srank is rank of subcomm within blocks of subcomms
int crank;
// Mpi uses the reverse Lexico convention to us; so reversed routines called
Lexicographic::IndexFromCoorReversed(ccoor,crank,processors); // processors is the split grid dimensions
Lexicographic::IndexFromCoorReversed(scoor,srank,ssize); // ssize is the number of split grids
MPI_Comm comm_split;
if ( Nchild > 1 ) {
////////////////////////////////////////////////////////////////
// Split the communicator
////////////////////////////////////////////////////////////////
int ierr= MPI_Comm_split(parent.communicator,srank,crank,&comm_split);
assert(ierr==0);
} else {
srank = 0;
int ierr = MPI_Comm_dup (parent.communicator,&comm_split);
assert(ierr==0);
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Set up from the new split communicator
//////////////////////////////////////////////////////////////////////////////////////////////////////
InitFromMPICommunicator(processors,comm_split);
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Take the right SHM buffers
//////////////////////////////////////////////////////////////////////////////////////////////////////
SetCommunicator(comm_split);
///////////////////////////////////////////////
// Free the temp communicator
///////////////////////////////////////////////
MPI_Comm_free(&comm_split);
if(0){
std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl;
for(int d=0;d<processors.size();d++){
std::cout << d<< " " << _processor_coor[d] <<" " << ccoor[d]<<std::endl;
}
}
for(int d=0;d<processors.size();d++){
assert(_processor_coor[d] == ccoor[d] );
}
}
void CartesianCommunicator::InitFromMPICommunicator(const Coordinate &processors, MPI_Comm communicator_base)
{
////////////////////////////////////////////////////
// Creates communicator, and the communicator_halo
////////////////////////////////////////////////////
_ndimension = processors.size();
_processor_coor.resize(_ndimension);
/////////////////////////////////
// Count the requested nodes
/////////////////////////////////
_Nprocessors=1;
_processors = processors;
for(int i=0;i<_ndimension;i++){
_Nprocessors*=_processors[i];
}
Coordinate periodic(_ndimension,1);
MPI_Cart_create(communicator_base, _ndimension,&_processors[0],&periodic[0],0,&communicator);
MPI_Comm_rank(communicator,&_processor);
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
if ( 0 && (communicator_base != communicator_world) ) {
std::cout << "InitFromMPICommunicator Cartesian communicator created with a non-world communicator"<<std::endl;
std::cout << " new communicator rank "<<_processor<< " coor ["<<_ndimension<<"] ";
for(int d=0;d<_processors.size();d++){
std::cout << _processor_coor[d]<<" ";
}
std::cout << std::endl;
}
int Size;
MPI_Comm_size(communicator,&Size);
communicator_halo.resize (2*_ndimension);
for(int i=0;i<_ndimension*2;i++){
MPI_Comm_dup(communicator,&communicator_halo[i]);
}
assert(Size==_Nprocessors);
}
CartesianCommunicator::~CartesianCommunicator()
{
int MPI_is_finalised;
MPI_Finalized(&MPI_is_finalised);
if (communicator && !MPI_is_finalised) {
MPI_Comm_free(&communicator);
for(int i=0;i<communicator_halo.size();i++){
MPI_Comm_free(&communicator_halo[i]);
}
}
}
void CartesianCommunicator::GlobalSum(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(uint64_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalXOR(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalXOR(uint64_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(float &f){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(float *f,int N)
{
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(double &d)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(double *d,int N)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFrom(void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
std::vector<CommsRequest_t> reqs(0);
// unsigned long xcrc = crc32(0L, Z_NULL, 0);
// unsigned long rcrc = crc32(0L, Z_NULL, 0);
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
SendToRecvFromComplete(reqs);
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
// printf("proc %d SendToRecvFrom %d bytes %lx %lx\n",_processor,bytes,xcrc,rcrc);
}
void CartesianCommunicator::SendRecvPacket(void *xmit,
void *recv,
int sender,
int receiver,
int bytes)
{
MPI_Status stat;
assert(sender != receiver);
int tag = sender;
if ( _processor == sender ) {
MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
}
if ( _processor == receiver ) {
MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
}
}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
int myrank = _processor;
int ierr;
if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
MPI_Request xrq;
MPI_Request rrq;
ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
assert(ierr==0);
list.push_back(xrq);
list.push_back(rrq);
} else {
// Give the CPU to MPI immediately; can use threads to overlap optionally
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
recv,bytes,MPI_CHAR,from, from,
communicator,MPI_STATUS_IGNORE);
assert(ierr==0);
}
}
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int dest,
void *recv,
int from,
int bytes,int dir)
{
std::vector<CommsRequest_t> list;
double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
StencilSendToRecvFromComplete(list,dir);
return offbytes;
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes,int dir)
{
int ncomm =communicator_halo.size();
int commdir=dir%ncomm;
MPI_Request xrq;
MPI_Request rrq;
int ierr;
int gdest = ShmRanks[dest];
int gfrom = ShmRanks[from];
int gme = ShmRanks[_processor];
assert(dest != _processor);
assert(from != _processor);
assert(gme == ShmRank);
double off_node_bytes=0.0;
if ( gfrom ==MPI_UNDEFINED) {
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[commdir],&rrq);
assert(ierr==0);
list.push_back(rrq);
off_node_bytes+=bytes;
}
if ( gdest == MPI_UNDEFINED ) {
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[commdir],&xrq);
assert(ierr==0);
list.push_back(xrq);
off_node_bytes+=bytes;
}
if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
this->StencilSendToRecvFromComplete(list,dir);
}
return off_node_bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
{
SendToRecvFromComplete(waitall);
}
void CartesianCommunicator::StencilBarrier(void)
{
MPI_Barrier (ShmComm);
}
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{
int nreq=list.size();
if (nreq==0) return;
std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
assert(ierr==0);
list.resize(0);
}
void CartesianCommunicator::Barrier(void)
{
int ierr = MPI_Barrier(communicator);
assert(ierr==0);
}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
{
int ierr=MPI_Bcast(data,
bytes,
MPI_BYTE,
root,
communicator);
assert(ierr==0);
}
int CartesianCommunicator::RankWorld(void){
int r;
MPI_Comm_rank(communicator_world,&r);
return r;
}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
{
int ierr= MPI_Bcast(data,
bytes,
MPI_BYTE,
root,
communicator_world);
assert(ierr==0);
}
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
{
Coordinate row(_ndimension,1);
assert(dim>=0 && dim<_ndimension);
// Split the communicator
row[dim] = _processors[dim];
int me;
CartesianCommunicator Comm(row,*this,me);
Comm.AllToAll(in,out,words,bytes);
}
void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes)
{
// MPI is a pain and uses "int" arguments
// 64*64*64*128*16 == 500Million elements of data.
// When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
// (Turns up on 32^3 x 64 Gparity too)
MPI_Datatype object;
int iwords;
int ibytes;
iwords = words;
ibytes = bytes;
assert(words == iwords); // safe to cast to int ?
assert(bytes == ibytes); // safe to cast to int ?
MPI_Type_contiguous(ibytes,MPI_BYTE,&object);
MPI_Type_commit(&object);
MPI_Alltoall(in,iwords,object,out,iwords,object,communicator);
MPI_Type_free(&object);
}
NAMESPACE_END(Grid);

View File

@@ -27,21 +27,32 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */
#include <Grid/GridCore.h>
namespace Grid {
NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
///////////////////////////////////////////////////////////////////////////////////////////////////
Grid_MPI_Comm CartesianCommunicator::communicator_world;
void CartesianCommunicator::Init(int *argc, char *** arv)
{
ShmInitGeneric();
GlobalSharedMemory::Init(communicator_world);
GlobalSharedMemory::SharedMemoryAllocate(
GlobalSharedMemory::MAX_MPI_SHM_BYTES,
GlobalSharedMemory::Hugepages);
}
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank)
: CartesianCommunicator(processors)
{
srank=0;
SetCommunicator(communicator_world);
}
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
{
_processors = processors;
_ndimension = processors.size();
_ndimension = processors.size(); assert(_ndimension>=1);
_processor_coor.resize(_ndimension);
// Require 1^N processor grid for fake
@@ -51,8 +62,11 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
assert(_processors[d]==1);
_processor_coor[d] = 0;
}
SetCommunicator(communicator_world);
}
CartesianCommunicator::~CartesianCommunicator(){}
void CartesianCommunicator::GlobalSum(float &){}
void CartesianCommunicator::GlobalSumVector(float *,int N){}
void CartesianCommunicator::GlobalSum(double &){}
@@ -95,19 +109,57 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
{
assert(0);
}
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
{
bcopy(in,out,bytes*words);
}
void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes)
{
bcopy(in,out,bytes*words);
}
int CartesianCommunicator::RankWorld(void){return 0;}
void CartesianCommunicator::Barrier(void){}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) { return 0;}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){ coor = _processor_coor; }
int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) { return 0;}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor){ coor = _processor_coor; }
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{
source =0;
dest=0;
}
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes, int dir)
{
std::vector<CommsRequest_t> list;
// Discard the "dir"
SendToRecvFromBegin (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
SendToRecvFromComplete(list);
return 2.0*bytes;
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes, int dir)
{
// Discard the "dir"
SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
return 2.0*bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
{
SendToRecvFromComplete(waitall);
}
void CartesianCommunicator::StencilBarrier(void){};
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,94 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/SharedMemory.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid);
// static data
int GlobalSharedMemory::HPEhypercube = 1;
uint64_t GlobalSharedMemory::MAX_MPI_SHM_BYTES = 1024LL*1024LL*1024LL;
int GlobalSharedMemory::Hugepages = 0;
int GlobalSharedMemory::_ShmSetup;
int GlobalSharedMemory::_ShmAlloc;
uint64_t GlobalSharedMemory::_ShmAllocBytes;
std::vector<void *> GlobalSharedMemory::WorldShmCommBufs;
Grid_MPI_Comm GlobalSharedMemory::WorldShmComm;
int GlobalSharedMemory::WorldShmRank;
int GlobalSharedMemory::WorldShmSize;
std::vector<int> GlobalSharedMemory::WorldShmRanks;
Grid_MPI_Comm GlobalSharedMemory::WorldComm;
int GlobalSharedMemory::WorldSize;
int GlobalSharedMemory::WorldRank;
int GlobalSharedMemory::WorldNodes;
int GlobalSharedMemory::WorldNode;
void GlobalSharedMemory::SharedMemoryFree(void)
{
assert(_ShmAlloc);
assert(_ShmAllocBytes>0);
for(int r=0;r<WorldShmSize;r++){
munmap(WorldShmCommBufs[r],_ShmAllocBytes);
}
_ShmAlloc = 0;
_ShmAllocBytes = 0;
}
/////////////////////////////////
// Alloc, free shmem region
/////////////////////////////////
void *SharedMemory::ShmBufferMalloc(size_t bytes){
// bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
void *ptr = (void *)heap_top;
heap_top += bytes;
heap_bytes+= bytes;
if (heap_bytes >= heap_size) {
std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
std::cout<< " Current value is " << (heap_size/(1024*1024)) <<std::endl;
assert(heap_bytes<heap_size);
}
//std::cerr << "ShmBufferMalloc "<<std::hex<< ptr<<" - "<<((uint64_t)ptr+bytes)<<std::dec<<std::endl;
return ptr;
}
void SharedMemory::ShmBufferFreeAll(void) {
heap_top =(size_t)ShmBufferSelf();
heap_bytes=0;
}
void *SharedMemory::ShmBufferSelf(void)
{
//std::cerr << "ShmBufferSelf "<<ShmRank<<" "<<std::hex<< ShmCommBufs[ShmRank] <<std::dec<<std::endl;
return ShmCommBufs[ShmRank];
}
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,165 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/SharedMemory.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/GridCore.h>
#if defined (GRID_COMMS_MPI3)
#include <mpi.h>
#endif
#include <semaphore.h>
#include <fcntl.h>
#include <unistd.h>
#include <limits.h>
#include <sys/types.h>
#include <sys/ipc.h>
#include <sys/shm.h>
#include <sys/mman.h>
#include <zlib.h>
#ifdef HAVE_NUMAIF_H
#include <numaif.h>
#endif
NAMESPACE_BEGIN(Grid);
#if defined (GRID_COMMS_MPI3)
typedef MPI_Comm Grid_MPI_Comm;
typedef MPI_Request CommsRequest_t;
#else
typedef int CommsRequest_t;
typedef int Grid_MPI_Comm;
#endif
class GlobalSharedMemory {
private:
static const int MAXLOG2RANKSPERNODE = 16;
// Init once lock on the buffer allocation
static int _ShmSetup;
static int _ShmAlloc;
static uint64_t _ShmAllocBytes;
public:
///////////////////////////////////////
// HPE 8600 hypercube optimisation
///////////////////////////////////////
static int HPEhypercube;
static int ShmSetup(void) { return _ShmSetup; }
static int ShmAlloc(void) { return _ShmAlloc; }
static uint64_t ShmAllocBytes(void) { return _ShmAllocBytes; }
static uint64_t MAX_MPI_SHM_BYTES;
static int Hugepages;
static std::vector<void *> WorldShmCommBufs;
static Grid_MPI_Comm WorldComm;
static int WorldRank;
static int WorldSize;
static Grid_MPI_Comm WorldShmComm;
static int WorldShmRank;
static int WorldShmSize;
static int WorldNodes;
static int WorldNode;
static std::vector<int> WorldShmRanks;
//////////////////////////////////////////////////////////////////////////////////////
// Create an optimal reordered communicator that makes MPI_Cart_create get it right
//////////////////////////////////////////////////////////////////////////////////////
static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD
static void OptimalCommunicator (const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
static void OptimalCommunicatorHypercube (const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
///////////////////////////////////////////////////
// Provide shared memory facilities off comm world
///////////////////////////////////////////////////
static void SharedMemoryAllocate(uint64_t bytes, int flags);
static void SharedMemoryFree(void);
static void SharedMemoryCopy(void *dest,const void *src,size_t bytes);
static void SharedMemoryZero(void *dest,size_t bytes);
};
//////////////////////////////
// one per communicator
//////////////////////////////
class SharedMemory
{
private:
static const int MAXLOG2RANKSPERNODE = 16;
size_t heap_top;
size_t heap_bytes;
size_t heap_size;
protected:
Grid_MPI_Comm ShmComm; // for barriers
int ShmRank;
int ShmSize;
std::vector<void *> ShmCommBufs;
std::vector<int> ShmRanks;// Mapping comm ranks to Shm ranks
public:
SharedMemory() {};
~SharedMemory();
///////////////////////////////////////////////////////////////////////////////////////
// set the buffers & sizes
///////////////////////////////////////////////////////////////////////////////////////
void SetCommunicator(Grid_MPI_Comm comm);
////////////////////////////////////////////////////////////////////////
// For this instance ; disjoint buffer sets between splits if split grid
////////////////////////////////////////////////////////////////////////
void ShmBarrier(void);
///////////////////////////////////////////////////
// Call on any instance
///////////////////////////////////////////////////
void SharedMemoryTest(void);
void *ShmBufferSelf(void);
void *ShmBuffer (int rank);
void *ShmBufferTranslate(int rank,void * local_p);
void *ShmBufferMalloc(size_t bytes);
void ShmBufferFreeAll(void) ;
//////////////////////////////////////////////////////////////////////////
// Make info on Nodes & ranks and Shared memory available
//////////////////////////////////////////////////////////////////////////
int NodeCount(void) { return GlobalSharedMemory::WorldNodes;};
int RankCount(void) { return GlobalSharedMemory::WorldSize;};
};
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,784 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/SharedMemory.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
#include <pwd.h>
#ifdef GRID_NVCC
#include <cuda_runtime_api.h>
#endif
NAMESPACE_BEGIN(Grid);
#define header "SharedMemoryMpi: "
/*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
{
assert(_ShmSetup==0);
WorldComm = comm;
MPI_Comm_rank(WorldComm,&WorldRank);
MPI_Comm_size(WorldComm,&WorldSize);
// WorldComm, WorldSize, WorldRank
/////////////////////////////////////////////////////////////////////
// Split into groups that can share memory
/////////////////////////////////////////////////////////////////////
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
MPI_Comm_rank(WorldShmComm ,&WorldShmRank);
MPI_Comm_size(WorldShmComm ,&WorldShmSize);
if ( WorldRank == 0) {
std::cout << header " World communicator of size " <<WorldSize << std::endl;
std::cout << header " Node communicator of size " <<WorldShmSize << std::endl;
}
// WorldShmComm, WorldShmSize, WorldShmRank
// WorldNodes
WorldNodes = WorldSize/WorldShmSize;
assert( (WorldNodes * WorldShmSize) == WorldSize );
// FIXME: Check all WorldShmSize are the same ?
/////////////////////////////////////////////////////////////////////
// find world ranks in our SHM group (i.e. which ranks are on our node)
/////////////////////////////////////////////////////////////////////
MPI_Group WorldGroup, ShmGroup;
MPI_Comm_group (WorldComm, &WorldGroup);
MPI_Comm_group (WorldShmComm, &ShmGroup);
std::vector<int> world_ranks(WorldSize); for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
WorldShmRanks.resize(WorldSize);
MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &WorldShmRanks[0]);
///////////////////////////////////////////////////////////////////
// Identify who is in my group and nominate the leader
///////////////////////////////////////////////////////////////////
int g=0;
std::vector<int> MyGroup;
MyGroup.resize(WorldShmSize);
for(int rank=0;rank<WorldSize;rank++){
if(WorldShmRanks[rank]!=MPI_UNDEFINED){
assert(g<WorldShmSize);
MyGroup[g++] = rank;
}
}
std::sort(MyGroup.begin(),MyGroup.end(),std::less<int>());
int myleader = MyGroup[0];
std::vector<int> leaders_1hot(WorldSize,0);
std::vector<int> leaders_group(WorldNodes,0);
leaders_1hot [ myleader ] = 1;
///////////////////////////////////////////////////////////////////
// global sum leaders over comm world
///////////////////////////////////////////////////////////////////
int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,WorldComm);
assert(ierr==0);
///////////////////////////////////////////////////////////////////
// find the group leaders world rank
///////////////////////////////////////////////////////////////////
int group=0;
for(int l=0;l<WorldSize;l++){
if(leaders_1hot[l]){
leaders_group[group++] = l;
}
}
///////////////////////////////////////////////////////////////////
// Identify the node of the group in which I (and my leader) live
///////////////////////////////////////////////////////////////////
WorldNode=-1;
for(int g=0;g<WorldNodes;g++){
if (myleader == leaders_group[g]){
WorldNode=g;
}
}
assert(WorldNode!=-1);
_ShmSetup=1;
}
// Gray encode support
int BinaryToGray (int binary) {
int gray = (binary>>1)^binary;
return gray;
}
int Log2Size(int TwoToPower,int MAXLOG2)
{
int log2size = -1;
for(int i=0;i<=MAXLOG2;i++){
if ( (0x1<<i) == TwoToPower ) {
log2size = i;
break;
}
}
return log2size;
}
void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
{
//////////////////////////////////////////////////////////////////////////////
// Look and see if it looks like an HPE 8600 based on hostname conventions
//////////////////////////////////////////////////////////////////////////////
const int namelen = _POSIX_HOST_NAME_MAX;
char name[namelen];
int R;
int I;
int N;
gethostname(name,namelen);
int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm);
else OptimalCommunicatorSharedMemory(processors,optimal_comm);
}
void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
{
////////////////////////////////////////////////////////////////
// Assert power of two shm_size.
////////////////////////////////////////////////////////////////
int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
assert(log2size != -1);
////////////////////////////////////////////////////////////////
// Identify the hypercube coordinate of this node using hostname
////////////////////////////////////////////////////////////////
// n runs 0...7 9...16 18...25 27...34 (8*4) 5 bits
// i runs 0..7 3 bits
// r runs 0..3 2 bits
// 2^10 = 1024 nodes
const int maxhdim = 10;
std::vector<int> HyperCubeCoords(maxhdim,0);
std::vector<int> RootHyperCubeCoords(maxhdim,0);
int R;
int I;
int N;
const int namelen = _POSIX_HOST_NAME_MAX;
char name[namelen];
// Parse ICE-XA hostname to get hypercube location
gethostname(name,namelen);
int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
assert(nscan==3);
int nlo = N%9;
int nhi = N/9;
uint32_t hypercoor = (R<<8)|(I<<5)|(nhi<<3)|nlo ;
uint32_t rootcoor = hypercoor;
//////////////////////////////////////////////////////////////////
// Print debug info
//////////////////////////////////////////////////////////////////
for(int d=0;d<maxhdim;d++){
HyperCubeCoords[d] = (hypercoor>>d)&0x1;
}
std::string hname(name);
// std::cout << "hostname "<<hname<<std::endl;
// std::cout << "R " << R << " I " << I << " N "<< N
// << " hypercoor 0x"<<std::hex<<hypercoor<<std::dec<<std::endl;
//////////////////////////////////////////////////////////////////
// broadcast node 0's base coordinate for this partition.
//////////////////////////////////////////////////////////////////
MPI_Bcast(&rootcoor, sizeof(rootcoor), MPI_BYTE, 0, WorldComm);
hypercoor=hypercoor-rootcoor;
assert(hypercoor<WorldSize);
assert(hypercoor>=0);
//////////////////////////////////////
// Printing
//////////////////////////////////////
for(int d=0;d<maxhdim;d++){
HyperCubeCoords[d] = (hypercoor>>d)&0x1;
}
////////////////////////////////////////////////////////////////
// Identify subblock of ranks on node spreading across dims
// in a maximally symmetrical way
////////////////////////////////////////////////////////////////
int ndimension = processors.size();
std::vector<int> processor_coor(ndimension);
std::vector<int> WorldDims = processors.toVector();
std::vector<int> ShmDims (ndimension,1); std::vector<int> NodeDims (ndimension);
std::vector<int> ShmCoor (ndimension); std::vector<int> NodeCoor (ndimension); std::vector<int> WorldCoor(ndimension);
std::vector<int> HyperCoor(ndimension);
int dim = 0;
for(int l2=0;l2<log2size;l2++){
while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
ShmDims[dim]*=2;
dim=(dim+1)%ndimension;
}
////////////////////////////////////////////////////////////////
// Establish torus of processes and nodes with sub-blockings
////////////////////////////////////////////////////////////////
for(int d=0;d<ndimension;d++){
NodeDims[d] = WorldDims[d]/ShmDims[d];
}
////////////////////////////////////////////////////////////////
// Map Hcube according to physical lattice
// must partition. Loop over dims and find out who would join.
////////////////////////////////////////////////////////////////
int hcoor = hypercoor;
for(int d=0;d<ndimension;d++){
int bits = Log2Size(NodeDims[d],MAXLOG2RANKSPERNODE);
int msk = (0x1<<bits)-1;
HyperCoor[d]=hcoor & msk;
HyperCoor[d]=BinaryToGray(HyperCoor[d]); // Space filling curve magic
hcoor = hcoor >> bits;
}
////////////////////////////////////////////////////////////////
// Check processor counts match
////////////////////////////////////////////////////////////////
int Nprocessors=1;
for(int i=0;i<ndimension;i++){
Nprocessors*=processors[i];
}
assert(WorldSize==Nprocessors);
////////////////////////////////////////////////////////////////
// Establish mapping between lexico physics coord and WorldRank
////////////////////////////////////////////////////////////////
int rank;
Lexicographic::CoorFromIndexReversed(NodeCoor,WorldNode ,NodeDims);
for(int d=0;d<ndimension;d++) NodeCoor[d]=HyperCoor[d];
Lexicographic::CoorFromIndexReversed(ShmCoor ,WorldShmRank,ShmDims);
for(int d=0;d<ndimension;d++) WorldCoor[d] = NodeCoor[d]*ShmDims[d]+ShmCoor[d];
Lexicographic::IndexFromCoorReversed(WorldCoor,rank,WorldDims);
/////////////////////////////////////////////////////////////////
// Build the new communicator
/////////////////////////////////////////////////////////////////
int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
assert(ierr==0);
}
void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
{
////////////////////////////////////////////////////////////////
// Assert power of two shm_size.
////////////////////////////////////////////////////////////////
int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
assert(log2size != -1);
////////////////////////////////////////////////////////////////
// Identify subblock of ranks on node spreading across dims
// in a maximally symmetrical way
////////////////////////////////////////////////////////////////
int ndimension = processors.size();
Coordinate processor_coor(ndimension);
Coordinate WorldDims = processors; Coordinate ShmDims(ndimension,1); Coordinate NodeDims (ndimension);
Coordinate ShmCoor(ndimension); Coordinate NodeCoor(ndimension); Coordinate WorldCoor(ndimension);
int dim = 0;
for(int l2=0;l2<log2size;l2++){
while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
ShmDims[dim]*=2;
dim=(dim+1)%ndimension;
}
////////////////////////////////////////////////////////////////
// Establish torus of processes and nodes with sub-blockings
////////////////////////////////////////////////////////////////
for(int d=0;d<ndimension;d++){
NodeDims[d] = WorldDims[d]/ShmDims[d];
}
////////////////////////////////////////////////////////////////
// Check processor counts match
////////////////////////////////////////////////////////////////
int Nprocessors=1;
for(int i=0;i<ndimension;i++){
Nprocessors*=processors[i];
}
assert(WorldSize==Nprocessors);
////////////////////////////////////////////////////////////////
// Establish mapping between lexico physics coord and WorldRank
////////////////////////////////////////////////////////////////
int rank;
Lexicographic::CoorFromIndexReversed(NodeCoor,WorldNode ,NodeDims);
Lexicographic::CoorFromIndexReversed(ShmCoor ,WorldShmRank,ShmDims);
for(int d=0;d<ndimension;d++) WorldCoor[d] = NodeCoor[d]*ShmDims[d]+ShmCoor[d];
Lexicographic::IndexFromCoorReversed(WorldCoor,rank,WorldDims);
/////////////////////////////////////////////////////////////////
// Build the new communicator
/////////////////////////////////////////////////////////////////
int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
assert(ierr==0);
}
////////////////////////////////////////////////////////////////////////////////////////////
// SHMGET
////////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_MPI3_SHMGET
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << header "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared windows for our group
//////////////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize);
std::vector<int> shmids(WorldShmSize);
if ( WorldShmRank == 0 ) {
for(int r=0;r<WorldShmSize;r++){
size_t size = bytes;
key_t key = IPC_PRIVATE;
int flags = IPC_CREAT | SHM_R | SHM_W;
#ifdef SHM_HUGETLB
if (Hugepages) flags|=SHM_HUGETLB;
#endif
if ((shmids[r]= shmget(key,size, flags)) ==-1) {
int errsv = errno;
printf("Errno %d\n",errsv);
printf("key %d\n",key);
printf("size %ld\n",size);
printf("flags %d\n",flags);
perror("shmget");
exit(1);
}
}
}
MPI_Barrier(WorldShmComm);
MPI_Bcast(&shmids[0],WorldShmSize*sizeof(int),MPI_BYTE,0,WorldShmComm);
MPI_Barrier(WorldShmComm);
for(int r=0;r<WorldShmSize;r++){
WorldShmCommBufs[r] = (uint64_t *)shmat(shmids[r], NULL,0);
if (WorldShmCommBufs[r] == (uint64_t *)-1) {
perror("Shared memory attach failure");
shmctl(shmids[r], IPC_RMID, NULL);
exit(2);
}
}
MPI_Barrier(WorldShmComm);
///////////////////////////////////
// Mark for clean up
///////////////////////////////////
for(int r=0;r<WorldShmSize;r++){
shmctl(shmids[r], IPC_RMID,(struct shmid_ds *)NULL);
}
MPI_Barrier(WorldShmComm);
_ShmAlloc=1;
_ShmAllocBytes = bytes;
}
#endif
////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs mapping intended
////////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_NVCC
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
void * ShmCommBuf ;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the pointer array for shared windows for our group
//////////////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// TODO/FIXME : NOT ALL NVLINK BOARDS have full Peer to peer connectivity.
// The annoyance is that they have partial peer 2 peer. This occurs on the 8 GPU blades.
// e.g. DGX1, supermicro board,
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
cudaSetDevice(WorldShmRank);
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer
///////////////////////////////////////////////////////////////////////////////////////////////////////////
auto err = cudaMalloc(&ShmCommBuf, bytes);
if ( err != cudaSuccess) {
std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
if (ShmCommBuf == (void *)NULL ) {
std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed NULL pointer for " << bytes<<" bytes " << std::endl;
exit(EXIT_FAILURE);
}
if ( WorldRank == 0 ){
std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
}
SharedMemoryZero(ShmCommBuf,bytes);
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Loop over ranks/gpu's on our node
///////////////////////////////////////////////////////////////////////////////////////////////////////////
for(int r=0;r<WorldShmSize;r++){
//////////////////////////////////////////////////
// If it is me, pass around the IPC access key
//////////////////////////////////////////////////
cudaIpcMemHandle_t handle;
if ( r==WorldShmRank ) {
err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
if ( err != cudaSuccess) {
std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
}
//////////////////////////////////////////////////
// Share this IPC handle across the Shm Comm
//////////////////////////////////////////////////
{
int ierr=MPI_Bcast(&handle,
sizeof(handle),
MPI_BYTE,
r,
WorldShmComm);
assert(ierr==0);
}
///////////////////////////////////////////////////////////////
// If I am not the source, overwrite thisBuf with remote buffer
///////////////////////////////////////////////////////////////
void * thisBuf = ShmCommBuf;
if ( r!=WorldShmRank ) {
err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
if ( err != cudaSuccess) {
std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
}
///////////////////////////////////////////////////////////////
// Save a copy of the device buffers
///////////////////////////////////////////////////////////////
WorldShmCommBufs[r] = thisBuf;
}
_ShmAllocBytes=bytes;
_ShmAlloc=1;
}
#else
#ifdef GRID_MPI3_SHMMMAP
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared windows for our group
//////////////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize);
////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs and others map filesystems as mappable huge pages
////////////////////////////////////////////////////////////////////////////////////////////
char shm_name [NAME_MAX];
for(int r=0;r<WorldShmSize;r++){
sprintf(shm_name,GRID_SHM_PATH "/Grid_mpi3_shm_%d_%d",WorldNode,r);
int fd=open(shm_name,O_RDWR|O_CREAT,0666);
if ( fd == -1) {
printf("open %s failed\n",shm_name);
perror("open hugetlbfs");
exit(0);
}
int mmap_flag = MAP_SHARED ;
#ifdef MAP_POPULATE
mmap_flag|=MAP_POPULATE;
#endif
#ifdef MAP_HUGETLB
if ( flags ) mmap_flag |= MAP_HUGETLB;
#endif
void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0);
if ( ptr == (void *)MAP_FAILED ) {
printf("mmap %s failed\n",shm_name);
perror("failed mmap"); assert(0);
}
assert(((uint64_t)ptr&0x3F)==0);
close(fd);
WorldShmCommBufs[r] =ptr;
// std::cout << header "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
}
_ShmAlloc=1;
_ShmAllocBytes = bytes;
};
#endif // MMAP
#ifdef GRID_MPI3_SHM_NONE
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared windows for our group
//////////////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize);
////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbf and others map filesystems as mappable huge pages
////////////////////////////////////////////////////////////////////////////////////////////
char shm_name [NAME_MAX];
assert(WorldShmSize == 1);
for(int r=0;r<WorldShmSize;r++){
int fd=-1;
int mmap_flag = MAP_SHARED |MAP_ANONYMOUS ;
#ifdef MAP_POPULATE
mmap_flag|=MAP_POPULATE;
#endif
#ifdef MAP_HUGETLB
if ( flags ) mmap_flag |= MAP_HUGETLB;
#endif
void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0);
if ( ptr == (void *)MAP_FAILED ) {
printf("mmap %s failed\n",shm_name);
perror("failed mmap"); assert(0);
}
assert(((uint64_t)ptr&0x3F)==0);
close(fd);
WorldShmCommBufs[r] =ptr;
// std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
}
_ShmAlloc=1;
_ShmAllocBytes = bytes;
};
#endif // MMAP
#ifdef GRID_MPI3_SHMOPEN
////////////////////////////////////////////////////////////////////////////////////////////
// POSIX SHMOPEN ; as far as I know Linux does not allow EXPLICIT HugePages with this case
// tmpfs (Larry Meadows says) does not support explicit huge page, and this is used for
// the posix shm virtual file system
////////////////////////////////////////////////////////////////////////////////////////////
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << header "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize);
char shm_name [NAME_MAX];
if ( WorldShmRank == 0 ) {
for(int r=0;r<WorldShmSize;r++){
size_t size = bytes;
struct passwd *pw = getpwuid (getuid());
sprintf(shm_name,"/Grid_%s_mpi3_shm_%d_%d",pw->pw_name,WorldNode,r);
shm_unlink(shm_name);
int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666);
if ( fd < 0 ) { perror("failed shm_open"); assert(0); }
ftruncate(fd, size);
int mmap_flag = MAP_SHARED;
#ifdef MAP_POPULATE
mmap_flag |= MAP_POPULATE;
#endif
#ifdef MAP_HUGETLB
if (flags) mmap_flag |= MAP_HUGETLB;
#endif
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
// std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
if ( ptr == (void * )MAP_FAILED ) {
perror("failed mmap");
assert(0);
}
assert(((uint64_t)ptr&0x3F)==0);
WorldShmCommBufs[r] =ptr;
close(fd);
}
}
MPI_Barrier(WorldShmComm);
if ( WorldShmRank != 0 ) {
for(int r=0;r<WorldShmSize;r++){
size_t size = bytes ;
struct passwd *pw = getpwuid (getuid());
sprintf(shm_name,"/Grid_%s_mpi3_shm_%d_%d",pw->pw_name,WorldNode,r);
int fd=shm_open(shm_name,O_RDWR,0666);
if ( fd<0 ) { perror("failed shm_open"); assert(0); }
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); }
assert(((uint64_t)ptr&0x3F)==0);
WorldShmCommBufs[r] =ptr;
close(fd);
}
}
_ShmAlloc=1;
_ShmAllocBytes = bytes;
}
#endif
#endif // End NVCC case for GPU device buffers
/////////////////////////////////////////////////////////////////////////
// Routines accessing shared memory should route through for GPU safety
/////////////////////////////////////////////////////////////////////////
void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
{
#ifdef GRID_NVCC
cudaMemset(dest,0,bytes);
#else
bzero(dest,bytes);
#endif
}
void GlobalSharedMemory::SharedMemoryCopy(void *dest,const void *src,size_t bytes)
{
#ifdef GRID_NVCC
cudaMemcpy(dest,src,bytes,cudaMemcpyDefault);
#else
bcopy(src,dest,bytes);
#endif
}
////////////////////////////////////////////////////////
// Global shared functionality finished
// Now move to per communicator functionality
////////////////////////////////////////////////////////
void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
{
int rank, size;
MPI_Comm_rank(comm,&rank);
MPI_Comm_size(comm,&size);
ShmRanks.resize(size);
/////////////////////////////////////////////////////////////////////
// Split into groups that can share memory
/////////////////////////////////////////////////////////////////////
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
MPI_Comm_rank(ShmComm ,&ShmRank);
MPI_Comm_size(ShmComm ,&ShmSize);
ShmCommBufs.resize(ShmSize);
//////////////////////////////////////////////////////////////////////
// Map ShmRank to WorldShmRank and use the right buffer
//////////////////////////////////////////////////////////////////////
assert (GlobalSharedMemory::ShmAlloc()==1);
heap_size = GlobalSharedMemory::ShmAllocBytes();
for(int r=0;r<ShmSize;r++){
uint32_t wsr = (r==ShmRank) ? GlobalSharedMemory::WorldShmRank : 0 ;
MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
}
ShmBufferFreeAll();
/////////////////////////////////////////////////////////////////////
// find comm ranks in our SHM group (i.e. which ranks are on our node)
/////////////////////////////////////////////////////////////////////
MPI_Group FullGroup, ShmGroup;
MPI_Comm_group (comm , &FullGroup);
MPI_Comm_group (ShmComm, &ShmGroup);
std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r;
MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]);
SharedMemoryTest();
}
//////////////////////////////////////////////////////////////////
// On node barrier
//////////////////////////////////////////////////////////////////
void SharedMemory::ShmBarrier(void)
{
MPI_Barrier (ShmComm);
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// Test the shared memory is working
//////////////////////////////////////////////////////////////////////////////////////////////////////////
void SharedMemory::SharedMemoryTest(void)
{
ShmBarrier();
uint64_t check[3];
uint64_t magic = 0x5A5A5A;
if ( ShmRank == 0 ) {
for(uint64_t r=0;r<ShmSize;r++){
check[0]=GlobalSharedMemory::WorldNode;
check[1]=r;
check[2]=magic;
GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t));
}
}
ShmBarrier();
for(uint64_t r=0;r<ShmSize;r++){
ShmBarrier();
GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t));
ShmBarrier();
assert(check[0]==GlobalSharedMemory::WorldNode);
assert(check[1]==r);
assert(check[2]==magic);
ShmBarrier();
}
}
void *SharedMemory::ShmBuffer(int rank)
{
int gpeer = ShmRanks[rank];
if (gpeer == MPI_UNDEFINED){
return NULL;
} else {
return ShmCommBufs[gpeer];
}
}
void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
{
int gpeer = ShmRanks[rank];
assert(gpeer!=ShmRank); // never send to self
if (gpeer == MPI_UNDEFINED){
return NULL;
} else {
uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
return (void *) remote;
}
}
SharedMemory::~SharedMemory()
{
int MPI_is_finalised; MPI_Finalized(&MPI_is_finalised);
if ( !MPI_is_finalised ) {
MPI_Comm_free(&ShmComm);
}
};
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,129 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/SharedMemory.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid);
/*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
{
assert(_ShmSetup==0);
WorldComm = 0;
WorldRank = 0;
WorldSize = 1;
WorldShmComm = 0 ;
WorldShmRank = 0 ;
WorldShmSize = 1 ;
WorldNodes = 1 ;
WorldNode = 0 ;
WorldShmRanks.resize(WorldSize); WorldShmRanks[0] = 0;
WorldShmCommBufs.resize(1);
_ShmSetup=1;
}
void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
{
optimal_comm = WorldComm;
}
////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs mapping intended, use anonymous mmap
////////////////////////////////////////////////////////////////////////////////////////////
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
void * ShmCommBuf ;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
int mmap_flag =0;
#ifdef MAP_ANONYMOUS
mmap_flag = mmap_flag| MAP_SHARED | MAP_ANONYMOUS;
#endif
#ifdef MAP_ANON
mmap_flag = mmap_flag| MAP_SHARED | MAP_ANON;
#endif
#ifdef MAP_HUGETLB
if ( flags ) mmap_flag |= MAP_HUGETLB;
#endif
ShmCommBuf =(void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag, -1, 0);
if (ShmCommBuf == (void *)MAP_FAILED) {
perror("mmap failed ");
exit(EXIT_FAILURE);
}
#ifdef MADV_HUGEPAGE
if (!Hugepages ) madvise(ShmCommBuf,bytes,MADV_HUGEPAGE);
#endif
bzero(ShmCommBuf,bytes);
WorldShmCommBufs[0] = ShmCommBuf;
_ShmAllocBytes=bytes;
_ShmAlloc=1;
};
////////////////////////////////////////////////////////
// Global shared functionality finished
// Now move to per communicator functionality
////////////////////////////////////////////////////////
void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
{
assert(GlobalSharedMemory::ShmAlloc()==1);
ShmRanks.resize(1);
ShmCommBufs.resize(1);
ShmRanks[0] = 0;
ShmRank = 0;
ShmSize = 1;
//////////////////////////////////////////////////////////////////////
// Map ShmRank to WorldShmRank and use the right buffer
//////////////////////////////////////////////////////////////////////
ShmCommBufs[0] = GlobalSharedMemory::WorldShmCommBufs[0];
heap_size = GlobalSharedMemory::ShmAllocBytes();
ShmBufferFreeAll();
return;
}
//////////////////////////////////////////////////////////////////
// On node barrier
//////////////////////////////////////////////////////////////////
void SharedMemory::ShmBarrier(void){ return ; }
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// Test the shared memory is working
//////////////////////////////////////////////////////////////////////////////////////////////////////////
void SharedMemory::SharedMemoryTest(void) { return; }
void *SharedMemory::ShmBuffer(int rank)
{
return NULL;
}
void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
{
return NULL;
}
SharedMemory::~SharedMemory()
{};
NAMESPACE_END(Grid);

View File

@@ -42,7 +42,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/cshift/Cshift_mpi.h>
#endif
#ifdef GRID_COMMS_MPI3L
#ifdef GRID_COMMS_MPIT
#include <Grid/cshift/Cshift_mpi.h>
#endif

View File

@@ -25,10 +25,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef _GRID_CSHIFT_COMMON_H_
#define _GRID_CSHIFT_COMMON_H_
#pragma once
namespace Grid {
NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////////
// Gather for when there is no need to SIMD split
@@ -36,91 +35,96 @@ namespace Grid {
template<class vobj> void
Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
{
int rd = rhs._grid->_rdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) {
if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
cbmask = 0x3;
}
int so=plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension];
int so=plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
int e1=rhs.Grid()->_slice_nblock[dimension];
int e2=rhs.Grid()->_slice_block[dimension];
int ent = 0;
int stride=rhs._grid->_slice_stride[dimension];
static Vector<std::pair<int,int> > table; table.resize(e1*e2);
int stride=rhs.Grid()->_slice_stride[dimension];
auto rhs_v = rhs.View();
if ( cbmask == 0x3 ) {
parallel_for_nest2(int n=0;n<e1;n++){
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*stride;
int bo = n*e2;
buffer[off+bo+b]=rhs._odata[so+o+b];
table[ent++] = std::pair<int,int>(off+bo+b,so+o+b);
}
}
} else {
int bo=0;
std::vector<std::pair<int,int> > table;
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*stride;
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
if ( ocb &cbmask ) {
table.push_back(std::pair<int,int> (bo++,o+b));
table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
}
}
}
parallel_for(int i=0;i<table.size();i++){
buffer[off+table[i].first]=rhs._odata[so+table[i].second];
}
}
thread_for(i,ent,{
buffer[table[i].first]=rhs_v[table[i].second];
});
}
///////////////////////////////////////////////////////////////////
// Gather for when there *is* need to SIMD split
///////////////////////////////////////////////////////////////////
template<class vobj> void
Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename vobj::scalar_object *> pointers,int dimension,int plane,int cbmask)
Gather_plane_extract(const Lattice<vobj> &rhs,
ExtractPointerArray<typename vobj::scalar_object> pointers,
int dimension,int plane,int cbmask)
{
int rd = rhs._grid->_rdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) {
if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
cbmask = 0x3;
}
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension];
int n1=rhs._grid->_slice_stride[dimension];
int e1=rhs.Grid()->_slice_nblock[dimension];
int e2=rhs.Grid()->_slice_block[dimension];
int n1=rhs.Grid()->_slice_stride[dimension];
auto rhs_v = rhs.View();
if ( cbmask ==0x3){
parallel_for_nest2(int n=0;n<e1;n++){
thread_for_collapse(2,n,e1,{
for(int b=0;b<e2;b++){
int o = n*n1;
int offset = b+n*e2;
vobj temp =rhs._odata[so+o+b];
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
}
}
});
} else {
// Case of SIMD split AND checker dim cannot currently be hit, except in
// Test_cshift_red_black code.
std::cout << " Dense packed buffer WARNING " <<std::endl;
parallel_for_nest2(int n=0;n<e1;n++){
thread_for_collapse(2,n,e1,{
for(int b=0;b<e2;b++){
int o=n*n1;
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
int offset = b+n*e2;
if ( ocb & cbmask ) {
vobj temp =rhs._odata[so+o+b];
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
}
}
}
});
}
}
@@ -129,82 +133,89 @@ Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename vobj::scalar_
//////////////////////////////////////////////////////
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
{
int rd = rhs._grid->_rdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) {
if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
cbmask=0x3;
}
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension];
int stride=rhs._grid->_slice_stride[dimension];
int e1=rhs.Grid()->_slice_nblock[dimension];
int e2=rhs.Grid()->_slice_block[dimension];
int stride=rhs.Grid()->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
int ent =0;
if ( cbmask ==0x3 ) {
parallel_for_nest2(int n=0;n<e1;n++){
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*rhs._grid->_slice_stride[dimension];
int bo =n*rhs._grid->_slice_block[dimension];
rhs._odata[so+o+b]=buffer[bo+b];
int o =n*rhs.Grid()->_slice_stride[dimension];
int bo =n*rhs.Grid()->_slice_block[dimension];
table[ent++] = std::pair<int,int>(so+o+b,bo+b);
}
}
} else {
std::vector<std::pair<int,int> > table;
int bo=0;
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*rhs._grid->_slice_stride[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
int o =n*rhs.Grid()->_slice_stride[dimension];
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) {
table.push_back(std::pair<int,int> (so+o+b,bo++));
table[ent++]=std::pair<int,int> (so+o+b,bo++);
}
}
}
parallel_for(int i=0;i<table.size();i++){
// std::cout << "Rcv"<< table[i].first << " " << table[i].second << " " <<buffer[table[i].second]<<std::endl;
rhs._odata[table[i].first]=buffer[table[i].second];
}
}
auto rhs_v = rhs.View();
thread_for(i,ent,{
rhs_v[table[i].first]=buffer[table[i].second];
});
}
//////////////////////////////////////////////////////
// Scatter for when there *is* need to SIMD split
//////////////////////////////////////////////////////
template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<typename vobj::scalar_object *> pointers,int dimension,int plane,int cbmask)
template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerArray<typename vobj::scalar_object> pointers,int dimension,int plane,int cbmask)
{
int rd = rhs._grid->_rdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) {
if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
cbmask=0x3;
}
int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block[dimension];
int e1=rhs.Grid()->_slice_nblock[dimension];
int e2=rhs.Grid()->_slice_block[dimension];
if(cbmask ==0x3 ) {
parallel_for_nest2(int n=0;n<e1;n++){
auto rhs_v = rhs.View();
thread_for_collapse(2,n,e1,{
for(int b=0;b<e2;b++){
int o = n*rhs._grid->_slice_stride[dimension];
int offset = b+n*rhs._grid->_slice_block[dimension];
merge(rhs._odata[so+o+b],pointers,offset);
}
int o = n*rhs.Grid()->_slice_stride[dimension];
int offset = b+n*rhs.Grid()->_slice_block[dimension];
merge(rhs_v[so+o+b],pointers,offset);
}
});
} else {
// Case of SIMD split AND checker dim cannot currently be hit, except in
// Test_cshift_red_black code.
// std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
auto rhs_v = rhs.View();
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o = n*rhs._grid->_slice_stride[dimension];
int offset = b+n*rhs._grid->_slice_block[dimension];
int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
int o = n*rhs.Grid()->_slice_stride[dimension];
int offset = b+n*rhs.Grid()->_slice_block[dimension];
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) {
merge(rhs._odata[so+o+b],pointers,offset);
merge(rhs_v[so+o+b],pointers,offset);
}
}
}
@@ -216,69 +227,87 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<typ
//////////////////////////////////////////////////////
template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
{
int rd = rhs._grid->_rdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) {
if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
cbmask=0x3;
}
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane
int ro = rplane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
int lo = lplane*lhs.Grid()->_ostride[dimension]; // base offset for start of plane
int e1=rhs.Grid()->_slice_nblock[dimension]; // clearly loop invariant for icpc
int e2=rhs.Grid()->_slice_block[dimension];
int stride = rhs.Grid()->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
int ent=0;
int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
int e2=rhs._grid->_slice_block[dimension];
int stride = rhs._grid->_slice_stride[dimension];
if(cbmask == 0x3 ){
parallel_for_nest2(int n=0;n<e1;n++){
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride+b;
//lhs._odata[lo+o]=rhs._odata[ro+o];
vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
table[ent++] = std::pair<int,int>(lo+o,ro+o);
}
}
} else {
parallel_for_nest2(int n=0;n<e1;n++){
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride+b;
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o);
if ( ocb&cbmask ) {
//lhs._odata[lo+o]=rhs._odata[ro+o];
vstream(lhs._odata[lo+o],rhs._odata[ro+o]);
table[ent++] = std::pair<int,int>(lo+o,ro+o);
}
}
}
}
auto rhs_v = rhs.View();
auto lhs_v = lhs.View();
thread_for(i,ent,{
lhs_v[table[i].first]=rhs_v[table[i].second];
});
}
template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
{
int rd = rhs._grid->_rdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension];
if ( !rhs._grid->CheckerBoarded(dimension) ) {
if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
cbmask=0x3;
}
int ro = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane
int lo = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane
int ro = rplane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
int lo = lplane*lhs.Grid()->_ostride[dimension]; // base offset for start of plane
int e1=rhs._grid->_slice_nblock[dimension];
int e2=rhs._grid->_slice_block [dimension];
int stride = rhs._grid->_slice_stride[dimension];
int e1=rhs.Grid()->_slice_nblock[dimension];
int e2=rhs.Grid()->_slice_block [dimension];
int stride = rhs.Grid()->_slice_stride[dimension];
parallel_for_nest2(int n=0;n<e1;n++){
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
int ent=0;
if ( cbmask == 0x3 ) {
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride;
int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) {
permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
}}
} else {
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride;
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
}}
}
}}
auto rhs_v = rhs.View();
auto lhs_v = lhs.View();
thread_for(i,ent,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
});
}
//////////////////////////////////////////////////////
@@ -288,8 +317,8 @@ template<class vobj> void Cshift_local(Lattice<vobj>& ret,const Lattice<vobj> &r
{
int sshift[2];
sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even);
sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);
sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
if ( sshift[0] == sshift[1] ) {
Cshift_local(ret,rhs,dimension,shift,0x3);
@@ -299,9 +328,9 @@ template<class vobj> void Cshift_local(Lattice<vobj>& ret,const Lattice<vobj> &r
}
}
template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
template<class vobj> void Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
GridBase *grid = rhs._grid;
GridBase *grid = rhs.Grid();
int fd = grid->_fdimensions[dimension];
int rd = grid->_rdimensions[dimension];
int ld = grid->_ldimensions[dimension];
@@ -312,24 +341,20 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
shift = (shift+fd)%fd;
// the permute type
ret.checkerboard = grid->CheckerBoardDestination(rhs.checkerboard,shift,dimension);
ret.Checkerboard() = grid->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension);
int permute_dim =grid->PermuteDim(dimension);
int permute_type=grid->PermuteType(dimension);
int permute_type_dist;
for(int x=0;x<rd;x++){
int o = 0;
// int o = 0;
int bo = x * grid->_ostride[dimension];
int cb= (cbmask==0x2)? Odd : Even;
int sshift = grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
int sshift = grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
int sx = (x+sshift)%rd;
// FIXME : This must change where we have a
// Rotate slice.
// Document how this works ; why didn't I do this when I first wrote it...
// wrap is whether sshift > rd.
// num is sshift mod rd.
//
@@ -366,9 +391,7 @@ template<class vobj> Lattice<vobj> Cshift_local(Lattice<vobj> &ret,const Lattice
if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type_dist);
else Copy_plane(ret,rhs,dimension,x,sx,cbmask);
}
}
NAMESPACE_END(Grid);
}
return ret;
}
}
#endif

View File

@@ -30,37 +30,37 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define _GRID_CSHIFT_MPI_H_
namespace Grid {
NAMESPACE_BEGIN(Grid);
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
{
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
Lattice<vobj> ret(rhs._grid);
Lattice<vobj> ret(rhs.Grid());
int fd = rhs._grid->_fdimensions[dimension];
int rd = rhs._grid->_rdimensions[dimension];
int fd = rhs.Grid()->_fdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension];
// Map to always positive shift modulo global full dimension.
shift = (shift+fd)%fd;
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift,dimension);
ret.Checkerboard() = rhs.Grid()->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension);
// the permute type
int simd_layout = rhs._grid->_simd_layout[dimension];
int comm_dim = rhs._grid->_processors[dimension] >1 ;
int splice_dim = rhs._grid->_simd_layout[dimension]>1 && (comm_dim);
int simd_layout = rhs.Grid()->_simd_layout[dimension];
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
int splice_dim = rhs.Grid()->_simd_layout[dimension]>1 && (comm_dim);
if ( !comm_dim ) {
// std::cout << "Cshift_local" <<std::endl;
//std::cout << "CSHIFT: Cshift_local" <<std::endl;
Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
} else if ( splice_dim ) {
// std::cout << "Cshift_comms_simd" <<std::endl;
//std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl;
Cshift_comms_simd(ret,rhs,dimension,shift);
} else {
// std::cout << "Cshift_comms" <<std::endl;
//std::cout << "CSHIFT: Cshift_comms" <<std::endl;
Cshift_comms(ret,rhs,dimension,shift);
}
return ret;
@@ -70,10 +70,10 @@ template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &r
{
int sshift[2];
sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even);
sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);
sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
// std::cout << "Cshift_comms dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
// std::cout << "Cshift_comms dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
if ( sshift[0] == sshift[1] ) {
// std::cout << "Single pass Cshift_comms" <<std::endl;
Cshift_comms(ret,rhs,dimension,shift,0x3);
@@ -88,12 +88,15 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
{
int sshift[2];
sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even);
sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);
sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
//std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
if ( sshift[0] == sshift[1] ) {
//std::cout << "Single pass Cshift_comms" <<std::endl;
Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
} else {
//std::cout << "Two pass Cshift_comms" <<std::endl;
Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
}
@@ -104,25 +107,25 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
GridBase *grid=rhs._grid;
Lattice<vobj> temp(rhs._grid);
GridBase *grid=rhs.Grid();
Lattice<vobj> temp(rhs.Grid());
int fd = rhs._grid->_fdimensions[dimension];
int rd = rhs._grid->_rdimensions[dimension];
int pd = rhs._grid->_processors[dimension];
int simd_layout = rhs._grid->_simd_layout[dimension];
int comm_dim = rhs._grid->_processors[dimension] >1 ;
int fd = rhs.Grid()->_fdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension];
int pd = rhs.Grid()->_processors[dimension];
int simd_layout = rhs.Grid()->_simd_layout[dimension];
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
assert(simd_layout==1);
assert(comm_dim==1);
assert(shift>=0);
assert(shift<fd);
int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
commVector<vobj> send_buf(buffer_size);
commVector<vobj> recv_buf(buffer_size);
int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
for(int x=0;x<rd;x++){
@@ -142,7 +145,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
int rank = grid->_processor;
// int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
@@ -162,7 +165,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
GridBase *grid=rhs._grid;
GridBase *grid=rhs.Grid();
const int Nsimd = grid->Nsimd();
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_object scalar_object;
@@ -175,6 +178,10 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
int simd_layout = grid->_simd_layout[dimension];
int comm_dim = grid->_processors[dimension] >1 ;
//std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
assert(comm_dim==1);
assert(simd_layout==2);
assert(shift>=0);
@@ -186,21 +193,21 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
// Simd direction uses an extract/merge pair
///////////////////////////////////////////////
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
int words = sizeof(vobj)/sizeof(vector_type);
// int words = sizeof(vobj)/sizeof(vector_type);
std::vector<commVector<scalar_object> > send_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
std::vector<commVector<scalar_object> > recv_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
int bytes = buffer_size*sizeof(scalar_object);
std::vector<scalar_object *> pointers(Nsimd); //
std::vector<scalar_object *> rpointers(Nsimd); // received pointers
ExtractPointerArray<scalar_object> pointers(Nsimd); //
ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
///////////////////////////////////////////
// Work out what to send where
///////////////////////////////////////////
int cb = (cbmask==0x2)? Odd : Even;
int sshift= grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
// loop over outer coord planes orthog to dim
for(int x=0;x<rd;x++){
@@ -251,5 +258,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
}
}
}
NAMESPACE_END(Grid);
#endif

View File

@@ -27,13 +27,14 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */
#ifndef _GRID_CSHIFT_NONE_H_
#define _GRID_CSHIFT_NONE_H_
namespace Grid {
NAMESPACE_BEGIN(Grid);
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
{
Lattice<vobj> ret(rhs._grid);
ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift,dimension);
Lattice<vobj> ret(rhs.Grid());
ret.Checkerboard() = rhs.Grid()->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension);
Cshift_local(ret,rhs,dimension,shift);
return ret;
}
}
NAMESPACE_END(Grid);
#endif

18922
Grid/json/json.hpp Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -25,9 +25,22 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_H
#define GRID_LATTICE_H
#pragma once
#include <Grid/lattice/Lattice_base.h>
#include <Grid/lattice/Lattice_conformable.h>
#include <Grid/lattice/Lattice_ET.h>
#include <Grid/lattice/Lattice_arith.h>
#include <Grid/lattice/Lattice_trace.h>
#include <Grid/lattice/Lattice_transpose.h>
#include <Grid/lattice/Lattice_local.h>
#include <Grid/lattice/Lattice_reduction.h>
#include <Grid/lattice/Lattice_peekpoke.h>
#include <Grid/lattice/Lattice_reality.h>
#include <Grid/lattice/Lattice_comparison_utils.h>
#include <Grid/lattice/Lattice_comparison.h>
#include <Grid/lattice/Lattice_coordinate.h>
//#include <Grid/lattice/Lattice_where.h>
#include <Grid/lattice/Lattice_rng.h>
#include <Grid/lattice/Lattice_unary.h>
#include <Grid/lattice/Lattice_transfer.h>
#endif

407
Grid/lattice/Lattice_ET.h Normal file
View File

@@ -0,0 +1,407 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_ET.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_ET_H
#define GRID_LATTICE_ET_H
#include <iostream>
#include <tuple>
#include <typeinfo>
#include <vector>
NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////////////
// Predicated where support
////////////////////////////////////////////////////
template <class iobj, class vobj, class robj>
accelerator_inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue,
const robj &iffalse) {
typename std::remove_const<vobj>::type ret;
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
const int Nsimd = vobj::vector_type::Nsimd();
ExtractBuffer<Integer> mask(Nsimd);
ExtractBuffer<scalar_object> truevals(Nsimd);
ExtractBuffer<scalar_object> falsevals(Nsimd);
extract(iftrue, truevals);
extract(iffalse, falsevals);
extract<vInteger, Integer>(TensorRemove(predicate), mask);
for (int s = 0; s < Nsimd; s++) {
if (mask[s]) falsevals[s] = truevals[s];
}
merge(ret, falsevals);
return ret;
}
/////////////////////////////////////////////////////
//Specialization of getVectorType for lattices
/////////////////////////////////////////////////////
template<typename T>
struct getVectorType<Lattice<T> >{
typedef typename Lattice<T>::vector_object type;
};
////////////////////////////////////////////
//-- recursive evaluation of expressions; --
// handle leaves of syntax tree
///////////////////////////////////////////////////
template<class sobj> accelerator_inline
sobj eval(const uint64_t ss, const sobj &arg)
{
return arg;
}
template <class lobj> accelerator_inline
const lobj & eval(const uint64_t ss, const LatticeView<lobj> &arg)
{
return arg[ss];
}
template <class lobj> accelerator_inline
const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg)
{
auto view = arg.View();
return view[ss];
}
///////////////////////////////////////////////////
// handle nodes in syntax tree- eval one operand
///////////////////////////////////////////////////
template <typename Op, typename T1> accelerator_inline
auto eval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)
-> decltype(expr.op.func( eval(ss, expr.arg1)))
{
return expr.op.func( eval(ss, expr.arg1) );
}
///////////////////////
// eval two operands
///////////////////////
template <typename Op, typename T1, typename T2> accelerator_inline
auto eval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr)
-> decltype(expr.op.func( eval(ss,expr.arg1),eval(ss,expr.arg2)))
{
return expr.op.func( eval(ss,expr.arg1), eval(ss,expr.arg2) );
}
///////////////////////
// eval three operands
///////////////////////
template <typename Op, typename T1, typename T2, typename T3> accelerator_inline
auto eval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
-> decltype(expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3)))
{
return expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3));
}
//////////////////////////////////////////////////////////////////////////
// Obtain the grid from an expression, ensuring conformable. This must follow a
// tree recursion; must retain grid pointer in the LatticeView class which sucks
// Use a different method, and make it void *.
// Perhaps a conformable method.
//////////////////////////////////////////////////////////////////////////
template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
accelerator_inline void GridFromExpression(GridBase *&grid, const T1 &lat) // Lattice leaf
{
lat.Conformable(grid);
}
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
accelerator_inline
void GridFromExpression(GridBase *&grid,const T1 &notlat) // non-lattice leaf
{}
template <typename Op, typename T1>
accelerator_inline
void GridFromExpression(GridBase *&grid,const LatticeUnaryExpression<Op, T1> &expr)
{
GridFromExpression(grid, expr.arg1); // recurse
}
template <typename Op, typename T1, typename T2>
accelerator_inline
void GridFromExpression(GridBase *&grid, const LatticeBinaryExpression<Op, T1, T2> &expr)
{
GridFromExpression(grid, expr.arg1); // recurse
GridFromExpression(grid, expr.arg2);
}
template <typename Op, typename T1, typename T2, typename T3>
accelerator_inline
void GridFromExpression(GridBase *&grid, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
{
GridFromExpression(grid, expr.arg1); // recurse
GridFromExpression(grid, expr.arg2); // recurse
GridFromExpression(grid, expr.arg3); // recurse
}
//////////////////////////////////////////////////////////////////////////
// Obtain the CB from an expression, ensuring conformable. This must follow a
// tree recursion
//////////////////////////////////////////////////////////////////////////
template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
inline void CBFromExpression(int &cb, const T1 &lat) // Lattice leaf
{
if ((cb == Odd) || (cb == Even)) {
assert(cb == lat.Checkerboard());
}
cb = lat.Checkerboard();
}
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
inline void CBFromExpression(int &cb, const T1 &notlat) // non-lattice leaf
{
}
template <typename Op, typename T1> inline
void CBFromExpression(int &cb,const LatticeUnaryExpression<Op, T1> &expr)
{
CBFromExpression(cb, expr.arg1); // recurse AST
}
template <typename Op, typename T1, typename T2> inline
void CBFromExpression(int &cb,const LatticeBinaryExpression<Op, T1, T2> &expr)
{
CBFromExpression(cb, expr.arg1); // recurse AST
CBFromExpression(cb, expr.arg2); // recurse AST
}
template <typename Op, typename T1, typename T2, typename T3>
inline void CBFromExpression(int &cb, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
{
CBFromExpression(cb, expr.arg1); // recurse AST
CBFromExpression(cb, expr.arg2); // recurse AST
CBFromExpression(cb, expr.arg3); // recurse AST
}
////////////////////////////////////////////
// Unary operators and funcs
////////////////////////////////////////////
#define GridUnopClass(name, ret) \
template <class arg> \
struct name { \
static auto accelerator_inline func(const arg a) -> decltype(ret) { return ret; } \
};
GridUnopClass(UnarySub, -a);
GridUnopClass(UnaryNot, Not(a));
GridUnopClass(UnaryAdj, adj(a));
GridUnopClass(UnaryConj, conjugate(a));
GridUnopClass(UnaryTrace, trace(a));
GridUnopClass(UnaryTranspose, transpose(a));
GridUnopClass(UnaryTa, Ta(a));
GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
GridUnopClass(UnaryReal, real(a));
GridUnopClass(UnaryImag, imag(a));
GridUnopClass(UnaryToReal, toReal(a));
GridUnopClass(UnaryToComplex, toComplex(a));
GridUnopClass(UnaryTimesI, timesI(a));
GridUnopClass(UnaryTimesMinusI, timesMinusI(a));
GridUnopClass(UnaryAbs, abs(a));
GridUnopClass(UnarySqrt, sqrt(a));
GridUnopClass(UnaryRsqrt, rsqrt(a));
GridUnopClass(UnarySin, sin(a));
GridUnopClass(UnaryCos, cos(a));
GridUnopClass(UnaryAsin, asin(a));
GridUnopClass(UnaryAcos, acos(a));
GridUnopClass(UnaryLog, log(a));
GridUnopClass(UnaryExp, exp(a));
////////////////////////////////////////////
// Binary operators
////////////////////////////////////////////
#define GridBinOpClass(name, combination) \
template <class left, class right> \
struct name { \
static auto accelerator_inline \
func(const left &lhs, const right &rhs) \
-> decltype(combination) const \
{ \
return combination; \
} \
};
GridBinOpClass(BinaryAdd, lhs + rhs);
GridBinOpClass(BinarySub, lhs - rhs);
GridBinOpClass(BinaryMul, lhs *rhs);
GridBinOpClass(BinaryDiv, lhs /rhs);
GridBinOpClass(BinaryAnd, lhs &rhs);
GridBinOpClass(BinaryOr, lhs | rhs);
GridBinOpClass(BinaryAndAnd, lhs &&rhs);
GridBinOpClass(BinaryOrOr, lhs || rhs);
////////////////////////////////////////////////////
// Trinary conditional op
////////////////////////////////////////////////////
#define GridTrinOpClass(name, combination) \
template <class predicate, class left, class right> \
struct name { \
static auto accelerator_inline \
func(const predicate &pred, const left &lhs, const right &rhs) \
-> decltype(combination) const \
{ \
return combination; \
} \
};
GridTrinOpClass(TrinaryWhere,
(predicatedWhere<predicate,
typename std::remove_reference<left>::type,
typename std::remove_reference<right>::type>(pred, lhs,rhs)));
////////////////////////////////////////////
// Operator syntactical glue
////////////////////////////////////////////
#define GRID_UNOP(name) name<decltype(eval(0, arg))>
#define GRID_BINOP(name) name<decltype(eval(0, lhs)), decltype(eval(0, rhs))>
#define GRID_TRINOP(name) name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>
#define GRID_DEF_UNOP(op, name) \
template <typename T1, typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value,T1>::type * = nullptr> \
inline auto op(const T1 &arg) ->decltype(LatticeUnaryExpression<GRID_UNOP(name),T1>(GRID_UNOP(name)(), arg)) \
{ \
return LatticeUnaryExpression<GRID_UNOP(name),T1>(GRID_UNOP(name)(), arg); \
}
#define GRID_BINOP_LEFT(op, name) \
template <typename T1, typename T2, \
typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value,T1>::type * = nullptr> \
inline auto op(const T1 &lhs, const T2 &rhs) \
->decltype(LatticeBinaryExpression<GRID_BINOP(name),T1,T2>(GRID_BINOP(name)(),lhs,rhs)) \
{ \
return LatticeBinaryExpression<GRID_BINOP(name),T1,T2>(GRID_BINOP(name)(),lhs,rhs);\
}
#define GRID_BINOP_RIGHT(op, name) \
template <typename T1, typename T2, \
typename std::enable_if<!is_lattice<T1>::value&&!is_lattice_expr<T1>::value,T1>::type * = nullptr, \
typename std::enable_if< is_lattice<T2>::value|| is_lattice_expr<T2>::value,T2>::type * = nullptr> \
inline auto op(const T1 &lhs, const T2 &rhs) \
->decltype(LatticeBinaryExpression<GRID_BINOP(name),T1,T2>(GRID_BINOP(name)(),lhs, rhs)) \
{ \
return LatticeBinaryExpression<GRID_BINOP(name),T1,T2>(GRID_BINOP(name)(),lhs, rhs); \
}
#define GRID_DEF_BINOP(op, name) \
GRID_BINOP_LEFT(op, name); \
GRID_BINOP_RIGHT(op, name);
#define GRID_DEF_TRINOP(op, name) \
template <typename T1, typename T2, typename T3> \
inline auto op(const T1 &pred, const T2 &lhs, const T3 &rhs) \
->decltype(LatticeTrinaryExpression<GRID_TRINOP(name),T1,T2,T3>(GRID_TRINOP(name)(),pred, lhs, rhs)) \
{ \
return LatticeTrinaryExpression<GRID_TRINOP(name),T1,T2,T3>(GRID_TRINOP(name)(),pred, lhs, rhs); \
}
////////////////////////
// Operator definitions
////////////////////////
GRID_DEF_UNOP(operator-, UnarySub);
GRID_DEF_UNOP(Not, UnaryNot);
GRID_DEF_UNOP(operator!, UnaryNot);
GRID_DEF_UNOP(adj, UnaryAdj);
GRID_DEF_UNOP(conjugate, UnaryConj);
GRID_DEF_UNOP(trace, UnaryTrace);
GRID_DEF_UNOP(transpose, UnaryTranspose);
GRID_DEF_UNOP(Ta, UnaryTa);
GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
GRID_DEF_UNOP(real, UnaryReal);
GRID_DEF_UNOP(imag, UnaryImag);
GRID_DEF_UNOP(toReal, UnaryToReal);
GRID_DEF_UNOP(toComplex, UnaryToComplex);
GRID_DEF_UNOP(timesI, UnaryTimesI);
GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI);
GRID_DEF_UNOP(abs, UnaryAbs); // abs overloaded in cmath C++98; DON'T do the
// abs-fabs-dabs-labs thing
GRID_DEF_UNOP(sqrt, UnarySqrt);
GRID_DEF_UNOP(rsqrt, UnaryRsqrt);
GRID_DEF_UNOP(sin, UnarySin);
GRID_DEF_UNOP(cos, UnaryCos);
GRID_DEF_UNOP(asin, UnaryAsin);
GRID_DEF_UNOP(acos, UnaryAcos);
GRID_DEF_UNOP(log, UnaryLog);
GRID_DEF_UNOP(exp, UnaryExp);
GRID_DEF_BINOP(operator+, BinaryAdd);
GRID_DEF_BINOP(operator-, BinarySub);
GRID_DEF_BINOP(operator*, BinaryMul);
GRID_DEF_BINOP(operator/, BinaryDiv);
GRID_DEF_BINOP(operator&, BinaryAnd);
GRID_DEF_BINOP(operator|, BinaryOr);
GRID_DEF_BINOP(operator&&, BinaryAndAnd);
GRID_DEF_BINOP(operator||, BinaryOrOr);
GRID_DEF_TRINOP(where, TrinaryWhere);
/////////////////////////////////////////////////////////////
// Closure convenience to force expression to evaluate
/////////////////////////////////////////////////////////////
template <class Op, class T1>
auto closure(const LatticeUnaryExpression<Op, T1> &expr)
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))>
{
Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> ret(expr);
return ret;
}
template <class Op, class T1, class T2>
auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))>
{
Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))> ret(expr);
return ret;
}
template <class Op, class T1, class T2, class T3>
auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1),
eval(0, expr.arg2),
eval(0, expr.arg3)))>
{
Lattice<decltype(expr.op.func(eval(0, expr.arg1),
eval(0, expr.arg2),
eval(0, expr.arg3)))> ret(expr);
return ret;
}
#undef GRID_UNOP
#undef GRID_BINOP
#undef GRID_TRINOP
#undef GRID_DEF_UNOP
#undef GRID_DEF_BINOP
#undef GRID_DEF_TRINOP
NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,257 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_arith.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_ARITH_H
#define GRID_LATTICE_ARITH_H
NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////////////////////////////////////////////////////
// avoid copy back routines for mult, mac, sub, add
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard();
auto ret_v = ret.View();
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
conformable(ret,rhs);
conformable(lhs,rhs);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t = lhs_v(ss);
auto rhs_t = rhs_v(ss);
mult(&tmp,&lhs_t,&rhs_t);
coalescedWrite(ret_v[ss],tmp);
});
}
template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs);
conformable(lhs,rhs);
auto ret_v = ret.View();
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss);
auto rhs_t=rhs_v(ss);
mac(&tmp,&lhs_t,&rhs_t);
coalescedWrite(ret_v[ss],tmp);
});
}
template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs);
conformable(lhs,rhs);
auto ret_v = ret.View();
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss);
auto rhs_t=rhs_v(ss);
sub(&tmp,&lhs_t,&rhs_t);
coalescedWrite(ret_v[ss],tmp);
});
}
template<class obj1,class obj2,class obj3> inline
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs);
conformable(lhs,rhs);
auto ret_v = ret.View();
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss);
auto rhs_t=rhs_v(ss);
add(&tmp,&lhs_t,&rhs_t);
coalescedWrite(ret_v[ss],tmp);
});
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
// avoid copy back routines for mult, mac, sub, add
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard();
conformable(lhs,ret);
auto ret_v = ret.View();
auto lhs_v = lhs.View();
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
mult(&tmp,&lhs_v(ss),&rhs);
coalescedWrite(ret_v[ss],tmp);
});
}
template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,lhs);
auto ret_v = ret.View();
auto lhs_v = lhs.View();
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss);
mac(&tmp,&lhs_t,&rhs);
coalescedWrite(ret_v[ss],tmp);
});
}
template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,lhs);
auto ret_v = ret.View();
auto lhs_v = lhs.View();
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss);
sub(&tmp,&lhs_t,&rhs);
coalescedWrite(ret_v[ss],tmp);
});
}
template<class obj1,class obj2,class obj3> inline
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard();
conformable(lhs,ret);
auto ret_v = ret.View();
auto lhs_v = lhs.View();
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss);
add(&tmp,&lhs_t,&rhs);
coalescedWrite(ret_v[ss],tmp);
});
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
// avoid copy back routines for mult, mac, sub, add
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs);
auto ret_v = ret.View();
auto rhs_v = lhs.View();
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss);
mult(&tmp,&lhs,&rhs_t);
coalescedWrite(ret_v[ss],tmp);
});
}
template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs);
auto ret_v = ret.View();
auto rhs_v = lhs.View();
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss);
mac(&tmp,&lhs,&rhs_t);
coalescedWrite(ret_v[ss],tmp);
});
}
template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs);
auto ret_v = ret.View();
auto rhs_v = lhs.View();
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss);
sub(&tmp,&lhs,&rhs_t);
coalescedWrite(ret_v[ss],tmp);
});
}
template<class obj1,class obj2,class obj3> inline
void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs);
auto ret_v = ret.View();
auto rhs_v = lhs.View();
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss);
add(&tmp,&lhs,&rhs_t);
coalescedWrite(ret_v[ss],tmp);
});
}
template<class sobj,class vobj> inline
void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
ret.Checkerboard() = x.Checkerboard();
conformable(ret,x);
conformable(x,y);
auto ret_v = ret.View();
auto x_v = x.View();
auto y_v = y.View();
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
auto tmp = a*x_v(ss)+y_v(ss);
coalescedWrite(ret_v[ss],tmp);
});
}
template<class sobj,class vobj> inline
void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
ret.Checkerboard() = x.Checkerboard();
conformable(ret,x);
conformable(x,y);
auto ret_v = ret.View();
auto x_v = x.View();
auto y_v = y.View();
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
auto tmp = a*x_v(ss)+b*y_v(ss);
coalescedWrite(ret_v[ss],tmp);
});
}
template<class sobj,class vobj> inline
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
{
return axpy_norm_fast(ret,a,x,y);
}
template<class sobj,class vobj> inline
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
{
return axpby_norm_fast(ret,a,b,x,y);
}
NAMESPACE_END(Grid);
#endif

466
Grid/lattice/Lattice_base.h Normal file
View File

@@ -0,0 +1,466 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_base.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#define STREAMING_STORES
NAMESPACE_BEGIN(Grid);
extern int GridCshiftPermuteMap[4][16];
///////////////////////////////////////////////////////////////////
// Base class which can be used by traits to pick up behaviour
///////////////////////////////////////////////////////////////////
class LatticeBase {};
/////////////////////////////////////////////////////////////////////////////////////////
// Conformable checks; same instance of Grid required
/////////////////////////////////////////////////////////////////////////////////////////
void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
{
assert(lhs == rhs);
}
////////////////////////////////////////////////////////////////////////////
// Minimal base class containing only data valid to access from accelerator
// _odata will be a managed pointer in CUDA
////////////////////////////////////////////////////////////////////////////
// Force access to lattice through a view object.
// prevents writing of code that will not offload to GPU, but perhaps annoyingly
// strict since host could could in principle direct access through the lattice object
// Need to decide programming model.
#define LATTICE_VIEW_STRICT
template<class vobj> class LatticeAccelerator : public LatticeBase
{
protected:
GridBase *_grid;
int checkerboard;
vobj *_odata; // A managed pointer
uint64_t _odata_size;
public:
accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr) { };
accelerator_inline uint64_t oSites(void) const { return _odata_size; };
accelerator_inline int Checkerboard(void) const { return checkerboard; };
accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view
accelerator_inline void Conformable(GridBase * &grid) const
{
if (grid) conformable(grid, _grid);
else grid = _grid;
};
};
/////////////////////////////////////////////////////////////////////////////////////////
// A View class which provides accessor to the data.
// This will be safe to call from accelerator_for and is trivially copy constructible
// The copy constructor for this will need to be used by device lambda functions
/////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
class LatticeView : public LatticeAccelerator<vobj>
{
public:
// Rvalue
#ifdef __CUDA_ARCH__
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { return coalescedRead(this->_odata[i]); }
#else
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
#endif
accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; };
accelerator_inline uint64_t begin(void) const { return 0;};
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
accelerator_inline uint64_t size(void) const { return this->_odata_size; };
LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me)
{
}
};
/////////////////////////////////////////////////////////////////////////////////////////
// Lattice expression types used by ET to assemble the AST
//
// Need to be able to detect code paths according to the whether a lattice object or not
// so introduce some trait type things
/////////////////////////////////////////////////////////////////////////////////////////
class LatticeExpressionBase {};
template <typename T> using is_lattice = std::is_base_of<LatticeBase, T>;
template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
template<class T, bool isLattice> struct ViewMapBase { typedef T Type; };
template<class T> struct ViewMapBase<T,true> { typedef LatticeView<typename T::vector_object> Type; };
template<class T> using ViewMap = ViewMapBase<T,std::is_base_of<LatticeBase, T>::value >;
template <typename Op, typename _T1>
class LatticeUnaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
Op op;
T1 arg1;
LatticeUnaryExpression(Op _op,const _T1 &_arg1) : op(_op), arg1(_arg1) {};
};
template <typename Op, typename _T1, typename _T2>
class LatticeBinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
Op op;
T1 arg1;
T2 arg2;
LatticeBinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2) : op(_op), arg1(_arg1), arg2(_arg2) {};
};
template <typename Op, typename _T1, typename _T2, typename _T3>
class LatticeTrinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
typedef typename ViewMap<_T3>::Type T3;
Op op;
T1 arg1;
T2 arg2;
T3 arg3;
LatticeTrinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2,const _T3 &_arg3) : op(_op), arg1(_arg1), arg2(_arg2), arg3(_arg3) {};
};
/////////////////////////////////////////////////////////////////////////////////////////
// The real lattice class, with normal copy and assignment semantics.
// This contains extra (host resident) grid pointer data that may be accessed by host code
/////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
class Lattice : public LatticeAccelerator<vobj>
{
public:
GridBase *Grid(void) const { return this->_grid; }
///////////////////////////////////////////////////
// Member types
///////////////////////////////////////////////////
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
typedef vobj vector_object;
private:
void dealloc(void)
{
alignedAllocator<vobj> alloc;
if( this->_odata_size ) {
alloc.deallocate(this->_odata,this->_odata_size);
this->_odata=nullptr;
this->_odata_size=0;
}
}
void resize(uint64_t size)
{
alignedAllocator<vobj> alloc;
if ( this->_odata_size != size ) {
dealloc();
}
this->_odata_size = size;
if ( size )
this->_odata = alloc.allocate(this->_odata_size);
else
this->_odata = nullptr;
}
public:
/////////////////////////////////////////////////////////////////////////////////
// Return a view object that may be dereferenced in site loops.
// The view is trivially copy constructible and may be copied to an accelerator device
// in device lambdas
/////////////////////////////////////////////////////////////////////////////////
LatticeView<vobj> View (void) const
{
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
return accessor;
}
~Lattice() {
if ( this->_odata_size ) {
dealloc();
}
}
////////////////////////////////////////////////////////////////////////////////
// Expression Template closure support
////////////////////////////////////////////////////////////////////////////////
template <typename Op, typename T1> inline Lattice<vobj> & operator=(const LatticeUnaryExpression<Op,T1> &expr)
{
GridBase *egrid(nullptr);
GridFromExpression(egrid,expr);
assert(egrid!=nullptr);
conformable(this->_grid,egrid);
int cb=-1;
CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb;
auto me = View();
accelerator_for(ss,me.size(),1,{
auto tmp = eval(ss,expr);
vstream(me[ss],tmp);
});
return *this;
}
template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
{
GridBase *egrid(nullptr);
GridFromExpression(egrid,expr);
assert(egrid!=nullptr);
conformable(this->_grid,egrid);
int cb=-1;
CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb;
auto me = View();
accelerator_for(ss,me.size(),1,{
auto tmp = eval(ss,expr);
vstream(me[ss],tmp);
});
return *this;
}
template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
{
GridBase *egrid(nullptr);
GridFromExpression(egrid,expr);
assert(egrid!=nullptr);
conformable(this->_grid,egrid);
int cb=-1;
CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb;
auto me = View();
accelerator_for(ss,me.size(),1,{
auto tmp = eval(ss,expr);
vstream(me[ss],tmp);
});
return *this;
}
//GridFromExpression is tricky to do
template<class Op,class T1>
Lattice(const LatticeUnaryExpression<Op,T1> & expr) {
this->_grid = nullptr;
GridFromExpression(this->_grid,expr);
assert(this->_grid!=nullptr);
int cb=-1;
CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb;
resize(this->_grid->oSites());
*this = expr;
}
template<class Op,class T1, class T2>
Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr) {
this->_grid = nullptr;
GridFromExpression(this->_grid,expr);
assert(this->_grid!=nullptr);
int cb=-1;
CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb;
resize(this->_grid->oSites());
*this = expr;
}
template<class Op,class T1, class T2, class T3>
Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) {
this->_grid = nullptr;
GridFromExpression(this->_grid,expr);
assert(this->_grid!=nullptr);
int cb=-1;
CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb;
resize(this->_grid->oSites());
*this = expr;
}
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
auto me = View();
thread_for(ss,me.size(),{
me[ss] = r;
});
return *this;
}
//////////////////////////////////////////////////////////////////
// Follow rule of five, with Constructor requires "grid" passed
// to user defined constructor
///////////////////////////////////////////
// user defined constructor
///////////////////////////////////////////
Lattice(GridBase *grid) {
this->_grid = grid;
resize(this->_grid->oSites());
assert((((uint64_t)&this->_odata[0])&0xF) ==0);
this->checkerboard=0;
}
// virtual ~Lattice(void) = default;
void reset(GridBase* grid) {
if (this->_grid != grid) {
this->_grid = grid;
this->_odata.resize(grid->oSites());
this->checkerboard = 0;
}
}
///////////////////////////////////////////
// copy constructor
///////////////////////////////////////////
Lattice(const Lattice& r){
// std::cout << "Lattice constructor(const Lattice &) "<<this<<std::endl;
this->_grid = r.Grid();
resize(this->_grid->oSites());
*this = r;
}
///////////////////////////////////////////
// move constructor
///////////////////////////////////////////
Lattice(Lattice && r){
this->_grid = r.Grid();
this->_odata = r._odata;
this->_odata_size = r._odata_size;
this->checkerboard= r.Checkerboard();
r._odata = nullptr;
r._odata_size = 0;
}
///////////////////////////////////////////
// assignment template
///////////////////////////////////////////
template<class robj> inline Lattice<vobj> & operator = (const Lattice<robj> & r){
typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
conformable(*this,r);
this->checkerboard = r.Checkerboard();
auto me = View();
auto him= r.View();
accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss));
});
return *this;
}
///////////////////////////////////////////
// Copy assignment
///////////////////////////////////////////
inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
this->checkerboard = r.Checkerboard();
conformable(*this,r);
auto me = View();
auto him= r.View();
accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss));
});
return *this;
}
///////////////////////////////////////////
// Move assignment possible if same type
///////////////////////////////////////////
inline Lattice<vobj> & operator = (Lattice<vobj> && r){
resize(0); // deletes if appropriate
this->_grid = r.Grid();
this->_odata = r._odata;
this->_odata_size = r._odata_size;
this->checkerboard= r.Checkerboard();
r._odata = nullptr;
r._odata_size = 0;
return *this;
}
/////////////////////////////////////////////////////////////////////////////
// *=,+=,-= operators inherit behvour from correspond */+/- operation
/////////////////////////////////////////////////////////////////////////////
template<class T> inline Lattice<vobj> &operator *=(const T &r) {
*this = (*this)*r;
return *this;
}
template<class T> inline Lattice<vobj> &operator -=(const T &r) {
*this = (*this)-r;
return *this;
}
template<class T> inline Lattice<vobj> &operator +=(const T &r) {
*this = (*this)+r;
return *this;
}
friend inline void swap(Lattice &l, Lattice &r) {
conformable(l,r);
LatticeAccelerator<vobj> tmp;
LatticeAccelerator<vobj> *lp = (LatticeAccelerator<vobj> *)&l;
LatticeAccelerator<vobj> *rp = (LatticeAccelerator<vobj> *)&r;
tmp = *lp; *lp=*rp; *rp=tmp;
}
}; // class Lattice
template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
typedef typename vobj::scalar_object sobj;
for(int g=0;g<o.Grid()->_gsites;g++){
Coordinate gcoor;
o.Grid()->GlobalIndexToGlobalCoor(g,gcoor);
sobj ss;
peekSite(ss,o,gcoor);
stream<<"[";
for(int d=0;d<gcoor.size();d++){
stream<<gcoor[d];
if(d!=gcoor.size()-1) stream<<",";
}
stream<<"]\t";
stream<<ss<<std::endl;
}
return stream;
}
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,207 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_comparison.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_COMPARISON_H
#define GRID_LATTICE_COMPARISON_H
NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////////////////////////
// relational operators
//
// Support <,>,<=,>=,==,!=
//
//Query supporting bitwise &, |, ^, !
//Query supporting logical &&, ||,
//////////////////////////////////////////////////////////////////////////
typedef iScalar<vInteger> vPredicate ;
/*
template <class iobj, class vobj, class robj> accelerator_inline
vobj predicatedWhere(const iobj &predicate, const vobj &iftrue, const robj &iffalse)
{
typename std::remove_const<vobj>::type ret;
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
const int Nsimd = vobj::vector_type::Nsimd();
ExtractBuffer<Integer> mask(Nsimd);
ExtractBuffer<scalar_object> truevals(Nsimd);
ExtractBuffer<scalar_object> falsevals(Nsimd);
extract(iftrue, truevals);
extract(iffalse, falsevals);
extract<vInteger, Integer>(TensorRemove(predicate), mask);
for (int s = 0; s < Nsimd; s++) {
if (mask[s]) falsevals[s] = truevals[s];
}
merge(ret, falsevals);
return ret;
}
*/
//////////////////////////////////////////////////////////////////////////
// compare lattice to lattice
//////////////////////////////////////////////////////////////////////////
template<class vfunctor,class lobj,class robj>
inline Lattice<vPredicate> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
{
Lattice<vPredicate> ret(rhs.Grid());
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
auto ret_v = ret.View();
thread_for( ss, rhs_v.size(), {
ret_v[ss]=op(lhs_v[ss],rhs_v[ss]);
});
return ret;
}
//////////////////////////////////////////////////////////////////////////
// compare lattice to scalar
//////////////////////////////////////////////////////////////////////////
template<class vfunctor,class lobj,class robj>
inline Lattice<vPredicate> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
{
Lattice<vPredicate> ret(lhs.Grid());
auto lhs_v = lhs.View();
auto ret_v = ret.View();
thread_for( ss, lhs_v.size(), {
ret_v[ss]=op(lhs_v[ss],rhs);
});
return ret;
}
//////////////////////////////////////////////////////////////////////////
// compare scalar to lattice
//////////////////////////////////////////////////////////////////////////
template<class vfunctor,class lobj,class robj>
inline Lattice<vPredicate> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
{
Lattice<vPredicate> ret(rhs.Grid());
auto rhs_v = rhs.View();
auto ret_v = ret.View();
thread_for( ss, rhs_v.size(), {
ret_v[ss]=op(lhs,rhs_v[ss]);
});
return ret;
}
//////////////////////////////////////////////////////////////////////////
// Map to functors
//////////////////////////////////////////////////////////////////////////
// Less than
template<class lobj,class robj>
inline Lattice<vPredicate> operator < (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vlt<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vPredicate> operator < (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vlt<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vPredicate> operator < (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vlt<lobj,robj>(),lhs,rhs);
}
// Less than equal
template<class lobj,class robj>
inline Lattice<vPredicate> operator <= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vle<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vPredicate> operator <= (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vle<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vPredicate> operator <= (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vle<lobj,robj>(),lhs,rhs);
}
// Greater than
template<class lobj,class robj>
inline Lattice<vPredicate> operator > (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vgt<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vPredicate> operator > (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vgt<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vPredicate> operator > (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vgt<lobj,robj>(),lhs,rhs);
}
// Greater than equal
template<class lobj,class robj>
inline Lattice<vPredicate> operator >= (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vge<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vPredicate> operator >= (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vge<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vPredicate> operator >= (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vge<lobj,robj>(),lhs,rhs);
}
// equal
template<class lobj,class robj>
inline Lattice<vPredicate> operator == (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(veq<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vPredicate> operator == (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(veq<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vPredicate> operator == (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(veq<lobj,robj>(),lhs,rhs);
}
// not equal
template<class lobj,class robj>
inline Lattice<vPredicate> operator != (const Lattice<lobj> & lhs, const Lattice<robj> & rhs) {
return LLComparison(vne<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vPredicate> operator != (const Lattice<lobj> & lhs, const robj & rhs) {
return LSComparison(vne<lobj,robj>(),lhs,rhs);
}
template<class lobj,class robj>
inline Lattice<vPredicate> operator != (const lobj & lhs, const Lattice<robj> & rhs) {
return SLComparison(vne<lobj,robj>(),lhs,rhs);
}
NAMESPACE_END(Grid);
#endif

View File

@@ -26,10 +26,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_COMPARISON_H
#define GRID_COMPARISON_H
namespace Grid {
#pragma once
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////
// This implementation is a bit poor.
@@ -44,42 +44,42 @@ namespace Grid {
//
template<class lobj,class robj> class veq {
public:
vInteger operator()(const lobj &lhs, const robj &rhs)
accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) == (rhs);
}
};
template<class lobj,class robj> class vne {
public:
vInteger operator()(const lobj &lhs, const robj &rhs)
accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) != (rhs);
}
};
template<class lobj,class robj> class vlt {
public:
vInteger operator()(const lobj &lhs, const robj &rhs)
accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) < (rhs);
}
};
template<class lobj,class robj> class vle {
public:
vInteger operator()(const lobj &lhs, const robj &rhs)
accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) <= (rhs);
}
};
template<class lobj,class robj> class vgt {
public:
vInteger operator()(const lobj &lhs, const robj &rhs)
accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) > (rhs);
}
};
template<class lobj,class robj> class vge {
public:
vInteger operator()(const lobj &lhs, const robj &rhs)
accelerator vInteger operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) >= (rhs);
}
@@ -88,42 +88,42 @@ namespace Grid {
// Generic list of functors
template<class lobj,class robj> class seq {
public:
Integer operator()(const lobj &lhs, const robj &rhs)
accelerator Integer operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) == (rhs);
}
};
template<class lobj,class robj> class sne {
public:
Integer operator()(const lobj &lhs, const robj &rhs)
accelerator Integer operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) != (rhs);
}
};
template<class lobj,class robj> class slt {
public:
Integer operator()(const lobj &lhs, const robj &rhs)
accelerator Integer operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) < (rhs);
}
};
template<class lobj,class robj> class sle {
public:
Integer operator()(const lobj &lhs, const robj &rhs)
accelerator Integer operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) <= (rhs);
}
};
template<class lobj,class robj> class sgt {
public:
Integer operator()(const lobj &lhs, const robj &rhs)
accelerator Integer operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) > (rhs);
}
};
template<class lobj,class robj> class sge {
public:
Integer operator()(const lobj &lhs, const robj &rhs)
accelerator Integer operator()(const lobj &lhs, const robj &rhs)
{
return (lhs) >= (rhs);
}
@@ -133,12 +133,12 @@ namespace Grid {
// Integer and real get extra relational functions.
//////////////////////////////////////////////////////////////////////////////////////////////////////
template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0>
inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs)
accelerator_inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const vsimd & rhs)
{
typedef typename vsimd::scalar_type scalar;
std::vector<scalar> vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
std::vector<scalar> vrhs(vsimd::Nsimd());
std::vector<Integer> vpred(vsimd::Nsimd());
ExtractBuffer<scalar> vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
ExtractBuffer<scalar> vrhs(vsimd::Nsimd());
ExtractBuffer<Integer> vpred(vsimd::Nsimd());
vInteger ret;
extract<vsimd,scalar>(lhs,vlhs);
extract<vsimd,scalar>(rhs,vrhs);
@@ -150,11 +150,11 @@ namespace Grid {
}
template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0>
inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs)
accelerator_inline vInteger Comparison(sfunctor sop,const vsimd & lhs, const typename vsimd::scalar_type & rhs)
{
typedef typename vsimd::scalar_type scalar;
std::vector<scalar> vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
std::vector<Integer> vpred(vsimd::Nsimd());
ExtractBuffer<scalar> vlhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
ExtractBuffer<Integer> vpred(vsimd::Nsimd());
vInteger ret;
extract<vsimd,scalar>(lhs,vlhs);
for(int s=0;s<vsimd::Nsimd();s++){
@@ -165,11 +165,11 @@ namespace Grid {
}
template<class sfunctor, class vsimd,IfNotComplex<vsimd> = 0>
inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs)
accelerator_inline vInteger Comparison(sfunctor sop,const typename vsimd::scalar_type & lhs, const vsimd & rhs)
{
typedef typename vsimd::scalar_type scalar;
std::vector<scalar> vrhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
std::vector<Integer> vpred(vsimd::Nsimd());
ExtractBuffer<scalar> vrhs(vsimd::Nsimd()); // Use functors to reduce this to single implementation
ExtractBuffer<Integer> vpred(vsimd::Nsimd());
vInteger ret;
extract<vsimd,scalar>(rhs,vrhs);
for(int s=0;s<vsimd::Nsimd();s++){
@@ -179,52 +179,54 @@ namespace Grid {
return ret;
}
#define DECLARE_RELATIONAL(op,functor) \
#define DECLARE_RELATIONAL_EQ(op,functor) \
template<class vsimd,IfSimd<vsimd> = 0>\
inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\
accelerator_inline vInteger operator op (const vsimd & lhs, const vsimd & rhs)\
{\
typedef typename vsimd::scalar_type scalar;\
return Comparison(functor<scalar,scalar>(),lhs,rhs);\
}\
template<class vsimd,IfSimd<vsimd> = 0>\
inline vInteger operator op (const vsimd & lhs, const typename vsimd::scalar_type & rhs) \
accelerator_inline vInteger operator op (const vsimd & lhs, const typename vsimd::scalar_type & rhs) \
{\
typedef typename vsimd::scalar_type scalar;\
return Comparison(functor<scalar,scalar>(),lhs,rhs);\
}\
template<class vsimd,IfSimd<vsimd> = 0>\
inline vInteger operator op (const typename vsimd::scalar_type & lhs, const vsimd & rhs) \
accelerator_inline vInteger operator op (const typename vsimd::scalar_type & lhs, const vsimd & rhs) \
{\
typedef typename vsimd::scalar_type scalar;\
return Comparison(functor<scalar,scalar>(),lhs,rhs);\
}\
template<class vsimd>\
inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
{ \
return lhs._internal op rhs._internal; \
} \
template<class vsimd>\
inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \
accelerator_inline vInteger operator op(const iScalar<vsimd> &lhs,const typename vsimd::scalar_type &rhs) \
{ \
return lhs._internal op rhs; \
} \
template<class vsimd>\
inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \
accelerator_inline vInteger operator op(const typename vsimd::scalar_type &lhs,const iScalar<vsimd> &rhs) \
{ \
return lhs op rhs._internal; \
}
} \
#define DECLARE_RELATIONAL(op,functor) \
DECLARE_RELATIONAL_EQ(op,functor) \
template<class vsimd>\
accelerator_inline vInteger operator op(const iScalar<vsimd> &lhs,const iScalar<vsimd> &rhs)\
{ \
return lhs._internal op rhs._internal; \
}
DECLARE_RELATIONAL(<,slt);
DECLARE_RELATIONAL(<=,sle);
DECLARE_RELATIONAL(>,sgt);
DECLARE_RELATIONAL(>=,sge);
DECLARE_RELATIONAL(==,seq);
DECLARE_RELATIONAL_EQ(==,seq);
DECLARE_RELATIONAL(!=,sne);
#undef DECLARE_RELATIONAL
}
NAMESPACE_END(Grid);
#endif

View File

@@ -28,13 +28,13 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_LATTICE_CONFORMABLE_H
#define GRID_LATTICE_CONFORMABLE_H
namespace Grid {
NAMESPACE_BEGIN(Grid);
template<class obj1,class obj2> void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs)
{
assert(lhs._grid == rhs._grid);
assert(lhs.checkerboard == rhs.checkerboard);
assert(lhs.Grid() == rhs.Grid());
assert(lhs.Checkerboard() == rhs.Checkerboard());
}
}
NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,74 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_coordinate.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
{
typedef typename iobj::scalar_type scalar_type;
typedef typename iobj::vector_type vector_type;
GridBase *grid = l.Grid();
int Nsimd = grid->iSites();
Coordinate gcoor;
ExtractBuffer<scalar_type> mergebuf(Nsimd);
vector_type vI;
auto l_v = l.View();
for(int o=0;o<grid->oSites();o++){
for(int i=0;i<grid->iSites();i++){
grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
mergebuf[i]=(Integer)gcoor[mu];
}
merge<vector_type,scalar_type>(vI,mergebuf);
l_v[o]=vI;
}
};
// LatticeCoordinate();
// FIXME for debug; deprecate this; made obscelete by
template<class vobj> void lex_sites(Lattice<vobj> &l){
auto l_v = l.View();
Real *v_ptr = (Real *)&l_v[0];
size_t o_len = l.Grid()->oSites();
size_t v_len = sizeof(vobj)/sizeof(vRealF);
size_t vec_len = vRealF::Nsimd();
for(int i=0;i<o_len;i++){
for(int j=0;j<v_len;j++){
for(int vv=0;vv<vec_len;vv+=2){
v_ptr[i*v_len*vec_len+j*vec_len+vv ]= i+vv*500;
v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
}
}}
}
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,87 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_local.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_LOCALREDUCTION_H
#define GRID_LATTICE_LOCALREDUCTION_H
///////////////////////////////////////////////
// localInner, localNorm, outerProduct
///////////////////////////////////////////////
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////
// Non site, reduced locally reduced routines
/////////////////////////////////////////////////////
// localNorm2,
template<class vobj>
inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
{
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
auto rhs_v = rhs.View();
auto ret_v = ret.View();
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss],innerProduct(rhs_v(ss),rhs_v(ss)));
});
return ret;
}
// localInnerProduct
template<class vobj>
inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
{
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
auto ret_v = ret.View();
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss],innerProduct(lhs_v(ss),rhs_v(ss)));
});
return ret;
}
// outerProduct Scalar x Scalar -> Scalar
// Vector x Vector -> Matrix
template<class ll,class rr>
inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(ll(),rr()))>
{
typedef decltype(coalescedRead(ll())) sll;
typedef decltype(coalescedRead(rr())) srr;
Lattice<decltype(outerProduct(ll(),rr()))> ret(rhs.Grid());
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
auto ret_v = ret.View();
accelerator_for(ss,rhs_v.size(),1,{
// FIXME had issues with scalar version of outer
// Use vector [] operator and don't read coalesce this loop
ret_v[ss]=outerProduct(lhs_v[ss],rhs_v[ss]);
});
return ret;
}
NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,202 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_reduction.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/Grid_Eigen_Dense.h>
#ifdef GRID_WARN_SUBOPTIMAL
#warning "Optimisation alert all these reduction loops are NOT threaded "
#endif
NAMESPACE_BEGIN(Grid);
template<class vobj>
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = X.Grid()->GlobalDimensions()[Orthog];
GridBase *FullGrid = X.Grid();
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
// Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1);
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
auto X_v = X.View();
auto Y_v = Y.View();
auto R_v = R.View();
thread_region
{
std::vector<vobj> s_x(Nblock);
thread_loop_collapse2( (int n=0;n<nblock;n++),{
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
s_x[i] = X_v[o+i*ostride];
}
vobj dot;
for(int i=0;i<Nblock;i++){
dot = Y_v[o+i*ostride];
for(int j=0;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i));
}
R_v[o+i*ostride]=dot;
}
}});
}
};
template<class vobj>
static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = X.Grid()->GlobalDimensions()[Orthog];
GridBase *FullGrid = X.Grid();
assert( FullGrid->_simd_layout[Orthog]==1);
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
auto X_v = X.View();
auto R_v = R.View();
thread_region
{
std::vector<vobj> s_x(Nblock);
thread_loop_collapse2( (int n=0;n<nblock;n++),{
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
s_x[i] = X_v[o+i*ostride];
}
vobj dot;
for(int i=0;i<Nblock;i++){
dot = s_x[0]*(scale*aa(0,i));
for(int j=1;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i));
}
R_v[o+i*ostride]=dot;
}
}});
}
};
template<class vobj>
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
GridBase *FullGrid = lhs.Grid();
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
int Nblock = FullGrid->GlobalDimensions()[Orthog];
// Lattice<vobj> Lslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
assert( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
// int nl = nh-1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
typedef typename vobj::vector_typeD vector_typeD;
auto lhs_v = lhs.View();
auto rhs_v = rhs.View();
thread_region {
std::vector<vobj> Left(Nblock);
std::vector<vobj> Right(Nblock);
Eigen::MatrixXcd mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
thread_loop_collapse2((int n=0;n<nblock;n++),{
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
Left [i] = lhs_v[o+i*ostride];
Right[i] = rhs_v[o+i*ostride];
}
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
auto tmp = innerProduct(Left[i],Right[j]);
auto rtmp = TensorRemove(tmp);
ComplexD z = Reduce(rtmp);
mat_thread(i,j) += std::complex<double>(real(z),imag(z));
}}
}});
thread_critical {
mat += mat_thread;
}
}
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
ComplexD sum = mat(i,j);
FullGrid->GlobalSum(sum);
mat(i,j)=sum;
}}
return;
}
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,217 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_peekpoke.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_PEEK_H
#define GRID_LATTICE_PEEK_H
///////////////////////////////////////////////
// Peeking and poking around
///////////////////////////////////////////////
NAMESPACE_BEGIN(Grid);
// FIXME accelerator_loop and accelerator_inline these
////////////////////////////////////////////////////////////////////////////////////////////////////
// Peek internal indices of a Lattice object
////////////////////////////////////////////////////////////////////////////////////////////////////
template<int Index,class vobj>
auto PeekIndex(const Lattice<vobj> &lhs,int i) -> Lattice<decltype(peekIndex<Index>(vobj(),i))>
{
Lattice<decltype(peekIndex<Index>(vobj(),i))> ret(lhs.Grid());
ret.Checkerboard()=lhs.Checkerboard();
auto ret_v = ret.View();
auto lhs_v = lhs.View();
thread_for( ss, lhs_v.size(), {
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i);
});
return ret;
};
template<int Index,class vobj>
auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekIndex<Index>(vobj(),i,j))>
{
Lattice<decltype(peekIndex<Index>(vobj(),i,j))> ret(lhs.Grid());
ret.Checkerboard()=lhs.Checkerboard();
auto ret_v = ret.View();
auto lhs_v = lhs.View();
thread_for( ss, lhs_v.size(), {
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i,j);
});
return ret;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
// Poke internal indices of a Lattice object
////////////////////////////////////////////////////////////////////////////////////////////////////
template<int Index,class vobj>
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0))> & rhs,int i)
{
auto rhs_v = rhs.View();
auto lhs_v = lhs.View();
thread_for( ss, lhs_v.size(), {
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i);
});
}
template<int Index,class vobj>
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0,0))> & rhs,int i,int j)
{
auto rhs_v = rhs.View();
auto lhs_v = lhs.View();
thread_for( ss, lhs_v.size(), {
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i,j);
});
}
//////////////////////////////////////////////////////
// Poke a scalar object into the SIMD array
//////////////////////////////////////////////////////
template<class vobj,class sobj>
void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
GridBase *grid=l.Grid();
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
int rank,odx,idx;
// Optional to broadcast from node 0.
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
grid->Broadcast(grid->BossRank(),s);
// extract-modify-merge cycle is easiest way and this is not perf critical
ExtractBuffer<sobj> buf(Nsimd);
auto l_v = l.View();
if ( rank == grid->ThisRank() ) {
extract(l_v[odx],buf);
buf[idx] = s;
merge(l_v[odx],buf);
}
return;
};
//////////////////////////////////////////////////////////
// Peek a scalar object from the SIMD array
//////////////////////////////////////////////////////////
template<class vobj,class sobj>
void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
GridBase *grid=l.Grid();
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd();
assert( l.Checkerboard() == l.Grid()->CheckerBoard(site));
int rank,odx,idx;
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
ExtractBuffer<sobj> buf(Nsimd);
auto l_v = l.View();
extract(l_v[odx],buf);
s = buf[idx];
grid->Broadcast(rank,s);
return;
};
//////////////////////////////////////////////////////////
// Peek a scalar object from the SIMD array
//////////////////////////////////////////////////////////
template<class vobj,class sobj>
void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
GridBase *grid = l.Grid();
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type);
int odx,idx;
idx= grid->iIndex(site);
odx= grid->oIndex(site);
auto l_v = l.View();
scalar_type * vp = (scalar_type *)&l_v[odx];
scalar_type * pt = (scalar_type *)&s;
for(int w=0;w<words;w++){
pt[w] = vp[idx+w*Nsimd];
}
return;
};
template<class vobj,class sobj>
void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){
GridBase *grid=l.Grid();
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type);
int odx,idx;
idx= grid->iIndex(site);
odx= grid->oIndex(site);
auto l_v = l.View();
scalar_type * vp = (scalar_type *)&l_v[odx];
scalar_type * pt = (scalar_type *)&s;
for(int w=0;w<words;w++){
vp[idx+w*Nsimd] = pt[w];
}
return;
};
NAMESPACE_END(Grid);
#endif

View File

@@ -36,22 +36,28 @@ Author: neo <cossu@post.kek.jp>
// The choice of burying complex in the SIMD
// is making the use of "real" and "imag" very cumbersome
namespace Grid {
NAMESPACE_BEGIN(Grid);
template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs._grid);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = adj(lhs._odata[ss]);
}
Lattice<vobj> ret(lhs.Grid());
auto lhs_v = lhs.View();
auto ret_v = ret.View();
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite(ret_v[ss], adj(lhs_v(ss)));
});
return ret;
};
template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs._grid);
parallel_for(int ss=0;ss<lhs._grid->oSites();ss++){
ret._odata[ss] = conjugate(lhs._odata[ss]);
}
Lattice<vobj> ret(lhs.Grid());
auto lhs_v = lhs.View();
auto ret_v = ret.View();
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite( ret_v[ss] , conjugate(lhs_v(ss)));
});
return ret;
};
}
NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,787 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_reduction.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/Grid_Eigen_Dense.h>
#ifdef GRID_NVCC
#include <Grid/lattice/Lattice_reduction_gpu.h>
#endif
NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////
// FIXME this should promote to double and accumulate
//////////////////////////////////////////////////////
template<class vobj>
inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
{
typedef typename vobj::scalar_object sobj;
const int Nsimd = vobj::Nsimd();
const int nthread = GridThread::GetThreads();
Vector<sobj> sumarray(nthread);
for(int i=0;i<nthread;i++){
sumarray[i]=Zero();
}
thread_for(thr,nthread, {
int nwork, mywork, myoff;
nwork = osites;
GridThread::GetWork(nwork,thr,mywork,myoff);
vobj vvsum=Zero();
for(int ss=myoff;ss<mywork+myoff; ss++){
vvsum = vvsum + arg[ss];
}
sumarray[thr]=Reduce(vvsum);
});
sobj ssum=Zero(); // sum across threads
for(int i=0;i<nthread;i++){
ssum = ssum+sumarray[i];
}
return ssum;
}
template<class vobj>
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
{
#ifdef GRID_NVCC
return sum_gpu(arg,osites);
#else
return sum_cpu(arg,osites);
#endif
}
template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
{
auto arg_v = arg.View();
Integer osites = arg.Grid()->oSites();
auto ssum= sum(&arg_v[0],osites);
arg.Grid()->GlobalSum(ssum);
return ssum;
}
////////////////////////////////////////////////////////////////////////////////////////////////////
// Deterministic Reduction operations
////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
ComplexD nrm = innerProduct(arg,arg);
return real(nrm);
}
// Double inner product
template<class vobj>
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
{
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type;
ComplexD nrm;
GridBase *grid = left.Grid();
// Might make all code paths go this way.
auto left_v = left.View();
auto right_v=right.View();
const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites();
#ifdef GRID_NVCC
// GPU - SIMT lane compliance...
typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto x_l = left_v(ss);
auto y_l = right_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
})
// This is in single precision and fails some tests
// Need a sumD that sums in double
nrm = TensorRemove(sumD_gpu(inner_tmp_v,sites));
#else
// CPU
typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto x_l = left_v[ss];
auto y_l = right_v[ss];
inner_tmp_v[ss]=innerProductD(x_l,y_l);
})
nrm = TensorRemove(sum(inner_tmp_v,sites));
#endif
grid->GlobalSum(nrm);
return nrm;
}
/////////////////////////
// Fast axpby_norm
// z = a x + b y
// return norm z
/////////////////////////
template<class sobj,class vobj> strong_inline RealD
axpy_norm_fast(Lattice<vobj> &z,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
{
sobj one(1.0);
return axpby_norm_fast(z,a,one,x,y);
}
template<class sobj,class vobj> strong_inline RealD
axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
{
z.Checkerboard() = x.Checkerboard();
conformable(z,x);
conformable(x,y);
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type;
RealD nrm;
GridBase *grid = x.Grid();
auto x_v=x.View();
auto y_v=y.View();
auto z_v=z.View();
const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites();
#ifdef GRID_NVCC
// GPU
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto tmp = a*x_v(ss)+b*y_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
coalescedWrite(z_v[ss],tmp);
});
nrm = real(TensorRemove(sumD_gpu(inner_tmp_v,sites)));
#else
// CPU
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto tmp = a*x_v(ss)+b*y_v(ss);
inner_tmp_v[ss]=innerProductD(tmp,tmp);
z_v[ss]=tmp;
});
// Already promoted to double
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
#endif
grid->GlobalSum(nrm);
return nrm;
}
template<class Op,class T1>
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
->typename decltype(expr.op.func(eval(0,expr.arg1)))::scalar_object
{
return sum(closure(expr));
}
template<class Op,class T1,class T2>
inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr)
->typename decltype(expr.op.func(eval(0,expr.arg1),eval(0,expr.arg2)))::scalar_object
{
return sum(closure(expr));
}
template<class Op,class T1,class T2,class T3>
inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
->typename decltype(expr.op.func(eval(0,expr.arg1),
eval(0,expr.arg2),
eval(0,expr.arg3)
))::scalar_object
{
return sum(closure(expr));
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
{
///////////////////////////////////////////////////////
// FIXME precision promoted summation
// may be important for correlation functions
// But easily avoided by using double precision fields
///////////////////////////////////////////////////////
typedef typename vobj::scalar_object sobj;
GridBase *grid = Data.Grid();
assert(grid!=NULL);
const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd();
assert(orthogdim >= 0);
assert(orthogdim < Nd);
int fd=grid->_fdimensions[orthogdim];
int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim];
Vector<vobj> lvSum(rd); // will locally sum vectors first
Vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars
ExtractBuffer<sobj> extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node
for(int r=0;r<rd;r++){
lvSum[r]=Zero();
}
int e1= grid->_slice_nblock[orthogdim];
int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim];
// sum over reduced dimension planes, breaking out orthog dir
// Parallel over orthog direction
auto Data_v=Data.View();
thread_for( r,rd, {
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int ss= so+n*stride+b;
lvSum[r]=lvSum[r]+Data_v[ss];
}
}
});
// Sum across simd lanes in the plane, breaking out orthog dir.
Coordinate icoor(Nd);
for(int rt=0;rt<rd;rt++){
extract(lvSum[rt],extracted);
for(int idx=0;idx<Nsimd;idx++){
grid->iCoorFromIindex(icoor,idx);
int ldx =rt+icoor[orthogdim]*rd;
lsSum[ldx]=lsSum[ldx]+extracted[idx];
}
}
// sum over nodes.
sobj gsum;
for(int t=0;t<fd;t++){
int pt = t/ld; // processor plane
int lt = t%ld;
if ( pt == grid->_processor_coor[orthogdim] ) {
gsum=lsSum[lt];
} else {
gsum=Zero();
}
grid->GlobalSum(gsum);
result[t]=gsum;
}
}
template<class vobj>
static void mySliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
{
// std::cout << GridLogMessage << "Start mySliceInnerProductVector" << std::endl;
typedef typename vobj::scalar_type scalar_type;
std::vector<scalar_type> lsSum;
localSliceInnerProductVector(result, lhs, rhs, lsSum, orthogdim);
globalSliceInnerProductVector(result, lhs, lsSum, orthogdim);
// std::cout << GridLogMessage << "End mySliceInnerProductVector" << std::endl;
}
template <class vobj>
static void localSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, const Lattice<vobj> &rhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim)
{
// std::cout << GridLogMessage << "Start prep" << std::endl;
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
GridBase *grid = lhs.Grid();
assert(grid!=NULL);
conformable(grid,rhs.Grid());
const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd();
assert(orthogdim >= 0);
assert(orthogdim < Nd);
int fd=grid->_fdimensions[orthogdim];
int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim];
// std::cout << GridLogMessage << "Start alloc" << std::endl;
Vector<vector_type> lvSum(rd); // will locally sum vectors first
lsSum.resize(ld,scalar_type(0.0)); // sum across these down to scalars
ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD
// std::cout << GridLogMessage << "End alloc" << std::endl;
result.resize(fd); // And then global sum to return the same vector to every node for IO to file
for(int r=0;r<rd;r++){
lvSum[r]=Zero();
}
int e1= grid->_slice_nblock[orthogdim];
int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim];
// std::cout << GridLogMessage << "End prep" << std::endl;
// std::cout << GridLogMessage << "Start parallel inner product, _rd = " << rd << std::endl;
vector_type vv;
auto l_v=lhs.View();
auto r_v=rhs.View();
thread_for( r,rd,{
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int ss = so + n * stride + b;
vv = TensorRemove(innerProduct(l_v[ss], r_v[ss]));
lvSum[r] = lvSum[r] + vv;
}
}
});
// std::cout << GridLogMessage << "End parallel inner product" << std::endl;
// Sum across simd lanes in the plane, breaking out orthog dir.
Coordinate icoor(Nd);
for(int rt=0;rt<rd;rt++){
iScalar<vector_type> temp;
temp._internal = lvSum[rt];
extract(temp,extracted);
for(int idx=0;idx<Nsimd;idx++){
grid->iCoorFromIindex(icoor,idx);
int ldx =rt+icoor[orthogdim]*rd;
lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal;
}
}
// std::cout << GridLogMessage << "End sum over simd lanes" << std::endl;
}
template <class vobj>
static void globalSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim)
{
typedef typename vobj::scalar_type scalar_type;
GridBase *grid = lhs.Grid();
int fd = result.size();
int ld = lsSum.size();
// sum over nodes.
std::vector<scalar_type> gsum;
gsum.resize(fd, scalar_type(0.0));
// std::cout << GridLogMessage << "Start of gsum[t] creation:" << std::endl;
for(int t=0;t<fd;t++){
int pt = t/ld; // processor plane
int lt = t%ld;
if ( pt == grid->_processor_coor[orthogdim] ) {
gsum[t]=lsSum[lt];
}
}
// std::cout << GridLogMessage << "End of gsum[t] creation:" << std::endl;
// std::cout << GridLogMessage << "Start of GlobalSumVector:" << std::endl;
grid->GlobalSumVector(&gsum[0], fd);
// std::cout << GridLogMessage << "End of GlobalSumVector:" << std::endl;
result = gsum;
}
template<class vobj>
static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
{
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
GridBase *grid = lhs.Grid();
assert(grid!=NULL);
conformable(grid,rhs.Grid());
const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd();
assert(orthogdim >= 0);
assert(orthogdim < Nd);
int fd=grid->_fdimensions[orthogdim];
int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim];
Vector<vector_type> lvSum(rd); // will locally sum vectors first
Vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars
ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node for IO to file
for(int r=0;r<rd;r++){
lvSum[r]=Zero();
}
int e1= grid->_slice_nblock[orthogdim];
int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim];
auto lhv=lhs.View();
auto rhv=rhs.View();
thread_for( r,rd,{
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int ss= so+n*stride+b;
vector_type vv = TensorRemove(innerProduct(lhv[ss],rhv[ss]));
lvSum[r]=lvSum[r]+vv;
}
}
});
// Sum across simd lanes in the plane, breaking out orthog dir.
Coordinate icoor(Nd);
for(int rt=0;rt<rd;rt++){
iScalar<vector_type> temp;
temp._internal = lvSum[rt];
extract(temp,extracted);
for(int idx=0;idx<Nsimd;idx++){
grid->iCoorFromIindex(icoor,idx);
int ldx =rt+icoor[orthogdim]*rd;
lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal;
}
}
// sum over nodes.
scalar_type gsum;
for(int t=0;t<fd;t++){
int pt = t/ld; // processor plane
int lt = t%ld;
if ( pt == grid->_processor_coor[orthogdim] ) {
gsum=lsSum[lt];
} else {
gsum=scalar_type(0.0);
}
grid->GlobalSum(gsum);
result[t]=gsum;
}
}
template<class vobj>
static void sliceNorm (std::vector<RealD> &sn,const Lattice<vobj> &rhs,int Orthog)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = rhs.Grid()->GlobalDimensions()[Orthog];
std::vector<ComplexD> ip(Nblock);
sn.resize(Nblock);
sliceInnerProductVector(ip,rhs,rhs,Orthog);
for(int ss=0;ss<Nblock;ss++){
sn[ss] = real(ip[ss]);
}
};
template<class vobj>
static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
int orthogdim,RealD scale=1.0)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
typedef typename vobj::tensor_reduced tensor_reduced;
scalar_type zscale(scale);
GridBase *grid = X.Grid();
int Nsimd =grid->Nsimd();
int Nblock =grid->GlobalDimensions()[orthogdim];
int fd =grid->_fdimensions[orthogdim];
int ld =grid->_ldimensions[orthogdim];
int rd =grid->_rdimensions[orthogdim];
int e1 =grid->_slice_nblock[orthogdim];
int e2 =grid->_slice_block [orthogdim];
int stride =grid->_slice_stride[orthogdim];
Coordinate icoor;
for(int r=0;r<rd;r++){
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
vector_type av;
for(int l=0;l<Nsimd;l++){
grid->iCoorFromIindex(icoor,l);
int ldx =r+icoor[orthogdim]*rd;
scalar_type *as =(scalar_type *)&av;
as[l] = scalar_type(a[ldx])*zscale;
}
tensor_reduced at; at=av;
auto Rv=R.View();
auto Xv=X.View();
auto Yv=Y.View();
thread_for_collapse(2, n, e1, {
for(int b=0;b<e2;b++){
int ss= so+n*stride+b;
Rv[ss] = at*Xv[ss]+Yv[ss];
}
});
}
};
/*
inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
{
int NN = BlockSolverGrid->_ndimension;
int nsimd = BlockSolverGrid->Nsimd();
std::vector<int> latt_phys(0);
std::vector<int> simd_phys(0);
std::vector<int> mpi_phys(0);
for(int d=0;d<NN;d++){
if( d!=Orthog ) {
latt_phys.push_back(BlockSolverGrid->_fdimensions[d]);
simd_phys.push_back(BlockSolverGrid->_simd_layout[d]);
mpi_phys.push_back(BlockSolverGrid->_processors[d]);
}
}
return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys);
}
*/
template<class vobj>
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = X.Grid()->GlobalDimensions()[Orthog];
GridBase *FullGrid = X.Grid();
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
// Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
// int nl = nh-1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
auto X_v=X.View();
auto Y_v=Y.View();
auto R_v=R.View();
thread_region
{
Vector<vobj> s_x(Nblock);
thread_for_collapse_in_region(2, n,nblock, {
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
s_x[i] = X_v[o+i*ostride];
}
vobj dot;
for(int i=0;i<Nblock;i++){
dot = Y_v[o+i*ostride];
for(int j=0;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i));
}
R_v[o+i*ostride]=dot;
}
}});
}
};
template<class vobj>
static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = X.Grid()->GlobalDimensions()[Orthog];
GridBase *FullGrid = X.Grid();
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
// Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
// int nl=1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
auto R_v = R.View();
auto X_v = X.View();
thread_region
{
std::vector<vobj> s_x(Nblock);
thread_for_collapse_in_region( 2 ,n,nblock,{
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
s_x[i] = X_v[o+i*ostride];
}
vobj dot;
for(int i=0;i<Nblock;i++){
dot = s_x[0]*(scale*aa(0,i));
for(int j=1;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i));
}
R_v[o+i*ostride]=dot;
}
}});
}
};
template<class vobj>
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
GridBase *FullGrid = lhs.Grid();
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
int Nblock = FullGrid->GlobalDimensions()[Orthog];
// Lattice<vobj> Lslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
assert( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
// int nl = nh-1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
typedef typename vobj::vector_typeD vector_typeD;
auto lhs_v=lhs.View();
auto rhs_v=rhs.View();
thread_region
{
std::vector<vobj> Left(Nblock);
std::vector<vobj> Right(Nblock);
Eigen::MatrixXcd mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
thread_for_collapse_in_region( 2, n,nblock,{
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
Left [i] = lhs_v[o+i*ostride];
Right[i] = rhs_v[o+i*ostride];
}
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
auto tmp = innerProduct(Left[i],Right[j]);
auto rtmp = TensorRemove(tmp);
auto red = Reduce(rtmp);
mat_thread(i,j) += std::complex<double>(real(red),imag(red));
}}
}});
thread_critical
{
mat += mat_thread;
}
}
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
ComplexD sum = mat(i,j);
FullGrid->GlobalSum(sum);
mat(i,j)=sum;
}}
return;
}
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,226 @@
NAMESPACE_BEGIN(Grid);
#define WARP_SIZE 32
extern cudaDeviceProp *gpu_props;
__device__ unsigned int retirementCount = 0;
template <class Iterator>
unsigned int nextPow2(Iterator x) {
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}
template <class Iterator>
void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
int device;
cudaGetDevice(&device);
Iterator warpSize = gpu_props[device].warpSize;
Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock;
Iterator maxThreadsPerBlock = gpu_props[device].maxThreadsPerBlock;
Iterator multiProcessorCount = gpu_props[device].multiProcessorCount;
std::cout << GridLogDebug << "GPU has:" << std::endl;
std::cout << GridLogDebug << "\twarpSize = " << warpSize << std::endl;
std::cout << GridLogDebug << "\tsharedMemPerBlock = " << sharedMemPerBlock << std::endl;
std::cout << GridLogDebug << "\tmaxThreadsPerBlock = " << maxThreadsPerBlock << std::endl;
std::cout << GridLogDebug << "\tmaxThreadsPerBlock = " << warpSize << std::endl;
std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
if (warpSize != WARP_SIZE) {
std::cout << GridLogError << "The warp size of the GPU in use does not match the warp size set when compiling Grid." << std::endl;
exit(EXIT_FAILURE);
}
// let the number of threads in a block be a multiple of 2, starting from warpSize
threads = warpSize;
while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
// keep all the streaming multiprocessors busy
blocks = nextPow2(multiProcessorCount);
}
template <class sobj, class Iterator>
__device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid) {
Iterator blockSize = blockDim.x;
// cannot use overloaded operators for sobj as they are not volatile-qualified
memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj));
__syncwarp();
const Iterator VEC = WARP_SIZE;
const Iterator vid = tid & (VEC-1);
sobj beta, temp;
memcpy((void *)&beta, (void *)&mySum, sizeof(sobj));
for (int i = VEC/2; i > 0; i>>=1) {
if (vid < i) {
memcpy((void *)&temp, (void *)&sdata[tid+i], sizeof(sobj));
beta += temp;
memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj));
}
__syncwarp();
}
__syncthreads();
if (threadIdx.x == 0) {
beta = Zero();
for (Iterator i = 0; i < blockSize; i += VEC) {
memcpy((void *)&temp, (void *)&sdata[i], sizeof(sobj));
beta += temp;
}
memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj));
}
__syncthreads();
}
template <class vobj, class sobj, class Iterator>
__device__ void reduceBlocks(const vobj *g_idata, sobj *g_odata, Iterator n)
{
constexpr Iterator nsimd = vobj::Nsimd();
Iterator blockSize = blockDim.x;
// force shared memory alignment
extern __shared__ __align__(COALESCE_GRANULARITY) unsigned char shmem_pointer[];
// it's not possible to have two extern __shared__ arrays with same name
// but different types in different scopes -- need to cast each time
sobj *sdata = (sobj *)shmem_pointer;
// first level of reduction,
// each thread writes result in mySum
Iterator tid = threadIdx.x;
Iterator i = blockIdx.x*(blockSize*2) + threadIdx.x;
Iterator gridSize = blockSize*2*gridDim.x;
sobj mySum = Zero();
while (i < n) {
Iterator lane = i % nsimd;
Iterator ss = i / nsimd;
auto tmp = extractLane(lane,g_idata[ss]);
sobj tmpD;
tmpD=tmp;
mySum +=tmpD;
if (i + blockSize < n) {
lane = (i+blockSize) % nsimd;
ss = (i+blockSize) / nsimd;
tmp = extractLane(lane,g_idata[ss]);
tmpD = tmp;
mySum += tmpD;
}
i += gridSize;
}
// copy mySum to shared memory and perform
// reduction for all threads in this block
reduceBlock(sdata, mySum, tid);
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
template <class vobj, class sobj,class Iterator>
__global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
Iterator blockSize = blockDim.x;
// perform reduction for this block and
// write result to global memory buffer
reduceBlocks(lat, buffer, n);
if (gridDim.x > 1) {
const Iterator tid = threadIdx.x;
__shared__ bool amLast;
// force shared memory alignment
extern __shared__ __align__(COALESCE_GRANULARITY) unsigned char shmem_pointer[];
// it's not possible to have two extern __shared__ arrays with same name
// but different types in different scopes -- need to cast each time
sobj *smem = (sobj *)shmem_pointer;
// wait until all outstanding memory instructions in this thread are finished
__threadfence();
if (tid==0) {
unsigned int ticket = atomicInc(&retirementCount, gridDim.x);
// true if this block is the last block to be done
amLast = (ticket == gridDim.x-1);
}
// each thread must read the correct value of amLast
__syncthreads();
if (amLast) {
// reduce buffer[0], ..., buffer[gridDim.x-1]
Iterator i = tid;
sobj mySum = Zero();
while (i < gridDim.x) {
mySum += buffer[i];
i += blockSize;
}
reduceBlock(smem, mySum, tid);
if (tid==0) {
buffer[0] = smem[0];
// reset count variable
retirementCount = 0;
}
}
}
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
// Possibly promote to double and sum
/////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_objectD sobj;
typedef decltype(lat) Iterator;
Integer nsimd= vobj::Nsimd();
Integer size = osites*nsimd;
Integer numThreads, numBlocks;
getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
Integer smemSize = numThreads * sizeof(sobj);
Vector<sobj> buffer(numBlocks);
sobj *buffer_v = &buffer[0];
reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
cudaDeviceSynchronize();
cudaError err = cudaGetLastError();
if ( cudaSuccess != err ) {
printf("Cuda error %s\n",cudaGetErrorString( err ));
exit(0);
}
auto result = buffer_v[0];
return result;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
// Return as same precision as input performing reduction in double precision though
/////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj>
inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_object sobj;
sobj result;
result = sumD_gpu(lat,osites);
return result;
}
NAMESPACE_END(Grid);

525
Grid/lattice/Lattice_rng.h Normal file
View File

@@ -0,0 +1,525 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_rng.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_RNG_H
#define GRID_LATTICE_RNG_H
#include <random>
#ifdef RNG_SITMO
#include <Grid/sitmo_rng/sitmo_prng_engine.hpp>
#endif
#if defined(RNG_SITMO)
#define RNG_FAST_DISCARD
#else
#undef RNG_FAST_DISCARD
#endif
NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////////////
// Allow the RNG state to be less dense than the fine grid
//////////////////////////////////////////////////////////////
inline int RNGfillable(GridBase *coarse,GridBase *fine)
{
int rngdims = coarse->_ndimension;
// trivially extended in higher dims, with locality guaranteeing RNG state is local to node
int lowerdims = fine->_ndimension - coarse->_ndimension;
assert(lowerdims >= 0);
for(int d=0;d<lowerdims;d++){
assert(fine->_simd_layout[d]==1);
assert(fine->_processors[d]==1);
}
int multiplicity=1;
for(int d=0;d<lowerdims;d++){
multiplicity=multiplicity*fine->_rdimensions[d];
}
// local and global volumes subdivide cleanly after SIMDization
for(int d=0;d<rngdims;d++){
int fd= d+lowerdims;
assert(coarse->_processors[d] == fine->_processors[fd]);
assert(coarse->_simd_layout[d] == fine->_simd_layout[fd]);
assert(((fine->_rdimensions[fd] / coarse->_rdimensions[d])* coarse->_rdimensions[d])==fine->_rdimensions[fd]);
multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d];
}
return multiplicity;
}
// merge of April 11 2017
// this function is necessary for the LS vectorised field
inline int RNGfillable_general(GridBase *coarse,GridBase *fine)
{
int rngdims = coarse->_ndimension;
// trivially extended in higher dims, with locality guaranteeing RNG state is local to node
int lowerdims = fine->_ndimension - coarse->_ndimension; assert(lowerdims >= 0);
// assumes that the higher dimensions are not using more processors
// all further divisions are local
for(int d=0;d<lowerdims;d++) assert(fine->_processors[d]==1);
for(int d=0;d<rngdims;d++) assert(coarse->_processors[d] == fine->_processors[d+lowerdims]);
// then divide the number of local sites
// check that the total number of sims agree, meanse the iSites are the same
assert(fine->Nsimd() == coarse->Nsimd());
// check that the two grids divide cleanly
assert( (fine->lSites() / coarse->lSites() ) * coarse->lSites() == fine->lSites() );
return fine->lSites() / coarse->lSites();
}
// real scalars are one component
template<class scalar,class distribution,class generator>
void fillScalar(scalar &s,distribution &dist,generator & gen)
{
s=dist(gen);
}
template<class distribution,class generator>
void fillScalar(ComplexF &s,distribution &dist, generator &gen)
{
// s=ComplexF(dist(gen),dist(gen));
s.real(dist(gen));
s.imag(dist(gen));
}
template<class distribution,class generator>
void fillScalar(ComplexD &s,distribution &dist,generator &gen)
{
// s=ComplexD(dist(gen),dist(gen));
s.real(dist(gen));
s.imag(dist(gen));
}
class GridRNGbase {
public:
// One generator per site.
// Uniform and Gaussian distributions from these generators.
#ifdef RNG_RANLUX
typedef std::ranlux48 RngEngine;
typedef uint64_t RngStateType;
static const int RngStateCount = 15;
#endif
#ifdef RNG_MT19937
typedef std::mt19937 RngEngine;
typedef uint32_t RngStateType;
static const int RngStateCount = std::mt19937::state_size;
#endif
#ifdef RNG_SITMO
typedef sitmo::prng_engine RngEngine;
typedef uint64_t RngStateType;
static const int RngStateCount = 13;
#endif
std::vector<RngEngine> _generators;
std::vector<std::uniform_real_distribution<RealD> > _uniform;
std::vector<std::normal_distribution<RealD> > _gaussian;
std::vector<std::discrete_distribution<int32_t> > _bernoulli;
std::vector<std::uniform_int_distribution<uint32_t> > _uid;
///////////////////////
// support for parallel init
///////////////////////
#ifdef RNG_FAST_DISCARD
static void Skip(RngEngine &eng,uint64_t site)
{
/////////////////////////////////////////////////////////////////////////////////////
// Skip by 2^40 elements between successive lattice sites
// This goes by 10^12.
// Consider quenched updating; likely never exceeding rate of 1000 sweeps
// per second on any machine. This gives us of order 10^9 seconds, or 100 years
// skip ahead.
// For HMC unlikely to go at faster than a solve per second, and
// tens of seconds per trajectory so this is clean in all reasonable cases,
// and margin of safety is orders of magnitude.
// We could hack Sitmo to skip in the higher order words of state if necessary
//
// Replace with 2^30 ; avoid problem on large volumes
//
/////////////////////////////////////////////////////////////////////////////////////
// uint64_t skip = site+1; // Old init Skipped then drew. Checked compat with faster init
const int shift = 30;
////////////////////////////////////////////////////////////////////
// Weird compiler bug in Intel 2018.1 under O3 was generating 32bit and not 64 bit left shift.
////////////////////////////////////////////////////////////////////
volatile uint64_t skip = site;
skip = skip<<shift;
assert((skip >> shift)==site); // check for overflow
eng.discard(skip);
// std::cout << " Engine " <<site << " state " <<eng<<std::endl;
}
#endif
static RngEngine Reseed(RngEngine &eng)
{
std::vector<uint32_t> newseed;
std::uniform_int_distribution<uint32_t> uid;
return Reseed(eng,newseed,uid);
}
static RngEngine Reseed(RngEngine &eng,std::vector<uint32_t> & newseed,
std::uniform_int_distribution<uint32_t> &uid)
{
const int reseeds=4;
newseed.resize(reseeds);
for(int i=0;i<reseeds;i++){
newseed[i] = uid(eng);
}
std::seed_seq sseq(newseed.begin(),newseed.end());
return RngEngine(sseq);
}
void GetState(std::vector<RngStateType> & saved,RngEngine &eng) {
saved.resize(RngStateCount);
std::stringstream ss;
ss<<eng;
ss.seekg(0,ss.beg);
for(int i=0;i<RngStateCount;i++){
ss>>saved[i];
}
}
void GetState(std::vector<RngStateType> & saved,int gen) {
GetState(saved,_generators[gen]);
}
void SetState(std::vector<RngStateType> & saved,RngEngine &eng){
assert(saved.size()==RngStateCount);
std::stringstream ss;
for(int i=0;i<RngStateCount;i++){
ss<< saved[i]<<" ";
}
ss.seekg(0,ss.beg);
ss>>eng;
}
void SetState(std::vector<RngStateType> & saved,int gen){
SetState(saved,_generators[gen]);
}
void SetEngine(RngEngine &Eng, int gen){
_generators[gen]=Eng;
}
void GetEngine(RngEngine &Eng, int gen){
Eng=_generators[gen];
}
template<class source> void Seed(source &src, int gen)
{
_generators[gen] = RngEngine(src);
}
};
class GridSerialRNG : public GridRNGbase {
public:
GridSerialRNG() : GridRNGbase() {
_generators.resize(1);
_uniform.resize(1,std::uniform_real_distribution<RealD>{0,1});
_gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) );
_bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
_uid.resize(1,std::uniform_int_distribution<uint32_t>() );
}
template <class sobj,class distribution> inline void fill(sobj &l,std::vector<distribution> &dist){
typedef typename sobj::scalar_type scalar_type;
int words = sizeof(sobj)/sizeof(scalar_type);
scalar_type *buf = (scalar_type *) & l;
dist[0].reset();
for(int idx=0;idx<words;idx++){
fillScalar(buf[idx],dist[0],_generators[0]);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(ComplexF &l,std::vector<distribution> &dist){
dist[0].reset();
fillScalar(l,dist[0],_generators[0]);
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(ComplexD &l,std::vector<distribution> &dist){
dist[0].reset();
fillScalar(l,dist[0],_generators[0]);
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(RealF &l,std::vector<distribution> &dist){
dist[0].reset();
fillScalar(l,dist[0],_generators[0]);
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(RealD &l,std::vector<distribution> &dist){
dist[0].reset();
fillScalar(l,dist[0],_generators[0]);
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
// vector fill
template <class distribution> inline void fill(vComplexF &l,std::vector<distribution> &dist){
RealF *pointer=(RealF *)&l;
dist[0].reset();
for(int i=0;i<2*vComplexF::Nsimd();i++){
fillScalar(pointer[i],dist[0],_generators[0]);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(vComplexD &l,std::vector<distribution> &dist){
RealD *pointer=(RealD *)&l;
dist[0].reset();
for(int i=0;i<2*vComplexD::Nsimd();i++){
fillScalar(pointer[i],dist[0],_generators[0]);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(vRealF &l,std::vector<distribution> &dist){
RealF *pointer=(RealF *)&l;
dist[0].reset();
for(int i=0;i<vRealF::Nsimd();i++){
fillScalar(pointer[i],dist[0],_generators[0]);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(vRealD &l,std::vector<distribution> &dist){
RealD *pointer=(RealD *)&l;
dist[0].reset();
for(int i=0;i<vRealD::Nsimd();i++){
fillScalar(pointer[i],dist[0],_generators[0]);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
void SeedFixedIntegers(const std::vector<int> &seeds){
CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
std::seed_seq src(seeds.begin(),seeds.end());
Seed(src,0);
}
void SeedUniqueString(const std::string &s){
std::vector<int> seeds;
std::stringstream sha;
seeds = GridChecksum::sha256_seeds(s);
for(int i=0;i<seeds.size();i++) {
sha << std::hex << seeds[i];
}
std::cout << GridLogMessage << "Intialising serial RNG with unique string '"
<< s << "'" << std::endl;
std::cout << GridLogMessage << "Seed SHA256: " << sha.str() << std::endl;
SeedFixedIntegers(seeds);
}
};
class GridParallelRNG : public GridRNGbase {
private:
double _time_counter;
GridBase *_grid;
unsigned int _vol;
public:
GridBase *Grid(void) const { return _grid; }
int generator_idx(int os,int is) {
return is*_grid->oSites()+os;
}
GridParallelRNG(GridBase *grid) : GridRNGbase() {
_grid = grid;
_vol =_grid->iSites()*_grid->oSites();
_generators.resize(_vol);
_uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
_gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
_bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
_uid.resize(_vol,std::uniform_int_distribution<uint32_t>() );
}
template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist){
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
double inner_time_counter = usecond();
int multiplicity = RNGfillable_general(_grid, l.Grid()); // l has finer or same grid
int Nsimd = _grid->Nsimd(); // guaranteed to be the same for l.Grid() too
int osites = _grid->oSites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity
int words = sizeof(scalar_object) / sizeof(scalar_type);
auto l_v = l.View();
thread_for( ss, osites, {
ExtractBuffer<scalar_object> buf(Nsimd);
for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times
int sm = multiplicity * ss + m; // Maps the generator site to the fine site
for (int si = 0; si < Nsimd; si++) {
int gdx = generator_idx(ss, si); // index of generator state
scalar_type *pointer = (scalar_type *)&buf[si];
dist[gdx].reset();
for (int idx = 0; idx < words; idx++)
fillScalar(pointer[idx], dist[gdx], _generators[gdx]);
}
// merge into SIMD lanes, FIXME suboptimal implementation
merge(l_v[sm], buf);
}
});
// });
_time_counter += usecond()- inner_time_counter;
}
void SeedUniqueString(const std::string &s){
std::vector<int> seeds;
seeds = GridChecksum::sha256_seeds(s);
std::cout << GridLogMessage << "Intialising parallel RNG with unique string '"
<< s << "'" << std::endl;
std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl;
SeedFixedIntegers(seeds);
}
void SeedFixedIntegers(const std::vector<int> &seeds){
// Everyone generates the same seed_seq based on input seeds
CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
std::seed_seq source(seeds.begin(),seeds.end());
RngEngine master_engine(source);
#ifdef RNG_FAST_DISCARD
////////////////////////////////////////////////
// Skip ahead through a single stream.
// Applicable to SITMO and other has based/crypto RNGs
// Should be applicable to Mersenne Twister, but the C++11
// MT implementation does not implement fast discard even though
// in principle this is possible
////////////////////////////////////////////////
// Everybody loops over global volume.
thread_for( gidx, _grid->_gsites, {
// Where is it?
int rank;
int o_idx;
int i_idx;
Coordinate gcoor;
_grid->GlobalIndexToGlobalCoor(gidx,gcoor);
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
// If this is one of mine we take it
if( rank == _grid->ThisRank() ){
int l_idx=generator_idx(o_idx,i_idx);
_generators[l_idx] = master_engine;
Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
}
});
#else
////////////////////////////////////////////////////////////////
// Machine and thread decomposition dependent seeding is efficient
// and maximally parallel; but NOT reproducible from machine to machine.
// Not ideal, but fastest way to reseed all nodes.
////////////////////////////////////////////////////////////////
{
// Obtain one Reseed per processor
int Nproc = _grid->ProcessorCount();
std::vector<RngEngine> seeders(Nproc);
int me= _grid->ThisRank();
for(int p=0;p<Nproc;p++){
seeders[p] = Reseed(master_engine);
}
master_engine = seeders[me];
}
{
// Obtain one reseeded generator per thread
int Nthread = GridThread::GetThreads();
std::vector<RngEngine> seeders(Nthread);
for(int t=0;t<Nthread;t++){
seeders[t] = Reseed(master_engine);
}
thread_for( t, Nthread, {
// set up one per local site in threaded fashion
std::vector<uint32_t> newseeds;
std::uniform_int_distribution<uint32_t> uid;
for(int l=0;l<_grid->lSites();l++) {
if ( (l%Nthread)==t ) {
_generators[l] = Reseed(seeders[t],newseeds,uid);
}
}
});
}
#endif
}
void Report(){
std::cout << GridLogMessage << "Time spent in the fill() routine by GridParallelRNG: "<< _time_counter/1e3 << " ms" << std::endl;
}
////////////////////////////////////////////////////////////////////////
// Support for rigorous test of RNG's
// Return uniform random uint32_t from requested site generator
////////////////////////////////////////////////////////////////////////
uint32_t GlobalU01(int gsite){
uint32_t the_number;
// who
int rank,o_idx,i_idx;
Coordinate gcoor;
_grid->GlobalIndexToGlobalCoor(gsite,gcoor);
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
// draw
int l_idx=generator_idx(o_idx,i_idx);
if( rank == _grid->ThisRank() ){
the_number = _uid[l_idx](_generators[l_idx]);
}
// share & return
_grid->Broadcast(rank,(void *)&the_number,sizeof(the_number));
return the_number;
}
};
template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._uniform); }
template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._gaussian); }
template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);}
template <class sobj> inline void random(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._uniform ); }
template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); }
template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); }
NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,69 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_trace.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_TRACE_H
#define GRID_LATTICE_TRACE_H
///////////////////////////////////////////////
// Tracing, transposing, peeking, poking
///////////////////////////////////////////////
NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////////////////////////////////////////////////////////////
// Trace
////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
inline auto trace(const Lattice<vobj> &lhs) -> Lattice<decltype(trace(vobj()))>
{
Lattice<decltype(trace(vobj()))> ret(lhs.Grid());
auto ret_v = ret.View();
auto lhs_v = lhs.View();
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite(ret_v[ss], trace(lhs_v(ss)));
});
return ret;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
// Trace Index level dependent operation
////////////////////////////////////////////////////////////////////////////////////////////////////
template<int Index,class vobj>
inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(vobj()))>
{
Lattice<decltype(traceIndex<Index>(vobj()))> ret(lhs.Grid());
auto ret_v = ret.View();
auto lhs_v = lhs.View();
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite(ret_v[ss], traceIndex<Index>(lhs_v(ss)));
});
return ret;
};
NAMESPACE_END(Grid);
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,68 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_transpose.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_TRANSPOSE_H
#define GRID_LATTICE_TRANSPOSE_H
///////////////////////////////////////////////
// Transpose
///////////////////////////////////////////////
NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////////////////////////////////////////////////////////////
// Transpose
////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid());
auto ret_v = ret.View();
auto lhs_v = lhs.View();
accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss], transpose(lhs_v(ss)));
});
return ret;
};
////////////////////////////////////////////////////////////////////////////////////////////////////
// Index level dependent transpose
////////////////////////////////////////////////////////////////////////////////////////////////////
template<int Index,class vobj>
inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(vobj()))>
{
Lattice<decltype(transposeIndex<Index>(vobj()))> ret(lhs.Grid());
auto ret_v = ret.View();
auto lhs_v = lhs.View();
accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss] , transposeIndex<Index>(lhs_v(ss)));
});
return ret;
};
NAMESPACE_END(Grid);
#endif

View File

@@ -0,0 +1,80 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_unary.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_UNARY_H
#define GRID_LATTICE_UNARY_H
NAMESPACE_BEGIN(Grid);
template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){
Lattice<obj> ret_i(rhs_i.Grid());
auto rhs = rhs_i.View();
auto ret = ret_i.View();
ret.Checkerboard() = rhs.Checkerboard();
accelerator_for(ss,rhs.size(),1,{
ret[ss]=pow(rhs[ss],y);
});
return ret_i;
}
template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){
Lattice<obj> ret_i(rhs_i.Grid());
auto rhs = rhs_i.View();
auto ret = ret_i.View();
ret.Checkerboard() = rhs.Checkerboard();
accelerator_for(ss,rhs.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],mod(rhs(ss),y));
});
return ret_i;
}
template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){
Lattice<obj> ret_i(rhs_i.Grid());
auto ret = ret_i.View();
auto rhs = rhs_i.View();
ret.Checkerboard() = rhs_i.Checkerboard();
accelerator_for(ss,rhs.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],div(rhs(ss),y));
});
return ret_i;
}
template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs_i, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){
Lattice<obj> ret_i(rhs_i.Grid());
auto rhs = rhs_i.View();
auto ret = ret_i.View();
ret.Checkerboard() = rhs.Checkerboard();
accelerator_for(ss,rhs.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],Exponentiate(rhs(ss),alpha, Nexp));
});
return ret_i;
}
NAMESPACE_END(Grid);
#endif

View File

@@ -35,7 +35,7 @@ directory
#include <cxxabi.h>
#include <memory>
namespace Grid {
NAMESPACE_BEGIN(Grid);
std::string demangle(const char* name) {
@@ -50,7 +50,7 @@ namespace Grid {
return (status==0) ? res.get() : name ;
}
GridStopWatch Logger::StopWatch;
GridStopWatch Logger::GlobalStopWatch;
int Logger::timestamp;
std::ostream Logger::devnull(0);
@@ -59,6 +59,9 @@ void GridLogTimestamp(int on){
}
Colours GridLogColours(0);
GridLogger GridLogMG (1, "MG" , GridLogColours, "NORMAL");
GridLogger GridLogIRL (1, "IRL" , GridLogColours, "NORMAL");
GridLogger GridLogSolver (1, "Solver", GridLogColours, "NORMAL");
GridLogger GridLogError (1, "Error" , GridLogColours, "RED");
GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW");
GridLogger GridLogMessage(1, "Message", GridLogColours, "NORMAL");
@@ -74,7 +77,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
GridLogIterative.Active(0);
GridLogDebug.Active(0);
GridLogPerformance.Active(0);
GridLogIntegrator.Active(0);
GridLogIntegrator.Active(1);
GridLogColours.Active(0);
for (int i = 0; i < logstreams.size(); i++) {
@@ -83,8 +86,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
if (logstreams[i] == std::string("NoMessage")) GridLogMessage.Active(0);
if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1);
if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1);
if (logstreams[i] == std::string("Performance"))
GridLogPerformance.Active(1);
if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1);
if (logstreams[i] == std::string("Integrator")) GridLogIntegrator.Active(1);
if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1);
}
@@ -95,7 +97,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
////////////////////////////////////////////////////////////
void Grid_quiesce_nodes(void) {
int me = 0;
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPI3L)
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
MPI_Comm_rank(MPI_COMM_WORLD, &me);
#endif
#ifdef GRID_COMMS_SHMEM
@@ -107,8 +109,9 @@ void Grid_quiesce_nodes(void) {
}
void Grid_unquiesce_nodes(void) {
#ifdef GRID_COMMS_MPI
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
std::cout.clear();
#endif
}
}
NAMESPACE_END(Grid);

View File

@@ -37,13 +37,12 @@
#include <execinfo.h>
#endif
namespace Grid {
NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////////////////////////////////////////////////////
// Dress the output; use std::chrono for time stamping via the StopWatch class
//////////////////////////////////////////////////////////////////////////////////////////////////
class Colours{
protected:
bool is_active;
@@ -85,12 +84,16 @@ class Logger {
protected:
Colours &Painter;
int active;
int timing_mode;
int topWidth{-1}, chanWidth{-1};
static int timestamp;
std::string name, topName;
std::string COLOUR;
public:
static GridStopWatch StopWatch;
static GridStopWatch GlobalStopWatch;
GridStopWatch LocalStopWatch;
GridStopWatch *StopWatch;
static std::ostream devnull;
std::string background() {return Painter.colour["NORMAL"];}
@@ -101,22 +104,52 @@ public:
name(nm),
topName(topNm),
Painter(col_class),
COLOUR(col) {} ;
timing_mode(0),
COLOUR(col)
{
StopWatch = & GlobalStopWatch;
};
void Active(int on) {active = on;};
int isActive(void) {return active;};
static void Timestamp(int on) {timestamp = on;};
void Reset(void) {
StopWatch->Reset();
StopWatch->Start();
}
void TimingMode(int on) {
timing_mode = on;
if(on) {
StopWatch = &LocalStopWatch;
Reset();
}
}
void setTopWidth(const int w) {topWidth = w;}
void setChanWidth(const int w) {chanWidth = w;}
friend std::ostream& operator<< (std::ostream& stream, Logger& log){
if ( log.active ) {
stream << log.background()<< std::setw(8) << std::left << log.topName << log.background()<< " : ";
stream << log.colour() << std::setw(10) << std::left << log.name << log.background() << " : ";
stream << log.background()<< std::left;
if (log.topWidth > 0)
{
stream << std::setw(log.topWidth);
}
stream << log.topName << log.background()<< " : ";
stream << log.colour() << std::left;
if (log.chanWidth > 0)
{
stream << std::setw(log.chanWidth);
}
stream << log.name << log.background() << " : ";
if ( log.timestamp ) {
StopWatch.Stop();
GridTime now = StopWatch.Elapsed();
StopWatch.Start();
stream << log.evidence()<< now << log.background() << " : " ;
log.StopWatch->Stop();
GridTime now = log.StopWatch->Elapsed();
if ( log.timing_mode==1 ) log.StopWatch->Reset();
log.StopWatch->Start();
stream << log.evidence()
<< now << log.background() << " : " ;
}
stream << log.colour();
return stream;
@@ -135,6 +168,9 @@ public:
void GridLogConfigure(std::vector<std::string> &logstreams);
extern GridLogger GridLogMG;
extern GridLogger GridLogIRL;
extern GridLogger GridLogSolver;
extern GridLogger GridLogError;
extern GridLogger GridLogWarning;
extern GridLogger GridLogMessage;
@@ -177,6 +213,6 @@ std::fprintf (fp,"BT %d %lx\n",3, __builtin_return_address(3)); std::fflush(fp);
#define BACKTRACE() BACKTRACEFP(stdout)
NAMESPACE_END(Grid);
}
#endif

View File

@@ -0,0 +1,3 @@
#include <Grid/GridCore.h>
int Grid::BinaryIO::latticeWriteMaxRetry = -1;

View File

@@ -26,10 +26,9 @@
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_BINARY_IO_H
#define GRID_BINARY_IO_H
#pragma once
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3)
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
#define USE_MPI_IO
#else
#undef USE_MPI_IO
@@ -42,8 +41,7 @@
#include <arpa/inet.h>
#include <algorithm>
namespace Grid {
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////
// Byte reversal garbage
@@ -81,6 +79,7 @@ inline void removeWhitespace(std::string &key)
///////////////////////////////////////////////////////////////////////////////////////////////////
class BinaryIO {
public:
static int latticeWriteMaxRetry;
/////////////////////////////////////////////////////////////////////////////
// more byte manipulation helpers
@@ -90,8 +89,8 @@ class BinaryIO {
{
typedef typename vobj::scalar_object sobj;
GridBase *grid = lat._grid;
int lsites = grid->lSites();
GridBase *grid = lat.Grid();
uint64_t lsites = grid->lSites();
std::vector<sobj> scalardata(lsites);
unvectorizeToLexOrdArray(scalardata,lat);
@@ -99,64 +98,66 @@ class BinaryIO {
NerscChecksum(grid,scalardata,nersc_csum);
}
template<class fobj> static inline void NerscChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &nersc_csum)
template <class fobj>
static inline void NerscChecksum(GridBase *grid, std::vector<fobj> &fbuf, uint32_t &nersc_csum)
{
const uint64_t size32 = sizeof(fobj) / sizeof(uint32_t);
uint64_t lsites = grid->lSites();
if (fbuf.size()==1) {
if (fbuf.size() == 1)
{
lsites = 1;
}
#pragma omp parallel
thread_region
{
uint32_t nersc_csum_thr = 0;
#pragma omp for
for(uint64_t local_site=0;local_site<lsites;local_site++){
thread_for_in_region( local_site, lsites,
{
uint32_t *site_buf = (uint32_t *)&fbuf[local_site];
for(uint64_t j=0;j<size32;j++){
for (uint64_t j = 0; j < size32; j++)
{
nersc_csum_thr = nersc_csum_thr + site_buf[j];
}
}
});
#pragma omp critical
thread_critical
{
nersc_csum += nersc_csum_thr;
}
}
}
template<class fobj> static inline void ScidacChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &scidac_csuma,uint32_t &scidac_csumb)
{
const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
int nd = grid->_ndimension;
uint64_t lsites =grid->lSites();
if (fbuf.size()==1) {
lsites=1;
}
std::vector<int> local_vol =grid->LocalDimensions();
std::vector<int> local_start =grid->LocalStarts();
std::vector<int> global_vol =grid->FullDimensions();
Coordinate local_vol =grid->LocalDimensions();
Coordinate local_start =grid->LocalStarts();
Coordinate global_vol =grid->FullDimensions();
#pragma omp parallel
thread_region
{
std::vector<int> coor(nd);
Coordinate coor(nd);
uint32_t scidac_csuma_thr=0;
uint32_t scidac_csumb_thr=0;
uint32_t site_crc=0;
#pragma omp for
for(uint64_t local_site=0;local_site<lsites;local_site++){
thread_for_in_region( local_site, lsites,
{
uint32_t * site_buf = (uint32_t *)&fbuf[local_site];
/*
* Scidac csum is rather more heavyweight
* FIXME -- 128^3 x 256 x 16 will overflow.
*/
int global_site;
Lexicographic::CoorFromIndex(coor,local_site,local_vol);
@@ -175,9 +176,9 @@ class BinaryIO {
// std::cout << "Site "<<local_site << std::hex<<site_buf[0] <<site_buf[1]<<std::dec <<std::endl;
scidac_csuma_thr ^= site_crc<<gsite29 | site_crc>>(32-gsite29);
scidac_csumb_thr ^= site_crc<<gsite31 | site_crc>>(32-gsite31);
}
});
#pragma omp critical
thread_critical
{
scidac_csuma^= scidac_csuma_thr;
scidac_csumb^= scidac_csumb_thr;
@@ -195,23 +196,23 @@ class BinaryIO {
{
uint32_t * f = (uint32_t *)file_object;
uint64_t count = bytes/sizeof(uint32_t);
parallel_for(uint64_t i=0;i<count;i++){
thread_for( i, count, {
f[i] = ntohl(f[i]);
}
});
}
// LE must Swap and switch to host
static inline void le32toh_v(void *file_object,uint64_t bytes)
{
uint32_t *fp = (uint32_t *)file_object;
uint32_t f;
uint64_t count = bytes/sizeof(uint32_t);
parallel_for(uint64_t i=0;i<count;i++){
thread_for(i,count,{
uint32_t f;
f = fp[i];
// got network order and the network to host
f = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ;
fp[i] = ntohl(f);
}
});
}
// BE is same as network
@@ -219,19 +220,18 @@ class BinaryIO {
{
uint64_t * f = (uint64_t *)file_object;
uint64_t count = bytes/sizeof(uint64_t);
parallel_for(uint64_t i=0;i<count;i++){
thread_for( i, count, {
f[i] = Grid_ntohll(f[i]);
}
});
}
// LE must swap and switch;
static inline void le64toh_v(void *file_object,uint64_t bytes)
{
uint64_t *fp = (uint64_t *)file_object;
uint64_t f,g;
uint64_t count = bytes/sizeof(uint64_t);
parallel_for(uint64_t i=0;i<count;i++){
thread_for( i, count, {
uint64_t f,g;
f = fp[i];
// got network order and the network to host
g = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ;
@@ -239,7 +239,7 @@ class BinaryIO {
f = f >> 32;
g|= ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ;
fp[i] = Grid_ntohll(g);
}
});
}
/////////////////////////////////////////////////////////////////////////////
// Real action:
@@ -257,7 +257,7 @@ class BinaryIO {
GridBase *grid,
std::vector<fobj> &iodata,
std::string file,
int offset,
uint64_t& offset,
const std::string &format, int control,
uint32_t &nersc_csum,
uint32_t &scidac_csuma,
@@ -275,13 +275,13 @@ class BinaryIO {
int nrank = grid->ProcessorCount();
int myrank = grid->ThisRank();
std::vector<int> psizes = grid->ProcessorGrid();
std::vector<int> pcoor = grid->ThisProcessorCoor();
std::vector<int> gLattice= grid->GlobalDimensions();
std::vector<int> lLattice= grid->LocalDimensions();
Coordinate psizes = grid->ProcessorGrid();
Coordinate pcoor = grid->ThisProcessorCoor();
Coordinate gLattice= grid->GlobalDimensions();
Coordinate lLattice= grid->LocalDimensions();
std::vector<int> lStart(ndim);
std::vector<int> gStart(ndim);
Coordinate lStart(ndim);
Coordinate gStart(ndim);
// Flatten the file
uint64_t lsites = grid->lSites();
@@ -342,7 +342,8 @@ class BinaryIO {
int ieee32 = (format == std::string("IEEE32"));
int ieee64big = (format == std::string("IEEE64BIG"));
int ieee64 = (format == std::string("IEEE64"));
assert(ieee64||ieee32|ieee64big||ieee32big);
assert((ieee64+ieee32+ieee64big+ieee32big)==1);
//////////////////////////////////////////////////////////////////////////////
// Do the I/O
//////////////////////////////////////////////////////////////////////////////
@@ -352,7 +353,7 @@ class BinaryIO {
if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
#ifdef USE_MPI_IO
std::cout<< GridLogMessage<< "MPI read I/O "<< file<< std::endl;
std::cout<< GridLogMessage<<"IOobject: MPI read I/O "<< file<< std::endl;
ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh); assert(ierr==0);
ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); assert(ierr==0);
ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status); assert(ierr==0);
@@ -363,16 +364,20 @@ class BinaryIO {
assert(0);
#endif
} else {
std::cout<< GridLogMessage<< "C++ read I/O "<< file<<" : "
<< iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
std::cout << GridLogMessage <<"IOobject: C++ read I/O " << file << " : "
<< iodata.size() * sizeof(fobj) << " bytes and offset " << offset << std::endl;
std::ifstream fin;
fin.open(file, std::ios::binary | std::ios::in);
if ( control & BINARYIO_MASTER_APPEND ) {
if (control & BINARYIO_MASTER_APPEND)
{
fin.seekg(-sizeof(fobj), fin.end);
} else {
}
else
{
fin.seekg(offset + myrank * lsites * sizeof(fobj));
}
fin.read((char *)&iodata[0],iodata.size()*sizeof(fobj));assert( fin.fail()==0);
fin.read((char *)&iodata[0], iodata.size() * sizeof(fobj));
assert(fin.fail() == 0);
fin.close();
}
timer.Stop();
@@ -405,10 +410,36 @@ class BinaryIO {
timer.Start();
if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
#ifdef USE_MPI_IO
std::cout<< GridLogMessage<< "MPI write I/O "<< file<< std::endl;
ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDWR|MPI_MODE_CREATE,MPI_INFO_NULL, &fh); assert(ierr==0);
ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); assert(ierr==0);
ierr=MPI_File_write_all(fh, &iodata[0], 1, localArray, &status); assert(ierr==0);
std::cout << GridLogMessage <<"IOobject: MPI write I/O " << file << std::endl;
ierr = MPI_File_open(grid->communicator, (char *)file.c_str(), MPI_MODE_RDWR | MPI_MODE_CREATE, MPI_INFO_NULL, &fh);
// std::cout << GridLogMessage << "Checking for errors" << std::endl;
if (ierr != MPI_SUCCESS)
{
char error_string[BUFSIZ];
int length_of_error_string, error_class;
MPI_Error_class(ierr, &error_class);
MPI_Error_string(error_class, error_string, &length_of_error_string);
fprintf(stderr, "%3d: %s\n", myrank, error_string);
MPI_Error_string(ierr, error_string, &length_of_error_string);
fprintf(stderr, "%3d: %s\n", myrank, error_string);
MPI_Abort(MPI_COMM_WORLD, 1); //assert(ierr == 0);
}
std::cout << GridLogDebug << "MPI write I/O set view " << file << std::endl;
ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);
assert(ierr == 0);
std::cout << GridLogDebug << "MPI write I/O write all " << file << std::endl;
ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);
assert(ierr == 0);
MPI_Offset os;
MPI_File_get_position(fh, &os);
MPI_File_get_byte_offset(fh, os, &disp);
offset = disp;
MPI_File_close(&fh);
MPI_Type_free(&fileArray);
MPI_Type_free(&localArray);
@@ -416,15 +447,56 @@ class BinaryIO {
assert(0);
#endif
} else {
std::ofstream fout; fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
std::cout<< GridLogMessage<< "C++ write I/O "<< file<<" : "
<< iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
if ( control & BINARYIO_MASTER_APPEND ) {
fout.seekp(0,fout.end);
} else {
fout.seekp(offset+myrank*lsites*sizeof(fobj));
std::cout << GridLogMessage << "IOobject: C++ write I/O " << file << " : "
<< iodata.size() * sizeof(fobj) << " bytes and offset " << offset << std::endl;
std::ofstream fout;
fout.exceptions ( std::fstream::failbit | std::fstream::badbit );
try {
if (offset) { // Must already exist and contain data
fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
} else { // Allow create
fout.open(file,std::ios::binary|std::ios::out);
}
fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));assert( fout.fail()==0);
} catch (const std::fstream::failure& exc) {
std::cout << GridLogError << "Error in opening the file " << file << " for output" <<std::endl;
std::cout << GridLogError << "Exception description: " << exc.what() << std::endl;
// std::cout << GridLogError << "Probable cause: wrong path, inaccessible location "<< std::endl;
#ifdef USE_MPI_IO
MPI_Abort(MPI_COMM_WORLD,1);
#else
exit(1);
#endif
}
if ( control & BINARYIO_MASTER_APPEND ) {
try {
fout.seekp(0,fout.end);
} catch (const std::fstream::failure& exc) {
std::cout << "Exception in seeking file end " << file << std::endl;
}
} else {
try {
fout.seekp(offset+myrank*lsites*sizeof(fobj));
} catch (const std::fstream::failure& exc) {
std::cout << "Exception in seeking file " << file <<" offset "<< offset << std::endl;
}
}
try {
fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));//assert( fout.fail()==0);
}
catch (const std::fstream::failure& exc) {
std::cout << "Exception in writing file " << file << std::endl;
std::cout << GridLogError << "Exception description: "<< exc.what() << std::endl;
#ifdef USE_MPI_IO
MPI_Abort(MPI_COMM_WORLD,1);
#else
exit(1);
#endif
}
offset = fout.tellp();
fout.close();
}
timer.Stop();
@@ -442,12 +514,15 @@ class BinaryIO {
//////////////////////////////////////////////////////////////////////////////
// Safety check
//////////////////////////////////////////////////////////////////////////////
// if the data size is 1 we do not want to sum over the MPI ranks
if (iodata.size() != 1){
grid->Barrier();
grid->GlobalSum(nersc_csum);
grid->GlobalXOR(scidac_csuma);
grid->GlobalXOR(scidac_csumb);
grid->Barrier();
}
}
/////////////////////////////////////////////////////////////////////////////
// Read a Lattice of object
@@ -456,7 +531,7 @@ class BinaryIO {
static inline void readLatticeObject(Lattice<vobj> &Umu,
std::string file,
munger munge,
int offset,
uint64_t offset,
const std::string &format,
uint32_t &nersc_csum,
uint32_t &scidac_csuma,
@@ -465,8 +540,8 @@ class BinaryIO {
typedef typename vobj::scalar_object sobj;
typedef typename vobj::Realified::scalar_type word; word w=0;
GridBase *grid = Umu._grid;
int lsites = grid->lSites();
GridBase *grid = Umu.Grid();
uint64_t lsites = grid->lSites();
std::vector<sobj> scalardata(lsites);
std::vector<fobj> iodata(lsites); // Munge, checksum, byte order in here
@@ -477,7 +552,7 @@ class BinaryIO {
GridStopWatch timer;
timer.Start();
parallel_for(int x=0;x<lsites;x++) munge(iodata[x], scalardata[x]);
thread_for(x,lsites, { munge(iodata[x], scalardata[x]); });
vectorizeFromLexOrdArray(scalardata,Umu);
grid->Barrier();
@@ -493,7 +568,7 @@ class BinaryIO {
static inline void writeLatticeObject(Lattice<vobj> &Umu,
std::string file,
munger munge,
int offset,
uint64_t offset,
const std::string &format,
uint32_t &nersc_csum,
uint32_t &scidac_csuma,
@@ -501,8 +576,10 @@ class BinaryIO {
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::Realified::scalar_type word; word w=0;
GridBase *grid = Umu._grid;
int lsites = grid->lSites();
GridBase *grid = Umu.Grid();
uint64_t lsites = grid->lSites(), offsetCopy = offset;
int attemptsLeft = std::max(0, BinaryIO::latticeWriteMaxRetry);
bool checkWrite = (BinaryIO::latticeWriteMaxRetry >= 0);
std::vector<sobj> scalardata(lsites);
std::vector<fobj> iodata(lsites); // Munge, checksum, byte order in here
@@ -513,13 +590,40 @@ class BinaryIO {
GridStopWatch timer; timer.Start();
unvectorizeToLexOrdArray(scalardata,Umu);
parallel_for(int x=0;x<lsites;x++) munge(scalardata[x],iodata[x]);
thread_for(x, lsites, { munge(scalardata[x],iodata[x]); });
grid->Barrier();
timer.Stop();
while (attemptsLeft >= 0)
{
grid->Barrier();
IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
nersc_csum,scidac_csuma,scidac_csumb);
if (checkWrite)
{
std::vector<fobj> ckiodata(lsites);
uint32_t cknersc_csum, ckscidac_csuma, ckscidac_csumb;
uint64_t ckoffset = offsetCopy;
std::cout << GridLogMessage << "writeLatticeObject: read back object" << std::endl;
grid->Barrier();
IOobject(w,grid,ckiodata,file,ckoffset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
cknersc_csum,ckscidac_csuma,ckscidac_csumb);
if ((cknersc_csum != nersc_csum) or (ckscidac_csuma != scidac_csuma) or (ckscidac_csumb != scidac_csumb))
{
std::cout << GridLogMessage << "writeLatticeObject: read test checksum failure, re-writing (" << attemptsLeft << " attempt(s) remaining)" << std::endl;
offset = offsetCopy;
thread_for(x,lsites, { munge(scalardata[x],iodata[x]); });
}
else
{
std::cout << GridLogMessage << "writeLatticeObject: read test checksum correct" << std::endl;
break;
}
}
attemptsLeft--;
}
std::cout<<GridLogMessage<<"writeLatticeObject: unvectorize overhead "<<timer.Elapsed() <<std::endl;
}
@@ -527,10 +631,10 @@ class BinaryIO {
/////////////////////////////////////////////////////////////////////////////
// Read a RNG; use IOobject and lexico map to an array of state
//////////////////////////////////////////////////////////////////////////////////////
static inline void readRNG(GridSerialRNG &serial,
GridParallelRNG &parallel,
static inline void readRNG(GridSerialRNG &serial_rng,
GridParallelRNG &parallel_rng,
std::string file,
int offset,
uint64_t offset,
uint32_t &nersc_csum,
uint32_t &scidac_csuma,
uint32_t &scidac_csumb)
@@ -542,13 +646,13 @@ class BinaryIO {
std::string format = "IEEE32BIG";
GridBase *grid = parallel._grid;
int gsites = grid->gSites();
int lsites = grid->lSites();
GridBase *grid = parallel_rng.Grid();
uint64_t gsites = grid->gSites();
uint64_t lsites = grid->lSites();
uint32_t nersc_csum_tmp;
uint32_t scidac_csuma_tmp;
uint32_t scidac_csumb_tmp;
uint32_t nersc_csum_tmp = 0;
uint32_t scidac_csuma_tmp = 0;
uint32_t scidac_csumb_tmp = 0;
GridStopWatch timer;
@@ -559,11 +663,11 @@ class BinaryIO {
nersc_csum,scidac_csuma,scidac_csumb);
timer.Start();
parallel_for(int lidx=0;lidx<lsites;lidx++){
thread_for(lidx,lsites,{
std::vector<RngStateType> tmp(RngStateCount);
std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin());
parallel.SetState(tmp,lidx);
}
parallel_rng.SetState(tmp,lidx);
});
timer.Stop();
iodata.resize(1);
@@ -573,7 +677,7 @@ class BinaryIO {
{
std::vector<RngStateType> tmp(RngStateCount);
std::copy(iodata[0].begin(),iodata[0].end(),tmp.begin());
serial.SetState(tmp,0);
serial_rng.SetState(tmp,0);
}
nersc_csum = nersc_csum + nersc_csum_tmp;
@@ -589,10 +693,10 @@ class BinaryIO {
/////////////////////////////////////////////////////////////////////////////
// Write a RNG; lexico map to an array of state and use IOobject
//////////////////////////////////////////////////////////////////////////////////////
static inline void writeRNG(GridSerialRNG &serial,
GridParallelRNG &parallel,
static inline void writeRNG(GridSerialRNG &serial_rng,
GridParallelRNG &parallel_rng,
std::string file,
int offset,
uint64_t offset,
uint32_t &nersc_csum,
uint32_t &scidac_csuma,
uint32_t &scidac_csumb)
@@ -602,9 +706,9 @@ class BinaryIO {
const int RngStateCount = GridSerialRNG::RngStateCount;
typedef std::array<RngStateType,RngStateCount> RNGstate;
GridBase *grid = parallel._grid;
int gsites = grid->gSites();
int lsites = grid->lSites();
GridBase *grid = parallel_rng.Grid();
uint64_t gsites = grid->gSites();
uint64_t lsites = grid->lSites();
uint32_t nersc_csum_tmp;
uint32_t scidac_csuma_tmp;
@@ -617,20 +721,19 @@ class BinaryIO {
timer.Start();
std::vector<RNGstate> iodata(lsites);
parallel_for(int lidx=0;lidx<lsites;lidx++){
thread_for(lidx,lsites,{
std::vector<RngStateType> tmp(RngStateCount);
parallel.GetState(tmp,lidx);
parallel_rng.GetState(tmp,lidx);
std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin());
}
});
timer.Stop();
IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
nersc_csum,scidac_csuma,scidac_csumb);
iodata.resize(1);
{
std::vector<RngStateType> tmp(RngStateCount);
serial.GetState(tmp,0);
serial_rng.GetState(tmp,0);
std::copy(tmp.begin(),tmp.end(),iodata[0].begin());
}
IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_MASTER_APPEND,
@@ -646,5 +749,5 @@ class BinaryIO {
std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl;
}
};
}
#endif
NAMESPACE_END(Grid);

View File

@@ -24,8 +24,7 @@ See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ILDG_IO_H
#define GRID_ILDG_IO_H
#pragma once
#ifdef HAVE_LIME
#include <algorithm>
@@ -43,8 +42,13 @@ extern "C" {
#include "lime.h"
}
namespace Grid {
namespace QCD {
NAMESPACE_BEGIN(Grid);
#define GRID_FIELD_NORM "FieldNormMetaData"
#define GRID_FIELD_NORM_CALC(FieldNormMetaData_, n2ck) \
0.5*fabs(FieldNormMetaData_.norm2 - n2ck)/(FieldNormMetaData_.norm2 + n2ck)
#define GRID_FIELD_NORM_CHECK(FieldNormMetaData_, n2ck) \
assert(GRID_FIELD_NORM_CALC(FieldNormMetaData_, n2ck) < 1.0e-5);
/////////////////////////////////
// Encode word types as strings
@@ -84,10 +88,6 @@ namespace QCD {
stream << "GRID_";
stream << ScidacWordMnemonic<stype>();
// std::cout << " Lorentz N/S/V/M : " << _LorentzN<<" "<<_LorentzScalar<<"/"<<_LorentzVector<<"/"<<_LorentzMatrix<<std::endl;
// std::cout << " Spin N/S/V/M : " << _SpinN <<" "<<_SpinScalar <<"/"<<_SpinVector <<"/"<<_SpinMatrix<<std::endl;
// std::cout << " Colour N/S/V/M : " << _ColourN <<" "<<_ColourScalar <<"/"<<_ColourVector <<"/"<<_ColourMatrix<<std::endl;
if ( _LorentzVector ) stream << "_LorentzVector"<<_LorentzN;
if ( _LorentzMatrix ) stream << "_LorentzMatrix"<<_LorentzN;
@@ -138,7 +138,7 @@ namespace QCD {
/////////////////////////////////////
// Scidac Private File structure
/////////////////////////////////////
_scidacFile = scidacFile(field._grid);
_scidacFile = scidacFile(field.Grid());
/////////////////////////////////////
// Scidac Private Record structure
@@ -151,7 +151,7 @@ namespace QCD {
_scidacRecord = sr;
std::cout << GridLogMessage << "Build SciDAC datatype " <<sr.datatype<<std::endl;
// std::cout << GridLogMessage << "Build SciDAC datatype " <<sr.datatype<<std::endl;
}
///////////////////////////////////////////////////////
@@ -182,10 +182,15 @@ class GridLimeReader : public BinaryIO {
/////////////////////////////////////////////
// Open the file
/////////////////////////////////////////////
void open(std::string &_filename)
void open(const std::string &_filename)
{
filename= _filename;
File = fopen(filename.c_str(), "r");
if (File == nullptr)
{
std::cerr << "cannot open file '" << filename << "'" << std::endl;
abort();
}
LimeR = limeCreateReader(File);
}
/////////////////////////////////////////////
@@ -204,79 +209,148 @@ class GridLimeReader : public BinaryIO {
{
typedef typename vobj::scalar_object sobj;
scidacChecksum scidacChecksum_;
FieldNormMetaData FieldNormMetaData_;
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
std::string format = getFormatString<vobj>();
while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) {
std::cout << GridLogMessage << limeReaderType(LimeR) <<std::endl;
uint64_t file_bytes =limeReaderBytes(LimeR);
if ( strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) ) ) {
// std::cout << GridLogMessage << limeReaderType(LimeR) << " "<< file_bytes <<" bytes "<<std::endl;
// std::cout << GridLogMessage<< " readLimeObject seeking "<< record_name <<" found record :" <<limeReaderType(LimeR) <<std::endl;
if ( !strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) ) ) {
off_t offset= ftell(File);
// std::cout << GridLogMessage<< " readLimeLatticeBinaryObject matches ! " <<std::endl;
uint64_t PayloadSize = sizeof(sobj) * field.Grid()->_gsites;
// std::cout << "R sizeof(sobj)= " <<sizeof(sobj)<<std::endl;
// std::cout << "R Gsites " <<field.Grid()->_gsites<<std::endl;
// std::cout << "R Payload expected " <<PayloadSize<<std::endl;
// std::cout << "R file size " <<file_bytes <<std::endl;
assert(PayloadSize == file_bytes);// Must match or user error
uint64_t offset= ftello(File);
// std::cout << " ReadLatticeObject from offset "<<offset << std::endl;
BinarySimpleMunger<sobj,sobj> munge;
BinaryIO::readLatticeObject< sobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
std::cout << GridLogMessage << "SciDAC checksum A " << std::hex << scidac_csuma << std::dec << std::endl;
std::cout << GridLogMessage << "SciDAC checksum B " << std::hex << scidac_csumb << std::dec << std::endl;
/////////////////////////////////////////////
// Insist checksum is next record
/////////////////////////////////////////////
readLimeObject(scidacChecksum_,std::string("scidacChecksum"),record_name);
readScidacChecksum(scidacChecksum_,FieldNormMetaData_);
/////////////////////////////////////////////
// Verify checksums
/////////////////////////////////////////////
scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb);
if(FieldNormMetaData_.norm2 != 0.0){
RealD n2ck = norm2(field);
std::cout << GridLogMessage << "Field norm: metadata= " << FieldNormMetaData_.norm2
<< " / field= " << n2ck << " / rdiff= " << GRID_FIELD_NORM_CALC(FieldNormMetaData_,n2ck) << std::endl;
GRID_FIELD_NORM_CHECK(FieldNormMetaData_,n2ck);
}
assert(scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb)==1);
// find out if next field is a GridFieldNorm
return;
}
}
}
void readScidacChecksum(scidacChecksum &scidacChecksum_,
FieldNormMetaData &FieldNormMetaData_)
{
FieldNormMetaData_.norm2 =0.0;
std::string scidac_str(SCIDAC_CHECKSUM);
std::string field_norm_str(GRID_FIELD_NORM);
while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) {
uint64_t nbytes = limeReaderBytes(LimeR);//size of this record (configuration)
std::vector<char> xmlc(nbytes+1,'\0');
limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);
std::string xmlstring = std::string(&xmlc[0]);
XmlReader RD(xmlstring, true, "");
if ( !strncmp(limeReaderType(LimeR), field_norm_str.c_str(),strlen(field_norm_str.c_str()) ) ) {
// std::cout << "FieldNormMetaData "<<xmlstring<<std::endl;
read(RD,field_norm_str,FieldNormMetaData_);
}
if ( !strncmp(limeReaderType(LimeR), scidac_str.c_str(),strlen(scidac_str.c_str()) ) ) {
// std::cout << SCIDAC_CHECKSUM << " " <<xmlstring<<std::endl;
read(RD,std::string("scidacChecksum"),scidacChecksum_);
return;
}
}
assert(0);
}
////////////////////////////////////////////
// Read a generic serialisable object
////////////////////////////////////////////
template<class serialisable_object>
void readLimeObject(serialisable_object &object,std::string object_name,std::string record_name)
void readLimeObject(std::string &xmlstring,std::string record_name)
{
std::string xmlstring;
// should this be a do while; can we miss a first record??
while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) {
// std::cout << GridLogMessage<< " readLimeObject seeking "<< record_name <<" found record :" <<limeReaderType(LimeR) <<std::endl;
uint64_t nbytes = limeReaderBytes(LimeR);//size of this record (configuration)
if ( strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) ) ) {
if ( !strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) ) ) {
// std::cout << GridLogMessage<< " readLimeObject matches ! " << record_name <<std::endl;
std::vector<char> xmlc(nbytes+1,'\0');
limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);
XmlReader RD(&xmlc[0],"");
read(RD,object_name,object);
// std::cout << GridLogMessage<< " readLimeObject matches XML " << &xmlc[0] <<std::endl;
xmlstring = std::string(&xmlc[0]);
return;
}
}
assert(0);
}
template<class serialisable_object>
void readLimeObject(serialisable_object &object,std::string object_name,std::string record_name)
{
std::string xmlstring;
readLimeObject(xmlstring, record_name);
XmlReader RD(xmlstring, true, "");
read(RD,object_name,object);
}
};
class GridLimeWriter : public BinaryIO {
class GridLimeWriter : public BinaryIO
{
public:
///////////////////////////////////////////////////
// FIXME: format for RNG? Now just binary out instead
// FIXME: collective calls or not ?
// : must know if I am the I/O boss
///////////////////////////////////////////////////
FILE *File;
LimeWriter *LimeW;
std::string filename;
void open(std::string &_filename) {
bool boss_node;
GridLimeWriter( bool isboss = true) {
boss_node = isboss;
}
void open(const std::string &_filename) {
filename= _filename;
if ( boss_node ) {
File = fopen(filename.c_str(), "w");
LimeW = limeCreateWriter(File); assert(LimeW != NULL );
}
}
/////////////////////////////////////////////
// Close the file
/////////////////////////////////////////////
void close(void) {
if ( boss_node ) {
fclose(File);
}
// limeDestroyWriter(LimeW);
}
///////////////////////////////////////////////////////
@@ -284,66 +358,124 @@ class GridLimeWriter : public BinaryIO {
///////////////////////////////////////////////////////
int createLimeRecordHeader(std::string message, int MB, int ME, size_t PayloadSize)
{
if ( boss_node ) {
LimeRecordHeader *h;
h = limeCreateHeader(MB, ME, const_cast<char *>(message.c_str()), PayloadSize);
assert(limeWriteRecordHeader(h, LimeW) >= 0);
limeDestroyHeader(h);
}
return LIME_SUCCESS;
}
////////////////////////////////////////////
// Write a generic serialisable object
////////////////////////////////////////////
template<class serialisable_object>
void writeLimeObject(int MB,int ME,serialisable_object &object,std::string object_name,std::string record_name)
void writeLimeObject(int MB,int ME,XmlWriter &writer,std::string object_name,std::string record_name)
{
std::string xmlstring;
{
XmlWriter WR("","");
write(WR,object_name,object);
xmlstring = WR.XmlString();
}
if ( boss_node ) {
std::string xmlstring = writer.docString();
// std::cout << "WriteLimeObject" << record_name <<std::endl;
uint64_t nbytes = xmlstring.size();
// std::cout << " xmlstring "<< nbytes<< " " << xmlstring <<std::endl;
int err;
LimeRecordHeader *h = limeCreateHeader(MB, ME,(char *)record_name.c_str(), nbytes); assert(h!= NULL);
LimeRecordHeader *h = limeCreateHeader(MB, ME,const_cast<char *>(record_name.c_str()), nbytes);
assert(h!= NULL);
err=limeWriteRecordHeader(h, LimeW); assert(err>=0);
err=limeWriteRecordData(&xmlstring[0], &nbytes, LimeW); assert(err>=0);
err=limeWriterCloseRecord(LimeW); assert(err>=0);
limeDestroyHeader(h);
}
////////////////////////////////////////////
}
template<class serialisable_object>
void writeLimeObject(int MB,int ME,serialisable_object &object,std::string object_name,std::string record_name, const unsigned int scientificPrec = 0)
{
XmlWriter WR("","");
if (scientificPrec)
{
WR.scientificFormat(true);
WR.setPrecision(scientificPrec);
}
write(WR,object_name,object);
writeLimeObject(MB, ME, WR, object_name, record_name);
}
////////////////////////////////////////////////////
// Write a generic lattice field and csum
////////////////////////////////////////////
// This routine is Collectively called by all nodes
// in communicator used by the field.Grid()
////////////////////////////////////////////////////
template<class vobj>
void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
{
////////////////////////////////////////////
// Create record header
////////////////////////////////////////////
typedef typename vobj::scalar_object sobj;
int err;
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
uint64_t PayloadSize = sizeof(sobj) * field._grid->_gsites;
createLimeRecordHeader(record_name, 0, 0, PayloadSize);
////////////////////////////////////////////////////////////////////
// NB: FILE and iostream are jointly writing disjoint sequences in the
// the same file through different file handles (integer units).
//
// These are both buffered, so why I think this code is right is as follows.
//
// i) write record header to FILE *File, telegraphing the size.
// ii) ftell reads the offset from FILE *File .
// i) write record header to FILE *File, telegraphing the size; flush
// ii) ftello reads the offset from FILE *File .
// iii) iostream / MPI Open independently seek this offset. Write sequence direct to disk.
// Closes iostream and flushes.
// iv) fseek on FILE * to end of this disjoint section.
// v) Continue writing scidac record.
////////////////////////////////////////////////////////////////////
off_t offset = ftell(File);
GridBase *grid = field.Grid();
assert(boss_node == field.Grid()->IsBoss() );
FieldNormMetaData FNMD; FNMD.norm2 = norm2(field);
////////////////////////////////////////////
// Create record header
////////////////////////////////////////////
typedef typename vobj::scalar_object sobj;
int err;
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
uint64_t PayloadSize = sizeof(sobj) * grid->_gsites;
if ( boss_node ) {
createLimeRecordHeader(record_name, 0, 0, PayloadSize);
fflush(File);
}
// std::cout << "W sizeof(sobj)" <<sizeof(sobj)<<std::endl;
// std::cout << "W Gsites " <<field.Grid()->_gsites<<std::endl;
// std::cout << "W Payload expected " <<PayloadSize<<std::endl;
////////////////////////////////////////////////
// Check all nodes agree on file position
////////////////////////////////////////////////
uint64_t offset1;
if ( boss_node ) {
offset1 = ftello(File);
}
grid->Broadcast(0,(void *)&offset1,sizeof(offset1));
///////////////////////////////////////////
// The above is collective. Write by other means into the binary record
///////////////////////////////////////////
std::string format = getFormatString<vobj>();
BinarySimpleMunger<sobj,sobj> munge;
BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset1, format,nersc_csum,scidac_csuma,scidac_csumb);
///////////////////////////////////////////
// Wind forward and close the record
///////////////////////////////////////////
if ( boss_node ) {
fseek(File,0,SEEK_END);
uint64_t offset2 = ftello(File); // std::cout << " now at offset "<<offset2 << std::endl;
assert( (offset2-offset1) == PayloadSize);
}
/////////////////////////////////////////////////////////////
// Check MPI-2 I/O did what we expect to file
/////////////////////////////////////////////////////////////
if ( boss_node ) {
err=limeWriterCloseRecord(LimeW); assert(err>=0);
}
////////////////////////////////////////
// Write checksum element, propagaing forward from the BinaryIO
// Always pair a checksum with a binary object, and close message
@@ -353,30 +485,35 @@ class GridLimeWriter : public BinaryIO {
std::stringstream streamb; streamb << std::hex << scidac_csumb;
checksum.suma= streama.str();
checksum.sumb= streamb.str();
std::cout << GridLogMessage<<" writing scidac checksums "<<std::hex<<scidac_csuma<<"/"<<scidac_csumb<<std::dec<<std::endl;
if ( boss_node ) {
writeLimeObject(0,0,FNMD,std::string(GRID_FIELD_NORM),std::string(GRID_FIELD_NORM));
writeLimeObject(0,1,checksum,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM));
}
}
};
class ScidacWriter : public GridLimeWriter {
public:
ScidacWriter(bool isboss =true ) : GridLimeWriter(isboss) { };
template<class SerialisableUserFile>
void writeScidacFileRecord(GridBase *grid,SerialisableUserFile &_userFile)
{
scidacFile _scidacFile(grid);
if ( this->boss_node ) {
writeLimeObject(1,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
writeLimeObject(0,1,_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
}
}
////////////////////////////////////////////////
// Write generic lattice field in scidac format
////////////////////////////////////////////////
template <class vobj, class userRecord>
void writeScidacFieldRecord(Lattice<vobj> &field,userRecord _userRecord)
void writeScidacFieldRecord(Lattice<vobj> &field,userRecord _userRecord,
const unsigned int recordScientificPrec = 0)
{
typedef typename vobj::scalar_object sobj;
uint64_t nbytes;
GridBase * grid = field._grid;
GridBase * grid = field.Grid();
////////////////////////////////////////
// fill the Grid header
@@ -390,16 +527,81 @@ class ScidacWriter : public GridLimeWriter {
//////////////////////////////////////////////
// Fill the Lime file record by record
//////////////////////////////////////////////
if ( this->boss_node ) {
writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message
writeLimeObject(0,0,_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML));
writeLimeObject(0,0,_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML), recordScientificPrec);
writeLimeObject(0,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
}
// Collective call
writeLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA)); // Closes message with checksum
}
};
class ScidacReader : public GridLimeReader {
public:
template<class SerialisableUserFile>
void readScidacFileRecord(GridBase *grid,SerialisableUserFile &_userFile)
{
scidacFile _scidacFile(grid);
readLimeObject(_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
readLimeObject(_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
}
////////////////////////////////////////////////
// Write generic lattice field in scidac format
////////////////////////////////////////////////
template <class vobj, class userRecord>
void readScidacFieldRecord(Lattice<vobj> &field,userRecord &_userRecord)
{
typedef typename vobj::scalar_object sobj;
GridBase * grid = field.Grid();
////////////////////////////////////////
// fill the Grid header
////////////////////////////////////////
FieldMetaData header;
scidacRecord _scidacRecord;
scidacFile _scidacFile;
//////////////////////////////////////////////
// Fill the Lime file record by record
//////////////////////////////////////////////
readLimeObject(header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message
readLimeObject(_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML));
readLimeObject(_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
readLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA));
}
void skipPastBinaryRecord(void) {
std::string rec_name(ILDG_BINARY_DATA);
while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) {
if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) ) ) {
skipPastObjectRecord(std::string(SCIDAC_CHECKSUM));
return;
}
}
}
void skipPastObjectRecord(std::string rec_name) {
while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) {
if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) ) ) {
return;
}
}
}
void skipScidacFieldRecord() {
skipPastObjectRecord(std::string(GRID_FORMAT));
skipPastObjectRecord(std::string(SCIDAC_RECORD_XML));
skipPastObjectRecord(std::string(SCIDAC_PRIVATE_RECORD_XML));
skipPastBinaryRecord();
}
};
class IldgWriter : public ScidacWriter {
public:
IldgWriter(bool isboss) : ScidacWriter(isboss) {};
///////////////////////////////////
// A little helper
///////////////////////////////////
@@ -420,13 +622,11 @@ class IldgWriter : public ScidacWriter {
template <class vsimd>
void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,int sequence,std::string LFN,std::string description)
{
GridBase * grid = Umu._grid;
GridBase * grid = Umu.Grid();
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
typedef iLorentzColourMatrix<vsimd> vobj;
typedef typename vobj::scalar_object sobj;
uint64_t nbytes;
////////////////////////////////////////
// fill the Grid header
////////////////////////////////////////
@@ -464,6 +664,12 @@ class IldgWriter : public ScidacWriter {
assert(header.nd==4);
assert(header.nd==header.dimension.size());
//////////////////////////////////////////////////////////////////////////////
// Field norm tests
//////////////////////////////////////////////////////////////////////////////
FieldNormMetaData FieldNormMetaData_;
FieldNormMetaData_.norm2 = norm2(Umu);
//////////////////////////////////////////////////////////////////////////////
// Fill the USQCD info field
//////////////////////////////////////////////////////////////////////////////
@@ -472,11 +678,12 @@ class IldgWriter : public ScidacWriter {
info.plaq = header.plaquette;
info.linktr = header.link_trace;
std::cout << GridLogMessage << " Writing config; IldgIO "<<std::endl;
// std::cout << GridLogMessage << " Writing config; IldgIO n2 "<< FieldNormMetaData_.norm2<<std::endl;
//////////////////////////////////////////////
// Fill the Lime file record by record
//////////////////////////////////////////////
writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message
writeLimeObject(0,0,FieldNormMetaData_,FieldNormMetaData_.SerialisableClassName(),std::string(GRID_FIELD_NORM));
writeLimeObject(0,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
writeLimeObject(0,1,info,info.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
writeLimeObject(1,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
@@ -485,7 +692,6 @@ class IldgWriter : public ScidacWriter {
writeLimeIldgLFN(header.ildg_lfn); // rec
writeLimeLatticeBinaryObject(Umu,std::string(ILDG_BINARY_DATA)); // Closes message with checksum
// limeDestroyWriter(LimeW);
fclose(File);
}
};
@@ -509,9 +715,9 @@ class IldgReader : public GridLimeReader {
typedef LorentzColourMatrixF fobj;
typedef LorentzColourMatrixD dobj;
GridBase *grid = Umu._grid;
GridBase *grid = Umu.Grid();
std::vector<int> dims = Umu._grid->FullDimensions();
Coordinate dims = Umu.Grid()->FullDimensions();
assert(dims.size()==4);
@@ -520,6 +726,7 @@ class IldgReader : public GridLimeReader {
std::string ildgLFN_ ;
scidacChecksum scidacChecksum_;
usqcdInfo usqcdInfo_ ;
FieldNormMetaData FieldNormMetaData_;
// track what we read from file
int found_ildgFormat =0;
@@ -528,7 +735,7 @@ class IldgReader : public GridLimeReader {
int found_usqcdInfo =0;
int found_ildgBinary =0;
int found_FieldMetaData =0;
int found_FieldNormMetaData =0;
uint32_t nersc_csum;
uint32_t scidac_csuma;
uint32_t scidac_csumb;
@@ -557,13 +764,15 @@ class IldgReader : public GridLimeReader {
// Copy out the string
std::vector<char> xmlc(nbytes+1,'\0');
limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);
std::cout << GridLogMessage<< "Non binary record :" <<limeReaderType(LimeR) <<std::endl; //<<"\n"<<(&xmlc[0])<<std::endl;
// std::cout << GridLogMessage<< "Non binary record :" <<limeReaderType(LimeR) <<std::endl; //<<"\n"<<(&xmlc[0])<<std::endl;
//////////////////////////////////
// ILDG format record
std::string xmlstring(&xmlc[0]);
if ( !strncmp(limeReaderType(LimeR), ILDG_FORMAT,strlen(ILDG_FORMAT)) ) {
XmlReader RD(&xmlc[0],"");
XmlReader RD(xmlstring, true, "");
read(RD,"ildgFormat",ildgFormat_);
if ( ildgFormat_.precision == 64 ) format = std::string("IEEE64BIG");
@@ -578,13 +787,13 @@ class IldgReader : public GridLimeReader {
}
if ( !strncmp(limeReaderType(LimeR), ILDG_DATA_LFN,strlen(ILDG_DATA_LFN)) ) {
FieldMetaData_.ildg_lfn = std::string(&xmlc[0]);
FieldMetaData_.ildg_lfn = xmlstring;
found_ildgLFN = 1;
}
if ( !strncmp(limeReaderType(LimeR), GRID_FORMAT,strlen(ILDG_FORMAT)) ) {
XmlReader RD(&xmlc[0],"");
XmlReader RD(xmlstring, true, "");
read(RD,"FieldMetaData",FieldMetaData_);
format = FieldMetaData_.floating_point;
@@ -598,24 +807,33 @@ class IldgReader : public GridLimeReader {
}
if ( !strncmp(limeReaderType(LimeR), SCIDAC_RECORD_XML,strlen(SCIDAC_RECORD_XML)) ) {
XmlReader RD(&xmlc[0],"");
// is it a USQCD info field
if ( xmlstring.find(std::string("usqcdInfo")) != std::string::npos ) {
// std::cout << GridLogMessage<<"...found a usqcdInfo field"<<std::endl;
XmlReader RD(xmlstring, true, "");
read(RD,"usqcdInfo",usqcdInfo_);
found_usqcdInfo = 1;
}
}
if ( !strncmp(limeReaderType(LimeR), SCIDAC_CHECKSUM,strlen(SCIDAC_CHECKSUM)) ) {
XmlReader RD(&xmlc[0],"");
XmlReader RD(xmlstring, true, "");
read(RD,"scidacChecksum",scidacChecksum_);
found_scidacChecksum = 1;
}
if ( !strncmp(limeReaderType(LimeR), GRID_FIELD_NORM,strlen(GRID_FIELD_NORM)) ) {
XmlReader RD(xmlstring, true, "");
read(RD,GRID_FIELD_NORM,FieldNormMetaData_);
found_FieldNormMetaData = 1;
}
} else {
/////////////////////////////////
// Binary data
/////////////////////////////////
std::cout << GridLogMessage << "ILDG Binary record found : " ILDG_BINARY_DATA << std::endl;
off_t offset= ftell(File);
// std::cout << GridLogMessage << "ILDG Binary record found : " ILDG_BINARY_DATA << std::endl;
uint64_t offset= ftello(File);
if ( format == std::string("IEEE64BIG") ) {
GaugeSimpleMunger<dobj, sobj> munge;
BinaryIO::readLatticeObject< vobj, dobj >(Umu, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
@@ -633,6 +851,7 @@ class IldgReader : public GridLimeReader {
// Minimally must find binary segment and checksum
// Since this is an ILDG reader require ILDG format
//////////////////////////////////////////////////////
assert(found_ildgLFN);
assert(found_ildgBinary);
assert(found_ildgFormat);
assert(found_scidacChecksum);
@@ -681,6 +900,13 @@ class IldgReader : public GridLimeReader {
////////////////////////////////////////////////////////////
// Really really want to mandate a scidac checksum
////////////////////////////////////////////////////////////
if ( found_FieldNormMetaData ) {
RealD nn = norm2(Umu);
GRID_FIELD_NORM_CHECK(FieldNormMetaData_,nn);
std::cout << GridLogMessage<<"FieldNormMetaData matches " << std::endl;
} else {
std::cout << GridLogWarning<<"FieldNormMetaData not found. " << std::endl;
}
if ( found_scidacChecksum ) {
FieldMetaData_.scidac_checksuma = stoull(scidacChecksum_.suma,0,16);
FieldMetaData_.scidac_checksumb = stoull(scidacChecksum_.sumb,0,16);
@@ -703,9 +929,9 @@ class IldgReader : public GridLimeReader {
}
};
}}
NAMESPACE_END(Grid);
//HAVE_LIME
#endif
#endif

View File

@@ -32,7 +32,7 @@ extern "C" { // for linkage
#include "lime.h"
}
namespace Grid {
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////
// Data representation of records that enter ILDG and SciDac formats
@@ -64,6 +64,11 @@ namespace Grid {
// file compatability, so should be correct to assume the undocumented but defacto file structure.
/////////////////////////////////////////////////////////////////////////////////
struct emptyUserRecord : Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(emptyUserRecord,int,dummy);
emptyUserRecord() { dummy=0; };
};
////////////////////////
// Scidac private file xml
// <?xml version="1.0" encoding="UTF-8"?><scidacFile><version>1.1</version><spacetime>4</spacetime><dims>16 16 16 32 </dims><volfmt>0</volfmt></scidacFile>
@@ -86,7 +91,7 @@ struct scidacFile : Serializable {
return dimensions;
}
void setDimensions(std::vector<int> dimensions) {
void setDimensions(Coordinate dimensions) {
char delimiter = ' ';
std::stringstream stream;
for(int i=0;i<dimensions.size();i++){
@@ -131,8 +136,9 @@ struct scidacRecord : Serializable {
int, typesize,
int, datacount);
scidacRecord() { version =1.0; }
scidacRecord()
: version(1.0), recordtype(0), colors(0), spins(0), typesize(0), datacount(0)
{}
};
////////////////////////
@@ -226,6 +232,6 @@ struct usqcdPropInfo : Serializable {
};
#endif
}
NAMESPACE_END(Grid);
#endif
#endif

330
Grid/parallelIO/MetaData.h Normal file
View File

@@ -0,0 +1,330 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/parallelIO/NerscIO.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <algorithm>
#include <iostream>
#include <iomanip>
#include <fstream>
#include <map>
#include <unistd.h>
#include <sys/utsname.h>
#include <pwd.h>
NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////
// Precision mapping
///////////////////////////////////////////////////////
template<class vobj> static std::string getFormatString (void)
{
std::string format;
typedef typename getPrecision<vobj>::real_scalar_type stype;
if ( sizeof(stype) == sizeof(float) ) {
format = std::string("IEEE32BIG");
}
if ( sizeof(stype) == sizeof(double) ) {
format = std::string("IEEE64BIG");
}
return format;
};
////////////////////////////////////////////////////////////////////////////////
// header specification/interpretation
////////////////////////////////////////////////////////////////////////////////
class FieldNormMetaData : Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(FieldNormMetaData, double, norm2);
};
class FieldMetaData : Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(FieldMetaData,
int, nd,
std::vector<int>, dimension,
std::vector<std::string>, boundary,
int, data_start,
std::string, hdr_version,
std::string, storage_format,
double, link_trace,
double, plaquette,
uint32_t, checksum,
uint32_t, scidac_checksuma,
uint32_t, scidac_checksumb,
unsigned int, sequence_number,
std::string, data_type,
std::string, ensemble_id,
std::string, ensemble_label,
std::string, ildg_lfn,
std::string, creator,
std::string, creator_hardware,
std::string, creation_date,
std::string, archive_date,
std::string, floating_point);
// WARNING: non-initialised values might lead to twisted parallel IO
// issues, std::string are fine because they initliase to size 0
// as per C++ standard.
FieldMetaData(void)
: nd(4), dimension(4,0), boundary(4, ""), data_start(0),
link_trace(0.), plaquette(0.), checksum(0),
scidac_checksuma(0), scidac_checksumb(0), sequence_number(0)
{}
};
// PB disable using namespace - this is a header and forces namesapce visibility for all
// including files
//using namespace Grid;
//////////////////////////////////////////////////////////////////////
// Bit and Physical Checksumming and QA of data
//////////////////////////////////////////////////////////////////////
inline void GridMetaData(GridBase *grid,FieldMetaData &header)
{
int nd = grid->_ndimension;
header.nd = nd;
header.dimension.resize(nd);
header.boundary.resize(nd);
header.data_start = 0;
for(int d=0;d<nd;d++) {
header.dimension[d] = grid->_fdimensions[d];
}
for(int d=0;d<nd;d++) {
header.boundary[d] = std::string("PERIODIC");
}
}
inline void MachineCharacteristics(FieldMetaData &header)
{
// Who
struct passwd *pw = getpwuid (getuid());
if (pw) header.creator = std::string(pw->pw_name);
// When
std::time_t t = std::time(nullptr);
std::tm tm_ = *std::localtime(&t);
std::ostringstream oss;
// oss << std::put_time(&tm_, "%c %Z");
header.creation_date = oss.str();
header.archive_date = header.creation_date;
// What
struct utsname name; uname(&name);
header.creator_hardware = std::string(name.nodename)+"-";
header.creator_hardware+= std::string(name.machine)+"-";
header.creator_hardware+= std::string(name.sysname)+"-";
header.creator_hardware+= std::string(name.release);
}
#define dump_meta_data(field, s) \
s << "BEGIN_HEADER" << std::endl; \
s << "HDR_VERSION = " << field.hdr_version << std::endl; \
s << "DATATYPE = " << field.data_type << std::endl; \
s << "STORAGE_FORMAT = " << field.storage_format << std::endl; \
for(int i=0;i<4;i++){ \
s << "DIMENSION_" << i+1 << " = " << field.dimension[i] << std::endl ; \
} \
s << "LINK_TRACE = " << std::setprecision(10) << field.link_trace << std::endl; \
s << "PLAQUETTE = " << std::setprecision(10) << field.plaquette << std::endl; \
for(int i=0;i<4;i++){ \
s << "BOUNDARY_"<<i+1<<" = " << field.boundary[i] << std::endl; \
} \
\
s << "CHECKSUM = "<< std::hex << std::setw(10) << field.checksum << std::dec<<std::endl; \
s << "SCIDAC_CHECKSUMA = "<< std::hex << std::setw(10) << field.scidac_checksuma << std::dec<<std::endl; \
s << "SCIDAC_CHECKSUMB = "<< std::hex << std::setw(10) << field.scidac_checksumb << std::dec<<std::endl; \
s << "ENSEMBLE_ID = " << field.ensemble_id << std::endl; \
s << "ENSEMBLE_LABEL = " << field.ensemble_label << std::endl; \
s << "SEQUENCE_NUMBER = " << field.sequence_number << std::endl; \
s << "CREATOR = " << field.creator << std::endl; \
s << "CREATOR_HARDWARE = "<< field.creator_hardware << std::endl; \
s << "CREATION_DATE = " << field.creation_date << std::endl; \
s << "ARCHIVE_DATE = " << field.archive_date << std::endl; \
s << "FLOATING_POINT = " << field.floating_point << std::endl; \
s << "END_HEADER" << std::endl;
template<class vobj> inline void PrepareMetaData(Lattice<vobj> & field, FieldMetaData &header)
{
GridBase *grid = field.Grid();
std::string format = getFormatString<vobj>();
header.floating_point = format;
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
GridMetaData(grid,header);
MachineCharacteristics(header);
}
inline void GaugeStatistics(Lattice<vLorentzColourMatrixF> & data,FieldMetaData &header)
{
// How to convert data precision etc...
header.link_trace=WilsonLoops<PeriodicGimplF>::linkTrace(data);
header.plaquette =WilsonLoops<PeriodicGimplF>::avgPlaquette(data);
}
inline void GaugeStatistics(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
{
// How to convert data precision etc...
header.link_trace=WilsonLoops<PeriodicGimplD>::linkTrace(data);
header.plaquette =WilsonLoops<PeriodicGimplD>::avgPlaquette(data);
}
template<> inline void PrepareMetaData<vLorentzColourMatrixF>(Lattice<vLorentzColourMatrixF> & field, FieldMetaData &header)
{
GridBase *grid = field.Grid();
std::string format = getFormatString<vLorentzColourMatrixF>();
header.floating_point = format;
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
GridMetaData(grid,header);
GaugeStatistics(field,header);
MachineCharacteristics(header);
}
template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzColourMatrixD> & field, FieldMetaData &header)
{
GridBase *grid = field.Grid();
std::string format = getFormatString<vLorentzColourMatrixD>();
header.floating_point = format;
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
GridMetaData(grid,header);
GaugeStatistics(field,header);
MachineCharacteristics(header);
}
//////////////////////////////////////////////////////////////////////
// Utilities ; these are QCD aware
//////////////////////////////////////////////////////////////////////
inline void reconstruct3(LorentzColourMatrix & cm)
{
const int x=0;
const int y=1;
const int z=2;
for(int mu=0;mu<Nd;mu++){
cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
}
}
////////////////////////////////////////////////////////////////////////////////
// Some data types for intermediate storage
////////////////////////////////////////////////////////////////////////////////
template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, 2>, Nd >;
typedef iLorentzColour2x3<Complex> LorentzColour2x3;
typedef iLorentzColour2x3<ComplexF> LorentzColour2x3F;
typedef iLorentzColour2x3<ComplexD> LorentzColour2x3D;
/////////////////////////////////////////////////////////////////////////////////
// Simple classes for precision conversion
/////////////////////////////////////////////////////////////////////////////////
template <class fobj, class sobj>
struct BinarySimpleUnmunger {
typedef typename getPrecision<fobj>::real_scalar_type fobj_stype;
typedef typename getPrecision<sobj>::real_scalar_type sobj_stype;
void operator()(sobj &in, fobj &out) {
// take word by word and transform accoding to the status
fobj_stype *out_buffer = (fobj_stype *)&out;
sobj_stype *in_buffer = (sobj_stype *)&in;
size_t fobj_words = sizeof(out) / sizeof(fobj_stype);
size_t sobj_words = sizeof(in) / sizeof(sobj_stype);
assert(fobj_words == sobj_words);
for (unsigned int word = 0; word < sobj_words; word++)
out_buffer[word] = in_buffer[word]; // type conversion on the fly
}
};
template <class fobj, class sobj>
struct BinarySimpleMunger {
typedef typename getPrecision<fobj>::real_scalar_type fobj_stype;
typedef typename getPrecision<sobj>::real_scalar_type sobj_stype;
void operator()(fobj &in, sobj &out) {
// take word by word and transform accoding to the status
fobj_stype *in_buffer = (fobj_stype *)&in;
sobj_stype *out_buffer = (sobj_stype *)&out;
size_t fobj_words = sizeof(in) / sizeof(fobj_stype);
size_t sobj_words = sizeof(out) / sizeof(sobj_stype);
assert(fobj_words == sobj_words);
for (unsigned int word = 0; word < sobj_words; word++)
out_buffer[word] = in_buffer[word]; // type conversion on the fly
}
};
template<class fobj,class sobj>
struct GaugeSimpleMunger{
void operator()(fobj &in, sobj &out) {
for (int mu = 0; mu < Nd; mu++) {
for (int i = 0; i < Nc; i++) {
for (int j = 0; j < Nc; j++) {
out(mu)()(i, j) = in(mu)()(i, j);
}}
}
};
};
template <class fobj, class sobj>
struct GaugeSimpleUnmunger {
void operator()(sobj &in, fobj &out) {
for (int mu = 0; mu < Nd; mu++) {
for (int i = 0; i < Nc; i++) {
for (int j = 0; j < Nc; j++) {
out(mu)()(i, j) = in(mu)()(i, j);
}}
}
};
};
template<class fobj,class sobj>
struct Gauge3x2munger{
void operator() (fobj &in,sobj &out){
for(int mu=0;mu<Nd;mu++){
for(int i=0;i<2;i++){
for(int j=0;j<3;j++){
out(mu)()(i,j) = in(mu)(i)(j);
}}
}
reconstruct3(out);
}
};
template<class fobj,class sobj>
struct Gauge3x2unmunger{
void operator() (sobj &in,fobj &out){
for(int mu=0;mu<Nd;mu++){
for(int i=0;i<2;i++){
for(int j=0;j<3;j++){
out(mu)(i)(j) = in(mu)()(i,j);
}}
}
}
};
NAMESPACE_END(Grid);

359
Grid/parallelIO/NerscIO.h Normal file
View File

@@ -0,0 +1,359 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/parallelIO/NerscIO.h
Copyright (C) 2015
Author: Matt Spraggs <matthew.spraggs@gmail.com>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_NERSC_IO_H
#define GRID_NERSC_IO_H
NAMESPACE_BEGIN(Grid);
using namespace Grid;
////////////////////////////////////////////////////////////////////////////////
// Write and read from fstream; comput header offset for payload
////////////////////////////////////////////////////////////////////////////////
class NerscIO : public BinaryIO {
public:
static inline void truncate(std::string file){
std::ofstream fout(file,std::ios::out);
}
static inline unsigned int writeHeader(FieldMetaData &field,std::string file)
{
std::ofstream fout(file,std::ios::out|std::ios::in);
fout.seekp(0,std::ios::beg);
dump_meta_data(field, fout);
field.data_start = fout.tellp();
return field.data_start;
}
// for the header-reader
static inline int readHeader(std::string file,GridBase *grid, FieldMetaData &field)
{
std::map<std::string,std::string> header;
std::string line;
//////////////////////////////////////////////////
// read the header
//////////////////////////////////////////////////
std::ifstream fin(file);
getline(fin,line); // read one line and insist is
removeWhitespace(line);
std::cout << GridLogMessage << "* " << line << std::endl;
assert(line==std::string("BEGIN_HEADER"));
do {
getline(fin,line); // read one line
std::cout << GridLogMessage << "* "<<line<< std::endl;
int eq = line.find("=");
if(eq >0) {
std::string key=line.substr(0,eq);
std::string val=line.substr(eq+1);
removeWhitespace(key);
removeWhitespace(val);
header[key] = val;
}
} while( line.find("END_HEADER") == std::string::npos );
field.data_start = fin.tellg();
//////////////////////////////////////////////////
// chomp the values
//////////////////////////////////////////////////
field.hdr_version = header["HDR_VERSION"];
field.data_type = header["DATATYPE"];
field.storage_format = header["STORAGE_FORMAT"];
field.dimension[0] = std::stol(header["DIMENSION_1"]);
field.dimension[1] = std::stol(header["DIMENSION_2"]);
field.dimension[2] = std::stol(header["DIMENSION_3"]);
field.dimension[3] = std::stol(header["DIMENSION_4"]);
assert(grid->_ndimension == 4);
for(int d=0;d<4;d++){
assert(grid->_fdimensions[d]==field.dimension[d]);
}
field.link_trace = std::stod(header["LINK_TRACE"]);
field.plaquette = std::stod(header["PLAQUETTE"]);
field.boundary[0] = header["BOUNDARY_1"];
field.boundary[1] = header["BOUNDARY_2"];
field.boundary[2] = header["BOUNDARY_3"];
field.boundary[3] = header["BOUNDARY_4"];
field.checksum = std::stoul(header["CHECKSUM"],0,16);
field.ensemble_id = header["ENSEMBLE_ID"];
field.ensemble_label = header["ENSEMBLE_LABEL"];
field.sequence_number = std::stol(header["SEQUENCE_NUMBER"]);
field.creator = header["CREATOR"];
field.creator_hardware = header["CREATOR_HARDWARE"];
field.creation_date = header["CREATION_DATE"];
field.archive_date = header["ARCHIVE_DATE"];
field.floating_point = header["FLOATING_POINT"];
return field.data_start;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Now the meat: the object readers
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vsimd>
static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
FieldMetaData& header,
std::string file)
{
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
GridBase *grid = Umu.Grid();
uint64_t offset = readHeader(file,Umu.Grid(),header);
FieldMetaData clone(header);
std::string format(header.floating_point);
int ieee32big = (format == std::string("IEEE32BIG"));
int ieee32 = (format == std::string("IEEE32"));
int ieee64big = (format == std::string("IEEE64BIG"));
int ieee64 = (format == std::string("IEEE64"));
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
// depending on datatype, set up munger;
// munger is a function of <floating point, Real, data_type>
if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
if ( ieee32 || ieee32big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3F>
(Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
nersc_csum,scidac_csuma,scidac_csumb);
}
if ( ieee64 || ieee64big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3D>
(Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb);
}
} else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
if ( ieee32 || ieee32big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb);
}
if ( ieee64 || ieee64big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb);
}
} else {
assert(0);
}
GaugeStatistics(Umu,clone);
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" checksum "<<std::hex<<nersc_csum<< std::dec
<<" header "<<std::hex<<header.checksum<<std::dec <<std::endl;
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" plaquette "<<clone.plaquette
<<" header "<<header.plaquette<<std::endl;
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" link_trace "<<clone.link_trace
<<" header "<<header.link_trace<<std::endl;
if ( fabs(clone.plaquette -header.plaquette ) >= 1.0e-5 ) {
std::cout << " Plaquette mismatch "<<std::endl;
}
if ( nersc_csum != header.checksum ) {
std::cerr << " checksum mismatch " << std::endl;
std::cerr << " plaqs " << clone.plaquette << " " << header.plaquette << std::endl;
std::cerr << " trace " << clone.link_trace<< " " << header.link_trace<< std::endl;
std::cerr << " nersc_csum " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
exit(0);
}
assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
assert(nersc_csum == header.checksum );
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
}
template<class vsimd>
static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
std::string file,
int two_row,
int bits32)
{
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
typedef iLorentzColourMatrix<vsimd> vobj;
typedef typename vobj::scalar_object sobj;
FieldMetaData header;
///////////////////////////////////////////
// Following should become arguments
///////////////////////////////////////////
header.sequence_number = 1;
header.ensemble_id = "UKQCD";
header.ensemble_label = "DWF";
typedef LorentzColourMatrixD fobj3D;
typedef LorentzColour2x3D fobj2D;
GridBase *grid = Umu.Grid();
GridMetaData(grid,header);
assert(header.nd==4);
GaugeStatistics(Umu,header);
MachineCharacteristics(header);
uint64_t offset;
// Sod it -- always write 3x3 double
header.floating_point = std::string("IEEE64BIG");
header.data_type = std::string("4D_SU3_GAUGE_3x3");
GaugeSimpleUnmunger<fobj3D,sobj> munge;
if ( grid->IsBoss() ) {
truncate(file);
offset = writeHeader(header,file);
}
grid->Broadcast(0,(void *)&offset,sizeof(offset));
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
nersc_csum,scidac_csuma,scidac_csumb);
header.checksum = nersc_csum;
if ( grid->IsBoss() ) {
writeHeader(header,file);
}
std::cout<<GridLogMessage <<"Written NERSC Configuration on "<< file << " checksum "
<<std::hex<<header.checksum
<<std::dec<<" plaq "<< header.plaquette <<std::endl;
}
///////////////////////////////
// RNG state
///////////////////////////////
static inline void writeRNGState(GridSerialRNG &serial,GridParallelRNG &parallel,std::string file)
{
typedef typename GridParallelRNG::RngStateType RngStateType;
// Following should become arguments
FieldMetaData header;
header.sequence_number = 1;
header.ensemble_id = "UKQCD";
header.ensemble_label = "DWF";
GridBase *grid = parallel.Grid();
GridMetaData(grid,header);
assert(header.nd==4);
header.link_trace=0.0;
header.plaquette=0.0;
MachineCharacteristics(header);
uint64_t offset;
#ifdef RNG_RANLUX
header.floating_point = std::string("UINT64");
header.data_type = std::string("RANLUX48");
#endif
#ifdef RNG_MT19937
header.floating_point = std::string("UINT32");
header.data_type = std::string("MT19937");
#endif
#ifdef RNG_SITMO
header.floating_point = std::string("UINT64");
header.data_type = std::string("SITMO");
#endif
if ( grid->IsBoss() ) {
truncate(file);
offset = writeHeader(header,file);
}
grid->Broadcast(0,(void *)&offset,sizeof(offset));
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::writeRNG(serial,parallel,file,offset,nersc_csum,scidac_csuma,scidac_csumb);
header.checksum = nersc_csum;
if ( grid->IsBoss() ) {
offset = writeHeader(header,file);
}
std::cout<<GridLogMessage
<<"Written NERSC RNG STATE "<<file<< " checksum "
<<std::hex<<header.checksum
<<std::dec<<std::endl;
}
static inline void readRNGState(GridSerialRNG &serial,GridParallelRNG & parallel,FieldMetaData& header,std::string file)
{
typedef typename GridParallelRNG::RngStateType RngStateType;
GridBase *grid = parallel.Grid();
uint64_t offset = readHeader(file,grid,header);
FieldMetaData clone(header);
std::string format(header.floating_point);
std::string data_type(header.data_type);
#ifdef RNG_RANLUX
assert(format == std::string("UINT64"));
assert(data_type == std::string("RANLUX48"));
#endif
#ifdef RNG_MT19937
assert(format == std::string("UINT32"));
assert(data_type == std::string("MT19937"));
#endif
#ifdef RNG_SITMO
assert(format == std::string("UINT64"));
assert(data_type == std::string("SITMO"));
#endif
// depending on datatype, set up munger;
// munger is a function of <floating point, Real, data_type>
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::readRNG(serial,parallel,file,offset,nersc_csum,scidac_csuma,scidac_csumb);
if ( nersc_csum != header.checksum ) {
std::cerr << "checksum mismatch "<<std::hex<< nersc_csum <<" "<<header.checksum<<std::dec<<std::endl;
exit(0);
}
assert(nersc_csum == header.checksum );
std::cout<<GridLogMessage <<"Read NERSC RNG file "<<file<< " format "<< data_type <<std::endl;
}
};
NAMESPACE_END(QCD);
#endif

View File

@@ -29,7 +29,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h>
#include <Grid/perfmon/PerfCount.h>
namespace Grid {
NAMESPACE_BEGIN(Grid);
#define CacheControl(L,O,R) ((PERF_COUNT_HW_CACHE_##L)|(PERF_COUNT_HW_CACHE_OP_##O<<8)| (PERF_COUNT_HW_CACHE_RESULT_##R<<16))
#define RawConfig(A,B) (A<<8|B)
@@ -40,7 +40,7 @@ const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::Performan
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES , "CPUCYCLES.........." , INSTRUCTIONS},
{ PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS , "INSTRUCTIONS......." , CPUCYCLES },
// 4
#ifdef AVX512
#ifdef KNL
{ PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES },
{ PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS },
{ PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS },
@@ -72,4 +72,5 @@ const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::Performan
// { PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "STALL_CYCLES" },
#endif
};
}
NAMESPACE_END(Grid);

Some files were not shown because too many files have changed in this diff Show More