mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 14:04:32 +00:00 
			
		
		
		
	Compare commits
	
		
			283 Commits
		
	
	
		
			feature/la
			...
			feature/fe
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 
						 | 
					f4e6824f22 | ||
| 
						 | 
					ac5cfd33a6 | ||
| 
						 | 
					f605230bbb | ||
| 
						 | 
					d54807b8c0 | ||
| 
						 | 
					5625b47c7d | ||
| 
						 | 
					1edcf902b7 | ||
| 
						 | 
					e5c19e1fd7 | ||
| 
						 | 
					a11d0a33d1 | ||
| 
						 | 
					4f8b6f26b4 | ||
| 
						 | 
					073525c5b3 | ||
| 
						 | 
					f7072d1ac2 | ||
| 
						 | 
					fddeb29d6b | ||
| 
						 | 
					a9ec5cf564 | ||
| 
						 | 
					946a8671b9 | ||
| 
						 | 
					771a1b8e79 | ||
| 
						 | 
					bfb68e6f02 | ||
| 
						 | 
					18c335198a | ||
| 
						 | 
					17c5b0f152 | ||
| 
						 | 
					5918769f97 | ||
| 
						 | 
					bbaf1ada91 | ||
| 
						 | 
					1950ac9294 | ||
| 
						 | 
					13fa70ac1a | ||
| 
						 | 
					7cb2b11f26 | ||
| 
						 | 
					1184ed29ae | ||
| 
						 | 
					203c7bf6fa | ||
| 
						 | 
					c709883f3f | ||
| 
						 | 
					aed5de4d50 | ||
| 
						 | 
					ba27cc6571 | ||
| 
						 | 
					d856327250 | ||
| 
						 | 
					d75369cb56 | ||
| 
						 | 
					bf973d0d56 | ||
| 
						 | 
					837bf8a5be | ||
| 
						 | 
					c05b2199f6 | ||
| 
						 | 
					a5fe07c077 | ||
| 
						 | 
					b83b2b1415 | ||
| 
						 | 
					b331be9101 | ||
| 
						 | 
					49c20a9fa8 | ||
| 
						 | 
					7359df3501 | ||
| 
						 | 
					59bd1fe21b | ||
| 
						 | 
					4e907fef2c | ||
| 
						 | 
					67888b657f | ||
| 
						 | 
					74af885d4e | ||
| 
						 | 
					d36d2fb40d | ||
| 
						 | 
					5b9267e88d | ||
| 
						 | 
					15fd4003ef | ||
| 
						 | 
					4b4c2a715b | ||
| 
						 | 
					54a5e6c1d0 | ||
| 
						 | 
					73aeca7dea | ||
| 
						 | 
					ad89abb018 | ||
| 
						 | 
					80c5bce5bb | ||
| 
						 | 
					f68b5de9c8 | ||
| 
						 | 
					d0f3d525d5 | ||
| 
						 | 
					f365a83fae | ||
| 
						 | 
					3a58217405 | ||
| 
						 | 
					c289699d9a | ||
| 
						 | 
					c3b1263e75 | ||
| 
						 | 
					34a9aeb331 | ||
| 102ea9ae66 | |||
| 
						 | 
					5fa386ddc9 | ||
| 
						 | 
					edabb3577f | ||
| 
						 | 
					ce5df177ee | ||
| 
						 | 
					a0bb8e5b46 | ||
| 
						 | 
					46f88e6d72 | ||
| 
						 | 
					dd8f1ea189 | ||
| 
						 | 
					b61835c1a5 | ||
| 
						 | 
					d9cd4f0273 | ||
| 
						 | 
					459f70e8d4 | ||
| 
						 | 
					061e48fd73 | ||
| 
						 | 
					ab50145001 | ||
| 
						 | 
					b49bec0cec | ||
| 
						 | 
					ae56e556c6 | ||
| 
						 | 
					1cdf999668 | ||
| 
						 | 
					11062fb686 | ||
| 
						 | 
					383ca7d392 | ||
| 
						 | 
					a446d95c33 | ||
| 
						 | 
					be66e7dd95 | ||
| 
						 | 
					6d0d064a6c | ||
| 
						 | 
					bfef525ed2 | ||
| 
						 | 
					0b0cf62193 | ||
| 
						 | 
					7d88198387 | ||
| 
						 | 
					2f619482b8 | ||
| 
						 | 
					d6472eda8d | ||
| 
						 | 
					9e658de238 | ||
| 
						 | 
					bcefdd7c4e | ||
| 
						 | 
					9d45fca8bc | ||
| 
						 | 
					ac9e6b63c0 | ||
| 
						 | 
					e140b3f802 | ||
| 
						 | 
					d9d3d30cc7 | ||
| 
						 | 
					47a12ec7b5 | ||
| 
						 | 
					ec1e2f7a40 | ||
| 
						 | 
					41f73ec083 | ||
| 
						 | 
					fd367d8bfd | ||
| 
						 | 
					6d0786ff9d | ||
| 
						 | 
					b7f93aeb4d | ||
| 
						 | 
					202a7fe900 | ||
| 
						 | 
					8a3fe60a27 | ||
| 
						 | 
					44051aecd1 | ||
| 
						 | 
					06e6f8de00 | ||
| 
						 | 
					dbe4d7850c | ||
| 
						 | 
					4fe182e5a7 | ||
| 
						 | 
					175f393f9d | ||
| 
						 | 
					7d867a8134 | ||
| 
						 | 
					9939b267d2 | ||
| 
						 | 
					14d53e1c9e | ||
| 
						 | 
					8bd869da37 | ||
| 
						 | 
					c7036f6717 | ||
| 
						 | 
					c0485d799d | ||
| 
						 | 
					7abc5613bd | ||
| 
						 | 
					237cfd11ab | ||
| 
						 | 
					a4b7dddb67 | ||
| 
						 | 
					5696781862 | ||
| 
						 | 
					8f4b3049cd | ||
| 
						 | 
					2a6e673a91 | ||
| 
						 | 
					9b6cde173f | ||
| 
						 | 
					9f280b82c4 | ||
| c3f0889eda | |||
| 
						 | 
					7a53dc3715 | ||
| 
						 | 
					0f214ad427 | ||
| 
						 | 
					fe4912880d | ||
| 
						 | 
					f038c6babe | ||
| 
						 | 
					169f4b2711 | ||
| 
						 | 
					2d8aff36fe | ||
| 
						 | 
					9fa07eecde | ||
| 
						 | 
					659d7d1a40 | ||
| 
						 | 
					f64fb7bd77 | ||
| 
						 | 
					2a35449b91 | ||
| 
						 | 
					184af5bd05 | ||
| 
						 | 
					097c9637ee | ||
| 
						 | 
					dc6f078246 | ||
| 
						 | 
					8a4714a4a6 | ||
| 
						 | 
					40e119c61c | ||
| 
						 | 
					d9593c4b81 | ||
| 
						 | 
					ac740f73ce | ||
| 
						 | 
					75dc7794b9 | ||
| 
						 | 
					dee68fc728 | ||
| 
						 | 
					a2d3643634 | ||
| 
						 | 
					57002924bc | ||
| 
						 | 
					7b0237b081 | ||
| 
						 | 
					b68ad0cc0b | ||
| 
						 | 
					37263fd9b1 | ||
| 
						 | 
					3d09e3e9e0 | ||
| 
						 | 
					1354b46338 | ||
| 
						 | 
					251a97fe1b | ||
| 
						 | 
					e18929eaa0 | ||
| 
						 | 
					f3b0a92e71 | ||
| 
						 | 
					a0be3f7330 | ||
| 
						 | 
					b5a6e4f1fd | ||
| 
						 | 
					7a788db3dc | ||
| 
						 | 
					f20eceb6cd | ||
| 
						 | 
					38325ebbc6 | ||
| 
						 | 
					b73bd151bb | ||
| 
						 | 
					694b305cab | ||
| 
						 | 
					2d3737a133 | ||
| 
						 | 
					ac1f1838bc | ||
| 
						 | 
					09d09d0fe5 | ||
| 
						 | 
					bf630a6821 | ||
| 
						 | 
					8859a151cc | ||
| 
						 | 
					688a39cfd9 | ||
| 
						 | 
					6f5a5cd9b3 | ||
| 
						 | 
					0933aeefd4 | ||
| 
						 | 
					322f61acee | ||
| 
						 | 
					08e04b9676 | ||
| feaa2ac947 | |||
| 07de925127 | |||
| 
						 | 
					a9c816a268 | ||
| 
						 | 
					e43a8b6b8a | ||
| 
						 | 
					bf729766dd | ||
| 
						 | 
					dafb351d38 | ||
| 0b707b861c | |||
| 15e87a4607 | |||
| 7d7220cbd7 | |||
| 
						 | 
					54e94360ad | ||
| 0af740dc15 | |||
| d2e8372df3 | |||
| 
						 | 
					869b99ec1e | ||
| 
						 | 
					4a29ab0d0a | ||
| 
						 | 
					0165bcb58e | ||
| 4372d04ad4 | |||
| 
						 | 
					349d75e483 | ||
| 
						 | 
					56abbdf4c2 | ||
| 
						 | 
					af71c63f4c | ||
| 
						 | 
					e51475703a | ||
| 
						 | 
					1feddf4ba6 | ||
| 
						 | 
					600d7ddc2e | ||
| 
						 | 
					e504260f3d | ||
| 
						 | 
					0440d4ce66 | ||
| 
						 | 
					5e4bea8f20 | ||
| 
						 | 
					6ebf9f15b7 | ||
| 
						 | 
					1d7aa673a4 | ||
| 
						 | 
					b9104f3072 | ||
| b22eab8c8b | |||
| 
						 | 
					a7d56523ab | ||
| 1e8a2e1621 | |||
| 7587df831a | |||
| b672717096 | |||
| 284ee194b1 | |||
| 81b18f843a | |||
| 
						 | 
					a833f88c32 | ||
| 
						 | 
					07b2c1b253 | ||
| 
						 | 
					735cbdb983 | ||
| 
						 | 
					2ad54c5a02 | ||
| 
						 | 
					3d04dc33c6 | ||
| 2490816297 | |||
| 5f55bca378 | |||
| f6aa82b7f2 | |||
| 22749699a3 | |||
| 0503c028be | |||
| 22f4feee7b | |||
| 3f858d6755 | |||
| 35fa3d1dfd | |||
| 
						 | 
					c4435e6beb | ||
| d1ece74137 | |||
| 43c817cc67 | |||
| 
						 | 
					51bf1501fc | ||
| 
						 | 
					741bc836f6 | ||
| 
						 | 
					8546d01a4c | ||
| 1407418755 | |||
| a6a0da873f | |||
| 
						 | 
					7b03d8d087 | ||
| 
						 | 
					4b759b8f2a | ||
| 
						 | 
					038b6ee9cd | ||
| 
						 | 
					38806343a8 | ||
| 
						 | 
					831ca4e3bf | ||
| eedcaf6470 | |||
| b39f0d1fb6 | |||
| 9f1267dfe6 | |||
| 2e90285232 | |||
| e254de982e | |||
| 28d99b5297 | |||
| 
						 | 
					ee93f0218b | ||
| 161ed102a5 | |||
| 
						 | 
					f65a585236 | ||
| 
						 | 
					ae99e99da2 | ||
| f3ca29af6c | |||
| 37988221a8 | |||
| 7a327a3f28 | |||
| 92f8950a56 | |||
| 65987a8a58 | |||
| 889d828bc2 | |||
| ad98b6193d | |||
| fc760016b3 | |||
| 2da86f7dae | |||
| 97843e2b58 | |||
| 82b3f54697 | |||
| 673994b281 | |||
| bbc0eff078 | |||
| 4c60e31070 | |||
| afbf7d4c37 | |||
| 8c3cc32364 | |||
| 4c3fd9fa3f | |||
| 17b3a10d46 | |||
| 149a46b92c | |||
| db9c28a773 | |||
| 9ac3ac41df | |||
| 2af9ab9034 | |||
| 6f1ea96293 | |||
| 2e3c5890b6 | |||
| bc6678732f | |||
| b10ae00c8a | |||
| 
						 | 
					6ad73145bc | ||
| f7293f2ddb | |||
| 
						 | 
					6b8ee7bae0 | ||
| 
						 | 
					739c2308b5 | ||
| 
						 | 
					a71b69389b | ||
| 
						 | 
					d49e502f53 | ||
| 
						 | 
					92ec3404f8 | ||
| 
						 | 
					f4ebea3381 | ||
| 
						 | 
					cf167d0cd1 | ||
| 
						 | 
					c363bdd784 | ||
| 
						 | 
					c30d96ea50 | ||
| 
						 | 
					7ffe17ada1 | ||
| 330a9b3f4c | |||
| 
						 | 
					28ff66a381 | ||
| 
						 | 
					78c7bcee36 | ||
| 00a7b95631 | |||
| 94d8321d01 | |||
| 
						 | 
					ac24cc9f99 | ||
| 
						 | 
					3ab4c8c0bb | ||
| 26d124283e | |||
| 0d889b7041 | |||
| ab31ad006a | |||
| 6e4a06e180 | |||
| 
						 | 
					446c768cd3 | 
							
								
								
									
										68
									
								
								.travis.yml
									
									
									
									
									
								
							
							
						
						
									
										68
									
								
								.travis.yml
									
									
									
									
									
								
							@@ -9,68 +9,6 @@ matrix:
 | 
			
		||||
    - os:        osx
 | 
			
		||||
      osx_image: xcode8.3
 | 
			
		||||
      compiler: clang
 | 
			
		||||
    - compiler: gcc
 | 
			
		||||
      dist: trusty
 | 
			
		||||
      sudo: required
 | 
			
		||||
      addons:
 | 
			
		||||
        apt:
 | 
			
		||||
          sources:
 | 
			
		||||
            - ubuntu-toolchain-r-test
 | 
			
		||||
          packages:
 | 
			
		||||
            - g++-4.9
 | 
			
		||||
            - libmpfr-dev
 | 
			
		||||
            - libgmp-dev
 | 
			
		||||
            - libmpc-dev
 | 
			
		||||
            - libopenmpi-dev
 | 
			
		||||
            - openmpi-bin
 | 
			
		||||
            - binutils-dev
 | 
			
		||||
      env: VERSION=-4.9
 | 
			
		||||
    - compiler: gcc
 | 
			
		||||
      dist: trusty
 | 
			
		||||
      sudo: required
 | 
			
		||||
      addons:
 | 
			
		||||
        apt:
 | 
			
		||||
          sources:
 | 
			
		||||
            - ubuntu-toolchain-r-test
 | 
			
		||||
          packages:
 | 
			
		||||
            - g++-5
 | 
			
		||||
            - libmpfr-dev
 | 
			
		||||
            - libgmp-dev
 | 
			
		||||
            - libmpc-dev
 | 
			
		||||
            - libopenmpi-dev
 | 
			
		||||
            - openmpi-bin
 | 
			
		||||
            - binutils-dev
 | 
			
		||||
      env: VERSION=-5
 | 
			
		||||
    - compiler: clang
 | 
			
		||||
      dist: trusty
 | 
			
		||||
      addons:
 | 
			
		||||
        apt:
 | 
			
		||||
          sources:
 | 
			
		||||
            - ubuntu-toolchain-r-test
 | 
			
		||||
          packages:
 | 
			
		||||
            - g++-4.8
 | 
			
		||||
            - libmpfr-dev
 | 
			
		||||
            - libgmp-dev
 | 
			
		||||
            - libmpc-dev
 | 
			
		||||
            - libopenmpi-dev
 | 
			
		||||
            - openmpi-bin
 | 
			
		||||
            - binutils-dev
 | 
			
		||||
      env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
 | 
			
		||||
    - compiler: clang
 | 
			
		||||
      dist: trusty
 | 
			
		||||
      addons:
 | 
			
		||||
        apt:
 | 
			
		||||
          sources:
 | 
			
		||||
            - ubuntu-toolchain-r-test
 | 
			
		||||
          packages:
 | 
			
		||||
            - g++-4.8
 | 
			
		||||
            - libmpfr-dev
 | 
			
		||||
            - libgmp-dev
 | 
			
		||||
            - libmpc-dev
 | 
			
		||||
            - libopenmpi-dev
 | 
			
		||||
            - openmpi-bin
 | 
			
		||||
            - binutils-dev
 | 
			
		||||
      env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
 | 
			
		||||
      
 | 
			
		||||
before_install:
 | 
			
		||||
    - export GRIDDIR=`pwd`
 | 
			
		||||
@@ -106,9 +44,3 @@ script:
 | 
			
		||||
    - make -j4
 | 
			
		||||
    - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
 | 
			
		||||
    - make check
 | 
			
		||||
    - echo make clean
 | 
			
		||||
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto ; fi
 | 
			
		||||
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then make -j4; fi
 | 
			
		||||
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										281
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										281
									
								
								README.md
									
									
									
									
									
								
							@@ -1,27 +1,44 @@
 | 
			
		||||
# Grid
 | 
			
		||||
<table>
 | 
			
		||||
<tr>
 | 
			
		||||
    <td>Last stable release</td>
 | 
			
		||||
    <td><a href="https://travis-ci.org/paboyle/Grid">
 | 
			
		||||
    <img src="https://travis-ci.org/paboyle/Grid.svg?branch=master"></a>
 | 
			
		||||
    </td>
 | 
			
		||||
</tr>
 | 
			
		||||
<tr>
 | 
			
		||||
    <td>Development branch</td>
 | 
			
		||||
    <td><a href="https://travis-ci.org/paboyle/Grid">
 | 
			
		||||
    <img src="https://travis-ci.org/paboyle/Grid.svg?branch=develop"></a>
 | 
			
		||||
    </td>
 | 
			
		||||
</tr>
 | 
			
		||||
</table>
 | 
			
		||||
# Grid [),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=Grid&tab=projectOverview) [](https://travis-ci.org/paboyle/Grid)
 | 
			
		||||
 | 
			
		||||
**Data parallel C++ mathematical object library.**
 | 
			
		||||
 | 
			
		||||
License: GPL v2.
 | 
			
		||||
 | 
			
		||||
Last update Nov 2016.
 | 
			
		||||
Last update June 2017.
 | 
			
		||||
 | 
			
		||||
_Please do not send pull requests to the `master` branch which is reserved for releases._
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
### Description
 | 
			
		||||
This library provides data parallel C++ container classes with internal memory layout
 | 
			
		||||
that is transformed to map efficiently to SIMD architectures. CSHIFT facilities
 | 
			
		||||
are provided, similar to HPF and cmfortran, and user control is given over the mapping of
 | 
			
		||||
array indices to both MPI tasks and SIMD processing elements.
 | 
			
		||||
 | 
			
		||||
* Identically shaped arrays then be processed with perfect data parallelisation.
 | 
			
		||||
* Such identically shaped arrays are called conformable arrays.
 | 
			
		||||
 | 
			
		||||
The transformation is based on the observation that Cartesian array processing involves
 | 
			
		||||
identical processing to be performed on different regions of the Cartesian array.
 | 
			
		||||
 | 
			
		||||
The library will both geometrically decompose into MPI tasks and across SIMD lanes.
 | 
			
		||||
Local vector loops are parallelised with OpenMP pragmas.
 | 
			
		||||
 | 
			
		||||
Data parallel array operations can then be specified with a SINGLE data parallel paradigm, but
 | 
			
		||||
optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a significant simplification
 | 
			
		||||
for most programmers.
 | 
			
		||||
 | 
			
		||||
The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
 | 
			
		||||
Presently SSE4, ARM NEON (128 bits) AVX, AVX2, QPX (256 bits), IMCI and AVX512 (512 bits) targets are supported.
 | 
			
		||||
 | 
			
		||||
These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. 
 | 
			
		||||
The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.
 | 
			
		||||
 | 
			
		||||
MPI, OpenMP, and SIMD parallelism are present in the library.
 | 
			
		||||
Please see [this paper](https://arxiv.org/abs/1512.03487) for more detail.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
### Compilers
 | 
			
		||||
 | 
			
		||||
Intel ICPC v16.0.3 and later
 | 
			
		||||
@@ -56,35 +73,25 @@ When you file an issue, please go though the following checklist:
 | 
			
		||||
6. Attach the output of `make V=1`.
 | 
			
		||||
7. Describe the issue and any previous attempt to solve it. If relevant, show how to reproduce the issue using a minimal working example.
 | 
			
		||||
 | 
			
		||||
### Required libraries
 | 
			
		||||
Grid requires:
 | 
			
		||||
 | 
			
		||||
[GMP](https://gmplib.org/), 
 | 
			
		||||
 | 
			
		||||
### Description
 | 
			
		||||
This library provides data parallel C++ container classes with internal memory layout
 | 
			
		||||
that is transformed to map efficiently to SIMD architectures. CSHIFT facilities
 | 
			
		||||
are provided, similar to HPF and cmfortran, and user control is given over the mapping of
 | 
			
		||||
array indices to both MPI tasks and SIMD processing elements.
 | 
			
		||||
[MPFR](http://www.mpfr.org/) 
 | 
			
		||||
 | 
			
		||||
* Identically shaped arrays then be processed with perfect data parallelisation.
 | 
			
		||||
* Such identically shaped arrays are called conformable arrays.
 | 
			
		||||
Bootstrapping grid downloads and uses for internal dense matrix (non-QCD operations) the Eigen library.
 | 
			
		||||
 | 
			
		||||
The transformation is based on the observation that Cartesian array processing involves
 | 
			
		||||
identical processing to be performed on different regions of the Cartesian array.
 | 
			
		||||
Grid optionally uses:
 | 
			
		||||
 | 
			
		||||
The library will both geometrically decompose into MPI tasks and across SIMD lanes.
 | 
			
		||||
Local vector loops are parallelised with OpenMP pragmas.
 | 
			
		||||
[HDF5](https://support.hdfgroup.org/HDF5/)  
 | 
			
		||||
 | 
			
		||||
Data parallel array operations can then be specified with a SINGLE data parallel paradigm, but
 | 
			
		||||
optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a significant simplification
 | 
			
		||||
for most programmers.
 | 
			
		||||
[LIME](http://usqcd-software.github.io/c-lime/) for ILDG and SciDAC file format support. 
 | 
			
		||||
 | 
			
		||||
The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture.
 | 
			
		||||
Presently SSE4 (128 bit) AVX, AVX2, QPX (256 bit), IMCI, and AVX512 (512 bit) targets are supported (ARM NEON on the way).
 | 
			
		||||
[FFTW](http://www.fftw.org) either generic version or via the Intel MKL library.
 | 
			
		||||
 | 
			
		||||
These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. These may be useful in themselves for other programmers.
 | 
			
		||||
The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`.
 | 
			
		||||
LAPACK either generic version or Intel MKL library.
 | 
			
		||||
 | 
			
		||||
MPI, OpenMP, and SIMD parallelism are present in the library.
 | 
			
		||||
Please see https://arxiv.org/abs/1512.03487 for more detail.
 | 
			
		||||
 | 
			
		||||
### Quick start
 | 
			
		||||
First, start by cloning the repository:
 | 
			
		||||
@@ -155,7 +162,6 @@ The following options can be use with the `--enable-comms=` option to target dif
 | 
			
		||||
| `none`         | no communications                                             |
 | 
			
		||||
| `mpi[-auto]`   | MPI communications                                            |
 | 
			
		||||
| `mpi3[-auto]`  | MPI communications using MPI 3 shared memory                  |
 | 
			
		||||
| `mpi3l[-auto]` | MPI communications using MPI 3 shared memory and leader model |
 | 
			
		||||
| `shmem `       | Cray SHMEM communications                                     |
 | 
			
		||||
 | 
			
		||||
For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names). The `-auto` suffix is not supported by the Cray environment wrapper scripts. Use the standard versions instead.  
 | 
			
		||||
@@ -173,7 +179,8 @@ The following options can be use with the `--enable-simd=` option to target diff
 | 
			
		||||
| `AVXFMA4`   | AVX (256 bit) + FMA4                   |
 | 
			
		||||
| `AVX2`      | AVX 2 (256 bit)                        |
 | 
			
		||||
| `AVX512`    | AVX 512 bit                            |
 | 
			
		||||
| `QPX`       | QPX (256 bit)                          |
 | 
			
		||||
| `NEONv8`    | [ARM NEON](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.den0024a/ch07s03.html) (128 bit)                     |
 | 
			
		||||
| `QPX`       | IBM QPX (256 bit)                      |
 | 
			
		||||
 | 
			
		||||
Alternatively, some CPU codenames can be directly used:
 | 
			
		||||
 | 
			
		||||
@@ -195,21 +202,205 @@ The following configuration is recommended for the Intel Knights Landing platfor
 | 
			
		||||
``` bash
 | 
			
		||||
../configure --enable-precision=double\
 | 
			
		||||
             --enable-simd=KNL        \
 | 
			
		||||
             --enable-comms=mpi-auto \
 | 
			
		||||
             --with-gmp=<path>        \
 | 
			
		||||
             --with-mpfr=<path>       \
 | 
			
		||||
             --enable-comms=mpi-auto  \
 | 
			
		||||
             --enable-mkl             \
 | 
			
		||||
             CXX=icpc MPICXX=mpiicpc
 | 
			
		||||
```
 | 
			
		||||
The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
 | 
			
		||||
 | 
			
		||||
where `<path>` is the UNIX prefix where GMP and MPFR are installed. If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
 | 
			
		||||
If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
 | 
			
		||||
 | 
			
		||||
``` bash
 | 
			
		||||
../configure --enable-precision=double\
 | 
			
		||||
             --enable-simd=KNL        \
 | 
			
		||||
             --enable-comms=mpi       \
 | 
			
		||||
             --with-gmp=<path>        \
 | 
			
		||||
             --with-mpfr=<path>       \
 | 
			
		||||
             --enable-mkl             \
 | 
			
		||||
             CXX=CC CC=cc
 | 
			
		||||
```
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
 | 
			
		||||
``` bash
 | 
			
		||||
               --with-gmp=<path>        \
 | 
			
		||||
               --with-mpfr=<path>       \
 | 
			
		||||
```
 | 
			
		||||
where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
 | 
			
		||||
 | 
			
		||||
Knight's Landing with Intel Omnipath adapters with two adapters per node 
 | 
			
		||||
presently performs better with use of more than one rank per node, using shared memory 
 | 
			
		||||
for interior communication. This is the mpi3 communications implementation. 
 | 
			
		||||
We recommend four ranks per node for best performance, but optimum is local volume dependent.
 | 
			
		||||
 | 
			
		||||
``` bash
 | 
			
		||||
../configure --enable-precision=double\
 | 
			
		||||
             --enable-simd=KNL        \
 | 
			
		||||
             --enable-comms=mpi3-auto \
 | 
			
		||||
             --enable-mkl             \
 | 
			
		||||
             CC=icpc MPICXX=mpiicpc 
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
### Build setup for Intel Haswell Xeon platform
 | 
			
		||||
 | 
			
		||||
The following configuration is recommended for the Intel Haswell platform:
 | 
			
		||||
 | 
			
		||||
``` bash
 | 
			
		||||
../configure --enable-precision=double\
 | 
			
		||||
             --enable-simd=AVX2       \
 | 
			
		||||
             --enable-comms=mpi3-auto \
 | 
			
		||||
             --enable-mkl             \
 | 
			
		||||
             CXX=icpc MPICXX=mpiicpc
 | 
			
		||||
```
 | 
			
		||||
The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
 | 
			
		||||
 | 
			
		||||
If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
 | 
			
		||||
``` bash
 | 
			
		||||
               --with-gmp=<path>        \
 | 
			
		||||
               --with-mpfr=<path>       \
 | 
			
		||||
```
 | 
			
		||||
where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
 | 
			
		||||
 | 
			
		||||
If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
 | 
			
		||||
 | 
			
		||||
``` bash
 | 
			
		||||
../configure --enable-precision=double\
 | 
			
		||||
             --enable-simd=AVX2       \
 | 
			
		||||
             --enable-comms=mpi3      \
 | 
			
		||||
             --enable-mkl             \
 | 
			
		||||
             CXX=CC CC=cc
 | 
			
		||||
```
 | 
			
		||||
Since Dual socket nodes are commonplace, we recommend MPI-3 as the default with the use of 
 | 
			
		||||
one rank per socket. If using the Intel MPI library, threads should be pinned to NUMA domains using
 | 
			
		||||
```
 | 
			
		||||
        export I_MPI_PIN=1
 | 
			
		||||
```
 | 
			
		||||
This is the default.
 | 
			
		||||
 | 
			
		||||
### Build setup for Intel Skylake Xeon platform
 | 
			
		||||
 | 
			
		||||
The following configuration is recommended for the Intel Skylake platform:
 | 
			
		||||
 | 
			
		||||
``` bash
 | 
			
		||||
../configure --enable-precision=double\
 | 
			
		||||
             --enable-simd=AVX512     \
 | 
			
		||||
             --enable-comms=mpi3      \
 | 
			
		||||
             --enable-mkl             \
 | 
			
		||||
             CXX=mpiicpc
 | 
			
		||||
```
 | 
			
		||||
The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library.
 | 
			
		||||
 | 
			
		||||
If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
 | 
			
		||||
``` bash
 | 
			
		||||
               --with-gmp=<path>        \
 | 
			
		||||
               --with-mpfr=<path>       \
 | 
			
		||||
```
 | 
			
		||||
where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
 | 
			
		||||
 | 
			
		||||
If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use:
 | 
			
		||||
 | 
			
		||||
``` bash
 | 
			
		||||
../configure --enable-precision=double\
 | 
			
		||||
             --enable-simd=AVX512     \
 | 
			
		||||
             --enable-comms=mpi3      \
 | 
			
		||||
             --enable-mkl             \
 | 
			
		||||
             CXX=CC CC=cc
 | 
			
		||||
```
 | 
			
		||||
Since Dual socket nodes are commonplace, we recommend MPI-3 as the default with the use of 
 | 
			
		||||
one rank per socket. If using the Intel MPI library, threads should be pinned to NUMA domains using
 | 
			
		||||
``` 
 | 
			
		||||
        export I_MPI_PIN=1
 | 
			
		||||
```
 | 
			
		||||
This is the default. 
 | 
			
		||||
 | 
			
		||||
#### Expected Skylake Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping): 
 | 
			
		||||
 | 
			
		||||
mpirun -n 2 benchmarks/Benchmark_dwf --grid 16.16.16.16 --mpi 2.1.1.1 --cacheblocking 2.2.2.2 --dslash-asm --shm 1024 --threads 18 
 | 
			
		||||
 | 
			
		||||
TBA
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
### Build setup for AMD EPYC / RYZEN
 | 
			
		||||
 | 
			
		||||
The AMD EPYC is a multichip module comprising 32 cores spread over four distinct chips each with 8 cores.
 | 
			
		||||
So, even with a single socket node there is a quad-chip module. Dual socket nodes with 64 cores total
 | 
			
		||||
are common. Each chip within the module exposes a separate NUMA domain.
 | 
			
		||||
There are four NUMA domains per socket and we recommend one MPI rank per NUMA domain.
 | 
			
		||||
MPI-3 is recommended with the use of four ranks per socket,
 | 
			
		||||
and 8 threads per rank. 
 | 
			
		||||
 | 
			
		||||
The following configuration is recommended for the AMD EPYC platform.
 | 
			
		||||
 | 
			
		||||
``` bash
 | 
			
		||||
../configure --enable-precision=double\
 | 
			
		||||
             --enable-simd=AVX2       \
 | 
			
		||||
             --enable-comms=mpi3 \
 | 
			
		||||
             CXX=mpicxx 
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed:
 | 
			
		||||
``` bash
 | 
			
		||||
               --with-gmp=<path>        \
 | 
			
		||||
               --with-mpfr=<path>       \
 | 
			
		||||
```
 | 
			
		||||
where `<path>` is the UNIX prefix where GMP and MPFR are installed. 
 | 
			
		||||
 | 
			
		||||
Using MPICH and g++ v4.9.2, best performance can be obtained using explicit GOMP_CPU_AFFINITY flags for each MPI rank.
 | 
			
		||||
This can be done by invoking MPI on a wrapper script omp_bind.sh to handle this. 
 | 
			
		||||
 | 
			
		||||
It is recommended to run 8 MPI ranks on a single dual socket AMD EPYC, with 8 threads per rank using MPI3 and
 | 
			
		||||
shared memory to communicate within this node:
 | 
			
		||||
 | 
			
		||||
mpirun -np 8 ./omp_bind.sh ./Benchmark_dwf --mpi 2.2.2.1 --dslash-unroll --threads 8 --grid 16.16.16.16 --cacheblocking 4.4.4.4 
 | 
			
		||||
 | 
			
		||||
Where omp_bind.sh does the following:
 | 
			
		||||
```
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
numanode=` expr $PMI_RANK % 8 `
 | 
			
		||||
basecore=`expr $numanode \* 16`
 | 
			
		||||
core0=`expr $basecore + 0 `
 | 
			
		||||
core1=`expr $basecore + 2 `
 | 
			
		||||
core2=`expr $basecore + 4 `
 | 
			
		||||
core3=`expr $basecore + 6 `
 | 
			
		||||
core4=`expr $basecore + 8 `
 | 
			
		||||
core5=`expr $basecore + 10 `
 | 
			
		||||
core6=`expr $basecore + 12 `
 | 
			
		||||
core7=`expr $basecore + 14 `
 | 
			
		||||
 | 
			
		||||
export GOMP_CPU_AFFINITY="$core0 $core1 $core2 $core3 $core4 $core5 $core6 $core7"
 | 
			
		||||
echo GOMP_CUP_AFFINITY $GOMP_CPU_AFFINITY
 | 
			
		||||
 | 
			
		||||
$@
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Performance:
 | 
			
		||||
 | 
			
		||||
#### Expected AMD EPYC 7601 dual socket (single prec, single node 32+32 cores) performance using NUMA MPI mapping): 
 | 
			
		||||
 | 
			
		||||
mpirun  -np 8 ./omp_bind.sh ./Benchmark_dwf --threads 8 --mpi 2.2.2.1 --dslash-unroll --grid 16.16.16.16 --cacheblocking 4.4.4.4
 | 
			
		||||
 | 
			
		||||
TBA
 | 
			
		||||
 | 
			
		||||
### Build setup for BlueGene/Q
 | 
			
		||||
 | 
			
		||||
To be written...
 | 
			
		||||
 | 
			
		||||
### Build setup for ARM Neon
 | 
			
		||||
 | 
			
		||||
To be written...
 | 
			
		||||
 | 
			
		||||
### Build setup for laptops, other compilers, non-cluster builds
 | 
			
		||||
 | 
			
		||||
Many versions of g++ and clang++ work with Grid, and involve merely replacing CXX (and MPICXX),
 | 
			
		||||
and omit the enable-mkl flag. 
 | 
			
		||||
 | 
			
		||||
Single node builds are enabled with 
 | 
			
		||||
```
 | 
			
		||||
            --enable-comms=none
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
FFTW support that is not in the default search path may then enabled with
 | 
			
		||||
```
 | 
			
		||||
    --with-fftw=<installpath>
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
BLAS will not be compiled in by default, and Lanczos will default to Eigen diagonalisation.
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										18
									
								
								TODO
									
									
									
									
									
								
							
							
						
						
									
										18
									
								
								TODO
									
									
									
									
									
								
							@@ -2,18 +2,20 @@ TODO:
 | 
			
		||||
---------------
 | 
			
		||||
 | 
			
		||||
Large item work list:
 | 
			
		||||
1)- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O
 | 
			
		||||
 | 
			
		||||
1)- BG/Q port and check ; Andrew says ok.
 | 
			
		||||
2)- Christoph's local basis expansion Lanczos
 | 
			
		||||
3)- BG/Q port and check
 | 
			
		||||
4)- Precision conversion and sort out localConvert      <-- partial
 | 
			
		||||
  - Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet
 | 
			
		||||
5)- Physical propagator interface
 | 
			
		||||
6)- Conserved currents
 | 
			
		||||
7)- Multigrid Wilson and DWF, compare to other Multigrid implementations
 | 
			
		||||
8)- HDCR resume
 | 
			
		||||
--
 | 
			
		||||
3a)- RNG I/O in ILDG/SciDAC (minor)
 | 
			
		||||
3b)- Precision conversion and sort out localConvert      <-- partial/easy
 | 
			
		||||
3c)- Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet
 | 
			
		||||
4)- Physical propagator interface
 | 
			
		||||
5)- Conserved currents
 | 
			
		||||
6)- Multigrid Wilson and DWF, compare to other Multigrid implementations
 | 
			
		||||
7)- HDCR resume
 | 
			
		||||
 | 
			
		||||
Recent DONE 
 | 
			
		||||
-- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O ; <-- DONE ; bmark cori
 | 
			
		||||
-- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- DONE
 | 
			
		||||
-- GaugeFix into central location                      <-- DONE
 | 
			
		||||
-- Scidac and Ildg metadata handling                   <-- DONE
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										800
									
								
								benchmarks/Benchmark_ITT.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										800
									
								
								benchmarks/Benchmark_ITT.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,800 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./benchmarks/Benchmark_memory_bandwidth.cc
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include <Grid/Grid.h>
 | 
			
		||||
 | 
			
		||||
using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
 | 
			
		||||
typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
 | 
			
		||||
typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF;
 | 
			
		||||
typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
std::vector<int> L_list;
 | 
			
		||||
std::vector<int> Ls_list;
 | 
			
		||||
std::vector<double> mflop_list;
 | 
			
		||||
 | 
			
		||||
double mflop_ref;
 | 
			
		||||
double mflop_ref_err;
 | 
			
		||||
 | 
			
		||||
int NN_global;
 | 
			
		||||
 | 
			
		||||
struct time_statistics{
 | 
			
		||||
  double mean;
 | 
			
		||||
  double err;
 | 
			
		||||
  double min;
 | 
			
		||||
  double max;
 | 
			
		||||
 | 
			
		||||
  void statistics(std::vector<double> v){
 | 
			
		||||
      double sum = std::accumulate(v.begin(), v.end(), 0.0);
 | 
			
		||||
      mean = sum / v.size();
 | 
			
		||||
 | 
			
		||||
      std::vector<double> diff(v.size());
 | 
			
		||||
      std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; });
 | 
			
		||||
      double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
 | 
			
		||||
      err = std::sqrt(sq_sum / (v.size()*(v.size() - 1)));
 | 
			
		||||
 | 
			
		||||
      auto result = std::minmax_element(v.begin(), v.end());
 | 
			
		||||
      min = *result.first;
 | 
			
		||||
      max = *result.second;
 | 
			
		||||
}
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
void comms_header(){
 | 
			
		||||
  std::cout <<GridLogMessage << " L  "<<"\t"<<" Ls  "<<"\t"
 | 
			
		||||
            <<std::setw(11)<<"bytes"<<"MB/s uni (err/min/max)"<<"\t\t"<<"MB/s bidi (err/min/max)"<<std::endl;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
Gamma::Algebra Gmu [] = {
 | 
			
		||||
  Gamma::Algebra::GammaX,
 | 
			
		||||
  Gamma::Algebra::GammaY,
 | 
			
		||||
  Gamma::Algebra::GammaZ,
 | 
			
		||||
  Gamma::Algebra::GammaT
 | 
			
		||||
};
 | 
			
		||||
struct controls {
 | 
			
		||||
  int Opt;
 | 
			
		||||
  int CommsOverlap;
 | 
			
		||||
  Grid::CartesianCommunicator::CommunicatorPolicy_t CommsAsynch;
 | 
			
		||||
  //  int HugePages;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
class Benchmark {
 | 
			
		||||
public:
 | 
			
		||||
  static void Decomposition (void ) {
 | 
			
		||||
 | 
			
		||||
    int threads = GridThread::GetThreads();
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "= Grid is setup to use "<<threads<<" threads"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tvReal          : "<<sizeof(vReal )*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vReal::Nsimd()))<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tvRealD         : "<<sizeof(vRealD)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tvComplex       : "<<sizeof(vComplex )*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplex::Nsimd()))<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tvComplexF      : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage<<"\tvComplexD      : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  static void Comms(void)
 | 
			
		||||
  {
 | 
			
		||||
    int Nloop=200;
 | 
			
		||||
    int nmu=0;
 | 
			
		||||
    int maxlat=32;
 | 
			
		||||
 | 
			
		||||
    std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
 | 
			
		||||
    std::vector<int> mpi_layout  = GridDefaultMpi();
 | 
			
		||||
 | 
			
		||||
    for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
 | 
			
		||||
 | 
			
		||||
    std::vector<double> t_time(Nloop);
 | 
			
		||||
    time_statistics timestat;
 | 
			
		||||
 | 
			
		||||
    std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
 | 
			
		||||
    comms_header();
 | 
			
		||||
 | 
			
		||||
    for(int lat=4;lat<=maxlat;lat+=4){
 | 
			
		||||
      for(int Ls=8;Ls<=8;Ls*=2){
 | 
			
		||||
 | 
			
		||||
	std::vector<int> latt_size  ({lat*mpi_layout[0],
 | 
			
		||||
	      lat*mpi_layout[1],
 | 
			
		||||
	      lat*mpi_layout[2],
 | 
			
		||||
	      lat*mpi_layout[3]});
 | 
			
		||||
 | 
			
		||||
	GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
	RealD Nrank = Grid._Nprocessors;
 | 
			
		||||
	RealD Nnode = Grid.NodeCount();
 | 
			
		||||
	RealD ppn = Nrank/Nnode;
 | 
			
		||||
 | 
			
		||||
	std::vector<HalfSpinColourVectorD *> xbuf(8);
 | 
			
		||||
	std::vector<HalfSpinColourVectorD *> rbuf(8);
 | 
			
		||||
	Grid.ShmBufferFreeAll();
 | 
			
		||||
	for(int d=0;d<8;d++){
 | 
			
		||||
	  xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 | 
			
		||||
	  rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 | 
			
		||||
	  bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 | 
			
		||||
	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
 | 
			
		||||
	int ncomm;
 | 
			
		||||
	double dbytes;
 | 
			
		||||
	std::vector<double> times(Nloop);
 | 
			
		||||
	for(int i=0;i<Nloop;i++){
 | 
			
		||||
 | 
			
		||||
	  double start=usecond();
 | 
			
		||||
 | 
			
		||||
	  dbytes=0;
 | 
			
		||||
	  ncomm=0;
 | 
			
		||||
 | 
			
		||||
	  parallel_for(int dir=0;dir<8;dir++){
 | 
			
		||||
 | 
			
		||||
	    double tbytes;
 | 
			
		||||
	    int mu =dir % 4;
 | 
			
		||||
 | 
			
		||||
	    if (mpi_layout[mu]>1 ) {
 | 
			
		||||
	        
 | 
			
		||||
	      int xmit_to_rank;
 | 
			
		||||
	      int recv_from_rank;
 | 
			
		||||
	      if ( dir == mu ) { 
 | 
			
		||||
		int comm_proc=1;
 | 
			
		||||
		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 | 
			
		||||
	      } else { 
 | 
			
		||||
		int comm_proc = mpi_layout[mu]-1;
 | 
			
		||||
		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 | 
			
		||||
	      }
 | 
			
		||||
	      tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
 | 
			
		||||
						 (void *)&rbuf[dir][0], recv_from_rank,
 | 
			
		||||
						 bytes,dir);
 | 
			
		||||
	  
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
#pragma omp atomic
 | 
			
		||||
#endif
 | 
			
		||||
	      ncomm++;
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
#pragma omp atomic
 | 
			
		||||
#endif
 | 
			
		||||
	      dbytes+=tbytes;
 | 
			
		||||
	    }
 | 
			
		||||
	  }
 | 
			
		||||
	  Grid.Barrier();
 | 
			
		||||
	  double stop=usecond();
 | 
			
		||||
	  t_time[i] = stop-start; // microseconds
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	timestat.statistics(t_time);
 | 
			
		||||
	//	for(int i=0;i<t_time.size();i++){
 | 
			
		||||
	//	  std::cout << i<<" "<<t_time[i]<<std::endl;
 | 
			
		||||
	//	}
 | 
			
		||||
 | 
			
		||||
	dbytes=dbytes*ppn;
 | 
			
		||||
	double xbytes    = dbytes*0.5;
 | 
			
		||||
	double rbytes    = dbytes*0.5;
 | 
			
		||||
	double bidibytes = dbytes;
 | 
			
		||||
 | 
			
		||||
	std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
 | 
			
		||||
		 <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
 | 
			
		||||
		 <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
 | 
			
		||||
		 <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
 | 
			
		||||
		 << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
 | 
			
		||||
		 << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
	
 | 
			
		||||
	    }
 | 
			
		||||
    }    
 | 
			
		||||
 | 
			
		||||
    return;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  static void Memory(void)
 | 
			
		||||
  {
 | 
			
		||||
    const int Nvec=8;
 | 
			
		||||
    typedef Lattice< iVector< vReal,Nvec> > LatticeVec;
 | 
			
		||||
    typedef iVector<vReal,Nvec> Vec;
 | 
			
		||||
 | 
			
		||||
    std::vector<int> simd_layout = GridDefaultSimd(Nd,vReal::Nsimd());
 | 
			
		||||
    std::vector<int> mpi_layout  = GridDefaultMpi();
 | 
			
		||||
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "= Benchmarking a*x + y bandwidth"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<< "\t\tGB/s / node"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
 | 
			
		||||
  
 | 
			
		||||
    uint64_t NP;
 | 
			
		||||
    uint64_t NN;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  uint64_t lmax=48;
 | 
			
		||||
#define NLOOP (100*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
 | 
			
		||||
 | 
			
		||||
    GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 | 
			
		||||
    for(int lat=8;lat<=lmax;lat+=4){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
 | 
			
		||||
      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
 | 
			
		||||
      NP= Grid.RankCount();
 | 
			
		||||
      NN =Grid.NodeCount();
 | 
			
		||||
 | 
			
		||||
      Vec rn ; random(sRNG,rn);
 | 
			
		||||
 | 
			
		||||
      LatticeVec z(&Grid); z=rn;
 | 
			
		||||
      LatticeVec x(&Grid); x=rn;
 | 
			
		||||
      LatticeVec y(&Grid); y=rn;
 | 
			
		||||
      double a=2.0;
 | 
			
		||||
 | 
			
		||||
      uint64_t Nloop=NLOOP;
 | 
			
		||||
 | 
			
		||||
      double start=usecond();
 | 
			
		||||
      for(int i=0;i<Nloop;i++){
 | 
			
		||||
	z=a*x-y;
 | 
			
		||||
        x._odata[0]=z._odata[0]; // force serial dependency to prevent optimise away
 | 
			
		||||
        y._odata[4]=z._odata[4];
 | 
			
		||||
      }
 | 
			
		||||
      double stop=usecond();
 | 
			
		||||
      double time = (stop-start)/Nloop*1000;
 | 
			
		||||
     
 | 
			
		||||
      double flops=vol*Nvec*2;// mul,add
 | 
			
		||||
      double bytes=3.0*vol*Nvec*sizeof(Real);
 | 
			
		||||
      std::cout<<GridLogMessage<<std::setprecision(3) 
 | 
			
		||||
	       << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.
 | 
			
		||||
	       << "\t\t"<< bytes/time/NN <<std::endl;
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  static double DWF5(int Ls,int L)
 | 
			
		||||
  {
 | 
			
		||||
    RealD mass=0.1;
 | 
			
		||||
    RealD M5  =1.8;
 | 
			
		||||
 | 
			
		||||
    double mflops;
 | 
			
		||||
    double mflops_best = 0;
 | 
			
		||||
    double mflops_worst= 0;
 | 
			
		||||
    std::vector<double> mflops_all;
 | 
			
		||||
 | 
			
		||||
    ///////////////////////////////////////////////////////
 | 
			
		||||
    // Set/Get the layout & grid size
 | 
			
		||||
    ///////////////////////////////////////////////////////
 | 
			
		||||
    int threads = GridThread::GetThreads();
 | 
			
		||||
    std::vector<int> mpi = GridDefaultMpi(); assert(mpi.size()==4);
 | 
			
		||||
    std::vector<int> local({L,L,L,L});
 | 
			
		||||
 | 
			
		||||
    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(std::vector<int>({64,64,64,64}), 
 | 
			
		||||
								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 | 
			
		||||
    uint64_t NP = TmpGrid->RankCount();
 | 
			
		||||
    uint64_t NN = TmpGrid->NodeCount();
 | 
			
		||||
    NN_global=NN;
 | 
			
		||||
    uint64_t SHM=NP/NN;
 | 
			
		||||
 | 
			
		||||
    std::vector<int> internal;
 | 
			
		||||
    if      ( SHM == 1 )   internal = std::vector<int>({1,1,1,1});
 | 
			
		||||
    else if ( SHM == 2 )   internal = std::vector<int>({2,1,1,1});
 | 
			
		||||
    else if ( SHM == 4 )   internal = std::vector<int>({2,2,1,1});
 | 
			
		||||
    else if ( SHM == 8 )   internal = std::vector<int>({2,2,2,1});
 | 
			
		||||
    else assert(0);
 | 
			
		||||
 | 
			
		||||
    std::vector<int> nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
 | 
			
		||||
    std::vector<int> latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
 | 
			
		||||
 | 
			
		||||
    ///////// Welcome message ////////////
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "Benchmark DWF Ls vec on "<<L<<"^4 local volume "<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* MPI ranks      : "<<GridCmdVectorIntToString(mpi)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* Intranode      : "<<GridCmdVectorIntToString(internal)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* nodes          : "<<GridCmdVectorIntToString(nodes)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
    ///////// Lattice Init ////////////
 | 
			
		||||
    GridCartesian         * UGrid    = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 | 
			
		||||
    GridRedBlackCartesian * UrbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 | 
			
		||||
    GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
 | 
			
		||||
    GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
 | 
			
		||||
    GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
 | 
			
		||||
    GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
 | 
			
		||||
 | 
			
		||||
    ///////// RNG Init ////////////
 | 
			
		||||
    std::vector<int> seeds4({1,2,3,4});
 | 
			
		||||
    std::vector<int> seeds5({5,6,7,8});
 | 
			
		||||
    GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
 | 
			
		||||
    GridParallelRNG          RNG5(sFGrid);  RNG5.SeedFixedIntegers(seeds5);
 | 
			
		||||
    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
 | 
			
		||||
 | 
			
		||||
    ///////// Source preparation ////////////
 | 
			
		||||
    LatticeFermion src   (sFGrid); random(RNG5,src);
 | 
			
		||||
    LatticeFermion tmp   (sFGrid);
 | 
			
		||||
 | 
			
		||||
    RealD N2 = 1.0/::sqrt(norm2(src));
 | 
			
		||||
    src = src*N2;
 | 
			
		||||
    
 | 
			
		||||
    LatticeGaugeField Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu); 
 | 
			
		||||
 | 
			
		||||
    WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
 | 
			
		||||
    LatticeFermion src_e (sFrbGrid);
 | 
			
		||||
    LatticeFermion src_o (sFrbGrid);
 | 
			
		||||
    LatticeFermion r_e   (sFrbGrid);
 | 
			
		||||
    LatticeFermion r_o   (sFrbGrid);
 | 
			
		||||
    LatticeFermion r_eo  (sFGrid);
 | 
			
		||||
    LatticeFermion err   (sFGrid);
 | 
			
		||||
    {
 | 
			
		||||
 | 
			
		||||
      pickCheckerboard(Even,src_e,src);
 | 
			
		||||
      pickCheckerboard(Odd,src_o,src);
 | 
			
		||||
 | 
			
		||||
#if defined(AVX512) 
 | 
			
		||||
      const int num_cases = 6;
 | 
			
		||||
      std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
 | 
			
		||||
#else
 | 
			
		||||
      const int num_cases = 4;
 | 
			
		||||
      std::string fmt("U/S ; U/O ; G/S ; G/O ");
 | 
			
		||||
#endif
 | 
			
		||||
      controls Cases [] = {
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 | 
			
		||||
	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 | 
			
		||||
#endif
 | 
			
		||||
	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 | 
			
		||||
	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 | 
			
		||||
	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 | 
			
		||||
	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }
 | 
			
		||||
      }; 
 | 
			
		||||
 | 
			
		||||
      for(int c=0;c<num_cases;c++) {
 | 
			
		||||
 | 
			
		||||
	QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
 | 
			
		||||
	QCD::WilsonKernelsStatic::Opt   = Cases[c].Opt;
 | 
			
		||||
	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
 | 
			
		||||
 | 
			
		||||
	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
 | 
			
		||||
	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
 | 
			
		||||
	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
 | 
			
		||||
	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
 | 
			
		||||
	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
 | 
			
		||||
	if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 | 
			
		||||
	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 | 
			
		||||
	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
	int nwarm = 100;
 | 
			
		||||
	uint64_t ncall = 1000;
 | 
			
		||||
 | 
			
		||||
	double t0=usecond();
 | 
			
		||||
	sFGrid->Barrier();
 | 
			
		||||
	for(int i=0;i<nwarm;i++){
 | 
			
		||||
	  sDw.DhopEO(src_o,r_e,DaggerNo);
 | 
			
		||||
	}
 | 
			
		||||
	sFGrid->Barrier();
 | 
			
		||||
	double t1=usecond();
 | 
			
		||||
 | 
			
		||||
	sDw.ZeroCounters();
 | 
			
		||||
	time_statistics timestat;
 | 
			
		||||
	std::vector<double> t_time(ncall);
 | 
			
		||||
	for(uint64_t i=0;i<ncall;i++){
 | 
			
		||||
	  t0=usecond();
 | 
			
		||||
	  sDw.DhopEO(src_o,r_e,DaggerNo);
 | 
			
		||||
	  t1=usecond();
 | 
			
		||||
	  t_time[i] = t1-t0;
 | 
			
		||||
	}
 | 
			
		||||
	sFGrid->Barrier();
 | 
			
		||||
	
 | 
			
		||||
	double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 | 
			
		||||
	double flops=(1344.0*volume)/2;
 | 
			
		||||
	double mf_hi, mf_lo, mf_err;
 | 
			
		||||
 | 
			
		||||
	timestat.statistics(t_time);
 | 
			
		||||
	mf_hi = flops/timestat.min;
 | 
			
		||||
	mf_lo = flops/timestat.max;
 | 
			
		||||
	mf_err= flops/timestat.min * timestat.err/timestat.mean;
 | 
			
		||||
 | 
			
		||||
	mflops = flops/timestat.mean;
 | 
			
		||||
	mflops_all.push_back(mflops);
 | 
			
		||||
	if ( mflops_best == 0   ) mflops_best = mflops;
 | 
			
		||||
	if ( mflops_worst== 0   ) mflops_worst= mflops;
 | 
			
		||||
	if ( mflops>mflops_best ) mflops_best = mflops;
 | 
			
		||||
	if ( mflops<mflops_worst) mflops_worst= mflops;
 | 
			
		||||
 | 
			
		||||
	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 | 
			
		||||
	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per rank   "<< mflops/NP<<std::endl;
 | 
			
		||||
	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per node   "<< mflops/NN<<std::endl;
 | 
			
		||||
 | 
			
		||||
	sDw.Report();
 | 
			
		||||
 | 
			
		||||
      }
 | 
			
		||||
      double robust = mflops_worst/mflops_best;;
 | 
			
		||||
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
 | 
			
		||||
 | 
			
		||||
      std::cout<<GridLogMessage <<std::setprecision(3)<< L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< robust <<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage <<fmt << std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage;
 | 
			
		||||
 | 
			
		||||
      for(int i=0;i<mflops_all.size();i++){
 | 
			
		||||
	std::cout<<mflops_all[i]/NN<<" ; " ;
 | 
			
		||||
      }
 | 
			
		||||
      std::cout<<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
    return mflops_best;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  static double DWF(int Ls,int L, double & robust)
 | 
			
		||||
  {
 | 
			
		||||
    RealD mass=0.1;
 | 
			
		||||
    RealD M5  =1.8;
 | 
			
		||||
 | 
			
		||||
    double mflops;
 | 
			
		||||
    double mflops_best = 0;
 | 
			
		||||
    double mflops_worst= 0;
 | 
			
		||||
    std::vector<double> mflops_all;
 | 
			
		||||
 | 
			
		||||
    ///////////////////////////////////////////////////////
 | 
			
		||||
    // Set/Get the layout & grid size
 | 
			
		||||
    ///////////////////////////////////////////////////////
 | 
			
		||||
    int threads = GridThread::GetThreads();
 | 
			
		||||
    std::vector<int> mpi = GridDefaultMpi(); assert(mpi.size()==4);
 | 
			
		||||
    std::vector<int> local({L,L,L,L});
 | 
			
		||||
 | 
			
		||||
    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(std::vector<int>({64,64,64,64}), 
 | 
			
		||||
								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 | 
			
		||||
    uint64_t NP = TmpGrid->RankCount();
 | 
			
		||||
    uint64_t NN = TmpGrid->NodeCount();
 | 
			
		||||
    NN_global=NN;
 | 
			
		||||
    uint64_t SHM=NP/NN;
 | 
			
		||||
 | 
			
		||||
    std::vector<int> internal;
 | 
			
		||||
    if      ( SHM == 1 )   internal = std::vector<int>({1,1,1,1});
 | 
			
		||||
    else if ( SHM == 2 )   internal = std::vector<int>({2,1,1,1});
 | 
			
		||||
    else if ( SHM == 4 )   internal = std::vector<int>({2,2,1,1});
 | 
			
		||||
    else if ( SHM == 8 )   internal = std::vector<int>({2,2,2,1});
 | 
			
		||||
    else assert(0);
 | 
			
		||||
 | 
			
		||||
    std::vector<int> nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
 | 
			
		||||
    std::vector<int> latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
 | 
			
		||||
 | 
			
		||||
    ///////// Welcome message ////////////
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "Benchmark DWF on "<<L<<"^4 local volume "<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* MPI ranks      : "<<GridCmdVectorIntToString(mpi)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* Intranode      : "<<GridCmdVectorIntToString(internal)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* nodes          : "<<GridCmdVectorIntToString(nodes)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    ///////// Lattice Init ////////////
 | 
			
		||||
    GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 | 
			
		||||
    GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 | 
			
		||||
    GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
 | 
			
		||||
    GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    ///////// RNG Init ////////////
 | 
			
		||||
    std::vector<int> seeds4({1,2,3,4});
 | 
			
		||||
    std::vector<int> seeds5({5,6,7,8});
 | 
			
		||||
    GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
 | 
			
		||||
    GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
 | 
			
		||||
    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
 | 
			
		||||
 | 
			
		||||
    ///////// Source preparation ////////////
 | 
			
		||||
    LatticeFermion src   (FGrid); random(RNG5,src);
 | 
			
		||||
    LatticeFermion ref   (FGrid);
 | 
			
		||||
    LatticeFermion tmp   (FGrid);
 | 
			
		||||
 | 
			
		||||
    RealD N2 = 1.0/::sqrt(norm2(src));
 | 
			
		||||
    src = src*N2;
 | 
			
		||||
    
 | 
			
		||||
    LatticeGaugeField Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu); 
 | 
			
		||||
 | 
			
		||||
    DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 | 
			
		||||
 | 
			
		||||
    ////////////////////////////////////
 | 
			
		||||
    // Naive wilson implementation
 | 
			
		||||
    ////////////////////////////////////
 | 
			
		||||
    {
 | 
			
		||||
      LatticeGaugeField Umu5d(FGrid); 
 | 
			
		||||
      std::vector<LatticeColourMatrix> U(4,FGrid);
 | 
			
		||||
      for(int ss=0;ss<Umu._grid->oSites();ss++){
 | 
			
		||||
	for(int s=0;s<Ls;s++){
 | 
			
		||||
	  Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
 | 
			
		||||
	}
 | 
			
		||||
      }
 | 
			
		||||
      ref = zero;
 | 
			
		||||
      for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
	U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
 | 
			
		||||
      }
 | 
			
		||||
      for(int mu=0;mu<Nd;mu++){
 | 
			
		||||
	
 | 
			
		||||
	tmp = U[mu]*Cshift(src,mu+1,1);
 | 
			
		||||
	ref=ref + tmp - Gamma(Gmu[mu])*tmp;
 | 
			
		||||
	
 | 
			
		||||
	tmp =adj(U[mu])*src;
 | 
			
		||||
	tmp =Cshift(tmp,mu+1,-1);
 | 
			
		||||
	ref=ref + tmp + Gamma(Gmu[mu])*tmp;
 | 
			
		||||
      }
 | 
			
		||||
      ref = -0.5*ref;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    LatticeFermion src_e (FrbGrid);
 | 
			
		||||
    LatticeFermion src_o (FrbGrid);
 | 
			
		||||
    LatticeFermion r_e   (FrbGrid);
 | 
			
		||||
    LatticeFermion r_o   (FrbGrid);
 | 
			
		||||
    LatticeFermion r_eo  (FGrid);
 | 
			
		||||
    LatticeFermion err   (FGrid);
 | 
			
		||||
    {
 | 
			
		||||
 | 
			
		||||
      pickCheckerboard(Even,src_e,src);
 | 
			
		||||
      pickCheckerboard(Odd,src_o,src);
 | 
			
		||||
 | 
			
		||||
#if defined(AVX512) 
 | 
			
		||||
      const int num_cases = 6;
 | 
			
		||||
      std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
 | 
			
		||||
#else
 | 
			
		||||
      const int num_cases = 4;
 | 
			
		||||
      std::string fmt("U/S ; U/O ; G/S ; G/O ");
 | 
			
		||||
#endif
 | 
			
		||||
      controls Cases [] = {
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 | 
			
		||||
	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 | 
			
		||||
#endif
 | 
			
		||||
	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 | 
			
		||||
	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 | 
			
		||||
	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 | 
			
		||||
	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }
 | 
			
		||||
      }; 
 | 
			
		||||
 | 
			
		||||
      for(int c=0;c<num_cases;c++) {
 | 
			
		||||
 | 
			
		||||
	QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
 | 
			
		||||
	QCD::WilsonKernelsStatic::Opt   = Cases[c].Opt;
 | 
			
		||||
	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
 | 
			
		||||
 | 
			
		||||
	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
 | 
			
		||||
	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
 | 
			
		||||
	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
 | 
			
		||||
	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
 | 
			
		||||
	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
 | 
			
		||||
	if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 | 
			
		||||
	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 | 
			
		||||
	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
	int nwarm = 200;
 | 
			
		||||
	double t0=usecond();
 | 
			
		||||
	FGrid->Barrier();
 | 
			
		||||
	for(int i=0;i<nwarm;i++){
 | 
			
		||||
	  Dw.DhopEO(src_o,r_e,DaggerNo);
 | 
			
		||||
	}
 | 
			
		||||
	FGrid->Barrier();
 | 
			
		||||
	double t1=usecond();
 | 
			
		||||
	//	uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
 | 
			
		||||
	//	if (ncall < 500) ncall = 500;
 | 
			
		||||
	uint64_t ncall = 1000;
 | 
			
		||||
 | 
			
		||||
	FGrid->Broadcast(0,&ncall,sizeof(ncall));
 | 
			
		||||
 | 
			
		||||
	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
 | 
			
		||||
	Dw.ZeroCounters();
 | 
			
		||||
 | 
			
		||||
	time_statistics timestat;
 | 
			
		||||
	std::vector<double> t_time(ncall);
 | 
			
		||||
	for(uint64_t i=0;i<ncall;i++){
 | 
			
		||||
	  t0=usecond();
 | 
			
		||||
	  Dw.DhopEO(src_o,r_e,DaggerNo);
 | 
			
		||||
	  t1=usecond();
 | 
			
		||||
	  t_time[i] = t1-t0;
 | 
			
		||||
	}
 | 
			
		||||
	FGrid->Barrier();
 | 
			
		||||
	
 | 
			
		||||
	double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 | 
			
		||||
	double flops=(1344.0*volume)/2;
 | 
			
		||||
	double mf_hi, mf_lo, mf_err;
 | 
			
		||||
 | 
			
		||||
	timestat.statistics(t_time);
 | 
			
		||||
	mf_hi = flops/timestat.min;
 | 
			
		||||
	mf_lo = flops/timestat.max;
 | 
			
		||||
	mf_err= flops/timestat.min * timestat.err/timestat.mean;
 | 
			
		||||
 | 
			
		||||
	mflops = flops/timestat.mean;
 | 
			
		||||
	mflops_all.push_back(mflops);
 | 
			
		||||
	if ( mflops_best == 0   ) mflops_best = mflops;
 | 
			
		||||
	if ( mflops_worst== 0   ) mflops_worst= mflops;
 | 
			
		||||
	if ( mflops>mflops_best ) mflops_best = mflops;
 | 
			
		||||
	if ( mflops<mflops_worst) mflops_worst= mflops;
 | 
			
		||||
 | 
			
		||||
	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 | 
			
		||||
	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl;
 | 
			
		||||
	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl;
 | 
			
		||||
 | 
			
		||||
	Dw.Report();
 | 
			
		||||
 | 
			
		||||
	Dw.DhopEO(src_o,r_e,DaggerNo);
 | 
			
		||||
	Dw.DhopOE(src_e,r_o,DaggerNo);
 | 
			
		||||
	setCheckerboard(r_eo,r_o);
 | 
			
		||||
	setCheckerboard(r_eo,r_e);
 | 
			
		||||
	err = r_eo-ref; 
 | 
			
		||||
	std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
 | 
			
		||||
	assert((norm2(err)<1.0e-4));
 | 
			
		||||
 | 
			
		||||
      }
 | 
			
		||||
      robust = mflops_worst/mflops_best;
 | 
			
		||||
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage << std::fixed<<std::setprecision(3)<< L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< robust  <<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage <<fmt << std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage ;
 | 
			
		||||
 | 
			
		||||
      for(int i=0;i<mflops_all.size();i++){
 | 
			
		||||
	std::cout<<mflops_all[i]/NN<<" ; " ;
 | 
			
		||||
      }
 | 
			
		||||
      std::cout<<std::endl;
 | 
			
		||||
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
    return mflops_best;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
  CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
 | 
			
		||||
#ifdef KNL
 | 
			
		||||
  LebesgueOrder::Block = std::vector<int>({8,2,2,2});
 | 
			
		||||
#else
 | 
			
		||||
  LebesgueOrder::Block = std::vector<int>({2,2,2,2});
 | 
			
		||||
#endif
 | 
			
		||||
  Benchmark::Decomposition();
 | 
			
		||||
 | 
			
		||||
  int do_memory=1;
 | 
			
		||||
  int do_comms =1;
 | 
			
		||||
  int do_su3   =0;
 | 
			
		||||
  int do_wilson=1;
 | 
			
		||||
  int do_dwf   =1;
 | 
			
		||||
 | 
			
		||||
  if ( do_su3 ) {
 | 
			
		||||
    // empty for now
 | 
			
		||||
  }
 | 
			
		||||
#if 1
 | 
			
		||||
  int sel=2;
 | 
			
		||||
  std::vector<int> L_list({8,12,16,24});
 | 
			
		||||
#else
 | 
			
		||||
  int sel=1;
 | 
			
		||||
  std::vector<int> L_list({8,12});
 | 
			
		||||
#endif
 | 
			
		||||
  int selm1=sel-1;
 | 
			
		||||
  std::vector<double> robust_list;
 | 
			
		||||
 | 
			
		||||
  std::vector<double> wilson;
 | 
			
		||||
  std::vector<double> dwf4;
 | 
			
		||||
  std::vector<double> dwf5;
 | 
			
		||||
 | 
			
		||||
  if ( do_wilson ) {
 | 
			
		||||
    int Ls=1;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    for(int l=0;l<L_list.size();l++){
 | 
			
		||||
      double robust;
 | 
			
		||||
      wilson.push_back(Benchmark::DWF(1,L_list[l],robust));
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  int Ls=16;
 | 
			
		||||
  if ( do_dwf ) {
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    for(int l=0;l<L_list.size();l++){
 | 
			
		||||
      double robust;
 | 
			
		||||
      double result = Benchmark::DWF(Ls,L_list[l],robust) ;
 | 
			
		||||
      dwf4.push_back(result);
 | 
			
		||||
      robust_list.push_back(robust);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( do_dwf ) {
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    for(int l=0;l<L_list.size();l++){
 | 
			
		||||
      dwf5.push_back(Benchmark::DWF5(Ls,L_list[l]));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( do_dwf ) {
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "L \t\t Wilson \t DWF4 \t DWF5 " <<std::endl;
 | 
			
		||||
  for(int l=0;l<L_list.size();l++){
 | 
			
		||||
    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t "<<dwf4[l]<<" \t "<<dwf5[l] <<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  int NN=NN_global;
 | 
			
		||||
  if ( do_memory ) {
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << " Memory benchmark " <<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    Benchmark::Memory();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( do_comms && (NN>1) ) {
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << " Communications benchmark " <<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
    Benchmark::Comms();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( do_dwf ) {
 | 
			
		||||
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << " L \t\t Wilson\t\t DWF4  \t\t DWF5 " <<std::endl;
 | 
			
		||||
  for(int l=0;l<L_list.size();l++){
 | 
			
		||||
    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]/NN<<" \t "<<dwf4[l]/NN<<" \t "<<dwf5[l] /NN<<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << " Comparison point     result: "  << 0.5*(dwf4[sel]+dwf4[selm1])/NN << " Mflop/s per node"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << " Comparison point is 0.5*("<<dwf4[sel]/NN<<"+"<<dwf4[selm1]/NN << ") "<<std::endl;
 | 
			
		||||
  std::cout<<std::setprecision(3);
 | 
			
		||||
  std::cout<<GridLogMessage << " Comparison point robustness: "  << robust_list[sel] <<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
@@ -68,7 +68,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
 | 
			
		||||
  int Nloop=100;
 | 
			
		||||
  int nmu=0;
 | 
			
		||||
  int maxlat=24;
 | 
			
		||||
  int maxlat=32;
 | 
			
		||||
  for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;
 | 
			
		||||
@@ -80,7 +80,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
 | 
			
		||||
  header();
 | 
			
		||||
  for(int lat=4;lat<=maxlat;lat+=4){
 | 
			
		||||
    for(int Ls=8;Ls<=32;Ls*=2){
 | 
			
		||||
    for(int Ls=8;Ls<=8;Ls*=2){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],
 | 
			
		||||
      				    lat*mpi_layout[1],
 | 
			
		||||
@@ -92,11 +92,16 @@ int main (int argc, char ** argv)
 | 
			
		||||
      RealD Nnode = Grid.NodeCount();
 | 
			
		||||
      RealD ppn = Nrank/Nnode;
 | 
			
		||||
 | 
			
		||||
      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
 | 
			
		||||
      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
 | 
			
		||||
      std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);	
 | 
			
		||||
      std::vector<Vector<HalfSpinColourVectorD> > rbuf(8);
 | 
			
		||||
 | 
			
		||||
      int ncomm;
 | 
			
		||||
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
 | 
			
		||||
      for(int mu=0;mu<8;mu++){
 | 
			
		||||
	xbuf[mu].resize(lat*lat*lat*Ls);
 | 
			
		||||
	rbuf[mu].resize(lat*lat*lat*Ls);
 | 
			
		||||
	//	std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      for(int i=0;i<Nloop;i++){
 | 
			
		||||
      double start=usecond();
 | 
			
		||||
@@ -112,7 +117,6 @@ int main (int argc, char ** argv)
 | 
			
		||||
	    int comm_proc=1;
 | 
			
		||||
	    int xmit_to_rank;
 | 
			
		||||
	    int recv_from_rank;
 | 
			
		||||
	    
 | 
			
		||||
	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 | 
			
		||||
	    Grid.SendToRecvFromBegin(requests,
 | 
			
		||||
				   (void *)&xbuf[mu][0],
 | 
			
		||||
@@ -163,7 +167,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
  header();
 | 
			
		||||
 | 
			
		||||
  for(int lat=4;lat<=maxlat;lat+=4){
 | 
			
		||||
    for(int Ls=8;Ls<=32;Ls*=2){
 | 
			
		||||
    for(int Ls=8;Ls<=8;Ls*=2){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat,lat,lat,lat});
 | 
			
		||||
 | 
			
		||||
@@ -172,9 +176,14 @@ int main (int argc, char ** argv)
 | 
			
		||||
      RealD Nnode = Grid.NodeCount();
 | 
			
		||||
      RealD ppn = Nrank/Nnode;
 | 
			
		||||
 | 
			
		||||
      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
 | 
			
		||||
      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
 | 
			
		||||
      std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);
 | 
			
		||||
      std::vector<Vector<HalfSpinColourVectorD> > rbuf(8);
 | 
			
		||||
 | 
			
		||||
      for(int mu=0;mu<8;mu++){
 | 
			
		||||
	xbuf[mu].resize(lat*lat*lat*Ls);
 | 
			
		||||
	rbuf[mu].resize(lat*lat*lat*Ls);
 | 
			
		||||
	//	std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      int ncomm;
 | 
			
		||||
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
 | 
			
		||||
@@ -249,7 +258,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
  header();
 | 
			
		||||
 | 
			
		||||
  for(int lat=4;lat<=maxlat;lat+=4){
 | 
			
		||||
    for(int Ls=8;Ls<=32;Ls*=2){
 | 
			
		||||
    for(int Ls=8;Ls<=8;Ls*=2){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],
 | 
			
		||||
      				    lat*mpi_layout[1],
 | 
			
		||||
@@ -299,7 +308,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
					      xmit_to_rank,
 | 
			
		||||
					      (void *)&rbuf[mu][0],
 | 
			
		||||
					      recv_from_rank,
 | 
			
		||||
					      bytes);
 | 
			
		||||
					      bytes,mu);
 | 
			
		||||
	
 | 
			
		||||
	    comm_proc = mpi_layout[mu]-1;
 | 
			
		||||
	  
 | 
			
		||||
@@ -310,11 +319,11 @@ int main (int argc, char ** argv)
 | 
			
		||||
					      xmit_to_rank,
 | 
			
		||||
					      (void *)&rbuf[mu+4][0],
 | 
			
		||||
					      recv_from_rank,
 | 
			
		||||
					      bytes);
 | 
			
		||||
					      bytes,mu+4);
 | 
			
		||||
	  
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
	Grid.StencilSendToRecvFromComplete(requests);
 | 
			
		||||
	Grid.StencilSendToRecvFromComplete(requests,0);
 | 
			
		||||
	Grid.Barrier();
 | 
			
		||||
	double stop=usecond();
 | 
			
		||||
	t_time[i] = stop-start; // microseconds
 | 
			
		||||
@@ -346,7 +355,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
  header();
 | 
			
		||||
 | 
			
		||||
  for(int lat=4;lat<=maxlat;lat+=4){
 | 
			
		||||
    for(int Ls=8;Ls<=32;Ls*=2){
 | 
			
		||||
    for(int Ls=8;Ls<=8;Ls*=2){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],
 | 
			
		||||
      				    lat*mpi_layout[1],
 | 
			
		||||
@@ -393,8 +402,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
					      xmit_to_rank,
 | 
			
		||||
					      (void *)&rbuf[mu][0],
 | 
			
		||||
					      recv_from_rank,
 | 
			
		||||
					      bytes);
 | 
			
		||||
	    Grid.StencilSendToRecvFromComplete(requests);
 | 
			
		||||
					      bytes,mu);
 | 
			
		||||
	    Grid.StencilSendToRecvFromComplete(requests,mu);
 | 
			
		||||
	    requests.resize(0);
 | 
			
		||||
 | 
			
		||||
	    comm_proc = mpi_layout[mu]-1;
 | 
			
		||||
@@ -406,8 +415,8 @@ int main (int argc, char ** argv)
 | 
			
		||||
					      xmit_to_rank,
 | 
			
		||||
					      (void *)&rbuf[mu+4][0],
 | 
			
		||||
					      recv_from_rank,
 | 
			
		||||
					      bytes);
 | 
			
		||||
	    Grid.StencilSendToRecvFromComplete(requests);
 | 
			
		||||
					      bytes,mu+4);
 | 
			
		||||
	    Grid.StencilSendToRecvFromComplete(requests,mu+4);
 | 
			
		||||
	    requests.resize(0);
 | 
			
		||||
	  
 | 
			
		||||
	  }
 | 
			
		||||
@@ -436,5 +445,97 @@ int main (int argc, char ** argv)
 | 
			
		||||
    }
 | 
			
		||||
  }    
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
 | 
			
		||||
  header();
 | 
			
		||||
 | 
			
		||||
  for(int lat=4;lat<=maxlat;lat+=4){
 | 
			
		||||
    for(int Ls=8;Ls<=8;Ls*=2){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],
 | 
			
		||||
      				    lat*mpi_layout[1],
 | 
			
		||||
      				    lat*mpi_layout[2],
 | 
			
		||||
      				    lat*mpi_layout[3]});
 | 
			
		||||
 | 
			
		||||
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
      RealD Nrank = Grid._Nprocessors;
 | 
			
		||||
      RealD Nnode = Grid.NodeCount();
 | 
			
		||||
      RealD ppn = Nrank/Nnode;
 | 
			
		||||
 | 
			
		||||
      std::vector<HalfSpinColourVectorD *> xbuf(8);
 | 
			
		||||
      std::vector<HalfSpinColourVectorD *> rbuf(8);
 | 
			
		||||
      Grid.ShmBufferFreeAll();
 | 
			
		||||
      for(int d=0;d<8;d++){
 | 
			
		||||
	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 | 
			
		||||
	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 | 
			
		||||
	bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 | 
			
		||||
	bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      int ncomm;
 | 
			
		||||
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
 | 
			
		||||
      double dbytes;
 | 
			
		||||
      for(int i=0;i<Nloop;i++){
 | 
			
		||||
	double start=usecond();
 | 
			
		||||
 | 
			
		||||
	std::vector<CartesianCommunicator::CommsRequest_t> requests;
 | 
			
		||||
	dbytes=0;
 | 
			
		||||
	ncomm=0;
 | 
			
		||||
 | 
			
		||||
	parallel_for(int dir=0;dir<8;dir++){
 | 
			
		||||
 | 
			
		||||
	  double tbytes;
 | 
			
		||||
	  int mu =dir % 4;
 | 
			
		||||
 | 
			
		||||
	  if (mpi_layout[mu]>1 ) {
 | 
			
		||||
	  
 | 
			
		||||
	    ncomm++;
 | 
			
		||||
	    int xmit_to_rank;
 | 
			
		||||
	    int recv_from_rank;
 | 
			
		||||
	    if ( dir == mu ) { 
 | 
			
		||||
	      int comm_proc=1;
 | 
			
		||||
	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 | 
			
		||||
	    } else { 
 | 
			
		||||
	      int comm_proc = mpi_layout[mu]-1;
 | 
			
		||||
	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 | 
			
		||||
	    }
 | 
			
		||||
 | 
			
		||||
	    tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
 | 
			
		||||
					       (void *)&rbuf[dir][0], recv_from_rank, bytes,dir);
 | 
			
		||||
 | 
			
		||||
#pragma omp atomic
 | 
			
		||||
	    dbytes+=tbytes;
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
	Grid.Barrier();
 | 
			
		||||
	double stop=usecond();
 | 
			
		||||
	t_time[i] = stop-start; // microseconds
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      timestat.statistics(t_time);
 | 
			
		||||
 | 
			
		||||
      dbytes=dbytes*ppn;
 | 
			
		||||
      double xbytes    = dbytes*0.5;
 | 
			
		||||
      double rbytes    = dbytes*0.5;
 | 
			
		||||
      double bidibytes = dbytes;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
 | 
			
		||||
               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
 | 
			
		||||
               <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
 | 
			
		||||
               <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
 | 
			
		||||
               << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
 | 
			
		||||
               << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
 | 
			
		||||
 
 | 
			
		||||
    }
 | 
			
		||||
  }    
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "= All done; Bye Bye"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
 | 
			
		||||
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -51,7 +51,13 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
 | 
			
		||||
 | 
			
		||||
  std::vector<int> latt4 = GridDefaultLatt();
 | 
			
		||||
  const int Ls=16;
 | 
			
		||||
  int Ls=16;
 | 
			
		||||
  for(int i=0;i<argc;i++)
 | 
			
		||||
    if(std::string(argv[i]) == "-Ls"){
 | 
			
		||||
      std::stringstream ss(argv[i+1]); ss >> Ls;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
 | 
			
		||||
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 | 
			
		||||
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
 | 
			
		||||
@@ -165,7 +171,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
 | 
			
		||||
 | 
			
		||||
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 | 
			
		||||
  int ncall =1000;
 | 
			
		||||
  int ncall =500;
 | 
			
		||||
  if (1) {
 | 
			
		||||
    FGrid->Barrier();
 | 
			
		||||
    Dw.ZeroCounters();
 | 
			
		||||
@@ -302,6 +308,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
      std::cout<< "sD ERR   \n " << err  <<std::endl;
 | 
			
		||||
    }
 | 
			
		||||
    assert(sum < 1.0e-4);
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    if(1){
 | 
			
		||||
      std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
 | 
			
		||||
@@ -381,8 +388,23 @@ int main (int argc, char ** argv)
 | 
			
		||||
      }
 | 
			
		||||
      assert(error<1.0e-4);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  if(0){
 | 
			
		||||
    std::cout << "Single cache warm call to sDw.Dhop " <<std::endl;
 | 
			
		||||
    for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
 | 
			
		||||
      sDw.Dhop(ssrc,sresult,0);
 | 
			
		||||
      PerformanceCounter Counter(i);
 | 
			
		||||
      Counter.Start();
 | 
			
		||||
      sDw.Dhop(ssrc,sresult,0);
 | 
			
		||||
      Counter.Stop();
 | 
			
		||||
      Counter.Report();
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  if (1)
 | 
			
		||||
  { // Naive wilson dag implementation
 | 
			
		||||
    ref = zero;
 | 
			
		||||
@@ -487,9 +509,9 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
 | 
			
		||||
 | 
			
		||||
  //assert(norm2(src_e)<1.0e-4);
 | 
			
		||||
  //assert(norm2(src_o)<1.0e-4);
 | 
			
		||||
 | 
			
		||||
  assert(norm2(src_e)<1.0e-4);
 | 
			
		||||
  assert(norm2(src_o)<1.0e-4);
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
  exit(0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										190
									
								
								benchmarks/Benchmark_gparity.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										190
									
								
								benchmarks/Benchmark_gparity.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,190 @@
 | 
			
		||||
#include <Grid/Grid.h>
 | 
			
		||||
#include <sstream>
 | 
			
		||||
using namespace std;
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Grid::QCD;
 | 
			
		||||
 | 
			
		||||
template<class d>
 | 
			
		||||
struct scal {
 | 
			
		||||
  d internal;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
  Gamma::Algebra Gmu [] = {
 | 
			
		||||
    Gamma::Algebra::GammaX,
 | 
			
		||||
    Gamma::Algebra::GammaY,
 | 
			
		||||
    Gamma::Algebra::GammaZ,
 | 
			
		||||
    Gamma::Algebra::GammaT
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
typedef typename GparityDomainWallFermionF::FermionField GparityLatticeFermionF;
 | 
			
		||||
typedef typename GparityDomainWallFermionD::FermionField GparityLatticeFermionD;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
  int Ls=16;
 | 
			
		||||
  for(int i=0;i<argc;i++)
 | 
			
		||||
    if(std::string(argv[i]) == "-Ls"){
 | 
			
		||||
      std::stringstream ss(argv[i+1]); ss >> Ls;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  int threads = GridThread::GetThreads();
 | 
			
		||||
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "Ls = " << Ls << std::endl;
 | 
			
		||||
 | 
			
		||||
  std::vector<int> latt4 = GridDefaultLatt();
 | 
			
		||||
 | 
			
		||||
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
 | 
			
		||||
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
 | 
			
		||||
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
 | 
			
		||||
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
 | 
			
		||||
 | 
			
		||||
  std::vector<int> seeds4({1,2,3,4});
 | 
			
		||||
  std::vector<int> seeds5({5,6,7,8});
 | 
			
		||||
  
 | 
			
		||||
  std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
 | 
			
		||||
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
 | 
			
		||||
  std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
 | 
			
		||||
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
 | 
			
		||||
  std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
 | 
			
		||||
 | 
			
		||||
  GparityLatticeFermionF src   (FGrid); random(RNG5,src);
 | 
			
		||||
  RealD N2 = 1.0/::sqrt(norm2(src));
 | 
			
		||||
  src = src*N2;
 | 
			
		||||
 | 
			
		||||
  GparityLatticeFermionF result(FGrid); result=zero;
 | 
			
		||||
  GparityLatticeFermionF    ref(FGrid);    ref=zero;
 | 
			
		||||
  GparityLatticeFermionF    tmp(FGrid);
 | 
			
		||||
  GparityLatticeFermionF    err(FGrid);
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
 | 
			
		||||
  LatticeGaugeFieldF Umu(UGrid); 
 | 
			
		||||
  SU3::HotConfiguration(RNG4,Umu); 
 | 
			
		||||
  std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
 | 
			
		||||
 | 
			
		||||
  RealD mass=0.1;
 | 
			
		||||
  RealD M5  =1.8;
 | 
			
		||||
 | 
			
		||||
  RealD NP = UGrid->_Nprocessors;
 | 
			
		||||
  RealD NN = UGrid->NodeCount();
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermion::Dhop                  "<<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
 | 
			
		||||
  if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
 | 
			
		||||
#endif
 | 
			
		||||
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
 | 
			
		||||
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
 | 
			
		||||
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
 | 
			
		||||
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage<< "* SINGLE/SINGLE"<<std::endl;
 | 
			
		||||
  GparityDomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 | 
			
		||||
  int ncall =1000;
 | 
			
		||||
  if (1) {
 | 
			
		||||
    FGrid->Barrier();
 | 
			
		||||
    Dw.ZeroCounters();
 | 
			
		||||
    Dw.Dhop(src,result,0);
 | 
			
		||||
    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
 | 
			
		||||
    double t0=usecond();
 | 
			
		||||
    for(int i=0;i<ncall;i++){
 | 
			
		||||
      __SSC_START;
 | 
			
		||||
      Dw.Dhop(src,result,0);
 | 
			
		||||
      __SSC_STOP;
 | 
			
		||||
    }
 | 
			
		||||
    double t1=usecond();
 | 
			
		||||
    FGrid->Barrier();
 | 
			
		||||
    
 | 
			
		||||
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 | 
			
		||||
    double flops=2*1344*volume*ncall;
 | 
			
		||||
 | 
			
		||||
    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
 | 
			
		||||
    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
 | 
			
		||||
    //    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
 | 
			
		||||
    Dw.Report();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage<< "* SINGLE/HALF"<<std::endl;
 | 
			
		||||
  GparityDomainWallFermionFH DwH(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
 | 
			
		||||
  if (1) {
 | 
			
		||||
    FGrid->Barrier();
 | 
			
		||||
    DwH.ZeroCounters();
 | 
			
		||||
    DwH.Dhop(src,result,0);
 | 
			
		||||
    double t0=usecond();
 | 
			
		||||
    for(int i=0;i<ncall;i++){
 | 
			
		||||
      __SSC_START;
 | 
			
		||||
      DwH.Dhop(src,result,0);
 | 
			
		||||
      __SSC_STOP;
 | 
			
		||||
    }
 | 
			
		||||
    double t1=usecond();
 | 
			
		||||
    FGrid->Barrier();
 | 
			
		||||
    
 | 
			
		||||
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 | 
			
		||||
    double flops=2*1344*volume*ncall;
 | 
			
		||||
 | 
			
		||||
    std::cout<<GridLogMessage << "Called half prec comms Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
 | 
			
		||||
    DwH.Report();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  GridCartesian         * UGrid_d   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
 | 
			
		||||
  GridRedBlackCartesian * UrbGrid_d = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid_d);
 | 
			
		||||
  GridCartesian         * FGrid_d   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid_d);
 | 
			
		||||
  GridRedBlackCartesian * FrbGrid_d = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid_d);
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
  std::cout << GridLogMessage<< "* DOUBLE/DOUBLE"<<std::endl;
 | 
			
		||||
  GparityLatticeFermionD src_d(FGrid_d);
 | 
			
		||||
  precisionChange(src_d,src);
 | 
			
		||||
 | 
			
		||||
  LatticeGaugeFieldD Umu_d(UGrid_d); 
 | 
			
		||||
  precisionChange(Umu_d,Umu);
 | 
			
		||||
 | 
			
		||||
  GparityLatticeFermionD result_d(FGrid_d);
 | 
			
		||||
 | 
			
		||||
  GparityDomainWallFermionD DwD(Umu_d,*FGrid_d,*FrbGrid_d,*UGrid_d,*UrbGrid_d,mass,M5);
 | 
			
		||||
  if (1) {
 | 
			
		||||
    FGrid_d->Barrier();
 | 
			
		||||
    DwD.ZeroCounters();
 | 
			
		||||
    DwD.Dhop(src_d,result_d,0);
 | 
			
		||||
    std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
 | 
			
		||||
    double t0=usecond();
 | 
			
		||||
    for(int i=0;i<ncall;i++){
 | 
			
		||||
      __SSC_START;
 | 
			
		||||
      DwD.Dhop(src_d,result_d,0);
 | 
			
		||||
      __SSC_STOP;
 | 
			
		||||
    }
 | 
			
		||||
    double t1=usecond();
 | 
			
		||||
    FGrid_d->Barrier();
 | 
			
		||||
    
 | 
			
		||||
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 | 
			
		||||
    double flops=2*1344*volume*ncall;
 | 
			
		||||
 | 
			
		||||
    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
 | 
			
		||||
    //    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
 | 
			
		||||
    //    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
 | 
			
		||||
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NN<<std::endl;
 | 
			
		||||
    DwD.Report();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -55,21 +55,21 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
 | 
			
		||||
  uint64_t lmax=64;
 | 
			
		||||
#define NLOOP (100*lmax*lmax*lmax*lmax/vol)
 | 
			
		||||
  for(int lat=4;lat<=lmax;lat+=4){
 | 
			
		||||
  uint64_t lmax=96;
 | 
			
		||||
#define NLOOP (10*lmax*lmax*lmax*lmax/vol)
 | 
			
		||||
  for(int lat=8;lat<=lmax;lat+=8){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
 | 
			
		||||
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
 | 
			
		||||
      uint64_t Nloop=NLOOP;
 | 
			
		||||
 | 
			
		||||
      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
 | 
			
		||||
      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 | 
			
		||||
 | 
			
		||||
      LatticeVec z(&Grid); //random(pRNG,z);
 | 
			
		||||
      LatticeVec x(&Grid); //random(pRNG,x);
 | 
			
		||||
      LatticeVec y(&Grid); //random(pRNG,y);
 | 
			
		||||
      LatticeVec z(&Grid);// random(pRNG,z);
 | 
			
		||||
      LatticeVec x(&Grid);// random(pRNG,x);
 | 
			
		||||
      LatticeVec y(&Grid);// random(pRNG,y);
 | 
			
		||||
      double a=2.0;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -83,7 +83,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
      double time = (stop-start)/Nloop*1000;
 | 
			
		||||
      
 | 
			
		||||
      double flops=vol*Nvec*2;// mul,add
 | 
			
		||||
      double bytes=3*vol*Nvec*sizeof(Real);
 | 
			
		||||
      double bytes=3.0*vol*Nvec*sizeof(Real);
 | 
			
		||||
      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
@@ -94,17 +94,17 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
 | 
			
		||||
  
 | 
			
		||||
  for(int lat=4;lat<=lmax;lat+=4){
 | 
			
		||||
  for(int lat=8;lat<=lmax;lat+=8){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
 | 
			
		||||
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
 | 
			
		||||
      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
 | 
			
		||||
      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 | 
			
		||||
 | 
			
		||||
      LatticeVec z(&Grid); //random(pRNG,z);
 | 
			
		||||
      LatticeVec x(&Grid); //random(pRNG,x);
 | 
			
		||||
      LatticeVec y(&Grid); //random(pRNG,y);
 | 
			
		||||
      LatticeVec z(&Grid);// random(pRNG,z);
 | 
			
		||||
      LatticeVec x(&Grid);// random(pRNG,x);
 | 
			
		||||
      LatticeVec y(&Grid);// random(pRNG,y);
 | 
			
		||||
      double a=2.0;
 | 
			
		||||
 | 
			
		||||
      uint64_t Nloop=NLOOP;
 | 
			
		||||
@@ -119,7 +119,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
      double time = (stop-start)/Nloop*1000;
 | 
			
		||||
     
 | 
			
		||||
      double flops=vol*Nvec*2;// mul,add
 | 
			
		||||
      double bytes=3*vol*Nvec*sizeof(Real);
 | 
			
		||||
      double bytes=3.0*vol*Nvec*sizeof(Real);
 | 
			
		||||
      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
@@ -129,20 +129,20 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
 | 
			
		||||
 | 
			
		||||
  for(int lat=4;lat<=lmax;lat+=4){
 | 
			
		||||
  for(int lat=8;lat<=lmax;lat+=8){
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
 | 
			
		||||
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
      uint64_t Nloop=NLOOP;
 | 
			
		||||
 | 
			
		||||
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
 | 
			
		||||
      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
 | 
			
		||||
      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 | 
			
		||||
 | 
			
		||||
      LatticeVec z(&Grid); //random(pRNG,z);
 | 
			
		||||
      LatticeVec x(&Grid); //random(pRNG,x);
 | 
			
		||||
      LatticeVec y(&Grid); //random(pRNG,y);
 | 
			
		||||
      LatticeVec z(&Grid);// random(pRNG,z);
 | 
			
		||||
      LatticeVec x(&Grid);// random(pRNG,x);
 | 
			
		||||
      LatticeVec y(&Grid);// random(pRNG,y);
 | 
			
		||||
      RealD a=2.0;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -154,7 +154,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
      double stop=usecond();
 | 
			
		||||
      double time = (stop-start)/Nloop*1000;
 | 
			
		||||
      
 | 
			
		||||
      double bytes=2*vol*Nvec*sizeof(Real);
 | 
			
		||||
      double bytes=2.0*vol*Nvec*sizeof(Real);
 | 
			
		||||
      double flops=vol*Nvec*1;// mul
 | 
			
		||||
      std::cout<<GridLogMessage <<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl;
 | 
			
		||||
 | 
			
		||||
@@ -166,17 +166,17 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl;
 | 
			
		||||
  std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
 | 
			
		||||
 | 
			
		||||
  for(int lat=4;lat<=lmax;lat+=4){
 | 
			
		||||
  for(int lat=8;lat<=lmax;lat+=8){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
 | 
			
		||||
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
      int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
      uint64_t Nloop=NLOOP;
 | 
			
		||||
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
 | 
			
		||||
      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
 | 
			
		||||
      LatticeVec z(&Grid); //random(pRNG,z);
 | 
			
		||||
      LatticeVec x(&Grid); //random(pRNG,x);
 | 
			
		||||
      LatticeVec y(&Grid); //random(pRNG,y);
 | 
			
		||||
      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 | 
			
		||||
      LatticeVec z(&Grid);// random(pRNG,z);
 | 
			
		||||
      LatticeVec x(&Grid);// random(pRNG,x);
 | 
			
		||||
      LatticeVec y(&Grid);// random(pRNG,y);
 | 
			
		||||
      RealD a=2.0;
 | 
			
		||||
      Real nn;      
 | 
			
		||||
      double start=usecond();
 | 
			
		||||
@@ -187,7 +187,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
      double stop=usecond();
 | 
			
		||||
      double time = (stop-start)/Nloop*1000;
 | 
			
		||||
      
 | 
			
		||||
      double bytes=vol*Nvec*sizeof(Real);
 | 
			
		||||
      double bytes=1.0*vol*Nvec*sizeof(Real);
 | 
			
		||||
      double flops=vol*Nvec*2;// mul,add
 | 
			
		||||
      std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<< "\t\t"<<(stop-start)/1000./1000.<< "\t\t " <<std::endl;
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -40,7 +40,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
 | 
			
		||||
  std::vector<int> mpi_layout  = GridDefaultMpi();
 | 
			
		||||
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
  GridRedBlackCartesian     RBGrid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
  GridRedBlackCartesian     RBGrid(&Grid);
 | 
			
		||||
 | 
			
		||||
  int threads = GridThread::GetThreads();
 | 
			
		||||
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
 | 
			
		||||
 
 | 
			
		||||
@@ -37,12 +37,12 @@ int main (int argc, char ** argv)
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
#define LMAX (64)
 | 
			
		||||
 | 
			
		||||
  int Nloop=20;
 | 
			
		||||
  int64_t Nloop=20;
 | 
			
		||||
 | 
			
		||||
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
 | 
			
		||||
  std::vector<int> mpi_layout  = GridDefaultMpi();
 | 
			
		||||
 | 
			
		||||
  int threads = GridThread::GetThreads();
 | 
			
		||||
  int64_t threads = GridThread::GetThreads();
 | 
			
		||||
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
 | 
			
		||||
 | 
			
		||||
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
 | 
			
		||||
@@ -54,16 +54,16 @@ int main (int argc, char ** argv)
 | 
			
		||||
  for(int lat=2;lat<=LMAX;lat+=2){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
 | 
			
		||||
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
 | 
			
		||||
      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 | 
			
		||||
 | 
			
		||||
      LatticeColourMatrix z(&Grid);// random(pRNG,z);
 | 
			
		||||
      LatticeColourMatrix x(&Grid);// random(pRNG,x);
 | 
			
		||||
      LatticeColourMatrix y(&Grid);// random(pRNG,y);
 | 
			
		||||
      LatticeColourMatrix z(&Grid); random(pRNG,z);
 | 
			
		||||
      LatticeColourMatrix x(&Grid); random(pRNG,x);
 | 
			
		||||
      LatticeColourMatrix y(&Grid); random(pRNG,y);
 | 
			
		||||
 | 
			
		||||
      double start=usecond();
 | 
			
		||||
      for(int i=0;i<Nloop;i++){
 | 
			
		||||
      for(int64_t i=0;i<Nloop;i++){
 | 
			
		||||
	x=x*y;
 | 
			
		||||
      }
 | 
			
		||||
      double stop=usecond();
 | 
			
		||||
@@ -86,17 +86,17 @@ int main (int argc, char ** argv)
 | 
			
		||||
  for(int lat=2;lat<=LMAX;lat+=2){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
 | 
			
		||||
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
 | 
			
		||||
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
 | 
			
		||||
      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 | 
			
		||||
 | 
			
		||||
      LatticeColourMatrix z(&Grid); //random(pRNG,z);
 | 
			
		||||
      LatticeColourMatrix x(&Grid); //random(pRNG,x);
 | 
			
		||||
      LatticeColourMatrix y(&Grid); //random(pRNG,y);
 | 
			
		||||
      LatticeColourMatrix z(&Grid); random(pRNG,z);
 | 
			
		||||
      LatticeColourMatrix x(&Grid); random(pRNG,x);
 | 
			
		||||
      LatticeColourMatrix y(&Grid); random(pRNG,y);
 | 
			
		||||
 | 
			
		||||
      double start=usecond();
 | 
			
		||||
      for(int i=0;i<Nloop;i++){
 | 
			
		||||
      for(int64_t i=0;i<Nloop;i++){
 | 
			
		||||
	z=x*y;
 | 
			
		||||
      }
 | 
			
		||||
      double stop=usecond();
 | 
			
		||||
@@ -117,17 +117,17 @@ int main (int argc, char ** argv)
 | 
			
		||||
  for(int lat=2;lat<=LMAX;lat+=2){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
 | 
			
		||||
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
 | 
			
		||||
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
 | 
			
		||||
      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 | 
			
		||||
 | 
			
		||||
      LatticeColourMatrix z(&Grid); //random(pRNG,z);
 | 
			
		||||
      LatticeColourMatrix x(&Grid); //random(pRNG,x);
 | 
			
		||||
      LatticeColourMatrix y(&Grid); //random(pRNG,y);
 | 
			
		||||
      LatticeColourMatrix z(&Grid); random(pRNG,z);
 | 
			
		||||
      LatticeColourMatrix x(&Grid); random(pRNG,x);
 | 
			
		||||
      LatticeColourMatrix y(&Grid); random(pRNG,y);
 | 
			
		||||
 | 
			
		||||
      double start=usecond();
 | 
			
		||||
      for(int i=0;i<Nloop;i++){
 | 
			
		||||
      for(int64_t i=0;i<Nloop;i++){
 | 
			
		||||
	mult(z,x,y);
 | 
			
		||||
      }
 | 
			
		||||
      double stop=usecond();
 | 
			
		||||
@@ -148,17 +148,17 @@ int main (int argc, char ** argv)
 | 
			
		||||
  for(int lat=2;lat<=LMAX;lat+=2){
 | 
			
		||||
 | 
			
		||||
      std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]});
 | 
			
		||||
      int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
      int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3];
 | 
			
		||||
 | 
			
		||||
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
      //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9});
 | 
			
		||||
      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
 | 
			
		||||
 | 
			
		||||
      LatticeColourMatrix z(&Grid); //random(pRNG,z);
 | 
			
		||||
      LatticeColourMatrix x(&Grid); //random(pRNG,x);
 | 
			
		||||
      LatticeColourMatrix y(&Grid); //random(pRNG,y);
 | 
			
		||||
      LatticeColourMatrix z(&Grid); random(pRNG,z);
 | 
			
		||||
      LatticeColourMatrix x(&Grid); random(pRNG,x);
 | 
			
		||||
      LatticeColourMatrix y(&Grid); random(pRNG,y);
 | 
			
		||||
 | 
			
		||||
      double start=usecond();
 | 
			
		||||
      for(int i=0;i<Nloop;i++){
 | 
			
		||||
      for(int64_t i=0;i<Nloop;i++){
 | 
			
		||||
	mac(z,x,y);
 | 
			
		||||
      }
 | 
			
		||||
      double stop=usecond();
 | 
			
		||||
 
 | 
			
		||||
@@ -58,7 +58,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
 | 
			
		||||
  std::vector<int> mpi_layout  = GridDefaultMpi();
 | 
			
		||||
  GridCartesian               Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
  GridRedBlackCartesian     RBGrid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
  GridRedBlackCartesian     RBGrid(&Grid);
 | 
			
		||||
 | 
			
		||||
  int threads = GridThread::GetThreads();
 | 
			
		||||
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
 | 
			
		||||
 
 | 
			
		||||
@@ -93,7 +93,7 @@ int main (int argc, char ** argv)
 | 
			
		||||
	  std::cout << latt_size.back() << "\t\t";
 | 
			
		||||
 | 
			
		||||
	  GridCartesian           Grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
	  GridRedBlackCartesian RBGrid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
	  GridRedBlackCartesian RBGrid(&Grid);
 | 
			
		||||
 | 
			
		||||
	  GridParallelRNG  pRNG(&Grid); pRNG.SeedFixedIntegers(seeds);
 | 
			
		||||
	  LatticeGaugeField Umu(&Grid); random(pRNG,Umu);
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										67
									
								
								configure.ac
									
									
									
									
									
								
							
							
						
						
									
										67
									
								
								configure.ac
									
									
									
									
									
								
							@@ -13,6 +13,10 @@ m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
 | 
			
		||||
################ Get git info
 | 
			
		||||
#AC_REVISION([m4_esyscmd_s([./scripts/configure.commit])])
 | 
			
		||||
 | 
			
		||||
################ Set flags
 | 
			
		||||
# do not move!
 | 
			
		||||
CXXFLAGS="-O3 $CXXFLAGS"
 | 
			
		||||
 | 
			
		||||
############### Checks for programs
 | 
			
		||||
AC_PROG_CXX
 | 
			
		||||
AC_PROG_RANLIB
 | 
			
		||||
@@ -27,7 +31,6 @@ AX_GXX_VERSION
 | 
			
		||||
AC_DEFINE_UNQUOTED([GXX_VERSION],["$GXX_VERSION"],
 | 
			
		||||
      [version of g++ that will compile the code])
 | 
			
		||||
 | 
			
		||||
CXXFLAGS="-g $CXXFLAGS"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
############### Checks for typedefs, structures, and compiler characteristics
 | 
			
		||||
@@ -51,9 +54,14 @@ AC_CHECK_HEADERS(malloc/malloc.h)
 | 
			
		||||
AC_CHECK_HEADERS(malloc.h)
 | 
			
		||||
AC_CHECK_HEADERS(endian.h)
 | 
			
		||||
AC_CHECK_HEADERS(execinfo.h)
 | 
			
		||||
AC_CHECK_HEADERS(numaif.h)
 | 
			
		||||
AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
 | 
			
		||||
AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])
 | 
			
		||||
 | 
			
		||||
############## Standard libraries
 | 
			
		||||
AC_CHECK_LIB([m],[cos])
 | 
			
		||||
AC_CHECK_LIB([stdc++],[abort])
 | 
			
		||||
 | 
			
		||||
############### GMP and MPFR
 | 
			
		||||
AC_ARG_WITH([gmp],
 | 
			
		||||
    [AS_HELP_STRING([--with-gmp=prefix],
 | 
			
		||||
@@ -186,9 +194,14 @@ Info at: http://usqcd.jlab.org/usqcd-docs/c-lime/)])
 | 
			
		||||
 | 
			
		||||
AC_SEARCH_LIBS([crc32], [z],
 | 
			
		||||
               [AC_DEFINE([HAVE_ZLIB], [1], [Define to 1 if you have the `LIBZ' library])]
 | 
			
		||||
               [have_zlib=true],
 | 
			
		||||
               [have_zlib=true] [LIBS="${LIBS} -lz"],
 | 
			
		||||
	       [AC_MSG_ERROR(zlib library was not found in your system.)])
 | 
			
		||||
 | 
			
		||||
AC_SEARCH_LIBS([move_pages], [numa],
 | 
			
		||||
               [AC_DEFINE([HAVE_LIBNUMA], [1], [Define to 1 if you have the `LIBNUMA' library])]
 | 
			
		||||
               [have_libnuma=true] [LIBS="${LIBS} -lnuma"],
 | 
			
		||||
	       [AC_MSG_WARN(libnuma library was not found in your system. Some optimisations will not apply)])
 | 
			
		||||
 | 
			
		||||
AC_SEARCH_LIBS([H5Fopen], [hdf5_cpp],
 | 
			
		||||
               [AC_DEFINE([HAVE_HDF5], [1], [Define to 1 if you have the `HDF5' library])]
 | 
			
		||||
               [have_hdf5=true]
 | 
			
		||||
@@ -241,6 +254,7 @@ case ${ax_cv_cxx_compiler_vendor} in
 | 
			
		||||
        SIMD_FLAGS='';;
 | 
			
		||||
      KNL)
 | 
			
		||||
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics])
 | 
			
		||||
        AC_DEFINE([KNL],[1],[Knights landing processor])
 | 
			
		||||
        SIMD_FLAGS='-march=knl';;
 | 
			
		||||
      GEN)
 | 
			
		||||
        AC_DEFINE([GEN],[1],[generic vector code])
 | 
			
		||||
@@ -248,6 +262,9 @@ case ${ax_cv_cxx_compiler_vendor} in
 | 
			
		||||
                           [generic SIMD vector width (in bytes)])
 | 
			
		||||
        SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)"
 | 
			
		||||
        SIMD_FLAGS='';;
 | 
			
		||||
      NEONv8)
 | 
			
		||||
        AC_DEFINE([NEONV8],[1],[ARMv8 NEON])
 | 
			
		||||
        SIMD_FLAGS='-march=armv8-a';;
 | 
			
		||||
      QPX|BGQ)
 | 
			
		||||
        AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q])
 | 
			
		||||
        SIMD_FLAGS='';;
 | 
			
		||||
@@ -276,6 +293,7 @@ case ${ax_cv_cxx_compiler_vendor} in
 | 
			
		||||
        SIMD_FLAGS='';;
 | 
			
		||||
      KNL)
 | 
			
		||||
        AC_DEFINE([AVX512],[1],[AVX512 intrinsics for Knights Landing])
 | 
			
		||||
        AC_DEFINE([KNL],[1],[Knights landing processor])
 | 
			
		||||
        SIMD_FLAGS='-xmic-avx512';;
 | 
			
		||||
      GEN)
 | 
			
		||||
        AC_DEFINE([GEN],[1],[generic vector code])
 | 
			
		||||
@@ -313,8 +331,41 @@ case ${ac_PRECISION} in
 | 
			
		||||
     double)
 | 
			
		||||
       AC_DEFINE([GRID_DEFAULT_PRECISION_DOUBLE],[1],[GRID_DEFAULT_PRECISION is DOUBLE] )
 | 
			
		||||
     ;;
 | 
			
		||||
     *)
 | 
			
		||||
     AC_MSG_ERROR([${ac_PRECISION} unsupported --enable-precision option]);
 | 
			
		||||
     ;;
 | 
			
		||||
esac
 | 
			
		||||
 | 
			
		||||
######################  Shared memory allocation technique under MPI3
 | 
			
		||||
AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmget|shmopen|hugetlbfs],
 | 
			
		||||
              [Select SHM allocation technique])],[ac_SHM=${enable_shm}],[ac_SHM=shmopen])
 | 
			
		||||
 | 
			
		||||
case ${ac_SHM} in
 | 
			
		||||
 | 
			
		||||
     shmget)
 | 
			
		||||
     AC_DEFINE([GRID_MPI3_SHMGET],[1],[GRID_MPI3_SHMGET] )
 | 
			
		||||
     ;;
 | 
			
		||||
 | 
			
		||||
     shmopen)
 | 
			
		||||
     AC_DEFINE([GRID_MPI3_SHMOPEN],[1],[GRID_MPI3_SHMOPEN] )
 | 
			
		||||
     ;;
 | 
			
		||||
 | 
			
		||||
     hugetlbfs)
 | 
			
		||||
     AC_DEFINE([GRID_MPI3_SHMMMAP],[1],[GRID_MPI3_SHMMMAP] )
 | 
			
		||||
     ;;
 | 
			
		||||
 | 
			
		||||
     *)
 | 
			
		||||
     AC_MSG_ERROR([${ac_SHM} unsupported --enable-shm option]);
 | 
			
		||||
     ;;
 | 
			
		||||
esac
 | 
			
		||||
 | 
			
		||||
######################  Shared base path for SHMMMAP
 | 
			
		||||
AC_ARG_ENABLE([shmpath],[AC_HELP_STRING([--enable-shmpath=path],
 | 
			
		||||
              [Select SHM mmap base path for hugetlbfs])],
 | 
			
		||||
	      [ac_SHMPATH=${enable_shmpath}],
 | 
			
		||||
	      [ac_SHMPATH=/var/lib/hugetlbfs/pagesize-2MB/])
 | 
			
		||||
AC_DEFINE_UNQUOTED([GRID_SHM_PATH],["$ac_SHMPATH"],[Path to a hugetlbfs filesystem for MMAPing])
 | 
			
		||||
 | 
			
		||||
############### communication type selection
 | 
			
		||||
AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi|mpi-auto|mpi3|mpi3-auto|shmem],
 | 
			
		||||
              [Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])
 | 
			
		||||
@@ -324,14 +375,14 @@ case ${ac_COMMS} in
 | 
			
		||||
        AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
 | 
			
		||||
        comms_type='none'
 | 
			
		||||
     ;;
 | 
			
		||||
     mpi3l*)
 | 
			
		||||
       AC_DEFINE([GRID_COMMS_MPI3L],[1],[GRID_COMMS_MPI3L] )
 | 
			
		||||
       comms_type='mpi3l'
 | 
			
		||||
     ;;
 | 
			
		||||
     mpi3*)
 | 
			
		||||
        AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] )
 | 
			
		||||
        comms_type='mpi3'
 | 
			
		||||
     ;;
 | 
			
		||||
     mpit)
 | 
			
		||||
        AC_DEFINE([GRID_COMMS_MPIT],[1],[GRID_COMMS_MPIT] )
 | 
			
		||||
        comms_type='mpit'
 | 
			
		||||
     ;;
 | 
			
		||||
     mpi*)
 | 
			
		||||
        AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
 | 
			
		||||
        comms_type='mpi'
 | 
			
		||||
@@ -359,7 +410,7 @@ esac
 | 
			
		||||
AM_CONDITIONAL(BUILD_COMMS_SHMEM, [ test "${comms_type}X" == "shmemX" ])
 | 
			
		||||
AM_CONDITIONAL(BUILD_COMMS_MPI,   [ test "${comms_type}X" == "mpiX" ])
 | 
			
		||||
AM_CONDITIONAL(BUILD_COMMS_MPI3,  [ test "${comms_type}X" == "mpi3X" ] )
 | 
			
		||||
AM_CONDITIONAL(BUILD_COMMS_MPI3L, [ test "${comms_type}X" == "mpi3lX" ] )
 | 
			
		||||
AM_CONDITIONAL(BUILD_COMMS_MPIT,  [ test "${comms_type}X" == "mpitX" ] )
 | 
			
		||||
AM_CONDITIONAL(BUILD_COMMS_NONE,  [ test "${comms_type}X" == "noneX" ])
 | 
			
		||||
 | 
			
		||||
############### RNG selection
 | 
			
		||||
@@ -464,6 +515,8 @@ compiler version            : ${ax_cv_gxx_version}
 | 
			
		||||
SIMD                        : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG}
 | 
			
		||||
Threading                   : ${ac_openmp}
 | 
			
		||||
Communications type         : ${comms_type}
 | 
			
		||||
Shared memory allocator     : ${ac_SHM}
 | 
			
		||||
Shared memory mmap path     : ${ac_SHMPATH}
 | 
			
		||||
Default precision           : ${ac_PRECISION}
 | 
			
		||||
Software FP16 conversion    : ${ac_SFW_FP16}
 | 
			
		||||
RNG choice                  : ${ac_RNG}
 | 
			
		||||
 
 | 
			
		||||
@@ -41,9 +41,10 @@ using namespace Hadrons;
 | 
			
		||||
// constructor /////////////////////////////////////////////////////////////////
 | 
			
		||||
Environment::Environment(void)
 | 
			
		||||
{
 | 
			
		||||
    nd_ = GridDefaultLatt().size();
 | 
			
		||||
    dim_ = GridDefaultLatt();
 | 
			
		||||
    nd_  = dim_.size();
 | 
			
		||||
    grid4d_.reset(SpaceTimeGrid::makeFourDimGrid(
 | 
			
		||||
        GridDefaultLatt(), GridDefaultSimd(nd_, vComplex::Nsimd()),
 | 
			
		||||
        dim_, GridDefaultSimd(nd_, vComplex::Nsimd()),
 | 
			
		||||
        GridDefaultMpi()));
 | 
			
		||||
    gridRb4d_.reset(SpaceTimeGrid::makeFourDimRedBlackGrid(grid4d_.get()));
 | 
			
		||||
    auto loc = getGrid()->LocalDimensions();
 | 
			
		||||
@@ -132,6 +133,16 @@ unsigned int Environment::getNd(void) const
 | 
			
		||||
    return nd_;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
std::vector<int> Environment::getDim(void) const
 | 
			
		||||
{
 | 
			
		||||
    return dim_;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int Environment::getDim(const unsigned int mu) const
 | 
			
		||||
{
 | 
			
		||||
    return dim_[mu];
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// random number generator /////////////////////////////////////////////////////
 | 
			
		||||
void Environment::setSeed(const std::vector<int> &seed)
 | 
			
		||||
{
 | 
			
		||||
@@ -271,6 +282,21 @@ std::string Environment::getModuleType(const std::string name) const
 | 
			
		||||
    return getModuleType(getModuleAddress(name));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
std::string Environment::getModuleNamespace(const unsigned int address) const
 | 
			
		||||
{
 | 
			
		||||
    std::string type = getModuleType(address), ns;
 | 
			
		||||
    
 | 
			
		||||
    auto pos2 = type.rfind("::");
 | 
			
		||||
    auto pos1 = type.rfind("::", pos2 - 2);
 | 
			
		||||
    
 | 
			
		||||
    return type.substr(pos1 + 2, pos2 - pos1 - 2);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
std::string Environment::getModuleNamespace(const std::string name) const
 | 
			
		||||
{
 | 
			
		||||
    return getModuleNamespace(getModuleAddress(name));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
bool Environment::hasModule(const unsigned int address) const
 | 
			
		||||
{
 | 
			
		||||
    return (address < module_.size());
 | 
			
		||||
@@ -492,7 +518,14 @@ std::string Environment::getObjectType(const unsigned int address) const
 | 
			
		||||
{
 | 
			
		||||
    if (hasRegisteredObject(address))
 | 
			
		||||
    {
 | 
			
		||||
        return typeName(object_[address].type);
 | 
			
		||||
        if (object_[address].type)
 | 
			
		||||
        {
 | 
			
		||||
            return typeName(object_[address].type);
 | 
			
		||||
        }
 | 
			
		||||
        else
 | 
			
		||||
        {
 | 
			
		||||
            return "<no type>";
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
    else if (hasObject(address))
 | 
			
		||||
    {
 | 
			
		||||
@@ -532,6 +565,23 @@ Environment::Size Environment::getObjectSize(const std::string name) const
 | 
			
		||||
    return getObjectSize(getObjectAddress(name));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
unsigned int Environment::getObjectModule(const unsigned int address) const
 | 
			
		||||
{
 | 
			
		||||
    if (hasObject(address))
 | 
			
		||||
    {
 | 
			
		||||
        return object_[address].module;
 | 
			
		||||
    }
 | 
			
		||||
    else
 | 
			
		||||
    {
 | 
			
		||||
        HADRON_ERROR("no object with address " + std::to_string(address));
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
unsigned int Environment::getObjectModule(const std::string name) const
 | 
			
		||||
{
 | 
			
		||||
    return getObjectModule(getObjectAddress(name));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
unsigned int Environment::getObjectLs(const unsigned int address) const
 | 
			
		||||
{
 | 
			
		||||
    if (hasRegisteredObject(address))
 | 
			
		||||
 
 | 
			
		||||
@@ -106,6 +106,8 @@ public:
 | 
			
		||||
    void                    createGrid(const unsigned int Ls);
 | 
			
		||||
    GridCartesian *         getGrid(const unsigned int Ls = 1) const;
 | 
			
		||||
    GridRedBlackCartesian * getRbGrid(const unsigned int Ls = 1) const;
 | 
			
		||||
    std::vector<int>        getDim(void) const;
 | 
			
		||||
    int                     getDim(const unsigned int mu) const;
 | 
			
		||||
    unsigned int            getNd(void) const;
 | 
			
		||||
    // random number generator
 | 
			
		||||
    void                    setSeed(const std::vector<int> &seed);
 | 
			
		||||
@@ -131,6 +133,8 @@ public:
 | 
			
		||||
    std::string             getModuleName(const unsigned int address) const;
 | 
			
		||||
    std::string             getModuleType(const unsigned int address) const;
 | 
			
		||||
    std::string             getModuleType(const std::string name) const;
 | 
			
		||||
    std::string             getModuleNamespace(const unsigned int address) const;
 | 
			
		||||
    std::string             getModuleNamespace(const std::string name) const;
 | 
			
		||||
    bool                    hasModule(const unsigned int address) const;
 | 
			
		||||
    bool                    hasModule(const std::string name) const;
 | 
			
		||||
    Graph<unsigned int>     makeModuleGraph(void) const;
 | 
			
		||||
@@ -171,6 +175,8 @@ public:
 | 
			
		||||
    std::string             getObjectType(const std::string name) const;
 | 
			
		||||
    Size                    getObjectSize(const unsigned int address) const;
 | 
			
		||||
    Size                    getObjectSize(const std::string name) const;
 | 
			
		||||
    unsigned int            getObjectModule(const unsigned int address) const;
 | 
			
		||||
    unsigned int            getObjectModule(const std::string name) const;
 | 
			
		||||
    unsigned int            getObjectLs(const unsigned int address) const;
 | 
			
		||||
    unsigned int            getObjectLs(const std::string name) const;
 | 
			
		||||
    bool                    hasObject(const unsigned int address) const;
 | 
			
		||||
@@ -181,6 +187,10 @@ public:
 | 
			
		||||
    bool                    hasCreatedObject(const std::string name) const;
 | 
			
		||||
    bool                    isObject5d(const unsigned int address) const;
 | 
			
		||||
    bool                    isObject5d(const std::string name) const;
 | 
			
		||||
    template <typename T>
 | 
			
		||||
    bool                    isObjectOfType(const unsigned int address) const;
 | 
			
		||||
    template <typename T>
 | 
			
		||||
    bool                    isObjectOfType(const std::string name) const;
 | 
			
		||||
    Environment::Size       getTotalSize(void) const;
 | 
			
		||||
    void                    addOwnership(const unsigned int owner,
 | 
			
		||||
                                         const unsigned int property);
 | 
			
		||||
@@ -197,6 +207,7 @@ private:
 | 
			
		||||
    bool                                   dryRun_{false};
 | 
			
		||||
    unsigned int                           traj_, locVol_;
 | 
			
		||||
    // grids
 | 
			
		||||
    std::vector<int>                       dim_;
 | 
			
		||||
    GridPt                                 grid4d_;
 | 
			
		||||
    std::map<unsigned int, GridPt>         grid5d_;
 | 
			
		||||
    GridRbPt                               gridRb4d_;
 | 
			
		||||
@@ -343,7 +354,7 @@ T * Environment::getObject(const unsigned int address) const
 | 
			
		||||
        else
 | 
			
		||||
        {
 | 
			
		||||
            HADRON_ERROR("object with address " + std::to_string(address) +
 | 
			
		||||
                         " does not have type '" + typeid(T).name() +
 | 
			
		||||
                         " does not have type '" + typeName(&typeid(T)) +
 | 
			
		||||
                         "' (has type '" + getObjectType(address) + "')");
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
@@ -380,6 +391,37 @@ T * Environment::createLattice(const std::string name)
 | 
			
		||||
    return createLattice<T>(getObjectAddress(name));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <typename T>
 | 
			
		||||
bool Environment::isObjectOfType(const unsigned int address) const
 | 
			
		||||
{
 | 
			
		||||
    if (hasRegisteredObject(address))
 | 
			
		||||
    {
 | 
			
		||||
        if (auto h = dynamic_cast<Holder<T> *>(object_[address].data.get()))
 | 
			
		||||
        {
 | 
			
		||||
            return true;
 | 
			
		||||
        }
 | 
			
		||||
        else
 | 
			
		||||
        {
 | 
			
		||||
            return false;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
    else if (hasObject(address))
 | 
			
		||||
    {
 | 
			
		||||
        HADRON_ERROR("object with address " + std::to_string(address) +
 | 
			
		||||
                     " exists but is not registered");
 | 
			
		||||
    }
 | 
			
		||||
    else
 | 
			
		||||
    {
 | 
			
		||||
        HADRON_ERROR("no object with address " + std::to_string(address));
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <typename T>
 | 
			
		||||
bool Environment::isObjectOfType(const std::string name) const
 | 
			
		||||
{
 | 
			
		||||
    return isObjectOfType<T>(getObjectAddress(name));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
END_HADRONS_NAMESPACE
 | 
			
		||||
 | 
			
		||||
#endif // Hadrons_Environment_hpp_
 | 
			
		||||
 
 | 
			
		||||
@@ -51,23 +51,43 @@ using Grid::operator<<;
 | 
			
		||||
 * error with GCC 5 (clang & GCC 6 compile fine without it).
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
// FIXME: find a way to do that in a more general fashion
 | 
			
		||||
#ifndef FIMPL
 | 
			
		||||
#define FIMPL WilsonImplR
 | 
			
		||||
#endif
 | 
			
		||||
#ifndef SIMPL
 | 
			
		||||
#define SIMPL ScalarImplCR
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
BEGIN_HADRONS_NAMESPACE
 | 
			
		||||
 | 
			
		||||
// type aliases
 | 
			
		||||
#define TYPE_ALIASES(FImpl, suffix)\
 | 
			
		||||
#define FERM_TYPE_ALIASES(FImpl, suffix)\
 | 
			
		||||
typedef FermionOperator<FImpl>                       FMat##suffix;             \
 | 
			
		||||
typedef typename FImpl::FermionField                 FermionField##suffix;     \
 | 
			
		||||
typedef typename FImpl::PropagatorField              PropagatorField##suffix;  \
 | 
			
		||||
typedef typename FImpl::SitePropagator               SitePropagator##suffix;   \
 | 
			
		||||
typedef typename FImpl::DoubledGaugeField            DoubledGaugeField##suffix;\
 | 
			
		||||
typedef std::function<void(FermionField##suffix &,                             \
 | 
			
		||||
typedef std::vector<typename FImpl::SitePropagator::scalar_object>             \
 | 
			
		||||
                                                     SlicedPropagator##suffix;
 | 
			
		||||
 | 
			
		||||
#define GAUGE_TYPE_ALIASES(FImpl, suffix)\
 | 
			
		||||
typedef typename FImpl::DoubledGaugeField DoubledGaugeField##suffix;
 | 
			
		||||
 | 
			
		||||
#define SCALAR_TYPE_ALIASES(SImpl, suffix)\
 | 
			
		||||
typedef typename SImpl::Field ScalarField##suffix;\
 | 
			
		||||
typedef typename SImpl::Field PropagatorField##suffix;
 | 
			
		||||
 | 
			
		||||
#define SOLVER_TYPE_ALIASES(FImpl, suffix)\
 | 
			
		||||
typedef std::function<void(FermionField##suffix &,\
 | 
			
		||||
                      const FermionField##suffix &)> SolverFn##suffix;
 | 
			
		||||
 | 
			
		||||
#define SINK_TYPE_ALIASES(suffix)\
 | 
			
		||||
typedef std::function<SlicedPropagator##suffix(const PropagatorField##suffix &)> SinkFn##suffix;
 | 
			
		||||
 | 
			
		||||
#define FGS_TYPE_ALIASES(FImpl, suffix)\
 | 
			
		||||
FERM_TYPE_ALIASES(FImpl, suffix)\
 | 
			
		||||
GAUGE_TYPE_ALIASES(FImpl, suffix)\
 | 
			
		||||
SOLVER_TYPE_ALIASES(FImpl, suffix)
 | 
			
		||||
 | 
			
		||||
// logger
 | 
			
		||||
class HadronsLogger: public Logger
 | 
			
		||||
{
 | 
			
		||||
 
 | 
			
		||||
@@ -1,48 +1,26 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
Source file: extras/Hadrons/Modules.hpp
 | 
			
		||||
 | 
			
		||||
Copyright (C) 2015
 | 
			
		||||
Copyright (C) 2016
 | 
			
		||||
 | 
			
		||||
Author: Antonin Portelli <antonin.portelli@me.com>
 | 
			
		||||
 | 
			
		||||
This program is free software; you can redistribute it and/or modify
 | 
			
		||||
it under the terms of the GNU General Public License as published by
 | 
			
		||||
the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
(at your option) any later version.
 | 
			
		||||
 | 
			
		||||
This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
You should have received a copy of the GNU General Public License along
 | 
			
		||||
with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
#include <Grid/Hadrons/Modules/MAction/DWF.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MAction/Wilson.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MContraction/Baryon.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MContraction/DiscLoop.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MContraction/Gamma3pt.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MContraction/Meson.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MLoop/NoiseLoop.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MFermion/GaugeProp.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MContraction/Meson.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MContraction/DiscLoop.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MContraction/Baryon.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MGauge/Load.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MGauge/Random.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MGauge/Unit.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MLoop/NoiseLoop.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MSolver/RBPrecCG.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MSource/Point.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MSource/SeqGamma.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MSource/Wall.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MContraction/Gamma3pt.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MSource/Z2.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/Quark.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MSource/SeqGamma.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MSource/Point.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MSource/Wall.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MSource/Laplacian.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MSolver/RBPrecCG.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MScalar/ChargedProp.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MScalar/FreeProp.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MScalar/Scalar.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MAction/DWF.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MAction/Wilson.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MGauge/StochEm.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MGauge/Unit.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MGauge/Random.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MGauge/Load.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MSink/Point.hpp>
 | 
			
		||||
 
 | 
			
		||||
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#ifndef Hadrons_DWF_hpp_
 | 
			
		||||
#define Hadrons_DWF_hpp_
 | 
			
		||||
#ifndef Hadrons_MAction_DWF_hpp_
 | 
			
		||||
#define Hadrons_MAction_DWF_hpp_
 | 
			
		||||
 | 
			
		||||
#include <Grid/Hadrons/Global.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Module.hpp>
 | 
			
		||||
@@ -56,7 +56,7 @@ template <typename FImpl>
 | 
			
		||||
class TDWF: public Module<DWFPar>
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
    TYPE_ALIASES(FImpl,);
 | 
			
		||||
    FGS_TYPE_ALIASES(FImpl,);
 | 
			
		||||
public:
 | 
			
		||||
    // constructor
 | 
			
		||||
    TDWF(const std::string name);
 | 
			
		||||
@@ -137,4 +137,4 @@ END_MODULE_NAMESPACE
 | 
			
		||||
 | 
			
		||||
END_HADRONS_NAMESPACE
 | 
			
		||||
 | 
			
		||||
#endif // Hadrons_DWF_hpp_
 | 
			
		||||
#endif // Hadrons_MAction_DWF_hpp_
 | 
			
		||||
 
 | 
			
		||||
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#ifndef Hadrons_Wilson_hpp_
 | 
			
		||||
#define Hadrons_Wilson_hpp_
 | 
			
		||||
#ifndef Hadrons_MAction_Wilson_hpp_
 | 
			
		||||
#define Hadrons_MAction_Wilson_hpp_
 | 
			
		||||
 | 
			
		||||
#include <Grid/Hadrons/Global.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Module.hpp>
 | 
			
		||||
@@ -54,7 +54,7 @@ template <typename FImpl>
 | 
			
		||||
class TWilson: public Module<WilsonPar>
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
    TYPE_ALIASES(FImpl,);
 | 
			
		||||
    FGS_TYPE_ALIASES(FImpl,);
 | 
			
		||||
public:
 | 
			
		||||
    // constructor
 | 
			
		||||
    TWilson(const std::string name);
 | 
			
		||||
 
 | 
			
		||||
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#ifndef Hadrons_Baryon_hpp_
 | 
			
		||||
#define Hadrons_Baryon_hpp_
 | 
			
		||||
#ifndef Hadrons_MContraction_Baryon_hpp_
 | 
			
		||||
#define Hadrons_MContraction_Baryon_hpp_
 | 
			
		||||
 | 
			
		||||
#include <Grid/Hadrons/Global.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Module.hpp>
 | 
			
		||||
@@ -55,9 +55,9 @@ template <typename FImpl1, typename FImpl2, typename FImpl3>
 | 
			
		||||
class TBaryon: public Module<BaryonPar>
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
    TYPE_ALIASES(FImpl1, 1);
 | 
			
		||||
    TYPE_ALIASES(FImpl2, 2);
 | 
			
		||||
    TYPE_ALIASES(FImpl3, 3);
 | 
			
		||||
    FERM_TYPE_ALIASES(FImpl1, 1);
 | 
			
		||||
    FERM_TYPE_ALIASES(FImpl2, 2);
 | 
			
		||||
    FERM_TYPE_ALIASES(FImpl3, 3);
 | 
			
		||||
    class Result: Serializable
 | 
			
		||||
    {
 | 
			
		||||
    public:
 | 
			
		||||
@@ -121,11 +121,11 @@ void TBaryon<FImpl1, FImpl2, FImpl3>::execute(void)
 | 
			
		||||
    
 | 
			
		||||
    // FIXME: do contractions
 | 
			
		||||
    
 | 
			
		||||
    write(writer, "meson", result);
 | 
			
		||||
    // write(writer, "meson", result);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
END_MODULE_NAMESPACE
 | 
			
		||||
 | 
			
		||||
END_HADRONS_NAMESPACE
 | 
			
		||||
 | 
			
		||||
#endif // Hadrons_Baryon_hpp_
 | 
			
		||||
#endif // Hadrons_MContraction_Baryon_hpp_
 | 
			
		||||
 
 | 
			
		||||
@@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#ifndef Hadrons_DiscLoop_hpp_
 | 
			
		||||
#define Hadrons_DiscLoop_hpp_
 | 
			
		||||
#ifndef Hadrons_MContraction_DiscLoop_hpp_
 | 
			
		||||
#define Hadrons_MContraction_DiscLoop_hpp_
 | 
			
		||||
 | 
			
		||||
#include <Grid/Hadrons/Global.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Module.hpp>
 | 
			
		||||
@@ -52,7 +52,7 @@ public:
 | 
			
		||||
template <typename FImpl>
 | 
			
		||||
class TDiscLoop: public Module<DiscLoopPar>
 | 
			
		||||
{
 | 
			
		||||
    TYPE_ALIASES(FImpl,);
 | 
			
		||||
    FERM_TYPE_ALIASES(FImpl,);
 | 
			
		||||
    class Result: Serializable
 | 
			
		||||
    {
 | 
			
		||||
    public:
 | 
			
		||||
@@ -141,4 +141,4 @@ END_MODULE_NAMESPACE
 | 
			
		||||
 | 
			
		||||
END_HADRONS_NAMESPACE
 | 
			
		||||
 | 
			
		||||
#endif // Hadrons_DiscLoop_hpp_
 | 
			
		||||
#endif // Hadrons_MContraction_DiscLoop_hpp_
 | 
			
		||||
 
 | 
			
		||||
@@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#ifndef Hadrons_Gamma3pt_hpp_
 | 
			
		||||
#define Hadrons_Gamma3pt_hpp_
 | 
			
		||||
#ifndef Hadrons_MContraction_Gamma3pt_hpp_
 | 
			
		||||
#define Hadrons_MContraction_Gamma3pt_hpp_
 | 
			
		||||
 | 
			
		||||
#include <Grid/Hadrons/Global.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Module.hpp>
 | 
			
		||||
@@ -72,9 +72,9 @@ public:
 | 
			
		||||
template <typename FImpl1, typename FImpl2, typename FImpl3>
 | 
			
		||||
class TGamma3pt: public Module<Gamma3ptPar>
 | 
			
		||||
{
 | 
			
		||||
    TYPE_ALIASES(FImpl1, 1);
 | 
			
		||||
    TYPE_ALIASES(FImpl2, 2);
 | 
			
		||||
    TYPE_ALIASES(FImpl3, 3);
 | 
			
		||||
    FERM_TYPE_ALIASES(FImpl1, 1);
 | 
			
		||||
    FERM_TYPE_ALIASES(FImpl2, 2);
 | 
			
		||||
    FERM_TYPE_ALIASES(FImpl3, 3);
 | 
			
		||||
    class Result: Serializable
 | 
			
		||||
    {
 | 
			
		||||
    public:
 | 
			
		||||
@@ -167,4 +167,4 @@ END_MODULE_NAMESPACE
 | 
			
		||||
 | 
			
		||||
END_HADRONS_NAMESPACE
 | 
			
		||||
 | 
			
		||||
#endif // Hadrons_Gamma3pt_hpp_
 | 
			
		||||
#endif // Hadrons_MContraction_Gamma3pt_hpp_
 | 
			
		||||
 
 | 
			
		||||
@@ -29,8 +29,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#ifndef Hadrons_Meson_hpp_
 | 
			
		||||
#define Hadrons_Meson_hpp_
 | 
			
		||||
#ifndef Hadrons_MContraction_Meson_hpp_
 | 
			
		||||
#define Hadrons_MContraction_Meson_hpp_
 | 
			
		||||
 | 
			
		||||
#include <Grid/Hadrons/Global.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Module.hpp>
 | 
			
		||||
@@ -69,7 +69,7 @@ public:
 | 
			
		||||
                                    std::string, q1,
 | 
			
		||||
                                    std::string, q2,
 | 
			
		||||
                                    std::string, gammas,
 | 
			
		||||
                                    std::string, mom,
 | 
			
		||||
                                    std::string, sink,
 | 
			
		||||
                                    std::string, output);
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
@@ -77,8 +77,10 @@ template <typename FImpl1, typename FImpl2>
 | 
			
		||||
class TMeson: public Module<MesonPar>
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
    TYPE_ALIASES(FImpl1, 1);
 | 
			
		||||
    TYPE_ALIASES(FImpl2, 2);
 | 
			
		||||
    FERM_TYPE_ALIASES(FImpl1, 1);
 | 
			
		||||
    FERM_TYPE_ALIASES(FImpl2, 2);
 | 
			
		||||
    FERM_TYPE_ALIASES(ScalarImplCR, Scalar);
 | 
			
		||||
    SINK_TYPE_ALIASES(Scalar);
 | 
			
		||||
    class Result: Serializable
 | 
			
		||||
    {
 | 
			
		||||
    public:
 | 
			
		||||
@@ -115,7 +117,7 @@ TMeson<FImpl1, FImpl2>::TMeson(const std::string name)
 | 
			
		||||
template <typename FImpl1, typename FImpl2>
 | 
			
		||||
std::vector<std::string> TMeson<FImpl1, FImpl2>::getInput(void)
 | 
			
		||||
{
 | 
			
		||||
    std::vector<std::string> input = {par().q1, par().q2};
 | 
			
		||||
    std::vector<std::string> input = {par().q1, par().q2, par().sink};
 | 
			
		||||
    
 | 
			
		||||
    return input;
 | 
			
		||||
}
 | 
			
		||||
@@ -154,6 +156,9 @@ void TMeson<FImpl1, FImpl2>::parseGammaString(std::vector<GammaPair> &gammaList)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
// execution ///////////////////////////////////////////////////////////////////
 | 
			
		||||
#define mesonConnected(q1, q2, gSnk, gSrc) \
 | 
			
		||||
(g5*(gSnk))*(q1)*(adj(gSrc)*g5)*adj(q2)
 | 
			
		||||
 | 
			
		||||
template <typename FImpl1, typename FImpl2>
 | 
			
		||||
void TMeson<FImpl1, FImpl2>::execute(void)
 | 
			
		||||
{
 | 
			
		||||
@@ -161,43 +166,72 @@ void TMeson<FImpl1, FImpl2>::execute(void)
 | 
			
		||||
                 << " quarks '" << par().q1 << "' and '" << par().q2 << "'"
 | 
			
		||||
                 << std::endl;
 | 
			
		||||
    
 | 
			
		||||
    CorrWriter              writer(par().output);
 | 
			
		||||
    PropagatorField1       &q1 = *env().template getObject<PropagatorField1>(par().q1);
 | 
			
		||||
    PropagatorField2       &q2 = *env().template getObject<PropagatorField2>(par().q2);
 | 
			
		||||
    LatticeComplex         c(env().getGrid());
 | 
			
		||||
    Gamma                  g5(Gamma::Algebra::Gamma5);
 | 
			
		||||
    std::vector<GammaPair> gammaList;
 | 
			
		||||
    CorrWriter             writer(par().output);
 | 
			
		||||
    std::vector<TComplex>  buf;
 | 
			
		||||
    std::vector<Result>    result;
 | 
			
		||||
    std::vector<Real>      p;
 | 
			
		||||
 | 
			
		||||
    p  = strToVec<Real>(par().mom);
 | 
			
		||||
    LatticeComplex         ph(env().getGrid()), coor(env().getGrid());
 | 
			
		||||
    Complex                i(0.0,1.0);
 | 
			
		||||
    ph = zero;
 | 
			
		||||
    for(unsigned int mu = 0; mu < env().getNd(); mu++)
 | 
			
		||||
    {
 | 
			
		||||
        LatticeCoordinate(coor, mu);
 | 
			
		||||
        ph = ph + p[mu]*coor*((1./(env().getGrid()->_fdimensions[mu])));
 | 
			
		||||
    }
 | 
			
		||||
    ph = exp((Real)(2*M_PI)*i*ph);
 | 
			
		||||
    Gamma                  g5(Gamma::Algebra::Gamma5);
 | 
			
		||||
    std::vector<GammaPair> gammaList;
 | 
			
		||||
    int                    nt = env().getDim(Tp);
 | 
			
		||||
    
 | 
			
		||||
    parseGammaString(gammaList);
 | 
			
		||||
 | 
			
		||||
    result.resize(gammaList.size());
 | 
			
		||||
    for (unsigned int i = 0; i < result.size(); ++i)
 | 
			
		||||
    {
 | 
			
		||||
        Gamma gSnk(gammaList[i].first);
 | 
			
		||||
        Gamma gSrc(gammaList[i].second);
 | 
			
		||||
        c = trace((g5*gSnk)*q1*(adj(gSrc)*g5)*adj(q2))*ph;
 | 
			
		||||
        sliceSum(c, buf, Tp);
 | 
			
		||||
 | 
			
		||||
        result[i].gamma_snk = gammaList[i].first;
 | 
			
		||||
        result[i].gamma_src = gammaList[i].second;
 | 
			
		||||
        result[i].corr.resize(buf.size());
 | 
			
		||||
        for (unsigned int t = 0; t < buf.size(); ++t)
 | 
			
		||||
        result[i].corr.resize(nt);
 | 
			
		||||
    }
 | 
			
		||||
    if (env().template isObjectOfType<SlicedPropagator1>(par().q1) and
 | 
			
		||||
        env().template isObjectOfType<SlicedPropagator2>(par().q2))
 | 
			
		||||
    {
 | 
			
		||||
        SlicedPropagator1 &q1 = *env().template getObject<SlicedPropagator1>(par().q1);
 | 
			
		||||
        SlicedPropagator2 &q2 = *env().template getObject<SlicedPropagator2>(par().q2);
 | 
			
		||||
        
 | 
			
		||||
        LOG(Message) << "(propagator already sinked)" << std::endl;
 | 
			
		||||
        for (unsigned int i = 0; i < result.size(); ++i)
 | 
			
		||||
        {
 | 
			
		||||
            result[i].corr[t] = TensorRemove(buf[t]);
 | 
			
		||||
            Gamma gSnk(gammaList[i].first);
 | 
			
		||||
            Gamma gSrc(gammaList[i].second);
 | 
			
		||||
            
 | 
			
		||||
            for (unsigned int t = 0; t < buf.size(); ++t)
 | 
			
		||||
            {
 | 
			
		||||
                result[i].corr[t] = TensorRemove(trace(mesonConnected(q1[t], q2[t], gSnk, gSrc)));
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
    else
 | 
			
		||||
    {
 | 
			
		||||
        PropagatorField1 &q1   = *env().template getObject<PropagatorField1>(par().q1);
 | 
			
		||||
        PropagatorField2 &q2   = *env().template getObject<PropagatorField2>(par().q2);
 | 
			
		||||
        LatticeComplex   c(env().getGrid());
 | 
			
		||||
        
 | 
			
		||||
        LOG(Message) << "(using sink '" << par().sink << "')" << std::endl;
 | 
			
		||||
        for (unsigned int i = 0; i < result.size(); ++i)
 | 
			
		||||
        {
 | 
			
		||||
            Gamma       gSnk(gammaList[i].first);
 | 
			
		||||
            Gamma       gSrc(gammaList[i].second);
 | 
			
		||||
            std::string ns;
 | 
			
		||||
                
 | 
			
		||||
            ns = env().getModuleNamespace(env().getObjectModule(par().sink));
 | 
			
		||||
            if (ns == "MSource")
 | 
			
		||||
            {
 | 
			
		||||
                PropagatorField1 &sink =
 | 
			
		||||
                    *env().template getObject<PropagatorField1>(par().sink);
 | 
			
		||||
                
 | 
			
		||||
                c = trace(mesonConnected(q1, q2, gSnk, gSrc)*sink);
 | 
			
		||||
                sliceSum(c, buf, Tp);
 | 
			
		||||
            }
 | 
			
		||||
            else if (ns == "MSink")
 | 
			
		||||
            {
 | 
			
		||||
                SinkFnScalar &sink = *env().template getObject<SinkFnScalar>(par().sink);
 | 
			
		||||
                
 | 
			
		||||
                c   = trace(mesonConnected(q1, q2, gSnk, gSrc));
 | 
			
		||||
                buf = sink(c);
 | 
			
		||||
            }
 | 
			
		||||
            for (unsigned int t = 0; t < buf.size(); ++t)
 | 
			
		||||
            {
 | 
			
		||||
                result[i].corr[t] = TensorRemove(buf[t]);
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
    write(writer, "meson", result);
 | 
			
		||||
@@ -207,4 +241,4 @@ END_MODULE_NAMESPACE
 | 
			
		||||
 | 
			
		||||
END_HADRONS_NAMESPACE
 | 
			
		||||
 | 
			
		||||
#endif // Hadrons_Meson_hpp_
 | 
			
		||||
#endif // Hadrons_MContraction_Meson_hpp_
 | 
			
		||||
 
 | 
			
		||||
@@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#ifndef Hadrons_WeakHamiltonian_hpp_
 | 
			
		||||
#define Hadrons_WeakHamiltonian_hpp_
 | 
			
		||||
#ifndef Hadrons_MContraction_WeakHamiltonian_hpp_
 | 
			
		||||
#define Hadrons_MContraction_WeakHamiltonian_hpp_
 | 
			
		||||
 | 
			
		||||
#include <Grid/Hadrons/Global.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Module.hpp>
 | 
			
		||||
@@ -83,7 +83,7 @@ public:
 | 
			
		||||
class T##modname: public Module<WeakHamiltonianPar>\
 | 
			
		||||
{\
 | 
			
		||||
public:\
 | 
			
		||||
    TYPE_ALIASES(FIMPL,)\
 | 
			
		||||
    FERM_TYPE_ALIASES(FIMPL,)\
 | 
			
		||||
    class Result: Serializable\
 | 
			
		||||
    {\
 | 
			
		||||
    public:\
 | 
			
		||||
@@ -111,4 +111,4 @@ END_MODULE_NAMESPACE
 | 
			
		||||
 | 
			
		||||
END_HADRONS_NAMESPACE
 | 
			
		||||
 | 
			
		||||
#endif // Hadrons_WeakHamiltonian_hpp_
 | 
			
		||||
#endif // Hadrons_MContraction_WeakHamiltonian_hpp_
 | 
			
		||||
 
 | 
			
		||||
@@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#ifndef Hadrons_WeakHamiltonianEye_hpp_
 | 
			
		||||
#define Hadrons_WeakHamiltonianEye_hpp_
 | 
			
		||||
#ifndef Hadrons_MContraction_WeakHamiltonianEye_hpp_
 | 
			
		||||
#define Hadrons_MContraction_WeakHamiltonianEye_hpp_
 | 
			
		||||
 | 
			
		||||
#include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
 | 
			
		||||
 | 
			
		||||
@@ -55,4 +55,4 @@ END_MODULE_NAMESPACE
 | 
			
		||||
 | 
			
		||||
END_HADRONS_NAMESPACE
 | 
			
		||||
 | 
			
		||||
#endif // Hadrons_WeakHamiltonianEye_hpp_
 | 
			
		||||
#endif // Hadrons_MContraction_WeakHamiltonianEye_hpp_
 | 
			
		||||
 
 | 
			
		||||
@@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#ifndef Hadrons_WeakHamiltonianNonEye_hpp_
 | 
			
		||||
#define Hadrons_WeakHamiltonianNonEye_hpp_
 | 
			
		||||
#ifndef Hadrons_MContraction_WeakHamiltonianNonEye_hpp_
 | 
			
		||||
#define Hadrons_MContraction_WeakHamiltonianNonEye_hpp_
 | 
			
		||||
 | 
			
		||||
#include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
 | 
			
		||||
 | 
			
		||||
@@ -54,4 +54,4 @@ END_MODULE_NAMESPACE
 | 
			
		||||
 | 
			
		||||
END_HADRONS_NAMESPACE
 | 
			
		||||
 | 
			
		||||
#endif // Hadrons_WeakHamiltonianNonEye_hpp_
 | 
			
		||||
#endif // Hadrons_MContraction_WeakHamiltonianNonEye_hpp_
 | 
			
		||||
 
 | 
			
		||||
@@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#ifndef Hadrons_WeakNeutral4ptDisc_hpp_
 | 
			
		||||
#define Hadrons_WeakNeutral4ptDisc_hpp_
 | 
			
		||||
#ifndef Hadrons_MContraction_WeakNeutral4ptDisc_hpp_
 | 
			
		||||
#define Hadrons_MContraction_WeakNeutral4ptDisc_hpp_
 | 
			
		||||
 | 
			
		||||
#include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp>
 | 
			
		||||
 | 
			
		||||
@@ -56,4 +56,4 @@ END_MODULE_NAMESPACE
 | 
			
		||||
 | 
			
		||||
END_HADRONS_NAMESPACE
 | 
			
		||||
 | 
			
		||||
#endif // Hadrons_WeakNeutral4ptDisc_hpp_
 | 
			
		||||
#endif // Hadrons_MContraction_WeakNeutral4ptDisc_hpp_
 | 
			
		||||
 
 | 
			
		||||
@@ -1,34 +1,5 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
Source file: extras/Hadrons/Modules/Quark.hpp
 | 
			
		||||
 | 
			
		||||
Copyright (C) 2015
 | 
			
		||||
Copyright (C) 2016
 | 
			
		||||
 | 
			
		||||
Author: Antonin Portelli <antonin.portelli@me.com>
 | 
			
		||||
 | 
			
		||||
This program is free software; you can redistribute it and/or modify
 | 
			
		||||
it under the terms of the GNU General Public License as published by
 | 
			
		||||
the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
(at your option) any later version.
 | 
			
		||||
 | 
			
		||||
This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
You should have received a copy of the GNU General Public License along
 | 
			
		||||
with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#ifndef Hadrons_Quark_hpp_
 | 
			
		||||
#define Hadrons_Quark_hpp_
 | 
			
		||||
#ifndef Hadrons_MFermion_GaugeProp_hpp_
 | 
			
		||||
#define Hadrons_MFermion_GaugeProp_hpp_
 | 
			
		||||
 | 
			
		||||
#include <Grid/Hadrons/Global.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Module.hpp>
 | 
			
		||||
@@ -37,27 +8,29 @@ See the full license in the file "LICENSE" in the top level distribution directo
 | 
			
		||||
BEGIN_HADRONS_NAMESPACE
 | 
			
		||||
 | 
			
		||||
/******************************************************************************
 | 
			
		||||
 *                               TQuark                                       *
 | 
			
		||||
 *                                GaugeProp                                   *
 | 
			
		||||
 ******************************************************************************/
 | 
			
		||||
class QuarkPar: Serializable
 | 
			
		||||
BEGIN_MODULE_NAMESPACE(MFermion)
 | 
			
		||||
 | 
			
		||||
class GaugePropPar: Serializable
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
    GRID_SERIALIZABLE_CLASS_MEMBERS(QuarkPar,
 | 
			
		||||
    GRID_SERIALIZABLE_CLASS_MEMBERS(GaugePropPar,
 | 
			
		||||
                                    std::string, source,
 | 
			
		||||
                                    std::string, solver);
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template <typename FImpl>
 | 
			
		||||
class TQuark: public Module<QuarkPar>
 | 
			
		||||
class TGaugeProp: public Module<GaugePropPar>
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
    TYPE_ALIASES(FImpl,);
 | 
			
		||||
    FGS_TYPE_ALIASES(FImpl,);
 | 
			
		||||
public:
 | 
			
		||||
    // constructor
 | 
			
		||||
    TQuark(const std::string name);
 | 
			
		||||
    TGaugeProp(const std::string name);
 | 
			
		||||
    // destructor
 | 
			
		||||
    virtual ~TQuark(void) = default;
 | 
			
		||||
    // dependencies/products
 | 
			
		||||
    virtual ~TGaugeProp(void) = default;
 | 
			
		||||
    // dependency relation
 | 
			
		||||
    virtual std::vector<std::string> getInput(void);
 | 
			
		||||
    virtual std::vector<std::string> getOutput(void);
 | 
			
		||||
    // setup
 | 
			
		||||
@@ -69,20 +42,20 @@ private:
 | 
			
		||||
    SolverFn     *solver_{nullptr};
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
MODULE_REGISTER(Quark, TQuark<FIMPL>);
 | 
			
		||||
MODULE_REGISTER_NS(GaugeProp, TGaugeProp<FIMPL>, MFermion);
 | 
			
		||||
 | 
			
		||||
/******************************************************************************
 | 
			
		||||
 *                          TQuark implementation                             *
 | 
			
		||||
 *                      TGaugeProp implementation                             *
 | 
			
		||||
 ******************************************************************************/
 | 
			
		||||
// constructor /////////////////////////////////////////////////////////////////
 | 
			
		||||
template <typename FImpl>
 | 
			
		||||
TQuark<FImpl>::TQuark(const std::string name)
 | 
			
		||||
: Module(name)
 | 
			
		||||
TGaugeProp<FImpl>::TGaugeProp(const std::string name)
 | 
			
		||||
: Module<GaugePropPar>(name)
 | 
			
		||||
{}
 | 
			
		||||
 | 
			
		||||
// dependencies/products ///////////////////////////////////////////////////////
 | 
			
		||||
template <typename FImpl>
 | 
			
		||||
std::vector<std::string> TQuark<FImpl>::getInput(void)
 | 
			
		||||
std::vector<std::string> TGaugeProp<FImpl>::getInput(void)
 | 
			
		||||
{
 | 
			
		||||
    std::vector<std::string> in = {par().source, par().solver};
 | 
			
		||||
    
 | 
			
		||||
@@ -90,7 +63,7 @@ std::vector<std::string> TQuark<FImpl>::getInput(void)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <typename FImpl>
 | 
			
		||||
std::vector<std::string> TQuark<FImpl>::getOutput(void)
 | 
			
		||||
std::vector<std::string> TGaugeProp<FImpl>::getOutput(void)
 | 
			
		||||
{
 | 
			
		||||
    std::vector<std::string> out = {getName(), getName() + "_5d"};
 | 
			
		||||
    
 | 
			
		||||
@@ -99,7 +72,7 @@ std::vector<std::string> TQuark<FImpl>::getOutput(void)
 | 
			
		||||
 | 
			
		||||
// setup ///////////////////////////////////////////////////////////////////////
 | 
			
		||||
template <typename FImpl>
 | 
			
		||||
void TQuark<FImpl>::setup(void)
 | 
			
		||||
void TGaugeProp<FImpl>::setup(void)
 | 
			
		||||
{
 | 
			
		||||
    Ls_ = env().getObjectLs(par().solver);
 | 
			
		||||
    env().template registerLattice<PropagatorField>(getName());
 | 
			
		||||
@@ -111,13 +84,13 @@ void TQuark<FImpl>::setup(void)
 | 
			
		||||
 | 
			
		||||
// execution ///////////////////////////////////////////////////////////////////
 | 
			
		||||
template <typename FImpl>
 | 
			
		||||
void TQuark<FImpl>::execute(void)
 | 
			
		||||
void TGaugeProp<FImpl>::execute(void)
 | 
			
		||||
{
 | 
			
		||||
    LOG(Message) << "Computing quark propagator '" << getName() << "'"
 | 
			
		||||
                 << std::endl;
 | 
			
		||||
    << std::endl;
 | 
			
		||||
    
 | 
			
		||||
    FermionField    source(env().getGrid(Ls_)), sol(env().getGrid(Ls_)),
 | 
			
		||||
                    tmp(env().getGrid());
 | 
			
		||||
    tmp(env().getGrid());
 | 
			
		||||
    std::string     propName = (Ls_ == 1) ? getName() : (getName() + "_5d");
 | 
			
		||||
    PropagatorField &prop    = *env().template createLattice<PropagatorField>(propName);
 | 
			
		||||
    PropagatorField &fullSrc = *env().template getObject<PropagatorField>(par().source);
 | 
			
		||||
@@ -128,7 +101,7 @@ void TQuark<FImpl>::execute(void)
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    LOG(Message) << "Inverting using solver '" << par().solver
 | 
			
		||||
                 << "' on source '" << par().source << "'" << std::endl;
 | 
			
		||||
    << "' on source '" << par().source << "'" << std::endl;
 | 
			
		||||
    for (unsigned int s = 0; s < Ns; ++s)
 | 
			
		||||
    for (unsigned int c = 0; c < Nc; ++c)
 | 
			
		||||
    {
 | 
			
		||||
@@ -170,7 +143,7 @@ void TQuark<FImpl>::execute(void)
 | 
			
		||||
        if (Ls_ > 1)
 | 
			
		||||
        {
 | 
			
		||||
            PropagatorField &p4d =
 | 
			
		||||
                *env().template getObject<PropagatorField>(getName());
 | 
			
		||||
            *env().template getObject<PropagatorField>(getName());
 | 
			
		||||
            
 | 
			
		||||
            axpby_ssp_pminus(sol, 0., sol, 1., sol, 0, 0);
 | 
			
		||||
            axpby_ssp_pplus(sol, 1., sol, 1., sol, 0, Ls_-1);
 | 
			
		||||
@@ -180,6 +153,8 @@ void TQuark<FImpl>::execute(void)
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
END_MODULE_NAMESPACE
 | 
			
		||||
 | 
			
		||||
END_HADRONS_NAMESPACE
 | 
			
		||||
 | 
			
		||||
#endif // Hadrons_Quark_hpp_
 | 
			
		||||
#endif // Hadrons_MFermion_GaugeProp_hpp_
 | 
			
		||||
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#ifndef Hadrons_Load_hpp_
 | 
			
		||||
#define Hadrons_Load_hpp_
 | 
			
		||||
#ifndef Hadrons_MGauge_Load_hpp_
 | 
			
		||||
#define Hadrons_MGauge_Load_hpp_
 | 
			
		||||
 | 
			
		||||
#include <Grid/Hadrons/Global.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Module.hpp>
 | 
			
		||||
@@ -70,4 +70,4 @@ END_MODULE_NAMESPACE
 | 
			
		||||
 | 
			
		||||
END_HADRONS_NAMESPACE
 | 
			
		||||
 | 
			
		||||
#endif // Hadrons_Load_hpp_
 | 
			
		||||
#endif // Hadrons_MGauge_Load_hpp_
 | 
			
		||||
 
 | 
			
		||||
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#ifndef Hadrons_Random_hpp_
 | 
			
		||||
#define Hadrons_Random_hpp_
 | 
			
		||||
#ifndef Hadrons_MGauge_Random_hpp_
 | 
			
		||||
#define Hadrons_MGauge_Random_hpp_
 | 
			
		||||
 | 
			
		||||
#include <Grid/Hadrons/Global.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Module.hpp>
 | 
			
		||||
@@ -63,4 +63,4 @@ END_MODULE_NAMESPACE
 | 
			
		||||
 | 
			
		||||
END_HADRONS_NAMESPACE
 | 
			
		||||
 | 
			
		||||
#endif // Hadrons_Random_hpp_
 | 
			
		||||
#endif // Hadrons_MGauge_Random_hpp_
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										88
									
								
								extras/Hadrons/Modules/MGauge/StochEm.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										88
									
								
								extras/Hadrons/Modules/MGauge/StochEm.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,88 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
Source file: extras/Hadrons/Modules/MGauge/StochEm.cc
 | 
			
		||||
 | 
			
		||||
Copyright (C) 2015
 | 
			
		||||
Copyright (C) 2016
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
This program is free software; you can redistribute it and/or modify
 | 
			
		||||
it under the terms of the GNU General Public License as published by
 | 
			
		||||
the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
(at your option) any later version.
 | 
			
		||||
 | 
			
		||||
This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
You should have received a copy of the GNU General Public License along
 | 
			
		||||
with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
#include <Grid/Hadrons/Modules/MGauge/StochEm.hpp>
 | 
			
		||||
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Hadrons;
 | 
			
		||||
using namespace MGauge;
 | 
			
		||||
 | 
			
		||||
/******************************************************************************
 | 
			
		||||
*                  TStochEm implementation                             *
 | 
			
		||||
******************************************************************************/
 | 
			
		||||
// constructor /////////////////////////////////////////////////////////////////
 | 
			
		||||
TStochEm::TStochEm(const std::string name)
 | 
			
		||||
: Module<StochEmPar>(name)
 | 
			
		||||
{}
 | 
			
		||||
 | 
			
		||||
// dependencies/products ///////////////////////////////////////////////////////
 | 
			
		||||
std::vector<std::string> TStochEm::getInput(void)
 | 
			
		||||
{
 | 
			
		||||
    std::vector<std::string> in;
 | 
			
		||||
    
 | 
			
		||||
    return in;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
std::vector<std::string> TStochEm::getOutput(void)
 | 
			
		||||
{
 | 
			
		||||
    std::vector<std::string> out = {getName()};
 | 
			
		||||
    
 | 
			
		||||
    return out;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// setup ///////////////////////////////////////////////////////////////////////
 | 
			
		||||
void TStochEm::setup(void)
 | 
			
		||||
{
 | 
			
		||||
    if (!env().hasRegisteredObject("_" + getName() + "_weight"))
 | 
			
		||||
    {
 | 
			
		||||
        env().registerLattice<EmComp>("_" + getName() + "_weight");
 | 
			
		||||
    }
 | 
			
		||||
    env().registerLattice<EmField>(getName());
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// execution ///////////////////////////////////////////////////////////////////
 | 
			
		||||
void TStochEm::execute(void)
 | 
			
		||||
{
 | 
			
		||||
    PhotonR photon(par().gauge, par().zmScheme);
 | 
			
		||||
    EmField &a = *env().createLattice<EmField>(getName());
 | 
			
		||||
    EmComp  *w;
 | 
			
		||||
    
 | 
			
		||||
    if (!env().hasCreatedObject("_" + getName() + "_weight"))
 | 
			
		||||
    {
 | 
			
		||||
        LOG(Message) << "Caching stochatic EM potential weight (gauge: "
 | 
			
		||||
                     << par().gauge << ", zero-mode scheme: "
 | 
			
		||||
                     << par().zmScheme << ")..." << std::endl;
 | 
			
		||||
        w = env().createLattice<EmComp>("_" + getName() + "_weight");
 | 
			
		||||
        photon.StochasticWeight(*w);
 | 
			
		||||
    }
 | 
			
		||||
    else
 | 
			
		||||
    {
 | 
			
		||||
        w = env().getObject<EmComp>("_" + getName() + "_weight");
 | 
			
		||||
    }
 | 
			
		||||
    LOG(Message) << "Generating stochatic EM potential..." << std::endl;
 | 
			
		||||
    photon.StochasticField(a, *env().get4dRng(), *w);
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										75
									
								
								extras/Hadrons/Modules/MGauge/StochEm.hpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										75
									
								
								extras/Hadrons/Modules/MGauge/StochEm.hpp
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,75 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
Source file: extras/Hadrons/Modules/MGauge/StochEm.hpp
 | 
			
		||||
 | 
			
		||||
Copyright (C) 2015
 | 
			
		||||
Copyright (C) 2016
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
This program is free software; you can redistribute it and/or modify
 | 
			
		||||
it under the terms of the GNU General Public License as published by
 | 
			
		||||
the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
(at your option) any later version.
 | 
			
		||||
 | 
			
		||||
This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
You should have received a copy of the GNU General Public License along
 | 
			
		||||
with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
#ifndef Hadrons_MGauge_StochEm_hpp_
 | 
			
		||||
#define Hadrons_MGauge_StochEm_hpp_
 | 
			
		||||
 | 
			
		||||
#include <Grid/Hadrons/Global.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Module.hpp>
 | 
			
		||||
#include <Grid/Hadrons/ModuleFactory.hpp>
 | 
			
		||||
 | 
			
		||||
BEGIN_HADRONS_NAMESPACE
 | 
			
		||||
 | 
			
		||||
/******************************************************************************
 | 
			
		||||
 *                         StochEm                                 *
 | 
			
		||||
 ******************************************************************************/
 | 
			
		||||
BEGIN_MODULE_NAMESPACE(MGauge)
 | 
			
		||||
 | 
			
		||||
class StochEmPar: Serializable
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
    GRID_SERIALIZABLE_CLASS_MEMBERS(StochEmPar,
 | 
			
		||||
                                    PhotonR::Gauge,    gauge,
 | 
			
		||||
                                    PhotonR::ZmScheme, zmScheme);
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
class TStochEm: public Module<StochEmPar>
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
    typedef PhotonR::GaugeField     EmField;
 | 
			
		||||
    typedef PhotonR::GaugeLinkField EmComp;
 | 
			
		||||
public:
 | 
			
		||||
    // constructor
 | 
			
		||||
    TStochEm(const std::string name);
 | 
			
		||||
    // destructor
 | 
			
		||||
    virtual ~TStochEm(void) = default;
 | 
			
		||||
    // dependency relation
 | 
			
		||||
    virtual std::vector<std::string> getInput(void);
 | 
			
		||||
    virtual std::vector<std::string> getOutput(void);
 | 
			
		||||
    // setup
 | 
			
		||||
    virtual void setup(void);
 | 
			
		||||
    // execution
 | 
			
		||||
    virtual void execute(void);
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
MODULE_REGISTER_NS(StochEm, TStochEm, MGauge);
 | 
			
		||||
 | 
			
		||||
END_MODULE_NAMESPACE
 | 
			
		||||
 | 
			
		||||
END_HADRONS_NAMESPACE
 | 
			
		||||
 | 
			
		||||
#endif // Hadrons_MGauge_StochEm_hpp_
 | 
			
		||||
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#ifndef Hadrons_Unit_hpp_
 | 
			
		||||
#define Hadrons_Unit_hpp_
 | 
			
		||||
#ifndef Hadrons_MGauge_Unit_hpp_
 | 
			
		||||
#define Hadrons_MGauge_Unit_hpp_
 | 
			
		||||
 | 
			
		||||
#include <Grid/Hadrons/Global.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Module.hpp>
 | 
			
		||||
@@ -63,4 +63,4 @@ END_MODULE_NAMESPACE
 | 
			
		||||
 | 
			
		||||
END_HADRONS_NAMESPACE
 | 
			
		||||
 | 
			
		||||
#endif // Hadrons_Unit_hpp_
 | 
			
		||||
#endif // Hadrons_MGauge_Unit_hpp_
 | 
			
		||||
 
 | 
			
		||||
@@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#ifndef Hadrons_NoiseLoop_hpp_
 | 
			
		||||
#define Hadrons_NoiseLoop_hpp_
 | 
			
		||||
#ifndef Hadrons_MLoop_NoiseLoop_hpp_
 | 
			
		||||
#define Hadrons_MLoop_NoiseLoop_hpp_
 | 
			
		||||
 | 
			
		||||
#include <Grid/Hadrons/Global.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Module.hpp>
 | 
			
		||||
@@ -65,7 +65,7 @@ template <typename FImpl>
 | 
			
		||||
class TNoiseLoop: public Module<NoiseLoopPar>
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
    TYPE_ALIASES(FImpl,);
 | 
			
		||||
    FERM_TYPE_ALIASES(FImpl,);
 | 
			
		||||
public:
 | 
			
		||||
    // constructor
 | 
			
		||||
    TNoiseLoop(const std::string name);
 | 
			
		||||
@@ -129,4 +129,4 @@ END_MODULE_NAMESPACE
 | 
			
		||||
 | 
			
		||||
END_HADRONS_NAMESPACE
 | 
			
		||||
 | 
			
		||||
#endif // Hadrons_NoiseLoop_hpp_
 | 
			
		||||
#endif // Hadrons_MLoop_NoiseLoop_hpp_
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										226
									
								
								extras/Hadrons/Modules/MScalar/ChargedProp.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										226
									
								
								extras/Hadrons/Modules/MScalar/ChargedProp.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,226 @@
 | 
			
		||||
#include <Grid/Hadrons/Modules/MScalar/ChargedProp.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MScalar/Scalar.hpp>
 | 
			
		||||
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Hadrons;
 | 
			
		||||
using namespace MScalar;
 | 
			
		||||
 | 
			
		||||
/******************************************************************************
 | 
			
		||||
*                     TChargedProp implementation                             *
 | 
			
		||||
******************************************************************************/
 | 
			
		||||
// constructor /////////////////////////////////////////////////////////////////
 | 
			
		||||
TChargedProp::TChargedProp(const std::string name)
 | 
			
		||||
: Module<ChargedPropPar>(name)
 | 
			
		||||
{}
 | 
			
		||||
 | 
			
		||||
// dependencies/products ///////////////////////////////////////////////////////
 | 
			
		||||
std::vector<std::string> TChargedProp::getInput(void)
 | 
			
		||||
{
 | 
			
		||||
    std::vector<std::string> in = {par().source, par().emField};
 | 
			
		||||
    
 | 
			
		||||
    return in;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
std::vector<std::string> TChargedProp::getOutput(void)
 | 
			
		||||
{
 | 
			
		||||
    std::vector<std::string> out = {getName()};
 | 
			
		||||
    
 | 
			
		||||
    return out;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// setup ///////////////////////////////////////////////////////////////////////
 | 
			
		||||
void TChargedProp::setup(void)
 | 
			
		||||
{
 | 
			
		||||
    freeMomPropName_ = FREEMOMPROP(par().mass);
 | 
			
		||||
    phaseName_.clear();
 | 
			
		||||
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
 | 
			
		||||
    {
 | 
			
		||||
        phaseName_.push_back("_shiftphase_" + std::to_string(mu));
 | 
			
		||||
    }
 | 
			
		||||
    GFSrcName_ = "_" + getName() + "_DinvSrc";
 | 
			
		||||
    if (!env().hasRegisteredObject(freeMomPropName_))
 | 
			
		||||
    {
 | 
			
		||||
        env().registerLattice<ScalarField>(freeMomPropName_);
 | 
			
		||||
    }
 | 
			
		||||
    if (!env().hasRegisteredObject(phaseName_[0]))
 | 
			
		||||
    {
 | 
			
		||||
        for (unsigned int mu = 0; mu < env().getNd(); ++mu)
 | 
			
		||||
        {
 | 
			
		||||
            env().registerLattice<ScalarField>(phaseName_[mu]);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
    if (!env().hasRegisteredObject(GFSrcName_))
 | 
			
		||||
    {
 | 
			
		||||
        env().registerLattice<ScalarField>(GFSrcName_);
 | 
			
		||||
    }
 | 
			
		||||
    env().registerLattice<ScalarField>(getName());
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// execution ///////////////////////////////////////////////////////////////////
 | 
			
		||||
void TChargedProp::execute(void)
 | 
			
		||||
{
 | 
			
		||||
    // CACHING ANALYTIC EXPRESSIONS
 | 
			
		||||
    ScalarField &source = *env().getObject<ScalarField>(par().source);
 | 
			
		||||
    Complex     ci(0.0,1.0);
 | 
			
		||||
    FFT         fft(env().getGrid());
 | 
			
		||||
    
 | 
			
		||||
    // cache free scalar propagator
 | 
			
		||||
    if (!env().hasCreatedObject(freeMomPropName_))
 | 
			
		||||
    {
 | 
			
		||||
        LOG(Message) << "Caching momentum space free scalar propagator"
 | 
			
		||||
                     << " (mass= " << par().mass << ")..." << std::endl;
 | 
			
		||||
        freeMomProp_ = env().createLattice<ScalarField>(freeMomPropName_);
 | 
			
		||||
        SIMPL::MomentumSpacePropagator(*freeMomProp_, par().mass);
 | 
			
		||||
    }
 | 
			
		||||
    else
 | 
			
		||||
    {
 | 
			
		||||
        freeMomProp_ = env().getObject<ScalarField>(freeMomPropName_);
 | 
			
		||||
    }
 | 
			
		||||
    // cache G*F*src
 | 
			
		||||
    if (!env().hasCreatedObject(GFSrcName_))
 | 
			
		||||
        
 | 
			
		||||
    {
 | 
			
		||||
        GFSrc_ = env().createLattice<ScalarField>(GFSrcName_);
 | 
			
		||||
        fft.FFT_all_dim(*GFSrc_, source, FFT::forward);
 | 
			
		||||
        *GFSrc_ = (*freeMomProp_)*(*GFSrc_);
 | 
			
		||||
    }
 | 
			
		||||
    else
 | 
			
		||||
    {
 | 
			
		||||
        GFSrc_ = env().getObject<ScalarField>(GFSrcName_);
 | 
			
		||||
    }
 | 
			
		||||
    // cache phases
 | 
			
		||||
    if (!env().hasCreatedObject(phaseName_[0]))
 | 
			
		||||
    {
 | 
			
		||||
        std::vector<int> &l = env().getGrid()->_fdimensions;
 | 
			
		||||
        
 | 
			
		||||
        LOG(Message) << "Caching shift phases..." << std::endl;
 | 
			
		||||
        for (unsigned int mu = 0; mu < env().getNd(); ++mu)
 | 
			
		||||
        {
 | 
			
		||||
            Real    twoPiL = M_PI*2./l[mu];
 | 
			
		||||
            
 | 
			
		||||
            phase_.push_back(env().createLattice<ScalarField>(phaseName_[mu]));
 | 
			
		||||
            LatticeCoordinate(*(phase_[mu]), mu);
 | 
			
		||||
            *(phase_[mu]) = exp(ci*twoPiL*(*(phase_[mu])));
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
    else
 | 
			
		||||
    {
 | 
			
		||||
        for (unsigned int mu = 0; mu < env().getNd(); ++mu)
 | 
			
		||||
        {
 | 
			
		||||
            phase_.push_back(env().getObject<ScalarField>(phaseName_[mu]));
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // PROPAGATOR CALCULATION
 | 
			
		||||
    LOG(Message) << "Computing charged scalar propagator"
 | 
			
		||||
                 << " (mass= " << par().mass
 | 
			
		||||
                 << ", charge= " << par().charge << ")..." << std::endl;
 | 
			
		||||
    
 | 
			
		||||
    ScalarField &prop   = *env().createLattice<ScalarField>(getName());
 | 
			
		||||
    ScalarField buf(env().getGrid());
 | 
			
		||||
    ScalarField &GFSrc = *GFSrc_, &G = *freeMomProp_;
 | 
			
		||||
    double      q = par().charge;
 | 
			
		||||
    
 | 
			
		||||
    // G*F*Src
 | 
			
		||||
    prop = GFSrc;
 | 
			
		||||
 | 
			
		||||
    // - q*G*momD1*G*F*Src (momD1 = F*D1*Finv)
 | 
			
		||||
    buf = GFSrc;
 | 
			
		||||
    momD1(buf, fft);
 | 
			
		||||
    buf = G*buf;
 | 
			
		||||
    prop = prop - q*buf;
 | 
			
		||||
 | 
			
		||||
    // + q^2*G*momD1*G*momD1*G*F*Src (here buf = G*momD1*G*F*Src)
 | 
			
		||||
    momD1(buf, fft);
 | 
			
		||||
    prop = prop + q*q*G*buf;
 | 
			
		||||
 | 
			
		||||
    // - q^2*G*momD2*G*F*Src (momD2 = F*D2*Finv)
 | 
			
		||||
    buf = GFSrc;
 | 
			
		||||
    momD2(buf, fft);
 | 
			
		||||
    prop = prop - q*q*G*buf;
 | 
			
		||||
 | 
			
		||||
    // final FT
 | 
			
		||||
    fft.FFT_all_dim(prop, prop, FFT::backward);
 | 
			
		||||
    
 | 
			
		||||
    // OUTPUT IF NECESSARY
 | 
			
		||||
    if (!par().output.empty())
 | 
			
		||||
    {
 | 
			
		||||
        std::string           filename = par().output + "." +
 | 
			
		||||
                                         std::to_string(env().getTrajectory());
 | 
			
		||||
        
 | 
			
		||||
        LOG(Message) << "Saving zero-momentum projection to '"
 | 
			
		||||
                     << filename << "'..." << std::endl;
 | 
			
		||||
        
 | 
			
		||||
        CorrWriter            writer(filename);
 | 
			
		||||
        std::vector<TComplex> vecBuf;
 | 
			
		||||
        std::vector<Complex>  result;
 | 
			
		||||
        
 | 
			
		||||
        sliceSum(prop, vecBuf, Tp);
 | 
			
		||||
        result.resize(vecBuf.size());
 | 
			
		||||
        for (unsigned int t = 0; t < vecBuf.size(); ++t)
 | 
			
		||||
        {
 | 
			
		||||
            result[t] = TensorRemove(vecBuf[t]);
 | 
			
		||||
        }
 | 
			
		||||
        write(writer, "charge", q);
 | 
			
		||||
        write(writer, "prop", result);
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void TChargedProp::momD1(ScalarField &s, FFT &fft)
 | 
			
		||||
{
 | 
			
		||||
    EmField     &A = *env().getObject<EmField>(par().emField);
 | 
			
		||||
    ScalarField buf(env().getGrid()), result(env().getGrid()),
 | 
			
		||||
                Amu(env().getGrid());
 | 
			
		||||
    Complex     ci(0.0,1.0);
 | 
			
		||||
 | 
			
		||||
    result = zero;
 | 
			
		||||
 | 
			
		||||
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
 | 
			
		||||
    {
 | 
			
		||||
        Amu = peekLorentz(A, mu);
 | 
			
		||||
        buf = (*phase_[mu])*s;
 | 
			
		||||
        fft.FFT_all_dim(buf, buf, FFT::backward);
 | 
			
		||||
        buf = Amu*buf;
 | 
			
		||||
        fft.FFT_all_dim(buf, buf, FFT::forward);
 | 
			
		||||
        result = result - ci*buf;
 | 
			
		||||
    }
 | 
			
		||||
    fft.FFT_all_dim(s, s, FFT::backward);
 | 
			
		||||
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
 | 
			
		||||
    {
 | 
			
		||||
        Amu = peekLorentz(A, mu);
 | 
			
		||||
        buf = Amu*s;
 | 
			
		||||
        fft.FFT_all_dim(buf, buf, FFT::forward);
 | 
			
		||||
        result = result + ci*adj(*phase_[mu])*buf;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    s = result;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void TChargedProp::momD2(ScalarField &s, FFT &fft)
 | 
			
		||||
{
 | 
			
		||||
    EmField     &A = *env().getObject<EmField>(par().emField);
 | 
			
		||||
    ScalarField buf(env().getGrid()), result(env().getGrid()),
 | 
			
		||||
                Amu(env().getGrid());
 | 
			
		||||
 | 
			
		||||
    result = zero;
 | 
			
		||||
    
 | 
			
		||||
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
 | 
			
		||||
    {
 | 
			
		||||
        Amu = peekLorentz(A, mu);
 | 
			
		||||
        buf = (*phase_[mu])*s;
 | 
			
		||||
        fft.FFT_all_dim(buf, buf, FFT::backward);
 | 
			
		||||
        buf = Amu*Amu*buf;
 | 
			
		||||
        fft.FFT_all_dim(buf, buf, FFT::forward);
 | 
			
		||||
        result = result + .5*buf;
 | 
			
		||||
    }
 | 
			
		||||
    fft.FFT_all_dim(s, s, FFT::backward);
 | 
			
		||||
    for (unsigned int mu = 0; mu < env().getNd(); ++mu)
 | 
			
		||||
    {
 | 
			
		||||
        Amu = peekLorentz(A, mu);        
 | 
			
		||||
        buf = Amu*Amu*s;
 | 
			
		||||
        fft.FFT_all_dim(buf, buf, FFT::forward);
 | 
			
		||||
        result = result + .5*adj(*phase_[mu])*buf;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    s = result;
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										61
									
								
								extras/Hadrons/Modules/MScalar/ChargedProp.hpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										61
									
								
								extras/Hadrons/Modules/MScalar/ChargedProp.hpp
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,61 @@
 | 
			
		||||
#ifndef Hadrons_MScalar_ChargedProp_hpp_
 | 
			
		||||
#define Hadrons_MScalar_ChargedProp_hpp_
 | 
			
		||||
 | 
			
		||||
#include <Grid/Hadrons/Global.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Module.hpp>
 | 
			
		||||
#include <Grid/Hadrons/ModuleFactory.hpp>
 | 
			
		||||
 | 
			
		||||
BEGIN_HADRONS_NAMESPACE
 | 
			
		||||
 | 
			
		||||
/******************************************************************************
 | 
			
		||||
 *                       Charged scalar propagator                            *
 | 
			
		||||
 ******************************************************************************/
 | 
			
		||||
BEGIN_MODULE_NAMESPACE(MScalar)
 | 
			
		||||
 | 
			
		||||
class ChargedPropPar: Serializable
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
    GRID_SERIALIZABLE_CLASS_MEMBERS(ChargedPropPar,
 | 
			
		||||
                                    std::string, emField,
 | 
			
		||||
                                    std::string, source,
 | 
			
		||||
                                    double,      mass,
 | 
			
		||||
                                    double,      charge,
 | 
			
		||||
                                    std::string, output);
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
class TChargedProp: public Module<ChargedPropPar>
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
    SCALAR_TYPE_ALIASES(SIMPL,);
 | 
			
		||||
    typedef PhotonR::GaugeField     EmField;
 | 
			
		||||
    typedef PhotonR::GaugeLinkField EmComp;
 | 
			
		||||
public:
 | 
			
		||||
    // constructor
 | 
			
		||||
    TChargedProp(const std::string name);
 | 
			
		||||
    // destructor
 | 
			
		||||
    virtual ~TChargedProp(void) = default;
 | 
			
		||||
    // dependency relation
 | 
			
		||||
    virtual std::vector<std::string> getInput(void);
 | 
			
		||||
    virtual std::vector<std::string> getOutput(void);
 | 
			
		||||
    // setup
 | 
			
		||||
    virtual void setup(void);
 | 
			
		||||
    // execution
 | 
			
		||||
    virtual void execute(void);
 | 
			
		||||
private:
 | 
			
		||||
    void momD1(ScalarField &s, FFT &fft);
 | 
			
		||||
    void momD2(ScalarField &s, FFT &fft);
 | 
			
		||||
private:
 | 
			
		||||
    std::string                freeMomPropName_, GFSrcName_;
 | 
			
		||||
    std::vector<std::string>   phaseName_;
 | 
			
		||||
    ScalarField                *freeMomProp_, *GFSrc_;
 | 
			
		||||
    std::vector<ScalarField *> phase_;
 | 
			
		||||
    EmField                    *A;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
MODULE_REGISTER_NS(ChargedProp, TChargedProp, MScalar);
 | 
			
		||||
 | 
			
		||||
END_MODULE_NAMESPACE
 | 
			
		||||
 | 
			
		||||
END_HADRONS_NAMESPACE
 | 
			
		||||
 | 
			
		||||
#endif // Hadrons_MScalar_ChargedProp_hpp_
 | 
			
		||||
							
								
								
									
										79
									
								
								extras/Hadrons/Modules/MScalar/FreeProp.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										79
									
								
								extras/Hadrons/Modules/MScalar/FreeProp.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,79 @@
 | 
			
		||||
#include <Grid/Hadrons/Modules/MScalar/FreeProp.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Modules/MScalar/Scalar.hpp>
 | 
			
		||||
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace Hadrons;
 | 
			
		||||
using namespace MScalar;
 | 
			
		||||
 | 
			
		||||
/******************************************************************************
 | 
			
		||||
*                        TFreeProp implementation                             *
 | 
			
		||||
******************************************************************************/
 | 
			
		||||
// constructor /////////////////////////////////////////////////////////////////
 | 
			
		||||
TFreeProp::TFreeProp(const std::string name)
 | 
			
		||||
: Module<FreePropPar>(name)
 | 
			
		||||
{}
 | 
			
		||||
 | 
			
		||||
// dependencies/products ///////////////////////////////////////////////////////
 | 
			
		||||
std::vector<std::string> TFreeProp::getInput(void)
 | 
			
		||||
{
 | 
			
		||||
    std::vector<std::string> in = {par().source};
 | 
			
		||||
    
 | 
			
		||||
    return in;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
std::vector<std::string> TFreeProp::getOutput(void)
 | 
			
		||||
{
 | 
			
		||||
    std::vector<std::string> out = {getName()};
 | 
			
		||||
    
 | 
			
		||||
    return out;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// setup ///////////////////////////////////////////////////////////////////////
 | 
			
		||||
void TFreeProp::setup(void)
 | 
			
		||||
{
 | 
			
		||||
    freeMomPropName_ = FREEMOMPROP(par().mass);
 | 
			
		||||
    
 | 
			
		||||
    if (!env().hasRegisteredObject(freeMomPropName_))
 | 
			
		||||
    {
 | 
			
		||||
        env().registerLattice<ScalarField>(freeMomPropName_);
 | 
			
		||||
    }
 | 
			
		||||
    env().registerLattice<ScalarField>(getName());
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// execution ///////////////////////////////////////////////////////////////////
 | 
			
		||||
void TFreeProp::execute(void)
 | 
			
		||||
{
 | 
			
		||||
    ScalarField &prop   = *env().createLattice<ScalarField>(getName());
 | 
			
		||||
    ScalarField &source = *env().getObject<ScalarField>(par().source);
 | 
			
		||||
    ScalarField *freeMomProp;
 | 
			
		||||
 | 
			
		||||
    if (!env().hasCreatedObject(freeMomPropName_))
 | 
			
		||||
    {
 | 
			
		||||
        LOG(Message) << "Caching momentum space free scalar propagator"
 | 
			
		||||
                     << " (mass= " << par().mass << ")..." << std::endl;
 | 
			
		||||
        freeMomProp = env().createLattice<ScalarField>(freeMomPropName_);
 | 
			
		||||
        SIMPL::MomentumSpacePropagator(*freeMomProp, par().mass);
 | 
			
		||||
    }
 | 
			
		||||
    else
 | 
			
		||||
    {
 | 
			
		||||
        freeMomProp = env().getObject<ScalarField>(freeMomPropName_);
 | 
			
		||||
    }
 | 
			
		||||
    LOG(Message) << "Computing free scalar propagator..." << std::endl;
 | 
			
		||||
    SIMPL::FreePropagator(source, prop, *freeMomProp);
 | 
			
		||||
    
 | 
			
		||||
    if (!par().output.empty())
 | 
			
		||||
    {
 | 
			
		||||
        TextWriter            writer(par().output + "." +
 | 
			
		||||
                                     std::to_string(env().getTrajectory()));
 | 
			
		||||
        std::vector<TComplex> buf;
 | 
			
		||||
        std::vector<Complex>  result;
 | 
			
		||||
        
 | 
			
		||||
        sliceSum(prop, buf, Tp);
 | 
			
		||||
        result.resize(buf.size());
 | 
			
		||||
        for (unsigned int t = 0; t < buf.size(); ++t)
 | 
			
		||||
        {
 | 
			
		||||
            result[t] = TensorRemove(buf[t]);
 | 
			
		||||
        }
 | 
			
		||||
        write(writer, "prop", result);
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										50
									
								
								extras/Hadrons/Modules/MScalar/FreeProp.hpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										50
									
								
								extras/Hadrons/Modules/MScalar/FreeProp.hpp
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,50 @@
 | 
			
		||||
#ifndef Hadrons_MScalar_FreeProp_hpp_
 | 
			
		||||
#define Hadrons_MScalar_FreeProp_hpp_
 | 
			
		||||
 | 
			
		||||
#include <Grid/Hadrons/Global.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Module.hpp>
 | 
			
		||||
#include <Grid/Hadrons/ModuleFactory.hpp>
 | 
			
		||||
 | 
			
		||||
BEGIN_HADRONS_NAMESPACE
 | 
			
		||||
 | 
			
		||||
/******************************************************************************
 | 
			
		||||
 *                               FreeProp                                     *
 | 
			
		||||
 ******************************************************************************/
 | 
			
		||||
BEGIN_MODULE_NAMESPACE(MScalar)
 | 
			
		||||
 | 
			
		||||
class FreePropPar: Serializable
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
    GRID_SERIALIZABLE_CLASS_MEMBERS(FreePropPar,
 | 
			
		||||
                                    std::string, source,
 | 
			
		||||
                                    double,      mass,
 | 
			
		||||
                                    std::string, output);
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
class TFreeProp: public Module<FreePropPar>
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
    SCALAR_TYPE_ALIASES(SIMPL,);
 | 
			
		||||
public:
 | 
			
		||||
    // constructor
 | 
			
		||||
    TFreeProp(const std::string name);
 | 
			
		||||
    // destructor
 | 
			
		||||
    virtual ~TFreeProp(void) = default;
 | 
			
		||||
    // dependency relation
 | 
			
		||||
    virtual std::vector<std::string> getInput(void);
 | 
			
		||||
    virtual std::vector<std::string> getOutput(void);
 | 
			
		||||
    // setup
 | 
			
		||||
    virtual void setup(void);
 | 
			
		||||
    // execution
 | 
			
		||||
    virtual void execute(void);
 | 
			
		||||
private:
 | 
			
		||||
    std::string freeMomPropName_;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
MODULE_REGISTER_NS(FreeProp, TFreeProp, MScalar);
 | 
			
		||||
 | 
			
		||||
END_MODULE_NAMESPACE
 | 
			
		||||
 | 
			
		||||
END_HADRONS_NAMESPACE
 | 
			
		||||
 | 
			
		||||
#endif // Hadrons_MScalar_FreeProp_hpp_
 | 
			
		||||
							
								
								
									
										6
									
								
								extras/Hadrons/Modules/MScalar/Scalar.hpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										6
									
								
								extras/Hadrons/Modules/MScalar/Scalar.hpp
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,6 @@
 | 
			
		||||
#ifndef Hadrons_Scalar_hpp_
 | 
			
		||||
#define Hadrons_Scalar_hpp_
 | 
			
		||||
 | 
			
		||||
#define FREEMOMPROP(m) "_scalar_mom_prop_" + std::to_string(m)
 | 
			
		||||
 | 
			
		||||
#endif // Hadrons_Scalar_hpp_
 | 
			
		||||
							
								
								
									
										114
									
								
								extras/Hadrons/Modules/MSink/Point.hpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										114
									
								
								extras/Hadrons/Modules/MSink/Point.hpp
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,114 @@
 | 
			
		||||
#ifndef Hadrons_MSink_Point_hpp_
 | 
			
		||||
#define Hadrons_MSink_Point_hpp_
 | 
			
		||||
 | 
			
		||||
#include <Grid/Hadrons/Global.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Module.hpp>
 | 
			
		||||
#include <Grid/Hadrons/ModuleFactory.hpp>
 | 
			
		||||
 | 
			
		||||
BEGIN_HADRONS_NAMESPACE
 | 
			
		||||
 | 
			
		||||
/******************************************************************************
 | 
			
		||||
 *                                   Point                                    *
 | 
			
		||||
 ******************************************************************************/
 | 
			
		||||
BEGIN_MODULE_NAMESPACE(MSink)
 | 
			
		||||
 | 
			
		||||
class PointPar: Serializable
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
    GRID_SERIALIZABLE_CLASS_MEMBERS(PointPar,
 | 
			
		||||
                                    std::string, mom);
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template <typename FImpl>
 | 
			
		||||
class TPoint: public Module<PointPar>
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
    FERM_TYPE_ALIASES(FImpl,);
 | 
			
		||||
    SINK_TYPE_ALIASES();
 | 
			
		||||
public:
 | 
			
		||||
    // constructor
 | 
			
		||||
    TPoint(const std::string name);
 | 
			
		||||
    // destructor
 | 
			
		||||
    virtual ~TPoint(void) = default;
 | 
			
		||||
    // dependency relation
 | 
			
		||||
    virtual std::vector<std::string> getInput(void);
 | 
			
		||||
    virtual std::vector<std::string> getOutput(void);
 | 
			
		||||
    // setup
 | 
			
		||||
    virtual void setup(void);
 | 
			
		||||
    // execution
 | 
			
		||||
    virtual void execute(void);
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
MODULE_REGISTER_NS(Point,       TPoint<FIMPL>,        MSink);
 | 
			
		||||
MODULE_REGISTER_NS(ScalarPoint, TPoint<ScalarImplCR>, MSink);
 | 
			
		||||
 | 
			
		||||
/******************************************************************************
 | 
			
		||||
 *                          TPoint implementation                             *
 | 
			
		||||
 ******************************************************************************/
 | 
			
		||||
// constructor /////////////////////////////////////////////////////////////////
 | 
			
		||||
template <typename FImpl>
 | 
			
		||||
TPoint<FImpl>::TPoint(const std::string name)
 | 
			
		||||
: Module<PointPar>(name)
 | 
			
		||||
{}
 | 
			
		||||
 | 
			
		||||
// dependencies/products ///////////////////////////////////////////////////////
 | 
			
		||||
template <typename FImpl>
 | 
			
		||||
std::vector<std::string> TPoint<FImpl>::getInput(void)
 | 
			
		||||
{
 | 
			
		||||
    std::vector<std::string> in;
 | 
			
		||||
    
 | 
			
		||||
    return in;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <typename FImpl>
 | 
			
		||||
std::vector<std::string> TPoint<FImpl>::getOutput(void)
 | 
			
		||||
{
 | 
			
		||||
    std::vector<std::string> out = {getName()};
 | 
			
		||||
    
 | 
			
		||||
    return out;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// setup ///////////////////////////////////////////////////////////////////////
 | 
			
		||||
template <typename FImpl>
 | 
			
		||||
void TPoint<FImpl>::setup(void)
 | 
			
		||||
{
 | 
			
		||||
    unsigned int size;
 | 
			
		||||
    
 | 
			
		||||
    size = env().template lattice4dSize<LatticeComplex>();
 | 
			
		||||
    env().registerObject(getName(), size);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// execution ///////////////////////////////////////////////////////////////////
 | 
			
		||||
template <typename FImpl>
 | 
			
		||||
void TPoint<FImpl>::execute(void)
 | 
			
		||||
{
 | 
			
		||||
    std::vector<Real> p = strToVec<Real>(par().mom);
 | 
			
		||||
    LatticeComplex    ph(env().getGrid()), coor(env().getGrid());
 | 
			
		||||
    Complex           i(0.0,1.0);
 | 
			
		||||
    
 | 
			
		||||
    LOG(Message) << "Setting up point sink function for momentum ["
 | 
			
		||||
                 << par().mom << "]" << std::endl;
 | 
			
		||||
    ph = zero;
 | 
			
		||||
    for(unsigned int mu = 0; mu < env().getNd(); mu++)
 | 
			
		||||
    {
 | 
			
		||||
        LatticeCoordinate(coor, mu);
 | 
			
		||||
        ph = ph + (p[mu]/env().getGrid()->_fdimensions[mu])*coor;
 | 
			
		||||
    }
 | 
			
		||||
    ph = exp((Real)(2*M_PI)*i*ph);
 | 
			
		||||
    auto sink = [ph](const PropagatorField &field)
 | 
			
		||||
    {
 | 
			
		||||
        SlicedPropagator res;
 | 
			
		||||
        PropagatorField  tmp = ph*field;
 | 
			
		||||
        
 | 
			
		||||
        sliceSum(tmp, res, Tp);
 | 
			
		||||
        
 | 
			
		||||
        return res;
 | 
			
		||||
    };
 | 
			
		||||
    env().setObject(getName(), new SinkFn(sink));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
END_MODULE_NAMESPACE
 | 
			
		||||
 | 
			
		||||
END_HADRONS_NAMESPACE
 | 
			
		||||
 | 
			
		||||
#endif // Hadrons_MSink_Point_hpp_
 | 
			
		||||
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#ifndef Hadrons_RBPrecCG_hpp_
 | 
			
		||||
#define Hadrons_RBPrecCG_hpp_
 | 
			
		||||
#ifndef Hadrons_MSolver_RBPrecCG_hpp_
 | 
			
		||||
#define Hadrons_MSolver_RBPrecCG_hpp_
 | 
			
		||||
 | 
			
		||||
#include <Grid/Hadrons/Global.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Module.hpp>
 | 
			
		||||
@@ -53,7 +53,7 @@ template <typename FImpl>
 | 
			
		||||
class TRBPrecCG: public Module<RBPrecCGPar>
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
    TYPE_ALIASES(FImpl,);
 | 
			
		||||
    FGS_TYPE_ALIASES(FImpl,);
 | 
			
		||||
public:
 | 
			
		||||
    // constructor
 | 
			
		||||
    TRBPrecCG(const std::string name);
 | 
			
		||||
@@ -129,4 +129,4 @@ END_MODULE_NAMESPACE
 | 
			
		||||
 | 
			
		||||
END_HADRONS_NAMESPACE
 | 
			
		||||
 | 
			
		||||
#endif // Hadrons_RBPrecCG_hpp_
 | 
			
		||||
#endif // Hadrons_MSolver_RBPrecCG_hpp_
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										153
									
								
								extras/Hadrons/Modules/MSource/Laplacian.hpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										153
									
								
								extras/Hadrons/Modules/MSource/Laplacian.hpp
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,153 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
Source file: extras/Hadrons/Modules/MSource/Laplacian.hpp
 | 
			
		||||
 | 
			
		||||
Copyright (C) 2017
 | 
			
		||||
 | 
			
		||||
Author: Guido Cossu <guido.cossu@ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
This program is free software; you can redistribute it and/or modify
 | 
			
		||||
it under the terms of the GNU General Public License as published by
 | 
			
		||||
the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
(at your option) any later version.
 | 
			
		||||
 | 
			
		||||
This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
You should have received a copy of the GNU General Public License along
 | 
			
		||||
with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#ifndef Hadrons_MSource_Laplacian_hpp_
 | 
			
		||||
#define Hadrons_MSource_Laplacian_hpp_
 | 
			
		||||
 | 
			
		||||
#include <Grid/Hadrons/Global.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Module.hpp>
 | 
			
		||||
#include <Grid/Hadrons/ModuleFactory.hpp>
 | 
			
		||||
 | 
			
		||||
BEGIN_HADRONS_NAMESPACE
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 
 | 
			
		||||
 Laplacian smearing source
 | 
			
		||||
 -----------------------------
 | 
			
		||||
 
 | 
			
		||||
 * options:
 | 
			
		||||
 - source: name of source object to be smeared (string)
 | 
			
		||||
 - N: number of steps (integer)
 | 
			
		||||
 - alpha: smearing parameter (real)
 | 
			
		||||
 
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
/******************************************************************************
 | 
			
		||||
 *                          Laplace smearing operator                         *
 | 
			
		||||
 ******************************************************************************/
 | 
			
		||||
BEGIN_MODULE_NAMESPACE(MSource)
 | 
			
		||||
 | 
			
		||||
class LaplacianPar : Serializable
 | 
			
		||||
{
 | 
			
		||||
  public:
 | 
			
		||||
    GRID_SERIALIZABLE_CLASS_MEMBERS(LaplacianPar,
 | 
			
		||||
                                    std::string, source,
 | 
			
		||||
                                    std::string, gauge,
 | 
			
		||||
                                    unsigned int, N,
 | 
			
		||||
                                    double, alpha);
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
template <typename FImpl>
 | 
			
		||||
class TLaplacian : public Module<LaplacianPar>
 | 
			
		||||
{
 | 
			
		||||
  public:
 | 
			
		||||
    FERM_TYPE_ALIASES(FImpl, );
 | 
			
		||||
 | 
			
		||||
  public:
 | 
			
		||||
    // constructor
 | 
			
		||||
    TLaplacian(const std::string name);
 | 
			
		||||
    // destructor
 | 
			
		||||
    virtual ~TLaplacian(void) = default;
 | 
			
		||||
    // dependency relation
 | 
			
		||||
    virtual std::vector<std::string> getInput(void);
 | 
			
		||||
    virtual std::vector<std::string> getOutput(void);
 | 
			
		||||
    // setup
 | 
			
		||||
    virtual void setup(void);
 | 
			
		||||
    // execution
 | 
			
		||||
    virtual void execute(void);
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
MODULE_REGISTER_NS(LaplaceSmearing, TLaplacian<FIMPL>, MSource);
 | 
			
		||||
 | 
			
		||||
/******************************************************************************
 | 
			
		||||
 *                       TLaplacian template implementation                   *
 | 
			
		||||
 ******************************************************************************/
 | 
			
		||||
// constructor /////////////////////////////////////////////////////////////////
 | 
			
		||||
template <typename FImpl>
 | 
			
		||||
TLaplacian<FImpl>::TLaplacian(const std::string name)
 | 
			
		||||
    : Module<LaplacianPar>(name)
 | 
			
		||||
{
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// dependencies/products ///////////////////////////////////////////////////////
 | 
			
		||||
template <typename FImpl>
 | 
			
		||||
std::vector<std::string> TLaplacian<FImpl>::getInput(void)
 | 
			
		||||
{
 | 
			
		||||
    std::vector<std::string> in = {par().source, par().gauge};
 | 
			
		||||
 | 
			
		||||
    return in;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <typename FImpl>
 | 
			
		||||
std::vector<std::string> TLaplacian<FImpl>::getOutput(void)
 | 
			
		||||
{
 | 
			
		||||
    std::vector<std::string> out = {getName()};
 | 
			
		||||
 | 
			
		||||
    return out;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// setup ///////////////////////////////////////////////////////////////////////
 | 
			
		||||
template <typename FImpl>
 | 
			
		||||
void TLaplacian<FImpl>::setup(void)
 | 
			
		||||
{
 | 
			
		||||
    env().template registerLattice<PropagatorField>(getName());
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// execution ///////////////////////////////////////////////////////////////////
 | 
			
		||||
template <typename FImpl>
 | 
			
		||||
void TLaplacian<FImpl>::execute(void)
 | 
			
		||||
{
 | 
			
		||||
 | 
			
		||||
    FermionField source(env().getGrid()), tmp(env().getGrid());
 | 
			
		||||
    PropagatorField &SmrSrc = *env().template createLattice<PropagatorField>(getName());
 | 
			
		||||
    PropagatorField &fullSrc = *env().template getObject<PropagatorField>(par().source);
 | 
			
		||||
    auto &U      = *env().template getObject<LatticeGaugeField>(par().gauge);
 | 
			
		||||
    Laplacian<FImpl> LaplaceOperator(env().getGrid());
 | 
			
		||||
    LaplaceOperator.ImportGauge(U);
 | 
			
		||||
    double prefactor = par().alpha / (double)(par().N);
 | 
			
		||||
 | 
			
		||||
    for (unsigned int s = 0; s < Ns; ++s)
 | 
			
		||||
    {
 | 
			
		||||
        for (unsigned int c = 0; c < Nc; ++c)
 | 
			
		||||
        {
 | 
			
		||||
            PropToFerm(source, fullSrc, s, c);
 | 
			
		||||
            for (int smr = 0; smr < par().N; ++smr)
 | 
			
		||||
            {
 | 
			
		||||
                LaplaceOperator.M(source, tmp);
 | 
			
		||||
                source += prefactor * tmp;
 | 
			
		||||
            }
 | 
			
		||||
            FermToProp(SmrSrc, source, s, c);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
END_MODULE_NAMESPACE
 | 
			
		||||
 | 
			
		||||
END_HADRONS_NAMESPACE
 | 
			
		||||
 | 
			
		||||
#endif // Hadrons_MSource_Z2_hpp_
 | 
			
		||||
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#ifndef Hadrons_Point_hpp_
 | 
			
		||||
#define Hadrons_Point_hpp_
 | 
			
		||||
#ifndef Hadrons_MSource_Point_hpp_
 | 
			
		||||
#define Hadrons_MSource_Point_hpp_
 | 
			
		||||
 | 
			
		||||
#include <Grid/Hadrons/Global.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Module.hpp>
 | 
			
		||||
@@ -63,7 +63,7 @@ template <typename FImpl>
 | 
			
		||||
class TPoint: public Module<PointPar>
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
    TYPE_ALIASES(FImpl,);
 | 
			
		||||
    FERM_TYPE_ALIASES(FImpl,);
 | 
			
		||||
public:
 | 
			
		||||
    // constructor
 | 
			
		||||
    TPoint(const std::string name);
 | 
			
		||||
@@ -78,7 +78,8 @@ public:
 | 
			
		||||
    virtual void execute(void);
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
MODULE_REGISTER_NS(Point, TPoint<FIMPL>, MSource);
 | 
			
		||||
MODULE_REGISTER_NS(Point,       TPoint<FIMPL>,        MSource);
 | 
			
		||||
MODULE_REGISTER_NS(ScalarPoint, TPoint<ScalarImplCR>, MSource);
 | 
			
		||||
 | 
			
		||||
/******************************************************************************
 | 
			
		||||
 *                       TPoint template implementation                       *
 | 
			
		||||
@@ -132,4 +133,4 @@ END_MODULE_NAMESPACE
 | 
			
		||||
 | 
			
		||||
END_HADRONS_NAMESPACE
 | 
			
		||||
 | 
			
		||||
#endif // Hadrons_Point_hpp_
 | 
			
		||||
#endif // Hadrons_MSource_Point_hpp_
 | 
			
		||||
 
 | 
			
		||||
@@ -28,8 +28,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#ifndef Hadrons_SeqGamma_hpp_
 | 
			
		||||
#define Hadrons_SeqGamma_hpp_
 | 
			
		||||
#ifndef Hadrons_MSource_SeqGamma_hpp_
 | 
			
		||||
#define Hadrons_MSource_SeqGamma_hpp_
 | 
			
		||||
 | 
			
		||||
#include <Grid/Hadrons/Global.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Module.hpp>
 | 
			
		||||
@@ -72,7 +72,7 @@ template <typename FImpl>
 | 
			
		||||
class TSeqGamma: public Module<SeqGammaPar>
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
    TYPE_ALIASES(FImpl,);
 | 
			
		||||
    FGS_TYPE_ALIASES(FImpl,);
 | 
			
		||||
public:
 | 
			
		||||
    // constructor
 | 
			
		||||
    TSeqGamma(const std::string name);
 | 
			
		||||
@@ -161,4 +161,4 @@ END_MODULE_NAMESPACE
 | 
			
		||||
 | 
			
		||||
END_HADRONS_NAMESPACE
 | 
			
		||||
 | 
			
		||||
#endif // Hadrons_SeqGamma_hpp_
 | 
			
		||||
#endif // Hadrons_MSource_SeqGamma_hpp_
 | 
			
		||||
 
 | 
			
		||||
@@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#ifndef Hadrons_WallSource_hpp_
 | 
			
		||||
#define Hadrons_WallSource_hpp_
 | 
			
		||||
#ifndef Hadrons_MSource_WallSource_hpp_
 | 
			
		||||
#define Hadrons_MSource_WallSource_hpp_
 | 
			
		||||
 | 
			
		||||
#include <Grid/Hadrons/Global.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Module.hpp>
 | 
			
		||||
@@ -64,7 +64,7 @@ template <typename FImpl>
 | 
			
		||||
class TWall: public Module<WallPar>
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
    TYPE_ALIASES(FImpl,);
 | 
			
		||||
    FERM_TYPE_ALIASES(FImpl,);
 | 
			
		||||
public:
 | 
			
		||||
    // constructor
 | 
			
		||||
    TWall(const std::string name);
 | 
			
		||||
@@ -144,4 +144,4 @@ END_MODULE_NAMESPACE
 | 
			
		||||
 | 
			
		||||
END_HADRONS_NAMESPACE
 | 
			
		||||
 | 
			
		||||
#endif // Hadrons_WallSource_hpp_
 | 
			
		||||
#endif // Hadrons_MSource_WallSource_hpp_
 | 
			
		||||
 
 | 
			
		||||
@@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#ifndef Hadrons_Z2_hpp_
 | 
			
		||||
#define Hadrons_Z2_hpp_
 | 
			
		||||
#ifndef Hadrons_MSource_Z2_hpp_
 | 
			
		||||
#define Hadrons_MSource_Z2_hpp_
 | 
			
		||||
 | 
			
		||||
#include <Grid/Hadrons/Global.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Module.hpp>
 | 
			
		||||
@@ -67,7 +67,7 @@ template <typename FImpl>
 | 
			
		||||
class TZ2: public Module<Z2Par>
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
    TYPE_ALIASES(FImpl,);
 | 
			
		||||
    FERM_TYPE_ALIASES(FImpl,);
 | 
			
		||||
public:
 | 
			
		||||
    // constructor
 | 
			
		||||
    TZ2(const std::string name);
 | 
			
		||||
@@ -82,7 +82,8 @@ public:
 | 
			
		||||
    virtual void execute(void);
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
MODULE_REGISTER_NS(Z2, TZ2<FIMPL>, MSource);
 | 
			
		||||
MODULE_REGISTER_NS(Z2,       TZ2<FIMPL>,        MSource);
 | 
			
		||||
MODULE_REGISTER_NS(ScalarZ2, TZ2<ScalarImplCR>, MSource);
 | 
			
		||||
 | 
			
		||||
/******************************************************************************
 | 
			
		||||
 *                       TZ2 template implementation                          *
 | 
			
		||||
@@ -148,4 +149,4 @@ END_MODULE_NAMESPACE
 | 
			
		||||
 | 
			
		||||
END_HADRONS_NAMESPACE
 | 
			
		||||
 | 
			
		||||
#endif // Hadrons_Z2_hpp_
 | 
			
		||||
#endif // Hadrons_MSource_Z2_hpp_
 | 
			
		||||
 
 | 
			
		||||
@@ -1,5 +1,5 @@
 | 
			
		||||
#ifndef Hadrons____FILEBASENAME____hpp_
 | 
			
		||||
#define Hadrons____FILEBASENAME____hpp_
 | 
			
		||||
#ifndef Hadrons____NAMESPACE_______FILEBASENAME____hpp_
 | 
			
		||||
#define Hadrons____NAMESPACE_______FILEBASENAME____hpp_
 | 
			
		||||
 | 
			
		||||
#include <Grid/Hadrons/Global.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Module.hpp>
 | 
			
		||||
@@ -41,4 +41,4 @@ END_MODULE_NAMESPACE
 | 
			
		||||
 | 
			
		||||
END_HADRONS_NAMESPACE
 | 
			
		||||
 | 
			
		||||
#endif // Hadrons____FILEBASENAME____hpp_
 | 
			
		||||
#endif // Hadrons____NAMESPACE_______FILEBASENAME____hpp_
 | 
			
		||||
 
 | 
			
		||||
@@ -1,5 +1,5 @@
 | 
			
		||||
#ifndef Hadrons____FILEBASENAME____hpp_
 | 
			
		||||
#define Hadrons____FILEBASENAME____hpp_
 | 
			
		||||
#ifndef Hadrons____NAMESPACE_______FILEBASENAME____hpp_
 | 
			
		||||
#define Hadrons____NAMESPACE_______FILEBASENAME____hpp_
 | 
			
		||||
 | 
			
		||||
#include <Grid/Hadrons/Global.hpp>
 | 
			
		||||
#include <Grid/Hadrons/Module.hpp>
 | 
			
		||||
@@ -82,4 +82,4 @@ END_MODULE_NAMESPACE
 | 
			
		||||
 | 
			
		||||
END_HADRONS_NAMESPACE
 | 
			
		||||
 | 
			
		||||
#endif // Hadrons____FILEBASENAME____hpp_
 | 
			
		||||
#endif // Hadrons____NAMESPACE_______FILEBASENAME____hpp_
 | 
			
		||||
 
 | 
			
		||||
@@ -1,30 +1,39 @@
 | 
			
		||||
modules_cc =\
 | 
			
		||||
  Modules/MContraction/WeakHamiltonianEye.cc \
 | 
			
		||||
  Modules/MContraction/WeakHamiltonianNonEye.cc \
 | 
			
		||||
  Modules/MContraction/WeakNeutral4ptDisc.cc \
 | 
			
		||||
  Modules/MGauge/Load.cc \
 | 
			
		||||
  Modules/MContraction/WeakHamiltonianEye.cc \
 | 
			
		||||
  Modules/MScalar/FreeProp.cc \
 | 
			
		||||
  Modules/MScalar/ChargedProp.cc \
 | 
			
		||||
  Modules/MGauge/Unit.cc \
 | 
			
		||||
  Modules/MGauge/Random.cc \
 | 
			
		||||
  Modules/MGauge/Unit.cc
 | 
			
		||||
  Modules/MGauge/StochEm.cc \
 | 
			
		||||
  Modules/MGauge/Load.cc
 | 
			
		||||
 | 
			
		||||
modules_hpp =\
 | 
			
		||||
  Modules/MAction/DWF.hpp \
 | 
			
		||||
  Modules/MAction/Wilson.hpp \
 | 
			
		||||
  Modules/MContraction/Baryon.hpp \
 | 
			
		||||
  Modules/MContraction/DiscLoop.hpp \
 | 
			
		||||
  Modules/MContraction/Gamma3pt.hpp \
 | 
			
		||||
  Modules/MContraction/Meson.hpp \
 | 
			
		||||
  Modules/MLoop/NoiseLoop.hpp \
 | 
			
		||||
  Modules/MFermion/GaugeProp.hpp \
 | 
			
		||||
  Modules/MContraction/WeakHamiltonian.hpp \
 | 
			
		||||
  Modules/MContraction/Meson.hpp \
 | 
			
		||||
  Modules/MContraction/DiscLoop.hpp \
 | 
			
		||||
  Modules/MContraction/WeakHamiltonianEye.hpp \
 | 
			
		||||
  Modules/MContraction/Baryon.hpp \
 | 
			
		||||
  Modules/MContraction/WeakHamiltonianNonEye.hpp \
 | 
			
		||||
  Modules/MContraction/WeakNeutral4ptDisc.hpp \
 | 
			
		||||
  Modules/MGauge/Load.hpp \
 | 
			
		||||
  Modules/MGauge/Random.hpp \
 | 
			
		||||
  Modules/MGauge/Unit.hpp \
 | 
			
		||||
  Modules/MLoop/NoiseLoop.hpp \
 | 
			
		||||
  Modules/MSolver/RBPrecCG.hpp \
 | 
			
		||||
  Modules/MSource/Point.hpp \
 | 
			
		||||
  Modules/MSource/SeqGamma.hpp \
 | 
			
		||||
  Modules/MSource/Wall.hpp \
 | 
			
		||||
  Modules/MContraction/Gamma3pt.hpp \
 | 
			
		||||
  Modules/MSource/Z2.hpp \
 | 
			
		||||
  Modules/Quark.hpp
 | 
			
		||||
  Modules/MSource/SeqGamma.hpp \
 | 
			
		||||
  Modules/MSource/Point.hpp \
 | 
			
		||||
  Modules/MSource/Wall.hpp \
 | 
			
		||||
  Modules/MSource/Laplacian.hpp \
 | 
			
		||||
  Modules/MSolver/RBPrecCG.hpp \
 | 
			
		||||
  Modules/MScalar/ChargedProp.hpp \
 | 
			
		||||
  Modules/MScalar/FreeProp.hpp \
 | 
			
		||||
  Modules/MScalar/Scalar.hpp \
 | 
			
		||||
  Modules/MAction/DWF.hpp \
 | 
			
		||||
  Modules/MAction/Wilson.hpp \
 | 
			
		||||
  Modules/MGauge/StochEm.hpp \
 | 
			
		||||
  Modules/MGauge/Unit.hpp \
 | 
			
		||||
  Modules/MGauge/Random.hpp \
 | 
			
		||||
  Modules/MGauge/Load.hpp \
 | 
			
		||||
  Modules/MSink/Point.hpp
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										11
									
								
								extras/qed-fvol/Global.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										11
									
								
								extras/qed-fvol/Global.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,11 @@
 | 
			
		||||
#include <qed-fvol/Global.hpp>
 | 
			
		||||
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace QCD;
 | 
			
		||||
using namespace QedFVol;
 | 
			
		||||
 | 
			
		||||
QedFVolLogger QedFVol::QedFVolLogError(1,"Error");
 | 
			
		||||
QedFVolLogger QedFVol::QedFVolLogWarning(1,"Warning");
 | 
			
		||||
QedFVolLogger QedFVol::QedFVolLogMessage(1,"Message");
 | 
			
		||||
QedFVolLogger QedFVol::QedFVolLogIterative(1,"Iterative");
 | 
			
		||||
QedFVolLogger QedFVol::QedFVolLogDebug(1,"Debug");
 | 
			
		||||
							
								
								
									
										42
									
								
								extras/qed-fvol/Global.hpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										42
									
								
								extras/qed-fvol/Global.hpp
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,42 @@
 | 
			
		||||
#ifndef QedFVol_Global_hpp_
 | 
			
		||||
#define QedFVol_Global_hpp_
 | 
			
		||||
 | 
			
		||||
#include <Grid/Grid.h>
 | 
			
		||||
 | 
			
		||||
#define BEGIN_QEDFVOL_NAMESPACE \
 | 
			
		||||
namespace Grid {\
 | 
			
		||||
using namespace QCD;\
 | 
			
		||||
namespace QedFVol {\
 | 
			
		||||
using Grid::operator<<;
 | 
			
		||||
#define END_QEDFVOL_NAMESPACE }}
 | 
			
		||||
 | 
			
		||||
/* the 'using Grid::operator<<;' statement prevents a very nasty compilation
 | 
			
		||||
 * error with GCC (clang compiles fine without it).
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
BEGIN_QEDFVOL_NAMESPACE
 | 
			
		||||
 | 
			
		||||
class QedFVolLogger: public Logger
 | 
			
		||||
{
 | 
			
		||||
public:
 | 
			
		||||
    QedFVolLogger(int on, std::string nm): Logger("QedFVol", on, nm,
 | 
			
		||||
                                                  GridLogColours, "BLACK"){};
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
#define LOG(channel) std::cout << QedFVolLog##channel
 | 
			
		||||
#define QEDFVOL_ERROR(msg)\
 | 
			
		||||
LOG(Error) << msg << " (" << __FUNCTION__ << " at " << __FILE__ << ":"\
 | 
			
		||||
           << __LINE__ << ")" << std::endl;\
 | 
			
		||||
abort();
 | 
			
		||||
 | 
			
		||||
#define DEBUG_VAR(var) LOG(Debug) << #var << "= " << (var) << std::endl;
 | 
			
		||||
 | 
			
		||||
extern QedFVolLogger QedFVolLogError;
 | 
			
		||||
extern QedFVolLogger QedFVolLogWarning;
 | 
			
		||||
extern QedFVolLogger QedFVolLogMessage;
 | 
			
		||||
extern QedFVolLogger QedFVolLogIterative;
 | 
			
		||||
extern QedFVolLogger QedFVolLogDebug;
 | 
			
		||||
 | 
			
		||||
END_QEDFVOL_NAMESPACE
 | 
			
		||||
 | 
			
		||||
#endif // QedFVol_Global_hpp_
 | 
			
		||||
							
								
								
									
										9
									
								
								extras/qed-fvol/Makefile.am
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								extras/qed-fvol/Makefile.am
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,9 @@
 | 
			
		||||
AM_CXXFLAGS += -I$(top_srcdir)/extras
 | 
			
		||||
 | 
			
		||||
bin_PROGRAMS = qed-fvol
 | 
			
		||||
 | 
			
		||||
qed_fvol_SOURCES =   \
 | 
			
		||||
    qed-fvol.cc      \
 | 
			
		||||
    Global.cc
 | 
			
		||||
 | 
			
		||||
qed_fvol_LDADD   = -lGrid
 | 
			
		||||
							
								
								
									
										265
									
								
								extras/qed-fvol/WilsonLoops.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										265
									
								
								extras/qed-fvol/WilsonLoops.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,265 @@
 | 
			
		||||
#ifndef QEDFVOL_WILSONLOOPS_H
 | 
			
		||||
#define QEDFVOL_WILSONLOOPS_H
 | 
			
		||||
 | 
			
		||||
#include <Global.hpp>
 | 
			
		||||
 | 
			
		||||
BEGIN_QEDFVOL_NAMESPACE
 | 
			
		||||
 | 
			
		||||
template <class Gimpl> class NewWilsonLoops : public Gimpl {
 | 
			
		||||
public:
 | 
			
		||||
  INHERIT_GIMPL_TYPES(Gimpl);
 | 
			
		||||
 | 
			
		||||
  typedef typename Gimpl::GaugeLinkField GaugeMat;
 | 
			
		||||
  typedef typename Gimpl::GaugeField GaugeLorentz;
 | 
			
		||||
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  // directed plaquette oriented in mu,nu plane
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  static void dirPlaquette(GaugeMat &plaq, const std::vector<GaugeMat> &U,
 | 
			
		||||
                           const int mu, const int nu) {
 | 
			
		||||
    // Annoyingly, must use either scope resolution to find dependent base
 | 
			
		||||
    // class,
 | 
			
		||||
    // or this-> ; there is no "this" in a static method. This forces explicit
 | 
			
		||||
    // Gimpl scope
 | 
			
		||||
    // resolution throughout the usage in this file, and rather defeats the
 | 
			
		||||
    // purpose of deriving
 | 
			
		||||
    // from Gimpl.
 | 
			
		||||
    plaq = Gimpl::CovShiftBackward(
 | 
			
		||||
        U[mu], mu, Gimpl::CovShiftBackward(
 | 
			
		||||
                       U[nu], nu, Gimpl::CovShiftForward(U[mu], mu, U[nu])));
 | 
			
		||||
  }
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  // trace of directed plaquette oriented in mu,nu plane
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  static void traceDirPlaquette(LatticeComplex &plaq,
 | 
			
		||||
                                const std::vector<GaugeMat> &U, const int mu,
 | 
			
		||||
                                const int nu) {
 | 
			
		||||
    GaugeMat sp(U[0]._grid);
 | 
			
		||||
    dirPlaquette(sp, U, mu, nu);
 | 
			
		||||
    plaq = trace(sp);
 | 
			
		||||
  }
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  // sum over all planes of plaquette
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  static void sitePlaquette(LatticeComplex &Plaq,
 | 
			
		||||
                            const std::vector<GaugeMat> &U) {
 | 
			
		||||
    LatticeComplex sitePlaq(U[0]._grid);
 | 
			
		||||
    Plaq = zero;
 | 
			
		||||
    for (int mu = 1; mu < U[0]._grid->_ndimension; mu++) {
 | 
			
		||||
      for (int nu = 0; nu < mu; nu++) {
 | 
			
		||||
        traceDirPlaquette(sitePlaq, U, mu, nu);
 | 
			
		||||
        Plaq = Plaq + sitePlaq;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  // sum over all x,y,z,t and over all planes of plaquette
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  static Real sumPlaquette(const GaugeLorentz &Umu) {
 | 
			
		||||
    std::vector<GaugeMat> U(4, Umu._grid);
 | 
			
		||||
 | 
			
		||||
    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
 | 
			
		||||
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    LatticeComplex Plaq(Umu._grid);
 | 
			
		||||
 | 
			
		||||
    sitePlaquette(Plaq, U);
 | 
			
		||||
 | 
			
		||||
    TComplex Tp = sum(Plaq);
 | 
			
		||||
    Complex p = TensorRemove(Tp);
 | 
			
		||||
    return p.real();
 | 
			
		||||
  }
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  // average over all x,y,z,t and over all planes of plaquette
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  static Real avgPlaquette(const GaugeLorentz &Umu) {
 | 
			
		||||
    int ndim = Umu._grid->_ndimension;
 | 
			
		||||
    Real sumplaq = sumPlaquette(Umu);
 | 
			
		||||
    Real vol = Umu._grid->gSites();
 | 
			
		||||
    Real faces = (1.0 * ndim * (ndim - 1)) / 2.0;
 | 
			
		||||
    return sumplaq / vol / faces / Nc; // Nc dependent... FIXME
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  // Wilson loop of size (R1, R2), oriented in mu,nu plane
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  static void wilsonLoop(GaugeMat &wl, const std::vector<GaugeMat> &U,
 | 
			
		||||
                           const int Rmu, const int Rnu,
 | 
			
		||||
                           const int mu, const int nu) {
 | 
			
		||||
    wl = U[nu];
 | 
			
		||||
 | 
			
		||||
    for(int i = 0; i < Rnu-1; i++){
 | 
			
		||||
      wl = Gimpl::CovShiftForward(U[nu], nu, wl);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    for(int i = 0; i < Rmu; i++){
 | 
			
		||||
      wl = Gimpl::CovShiftForward(U[mu], mu, wl);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    for(int i = 0; i < Rnu; i++){
 | 
			
		||||
      wl = Gimpl::CovShiftBackward(U[nu], nu, wl);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    for(int i = 0; i < Rmu; i++){
 | 
			
		||||
      wl = Gimpl::CovShiftBackward(U[mu], mu, wl);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  // trace of Wilson Loop oriented in mu,nu plane
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  static void traceWilsonLoop(LatticeComplex &wl,
 | 
			
		||||
                                const std::vector<GaugeMat> &U,
 | 
			
		||||
                                const int Rmu, const int Rnu,
 | 
			
		||||
                                const int mu, const int nu) {
 | 
			
		||||
    GaugeMat sp(U[0]._grid);
 | 
			
		||||
    wilsonLoop(sp, U, Rmu, Rnu, mu, nu);
 | 
			
		||||
    wl = trace(sp);
 | 
			
		||||
  }
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  // sum over all planes of Wilson loop
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  static void siteWilsonLoop(LatticeComplex &Wl,
 | 
			
		||||
                            const std::vector<GaugeMat> &U,
 | 
			
		||||
                            const int R1, const int R2) {
 | 
			
		||||
    LatticeComplex siteWl(U[0]._grid);
 | 
			
		||||
    Wl = zero;
 | 
			
		||||
    for (int mu = 1; mu < U[0]._grid->_ndimension; mu++) {
 | 
			
		||||
      for (int nu = 0; nu < mu; nu++) {
 | 
			
		||||
        traceWilsonLoop(siteWl, U, R1, R2, mu, nu);
 | 
			
		||||
        Wl = Wl + siteWl;
 | 
			
		||||
        traceWilsonLoop(siteWl, U, R2, R1, mu, nu);
 | 
			
		||||
        Wl = Wl + siteWl;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  // sum over planes of Wilson loop with length R1
 | 
			
		||||
  // in the time direction
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  static void siteTimelikeWilsonLoop(LatticeComplex &Wl,
 | 
			
		||||
                            const std::vector<GaugeMat> &U,
 | 
			
		||||
                            const int R1, const int R2) {
 | 
			
		||||
    LatticeComplex siteWl(U[0]._grid);
 | 
			
		||||
 | 
			
		||||
    int ndim = U[0]._grid->_ndimension;
 | 
			
		||||
 | 
			
		||||
    Wl = zero;
 | 
			
		||||
    for (int nu = 0; nu < ndim - 1; nu++) {
 | 
			
		||||
      traceWilsonLoop(siteWl, U, R1, R2, ndim-1, nu);
 | 
			
		||||
      Wl = Wl + siteWl;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  // sum Wilson loop over all planes orthogonal to the time direction
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  static void siteSpatialWilsonLoop(LatticeComplex &Wl,
 | 
			
		||||
                            const std::vector<GaugeMat> &U,
 | 
			
		||||
                            const int R1, const int R2) {
 | 
			
		||||
    LatticeComplex siteWl(U[0]._grid);
 | 
			
		||||
 | 
			
		||||
    Wl = zero;
 | 
			
		||||
    for (int mu = 1; mu < U[0]._grid->_ndimension - 1; mu++) {
 | 
			
		||||
      for (int nu = 0; nu < mu; nu++) {
 | 
			
		||||
        traceWilsonLoop(siteWl, U, R1, R2, mu, nu);
 | 
			
		||||
        Wl = Wl + siteWl;
 | 
			
		||||
        traceWilsonLoop(siteWl, U, R2, R1, mu, nu);
 | 
			
		||||
        Wl = Wl + siteWl;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  // sum over all x,y,z,t and over all planes of Wilson loop
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  static Real sumWilsonLoop(const GaugeLorentz &Umu,
 | 
			
		||||
                            const int R1, const int R2) {
 | 
			
		||||
    std::vector<GaugeMat> U(4, Umu._grid);
 | 
			
		||||
 | 
			
		||||
    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
 | 
			
		||||
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    LatticeComplex Wl(Umu._grid);
 | 
			
		||||
 | 
			
		||||
    siteWilsonLoop(Wl, U, R1, R2);
 | 
			
		||||
 | 
			
		||||
    TComplex Tp = sum(Wl);
 | 
			
		||||
    Complex p = TensorRemove(Tp);
 | 
			
		||||
    return p.real();
 | 
			
		||||
  }
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  // sum over all x,y,z,t and over all planes of timelike Wilson loop
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  static Real sumTimelikeWilsonLoop(const GaugeLorentz &Umu,
 | 
			
		||||
                            const int R1, const int R2) {
 | 
			
		||||
    std::vector<GaugeMat> U(4, Umu._grid);
 | 
			
		||||
 | 
			
		||||
    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
 | 
			
		||||
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    LatticeComplex Wl(Umu._grid);
 | 
			
		||||
 | 
			
		||||
    siteTimelikeWilsonLoop(Wl, U, R1, R2);
 | 
			
		||||
 | 
			
		||||
    TComplex Tp = sum(Wl);
 | 
			
		||||
    Complex p = TensorRemove(Tp);
 | 
			
		||||
    return p.real();
 | 
			
		||||
  }
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  // sum over all x,y,z,t and over all planes of spatial Wilson loop
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  static Real sumSpatialWilsonLoop(const GaugeLorentz &Umu,
 | 
			
		||||
                            const int R1, const int R2) {
 | 
			
		||||
    std::vector<GaugeMat> U(4, Umu._grid);
 | 
			
		||||
 | 
			
		||||
    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
 | 
			
		||||
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    LatticeComplex Wl(Umu._grid);
 | 
			
		||||
 | 
			
		||||
    siteSpatialWilsonLoop(Wl, U, R1, R2);
 | 
			
		||||
 | 
			
		||||
    TComplex Tp = sum(Wl);
 | 
			
		||||
    Complex p = TensorRemove(Tp);
 | 
			
		||||
    return p.real();
 | 
			
		||||
  }
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  // average over all x,y,z,t and over all planes of Wilson loop
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  static Real avgWilsonLoop(const GaugeLorentz &Umu,
 | 
			
		||||
                            const int R1, const int R2) {
 | 
			
		||||
    int ndim = Umu._grid->_ndimension;
 | 
			
		||||
    Real sumWl = sumWilsonLoop(Umu, R1, R2);
 | 
			
		||||
    Real vol = Umu._grid->gSites();
 | 
			
		||||
    Real faces = 1.0 * ndim * (ndim - 1);
 | 
			
		||||
    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
 | 
			
		||||
  }
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  // average over all x,y,z,t and over all planes of timelike Wilson loop
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  static Real avgTimelikeWilsonLoop(const GaugeLorentz &Umu,
 | 
			
		||||
                            const int R1, const int R2) {
 | 
			
		||||
    int ndim = Umu._grid->_ndimension;
 | 
			
		||||
    Real sumWl = sumTimelikeWilsonLoop(Umu, R1, R2);
 | 
			
		||||
    Real vol = Umu._grid->gSites();
 | 
			
		||||
    Real faces = 1.0 * (ndim - 1);
 | 
			
		||||
    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
 | 
			
		||||
  }
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  // average over all x,y,z,t and over all planes of spatial Wilson loop
 | 
			
		||||
  //////////////////////////////////////////////////
 | 
			
		||||
  static Real avgSpatialWilsonLoop(const GaugeLorentz &Umu,
 | 
			
		||||
                            const int R1, const int R2) {
 | 
			
		||||
    int ndim = Umu._grid->_ndimension;
 | 
			
		||||
    Real sumWl = sumSpatialWilsonLoop(Umu, R1, R2);
 | 
			
		||||
    Real vol = Umu._grid->gSites();
 | 
			
		||||
    Real faces = 1.0 * (ndim - 1) * (ndim - 2);
 | 
			
		||||
    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
END_QEDFVOL_NAMESPACE
 | 
			
		||||
 | 
			
		||||
#endif // QEDFVOL_WILSONLOOPS_H
 | 
			
		||||
							
								
								
									
										88
									
								
								extras/qed-fvol/qed-fvol.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										88
									
								
								extras/qed-fvol/qed-fvol.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,88 @@
 | 
			
		||||
#include <Global.hpp>
 | 
			
		||||
#include <WilsonLoops.h>
 | 
			
		||||
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
using namespace QCD;
 | 
			
		||||
using namespace QedFVol;
 | 
			
		||||
 | 
			
		||||
typedef PeriodicGaugeImpl<QedGimplR>    QedPeriodicGimplR;
 | 
			
		||||
typedef PhotonR::GaugeField             EmField;
 | 
			
		||||
typedef PhotonR::GaugeLinkField         EmComp;
 | 
			
		||||
 | 
			
		||||
const int NCONFIGS = 10;
 | 
			
		||||
const int NWILSON = 10;
 | 
			
		||||
 | 
			
		||||
int main(int argc, char *argv[])
 | 
			
		||||
{
 | 
			
		||||
    // parse command line
 | 
			
		||||
    std::string parameterFileName;
 | 
			
		||||
    
 | 
			
		||||
    if (argc < 2)
 | 
			
		||||
    {
 | 
			
		||||
        std::cerr << "usage: " << argv[0] << " <parameter file> [Grid options]";
 | 
			
		||||
        std::cerr << std::endl;
 | 
			
		||||
        std::exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
    parameterFileName = argv[1];
 | 
			
		||||
    
 | 
			
		||||
    // initialization
 | 
			
		||||
    Grid_init(&argc, &argv);
 | 
			
		||||
    QedFVolLogError.Active(GridLogError.isActive());
 | 
			
		||||
    QedFVolLogWarning.Active(GridLogWarning.isActive());
 | 
			
		||||
    QedFVolLogMessage.Active(GridLogMessage.isActive());
 | 
			
		||||
    QedFVolLogIterative.Active(GridLogIterative.isActive());
 | 
			
		||||
    QedFVolLogDebug.Active(GridLogDebug.isActive());
 | 
			
		||||
    LOG(Message) << "Grid initialized" << std::endl;
 | 
			
		||||
    
 | 
			
		||||
    // QED stuff
 | 
			
		||||
    std::vector<int> latt_size   = GridDefaultLatt();
 | 
			
		||||
    std::vector<int> simd_layout = GridDefaultSimd(4, vComplex::Nsimd());
 | 
			
		||||
    std::vector<int> mpi_layout  = GridDefaultMpi();
 | 
			
		||||
    GridCartesian    grid(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
    GridParallelRNG  pRNG(&grid);
 | 
			
		||||
    PhotonR          photon(PhotonR::Gauge::feynman,
 | 
			
		||||
                            PhotonR::ZmScheme::qedL);
 | 
			
		||||
    EmField          a(&grid);
 | 
			
		||||
    EmField          expA(&grid);
 | 
			
		||||
 | 
			
		||||
    Complex imag_unit(0, 1);
 | 
			
		||||
 | 
			
		||||
    Real wlA;
 | 
			
		||||
    std::vector<Real> logWlAvg(NWILSON, 0.0), logWlTime(NWILSON, 0.0), logWlSpace(NWILSON, 0.0);
 | 
			
		||||
 | 
			
		||||
    pRNG.SeedRandomDevice();
 | 
			
		||||
 | 
			
		||||
    LOG(Message) << "Wilson loop calculation beginning" << std::endl;
 | 
			
		||||
    for(int ic = 0; ic < NCONFIGS; ic++){
 | 
			
		||||
        LOG(Message) << "Configuration " << ic <<std::endl;
 | 
			
		||||
        photon.StochasticField(a, pRNG);
 | 
			
		||||
 | 
			
		||||
        // Exponentiate photon field
 | 
			
		||||
        expA = exp(imag_unit*a);
 | 
			
		||||
 | 
			
		||||
        // Calculate Wilson loops
 | 
			
		||||
        for(int iw=1; iw<=NWILSON; iw++){
 | 
			
		||||
            wlA = NewWilsonLoops<QedPeriodicGimplR>::avgWilsonLoop(expA, iw, iw) * 3;
 | 
			
		||||
            logWlAvg[iw-1] -= 2*log(wlA);
 | 
			
		||||
            wlA = NewWilsonLoops<QedPeriodicGimplR>::avgTimelikeWilsonLoop(expA, iw, iw) * 3;
 | 
			
		||||
            logWlTime[iw-1] -= 2*log(wlA);
 | 
			
		||||
            wlA = NewWilsonLoops<QedPeriodicGimplR>::avgSpatialWilsonLoop(expA, iw, iw) * 3;
 | 
			
		||||
            logWlSpace[iw-1] -= 2*log(wlA);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
    LOG(Message) << "Wilson loop calculation completed" << std::endl;
 | 
			
		||||
    
 | 
			
		||||
    // Calculate Wilson loops
 | 
			
		||||
    for(int iw=1; iw<=10; iw++){
 | 
			
		||||
        LOG(Message) << iw << 'x' << iw << " Wilson loop" << std::endl;
 | 
			
		||||
        LOG(Message) << "-2log(W) average: " << logWlAvg[iw-1]/NCONFIGS << std::endl;
 | 
			
		||||
        LOG(Message) << "-2log(W) timelike: " << logWlTime[iw-1]/NCONFIGS << std::endl;
 | 
			
		||||
        LOG(Message) << "-2log(W) spatial: " << logWlSpace[iw-1]/NCONFIGS << std::endl;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // epilogue
 | 
			
		||||
    LOG(Message) << "Grid is finalizing now" << std::endl;
 | 
			
		||||
    Grid_finalize();
 | 
			
		||||
    
 | 
			
		||||
    return EXIT_SUCCESS;
 | 
			
		||||
}
 | 
			
		||||
@@ -10,8 +10,8 @@ if BUILD_COMMS_MPI3
 | 
			
		||||
  extra_sources+=communicator/Communicator_base.cc
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
if BUILD_COMMS_MPI3L
 | 
			
		||||
  extra_sources+=communicator/Communicator_mpi3_leader.cc
 | 
			
		||||
if BUILD_COMMS_MPIT
 | 
			
		||||
  extra_sources+=communicator/Communicator_mpit.cc
 | 
			
		||||
  extra_sources+=communicator/Communicator_base.cc
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -1,6 +1,6 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
    Source file: ./lib/Algorithms.h
 | 
			
		||||
 | 
			
		||||
@@ -37,6 +37,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#include <Grid/algorithms/approx/Chebyshev.h>
 | 
			
		||||
#include <Grid/algorithms/approx/Remez.h>
 | 
			
		||||
#include <Grid/algorithms/approx/MultiShiftFunction.h>
 | 
			
		||||
#include <Grid/algorithms/approx/Forecast.h>
 | 
			
		||||
 | 
			
		||||
#include <Grid/algorithms/iterative/ConjugateGradient.h>
 | 
			
		||||
#include <Grid/algorithms/iterative/ConjugateResidual.h>
 | 
			
		||||
@@ -44,30 +45,16 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#include <Grid/algorithms/iterative/SchurRedBlack.h>
 | 
			
		||||
#include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
 | 
			
		||||
#include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
 | 
			
		||||
 | 
			
		||||
// Lanczos support
 | 
			
		||||
//#include <Grid/algorithms/iterative/MatrixUtils.h>
 | 
			
		||||
#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
 | 
			
		||||
#include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
 | 
			
		||||
#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
 | 
			
		||||
#include <Grid/algorithms/CoarsenedMatrix.h>
 | 
			
		||||
#include <Grid/algorithms/FFT.h>
 | 
			
		||||
 | 
			
		||||
// Eigen/lanczos
 | 
			
		||||
// EigCg
 | 
			
		||||
// MCR
 | 
			
		||||
// Pcg
 | 
			
		||||
// Multishift CG
 | 
			
		||||
// Hdcg
 | 
			
		||||
// GCR
 | 
			
		||||
// etc..
 | 
			
		||||
 | 
			
		||||
// integrator/Leapfrog
 | 
			
		||||
// integrator/Omelyan
 | 
			
		||||
// integrator/ForceGradient
 | 
			
		||||
 | 
			
		||||
// montecarlo/hmc
 | 
			
		||||
// montecarlo/rhmc
 | 
			
		||||
// montecarlo/metropolis
 | 
			
		||||
// etc...
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -230,6 +230,7 @@ namespace Grid {
 | 
			
		||||
      // Barrel shift and collect global pencil
 | 
			
		||||
      std::vector<int> lcoor(Nd), gcoor(Nd);
 | 
			
		||||
      result = source;
 | 
			
		||||
      int pc = processor_coor[dim];
 | 
			
		||||
      for(int p=0;p<processors[dim];p++) {
 | 
			
		||||
        PARALLEL_REGION
 | 
			
		||||
        {
 | 
			
		||||
@@ -240,7 +241,8 @@ namespace Grid {
 | 
			
		||||
          for(int idx=0;idx<sgrid->lSites();idx++) {
 | 
			
		||||
            sgrid->LocalIndexToLocalCoor(idx,cbuf);
 | 
			
		||||
            peekLocalSite(s,result,cbuf);
 | 
			
		||||
            cbuf[dim]+=p*L;
 | 
			
		||||
	    cbuf[dim]+=((pc+p) % processors[dim])*L;
 | 
			
		||||
	    //            cbuf[dim]+=p*L;
 | 
			
		||||
            pokeLocalSite(s,pgbuf,cbuf);
 | 
			
		||||
          }
 | 
			
		||||
        }
 | 
			
		||||
@@ -278,7 +280,6 @@ namespace Grid {
 | 
			
		||||
      flops+= flops_call*NN;
 | 
			
		||||
      
 | 
			
		||||
      // writing out result
 | 
			
		||||
      int pc = processor_coor[dim];
 | 
			
		||||
      PARALLEL_REGION
 | 
			
		||||
      {
 | 
			
		||||
        std::vector<int> clbuf(Nd), cgbuf(Nd);
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										152
									
								
								lib/algorithms/approx/Forecast.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										152
									
								
								lib/algorithms/approx/Forecast.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,152 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
Source file: ./lib/algorithms/approx/Forecast.h
 | 
			
		||||
 | 
			
		||||
Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: David Murphy <dmurphy@phys.columbia.edu>
 | 
			
		||||
 | 
			
		||||
This program is free software; you can redistribute it and/or modify
 | 
			
		||||
it under the terms of the GNU General Public License as published by
 | 
			
		||||
the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
(at your option) any later version.
 | 
			
		||||
 | 
			
		||||
This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
You should have received a copy of the GNU General Public License along
 | 
			
		||||
with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#ifndef INCLUDED_FORECAST_H
 | 
			
		||||
#define INCLUDED_FORECAST_H
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
  // Abstract base class.
 | 
			
		||||
  // Takes a matrix (Mat), a source (phi), and a vector of Fields (chi)
 | 
			
		||||
  // and returns a forecasted solution to the system D*psi = phi (psi).
 | 
			
		||||
  template<class Matrix, class Field>
 | 
			
		||||
  class Forecast
 | 
			
		||||
  {
 | 
			
		||||
    public:
 | 
			
		||||
      virtual Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& chi) = 0;
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  // Implementation of Brower et al.'s chronological inverter (arXiv:hep-lat/9509012),
 | 
			
		||||
  // used to forecast solutions across poles of the EOFA heatbath.
 | 
			
		||||
  //
 | 
			
		||||
  // Modified from CPS (cps_pp/src/util/dirac_op/d_op_base/comsrc/minresext.C)
 | 
			
		||||
  template<class Matrix, class Field>
 | 
			
		||||
  class ChronoForecast : public Forecast<Matrix,Field>
 | 
			
		||||
  {
 | 
			
		||||
    public:
 | 
			
		||||
      Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& prev_solns)
 | 
			
		||||
      {
 | 
			
		||||
        int degree = prev_solns.size();
 | 
			
		||||
        Field chi(phi); // forecasted solution
 | 
			
		||||
 | 
			
		||||
        // Trivial cases
 | 
			
		||||
        if(degree == 0){ chi = zero; return chi; }
 | 
			
		||||
        else if(degree == 1){ return prev_solns[0]; }
 | 
			
		||||
 | 
			
		||||
        RealD dot;
 | 
			
		||||
        ComplexD xp;
 | 
			
		||||
        Field r(phi); // residual
 | 
			
		||||
        Field Mv(phi);
 | 
			
		||||
        std::vector<Field> v(prev_solns); // orthonormalized previous solutions
 | 
			
		||||
        std::vector<Field> MdagMv(degree,phi);
 | 
			
		||||
 | 
			
		||||
        // Array to hold the matrix elements
 | 
			
		||||
        std::vector<std::vector<ComplexD>> G(degree, std::vector<ComplexD>(degree));
 | 
			
		||||
 | 
			
		||||
        // Solution and source vectors
 | 
			
		||||
        std::vector<ComplexD> a(degree);
 | 
			
		||||
        std::vector<ComplexD> b(degree);
 | 
			
		||||
 | 
			
		||||
        // Orthonormalize the vector basis
 | 
			
		||||
        for(int i=0; i<degree; i++){
 | 
			
		||||
          v[i] *= 1.0/std::sqrt(norm2(v[i]));
 | 
			
		||||
          for(int j=i+1; j<degree; j++){ v[j] -= innerProduct(v[i],v[j]) * v[i]; }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // Perform sparse matrix multiplication and construct rhs
 | 
			
		||||
        for(int i=0; i<degree; i++){
 | 
			
		||||
          b[i] = innerProduct(v[i],phi);
 | 
			
		||||
          Mat.M(v[i],Mv);
 | 
			
		||||
          Mat.Mdag(Mv,MdagMv[i]);
 | 
			
		||||
          G[i][i] = innerProduct(v[i],MdagMv[i]);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // Construct the matrix
 | 
			
		||||
        for(int j=0; j<degree; j++){
 | 
			
		||||
        for(int k=j+1; k<degree; k++){
 | 
			
		||||
          G[j][k] = innerProduct(v[j],MdagMv[k]);
 | 
			
		||||
          G[k][j] = std::conj(G[j][k]);
 | 
			
		||||
        }}
 | 
			
		||||
 | 
			
		||||
        // Gauss-Jordan elimination with partial pivoting
 | 
			
		||||
        for(int i=0; i<degree; i++){
 | 
			
		||||
 | 
			
		||||
          // Perform partial pivoting
 | 
			
		||||
          int k = i;
 | 
			
		||||
          for(int j=i+1; j<degree; j++){ if(std::abs(G[j][j]) > std::abs(G[k][k])){ k = j; } }
 | 
			
		||||
          if(k != i){
 | 
			
		||||
            xp = b[k];
 | 
			
		||||
            b[k] = b[i];
 | 
			
		||||
            b[i] = xp;
 | 
			
		||||
            for(int j=0; j<degree; j++){
 | 
			
		||||
              xp = G[k][j];
 | 
			
		||||
              G[k][j] = G[i][j];
 | 
			
		||||
              G[i][j] = xp;
 | 
			
		||||
            }
 | 
			
		||||
          }
 | 
			
		||||
 | 
			
		||||
          // Convert matrix to upper triangular form
 | 
			
		||||
          for(int j=i+1; j<degree; j++){
 | 
			
		||||
            xp = G[j][i]/G[i][i];
 | 
			
		||||
            b[j] -= xp * b[i];
 | 
			
		||||
            for(int k=0; k<degree; k++){ G[j][k] -= xp*G[i][k]; }
 | 
			
		||||
          }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // Use Gaussian elimination to solve equations and calculate initial guess
 | 
			
		||||
        chi = zero;
 | 
			
		||||
        r = phi;
 | 
			
		||||
        for(int i=degree-1; i>=0; i--){
 | 
			
		||||
          a[i] = 0.0;
 | 
			
		||||
          for(int j=i+1; j<degree; j++){ a[i] += G[i][j] * a[j]; }
 | 
			
		||||
          a[i] = (b[i]-a[i])/G[i][i];
 | 
			
		||||
          chi += a[i]*v[i];
 | 
			
		||||
          r -= a[i]*MdagMv[i];
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        RealD true_r(0.0);
 | 
			
		||||
        ComplexD tmp;
 | 
			
		||||
        for(int i=0; i<degree; i++){
 | 
			
		||||
          tmp = -b[i];
 | 
			
		||||
          for(int j=0; j<degree; j++){ tmp += G[i][j]*a[j]; }
 | 
			
		||||
          tmp = std::conj(tmp)*tmp;
 | 
			
		||||
          true_r += std::sqrt(tmp.real());
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        RealD error = std::sqrt(norm2(r)/norm2(phi));
 | 
			
		||||
        std::cout << GridLogMessage << "ChronoForecast: |res|/|src| = " << error << std::endl;
 | 
			
		||||
 | 
			
		||||
        return chi;
 | 
			
		||||
      };
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
@@ -87,15 +87,22 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  sliceInnerProductMatrix(m_rr,R,R,Orthog);
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Cholesky from Eigen
 | 
			
		||||
  // There exists a ldlt that is documented as more stable
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
 | 
			
		||||
  // Force manifest hermitian to avoid rounding related
 | 
			
		||||
  m_rr = 0.5*(m_rr+m_rr.adjoint());
 | 
			
		||||
 | 
			
		||||
#if 0
 | 
			
		||||
  std::cout << " Calling Cholesky  ldlt on m_rr "  << m_rr <<std::endl;
 | 
			
		||||
  Eigen::MatrixXcd L_ldlt = m_rr.ldlt().matrixL(); 
 | 
			
		||||
  std::cout << " Called Cholesky  ldlt on m_rr "  << L_ldlt <<std::endl;
 | 
			
		||||
  auto  D_ldlt = m_rr.ldlt().vectorD(); 
 | 
			
		||||
  std::cout << " Called Cholesky  ldlt on m_rr "  << D_ldlt <<std::endl;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
  //  std::cout << " Calling Cholesky  llt on m_rr "  <<std::endl;
 | 
			
		||||
  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
 | 
			
		||||
  //  std::cout << " Called Cholesky  llt on m_rr "  << L <<std::endl;
 | 
			
		||||
  C    = L.adjoint();
 | 
			
		||||
  Cinv = C.inverse();
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Q = R C^{-1}
 | 
			
		||||
  //
 | 
			
		||||
@@ -103,7 +110,6 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
 | 
			
		||||
  //
 | 
			
		||||
  // NB maddMatrix conventions are Right multiplication X[j] a[j,i] already
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // FIXME:: make a sliceMulMatrix to avoid zero vector
 | 
			
		||||
  sliceMulMatrix(Q,Cinv,R,Orthog);
 | 
			
		||||
}
 | 
			
		||||
////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -199,7 +205,12 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
 | 
			
		||||
 | 
			
		||||
  Linop.HermOp(X, AD);
 | 
			
		||||
  tmp = B - AD;  
 | 
			
		||||
  //std::cout << GridLogMessage << " initial tmp " << norm2(tmp)<< std::endl;
 | 
			
		||||
  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
 | 
			
		||||
  //std::cout << GridLogMessage << " initial Q " << norm2(Q)<< std::endl;
 | 
			
		||||
  //std::cout << GridLogMessage << " m_rr " << m_rr<<std::endl;
 | 
			
		||||
  //std::cout << GridLogMessage << " m_C " << m_C<<std::endl;
 | 
			
		||||
  //std::cout << GridLogMessage << " m_Cinv " << m_Cinv<<std::endl;
 | 
			
		||||
  D=Q;
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl;
 | 
			
		||||
@@ -221,13 +232,15 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
 | 
			
		||||
    MatrixTimer.Start();
 | 
			
		||||
    Linop.HermOp(D, Z);      
 | 
			
		||||
    MatrixTimer.Stop();
 | 
			
		||||
    //std::cout << GridLogMessage << " norm2 Z " <<norm2(Z)<<std::endl;
 | 
			
		||||
 | 
			
		||||
    //4. M  = [D^dag Z]^{-1}
 | 
			
		||||
    sliceInnerTimer.Start();
 | 
			
		||||
    sliceInnerProductMatrix(m_DZ,D,Z,Orthog);
 | 
			
		||||
    sliceInnerTimer.Stop();
 | 
			
		||||
    m_M       = m_DZ.inverse();
 | 
			
		||||
 | 
			
		||||
    //std::cout << GridLogMessage << " m_DZ " <<m_DZ<<std::endl;
 | 
			
		||||
    
 | 
			
		||||
    //5. X  = X + D MC
 | 
			
		||||
    m_tmp     = m_M * m_C;
 | 
			
		||||
    sliceMaddTimer.Start();
 | 
			
		||||
 
 | 
			
		||||
@@ -52,8 +52,8 @@ class ConjugateGradient : public OperatorFunction<Field> {
 | 
			
		||||
        MaxIterations(maxit),
 | 
			
		||||
        ErrorOnNoConverge(err_on_no_conv){};
 | 
			
		||||
 | 
			
		||||
  void operator()(LinearOperatorBase<Field> &Linop, const Field &src,
 | 
			
		||||
                  Field &psi) {
 | 
			
		||||
  void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
 | 
			
		||||
 | 
			
		||||
    psi.checkerboard = src.checkerboard;
 | 
			
		||||
    conformable(psi, src);
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										256
									
								
								lib/algorithms/iterative/ConjugateGradientReliableUpdate.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										256
									
								
								lib/algorithms/iterative/ConjugateGradientReliableUpdate.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,256 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./lib/algorithms/iterative/ConjugateGradientReliableUpdate.h
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: Christopher Kelly <ckelly@phys.columbia.edu>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#ifndef GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H
 | 
			
		||||
#define GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
  template<class FieldD,class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> 
 | 
			
		||||
  class ConjugateGradientReliableUpdate : public LinearFunction<FieldD> {
 | 
			
		||||
  public:
 | 
			
		||||
    bool ErrorOnNoConverge;  // throw an assert when the CG fails to converge.
 | 
			
		||||
    // Defaults true.
 | 
			
		||||
    RealD Tolerance;
 | 
			
		||||
    Integer MaxIterations;
 | 
			
		||||
    Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
 | 
			
		||||
    Integer ReliableUpdatesPerformed;
 | 
			
		||||
 | 
			
		||||
    bool DoFinalCleanup; //Final DP cleanup, defaults to true
 | 
			
		||||
    Integer IterationsToCleanup; //Final DP cleanup step iterations
 | 
			
		||||
    
 | 
			
		||||
    LinearOperatorBase<FieldF> &Linop_f;
 | 
			
		||||
    LinearOperatorBase<FieldD> &Linop_d;
 | 
			
		||||
    GridBase* SinglePrecGrid;
 | 
			
		||||
    RealD Delta; //reliable update parameter
 | 
			
		||||
 | 
			
		||||
    //Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single
 | 
			
		||||
    LinearOperatorBase<FieldF> *Linop_fallback;
 | 
			
		||||
    RealD fallback_transition_tol;
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    ConjugateGradientReliableUpdate(RealD tol, Integer maxit, RealD _delta, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d, bool err_on_no_conv = true)
 | 
			
		||||
      : Tolerance(tol),
 | 
			
		||||
        MaxIterations(maxit),
 | 
			
		||||
	Delta(_delta),
 | 
			
		||||
	Linop_f(_Linop_f),
 | 
			
		||||
	Linop_d(_Linop_d),
 | 
			
		||||
	SinglePrecGrid(_sp_grid),
 | 
			
		||||
        ErrorOnNoConverge(err_on_no_conv),
 | 
			
		||||
	DoFinalCleanup(true),
 | 
			
		||||
	Linop_fallback(NULL)
 | 
			
		||||
    {};
 | 
			
		||||
 | 
			
		||||
    void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
 | 
			
		||||
      Linop_fallback = &_Linop_fallback;
 | 
			
		||||
      fallback_transition_tol = _fallback_transition_tol;      
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    void operator()(const FieldD &src, FieldD &psi) {
 | 
			
		||||
      LinearOperatorBase<FieldF> *Linop_f_use = &Linop_f;
 | 
			
		||||
      bool using_fallback = false;
 | 
			
		||||
      
 | 
			
		||||
      psi.checkerboard = src.checkerboard;
 | 
			
		||||
      conformable(psi, src);
 | 
			
		||||
 | 
			
		||||
      RealD cp, c, a, d, b, ssq, qq, b_pred;
 | 
			
		||||
 | 
			
		||||
      FieldD p(src);
 | 
			
		||||
      FieldD mmp(src);
 | 
			
		||||
      FieldD r(src);
 | 
			
		||||
 | 
			
		||||
      // Initial residual computation & set up
 | 
			
		||||
      RealD guess = norm2(psi);
 | 
			
		||||
      assert(std::isnan(guess) == 0);
 | 
			
		||||
    
 | 
			
		||||
      Linop_d.HermOpAndNorm(psi, mmp, d, b);
 | 
			
		||||
    
 | 
			
		||||
      r = src - mmp;
 | 
			
		||||
      p = r;
 | 
			
		||||
 | 
			
		||||
      a = norm2(p);
 | 
			
		||||
      cp = a;
 | 
			
		||||
      ssq = norm2(src);
 | 
			
		||||
 | 
			
		||||
      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: guess " << guess << std::endl;
 | 
			
		||||
      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:   src " << ssq << std::endl;
 | 
			
		||||
      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:    mp " << d << std::endl;
 | 
			
		||||
      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:   mmp " << b << std::endl;
 | 
			
		||||
      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:  cp,r " << cp << std::endl;
 | 
			
		||||
      std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate:     p " << a << std::endl;
 | 
			
		||||
 | 
			
		||||
      RealD rsq = Tolerance * Tolerance * ssq;
 | 
			
		||||
 | 
			
		||||
      // Check if guess is really REALLY good :)
 | 
			
		||||
      if (cp <= rsq) {
 | 
			
		||||
	std::cout << GridLogMessage << "ConjugateGradientReliableUpdate guess was REALLY good\n";
 | 
			
		||||
	std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
 | 
			
		||||
	return;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      //Single prec initialization
 | 
			
		||||
      FieldF r_f(SinglePrecGrid);
 | 
			
		||||
      r_f.checkerboard = r.checkerboard;
 | 
			
		||||
      precisionChange(r_f, r);
 | 
			
		||||
 | 
			
		||||
      FieldF psi_f(r_f);
 | 
			
		||||
      psi_f = zero;
 | 
			
		||||
 | 
			
		||||
      FieldF p_f(r_f);
 | 
			
		||||
      FieldF mmp_f(r_f);
 | 
			
		||||
 | 
			
		||||
      RealD MaxResidSinceLastRelUp = cp; //initial residual    
 | 
			
		||||
    
 | 
			
		||||
      std::cout << GridLogIterative << std::setprecision(4)
 | 
			
		||||
		<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
 | 
			
		||||
 | 
			
		||||
      GridStopWatch LinalgTimer;
 | 
			
		||||
      GridStopWatch MatrixTimer;
 | 
			
		||||
      GridStopWatch SolverTimer;
 | 
			
		||||
 | 
			
		||||
      SolverTimer.Start();
 | 
			
		||||
      int k = 0;
 | 
			
		||||
      int l = 0;
 | 
			
		||||
    
 | 
			
		||||
      for (k = 1; k <= MaxIterations; k++) {
 | 
			
		||||
	c = cp;
 | 
			
		||||
 | 
			
		||||
	MatrixTimer.Start();
 | 
			
		||||
	Linop_f_use->HermOpAndNorm(p_f, mmp_f, d, qq);
 | 
			
		||||
	MatrixTimer.Stop();
 | 
			
		||||
 | 
			
		||||
	LinalgTimer.Start();
 | 
			
		||||
 | 
			
		||||
	a = c / d;
 | 
			
		||||
	b_pred = a * (a * qq - d) / c;
 | 
			
		||||
 | 
			
		||||
	cp = axpy_norm(r_f, -a, mmp_f, r_f);
 | 
			
		||||
	b = cp / c;
 | 
			
		||||
 | 
			
		||||
	// Fuse these loops ; should be really easy
 | 
			
		||||
	psi_f = a * p_f + psi_f;
 | 
			
		||||
	//p_f = p_f * b + r_f;
 | 
			
		||||
 | 
			
		||||
	LinalgTimer.Stop();
 | 
			
		||||
 | 
			
		||||
	std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: Iteration " << k
 | 
			
		||||
		  << " residual " << cp << " target " << rsq << std::endl;
 | 
			
		||||
	std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << "  b = "<< b << std::endl;
 | 
			
		||||
	std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << "  c = "<< c << std::endl;
 | 
			
		||||
 | 
			
		||||
	if(cp > MaxResidSinceLastRelUp){
 | 
			
		||||
	  std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: updating MaxResidSinceLastRelUp : " << MaxResidSinceLastRelUp << " -> " << cp << std::endl;
 | 
			
		||||
	  MaxResidSinceLastRelUp = cp;
 | 
			
		||||
	}
 | 
			
		||||
	  
 | 
			
		||||
	// Stopping condition
 | 
			
		||||
	if (cp <= rsq) {
 | 
			
		||||
	  //Although not written in the paper, I assume that I have to add on the final solution
 | 
			
		||||
	  precisionChange(mmp, psi_f);
 | 
			
		||||
	  psi = psi + mmp;
 | 
			
		||||
	
 | 
			
		||||
	
 | 
			
		||||
	  SolverTimer.Stop();
 | 
			
		||||
	  Linop_d.HermOpAndNorm(psi, mmp, d, qq);
 | 
			
		||||
	  p = mmp - src;
 | 
			
		||||
 | 
			
		||||
	  RealD srcnorm = sqrt(norm2(src));
 | 
			
		||||
	  RealD resnorm = sqrt(norm2(p));
 | 
			
		||||
	  RealD true_residual = resnorm / srcnorm;
 | 
			
		||||
 | 
			
		||||
	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate Converged on iteration " << k << " after " << l << " reliable updates" << std::endl;
 | 
			
		||||
	  std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
 | 
			
		||||
	  std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
 | 
			
		||||
	  std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
 | 
			
		||||
 | 
			
		||||
	  std::cout << GridLogMessage << "Time breakdown "<<std::endl;
 | 
			
		||||
	  std::cout << GridLogMessage << "\tElapsed    " << SolverTimer.Elapsed() <<std::endl;
 | 
			
		||||
	  std::cout << GridLogMessage << "\tMatrix     " << MatrixTimer.Elapsed() <<std::endl;
 | 
			
		||||
	  std::cout << GridLogMessage << "\tLinalg     " << LinalgTimer.Elapsed() <<std::endl;
 | 
			
		||||
 | 
			
		||||
	  IterationsToComplete = k;	
 | 
			
		||||
	  ReliableUpdatesPerformed = l;
 | 
			
		||||
	  
 | 
			
		||||
	  if(DoFinalCleanup){
 | 
			
		||||
	    //Do a final CG to cleanup
 | 
			
		||||
	    std::cout << GridLogMessage << "ConjugateGradientReliableUpdate performing final cleanup.\n";
 | 
			
		||||
	    ConjugateGradient<FieldD> CG(Tolerance,MaxIterations);
 | 
			
		||||
	    CG.ErrorOnNoConverge = ErrorOnNoConverge;
 | 
			
		||||
	    CG(Linop_d,src,psi);
 | 
			
		||||
	    IterationsToCleanup = CG.IterationsToComplete;
 | 
			
		||||
	  }
 | 
			
		||||
	  else if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
 | 
			
		||||
 | 
			
		||||
	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate complete.\n";
 | 
			
		||||
	  return;
 | 
			
		||||
	}
 | 
			
		||||
	else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update
 | 
			
		||||
	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate "
 | 
			
		||||
		    << cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n";
 | 
			
		||||
	  precisionChange(mmp, psi_f);
 | 
			
		||||
	  psi = psi + mmp;
 | 
			
		||||
 | 
			
		||||
	  Linop_d.HermOpAndNorm(psi, mmp, d, qq);
 | 
			
		||||
	  r = src - mmp;
 | 
			
		||||
 | 
			
		||||
	  psi_f = zero;
 | 
			
		||||
	  precisionChange(r_f, r);
 | 
			
		||||
	  cp = norm2(r);
 | 
			
		||||
	  MaxResidSinceLastRelUp = cp;
 | 
			
		||||
 | 
			
		||||
	  b = cp/c;
 | 
			
		||||
	  
 | 
			
		||||
	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate new residual " << cp << std::endl;
 | 
			
		||||
	  
 | 
			
		||||
	  l = l+1;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	p_f = p_f * b + r_f; //update search vector after reliable update appears to help convergence
 | 
			
		||||
 | 
			
		||||
	if(!using_fallback && Linop_fallback != NULL && cp < fallback_transition_tol){
 | 
			
		||||
	  std::cout << GridLogMessage << "ConjugateGradientReliableUpdate switching to fallback linear operator on iteration " << k << " at residual " << cp << std::endl;
 | 
			
		||||
	  Linop_f_use = Linop_fallback;
 | 
			
		||||
	  using_fallback = true;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	
 | 
			
		||||
      }
 | 
			
		||||
      std::cout << GridLogMessage << "ConjugateGradientReliableUpdate did NOT converge"
 | 
			
		||||
		<< std::endl;
 | 
			
		||||
      
 | 
			
		||||
      if (ErrorOnNoConverge) assert(0);
 | 
			
		||||
      IterationsToComplete = k;
 | 
			
		||||
      ReliableUpdatesPerformed = l;      
 | 
			
		||||
    }    
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
@@ -1,7 +1,5 @@
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#include <Grid/GridCore.h>
 | 
			
		||||
#include <fcntl.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
@@ -11,7 +9,7 @@ int PointerCache::victim;
 | 
			
		||||
 | 
			
		||||
void *PointerCache::Insert(void *ptr,size_t bytes) {
 | 
			
		||||
 | 
			
		||||
  if (bytes < 4096 ) return NULL;
 | 
			
		||||
  if (bytes < 4096 ) return ptr;
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
  assert(omp_in_parallel()==0);
 | 
			
		||||
@@ -63,4 +61,37 @@ void *PointerCache::Lookup(size_t bytes) {
 | 
			
		||||
  return NULL;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
void check_huge_pages(void *Buf,uint64_t BYTES)
 | 
			
		||||
{
 | 
			
		||||
#ifdef __linux__
 | 
			
		||||
  int fd = open("/proc/self/pagemap", O_RDONLY);
 | 
			
		||||
  assert(fd >= 0);
 | 
			
		||||
  const int page_size = 4096;
 | 
			
		||||
  uint64_t virt_pfn = (uint64_t)Buf / page_size;
 | 
			
		||||
  off_t offset = sizeof(uint64_t) * virt_pfn;
 | 
			
		||||
  uint64_t npages = (BYTES + page_size-1) / page_size;
 | 
			
		||||
  uint64_t pagedata[npages];
 | 
			
		||||
  uint64_t ret = lseek(fd, offset, SEEK_SET);
 | 
			
		||||
  assert(ret == offset);
 | 
			
		||||
  ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
 | 
			
		||||
  assert(ret == sizeof(uint64_t) * npages);
 | 
			
		||||
  int nhugepages = npages / 512;
 | 
			
		||||
  int n4ktotal, nnothuge;
 | 
			
		||||
  n4ktotal = 0;
 | 
			
		||||
  nnothuge = 0;
 | 
			
		||||
  for (int i = 0; i < nhugepages; ++i) {
 | 
			
		||||
    uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
 | 
			
		||||
    for (int j = 0; j < 512; ++j) {
 | 
			
		||||
      uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
 | 
			
		||||
      ++n4ktotal;
 | 
			
		||||
      if (pageaddr != baseaddr + j * page_size)
 | 
			
		||||
	++nnothuge;
 | 
			
		||||
      }
 | 
			
		||||
  }
 | 
			
		||||
  int rank = CartesianCommunicator::RankWorld();
 | 
			
		||||
  printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -64,6 +64,8 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  void check_huge_pages(void *Buf,uint64_t BYTES);
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////
 | 
			
		||||
// A lattice of something, but assume the something is SIMDized.
 | 
			
		||||
////////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -92,18 +94,34 @@ public:
 | 
			
		||||
    size_type bytes = __n*sizeof(_Tp);
 | 
			
		||||
 | 
			
		||||
    _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
 | 
			
		||||
    
 | 
			
		||||
#ifdef HAVE_MM_MALLOC_H
 | 
			
		||||
    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,128);
 | 
			
		||||
#else
 | 
			
		||||
    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(128,bytes);
 | 
			
		||||
#endif
 | 
			
		||||
    //    if ( ptr != NULL ) 
 | 
			
		||||
    //      std::cout << "alignedAllocator "<<__n << " cache hit "<< std::hex << ptr <<std::dec <<std::endl;
 | 
			
		||||
 | 
			
		||||
    //////////////////
 | 
			
		||||
    // Hack 2MB align; could make option probably doesn't need configurability
 | 
			
		||||
    //////////////////
 | 
			
		||||
//define GRID_ALLOC_ALIGN (128)
 | 
			
		||||
#define GRID_ALLOC_ALIGN (2*1024*1024)
 | 
			
		||||
#ifdef HAVE_MM_MALLOC_H
 | 
			
		||||
    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
 | 
			
		||||
#else
 | 
			
		||||
    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
 | 
			
		||||
#endif
 | 
			
		||||
    //    std::cout << "alignedAllocator " << std::hex << ptr <<std::dec <<std::endl;
 | 
			
		||||
    // First touch optimise in threaded loop
 | 
			
		||||
    uint8_t *cp = (uint8_t *)ptr;
 | 
			
		||||
#ifdef GRID_OMP
 | 
			
		||||
#pragma omp parallel for
 | 
			
		||||
#endif
 | 
			
		||||
    for(size_type n=0;n<bytes;n+=4096){
 | 
			
		||||
      cp[n]=0;
 | 
			
		||||
    }
 | 
			
		||||
    return ptr;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  void deallocate(pointer __p, size_type __n) { 
 | 
			
		||||
    size_type bytes = __n * sizeof(_Tp);
 | 
			
		||||
 | 
			
		||||
    pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
 | 
			
		||||
 | 
			
		||||
#ifdef HAVE_MM_MALLOC_H
 | 
			
		||||
@@ -182,10 +200,19 @@ public:
 | 
			
		||||
  pointer allocate(size_type __n, const void* _p= 0) 
 | 
			
		||||
  {
 | 
			
		||||
#ifdef HAVE_MM_MALLOC_H
 | 
			
		||||
    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
 | 
			
		||||
    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),GRID_ALLOC_ALIGN);
 | 
			
		||||
#else
 | 
			
		||||
    _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
 | 
			
		||||
    _Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,__n*sizeof(_Tp));
 | 
			
		||||
#endif
 | 
			
		||||
    size_type bytes = __n*sizeof(_Tp);
 | 
			
		||||
    uint8_t *cp = (uint8_t *)ptr;
 | 
			
		||||
    if ( ptr ) { 
 | 
			
		||||
    // One touch per 4k page, static OMP loop to catch same loop order
 | 
			
		||||
#pragma omp parallel for schedule(static)
 | 
			
		||||
      for(size_type n=0;n<bytes;n+=4096){
 | 
			
		||||
	cp[n]=0;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    return ptr;
 | 
			
		||||
  }
 | 
			
		||||
  void deallocate(pointer __p, size_type) { 
 | 
			
		||||
 
 | 
			
		||||
@@ -49,6 +49,8 @@ public:
 | 
			
		||||
    template<class object> friend class Lattice;
 | 
			
		||||
 | 
			
		||||
    GridBase(const std::vector<int> & processor_grid) : CartesianCommunicator(processor_grid) {};
 | 
			
		||||
    GridBase(const std::vector<int> & processor_grid,
 | 
			
		||||
	     const CartesianCommunicator &parent) : CartesianCommunicator(processor_grid,parent) {};
 | 
			
		||||
 | 
			
		||||
    // Physics Grid information.
 | 
			
		||||
    std::vector<int> _simd_layout;// Which dimensions get relayed out over simd lanes.
 | 
			
		||||
@@ -185,17 +187,18 @@ public:
 | 
			
		||||
    ////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
    void show_decomposition(){
 | 
			
		||||
      std::cout << GridLogMessage << "Full Dimensions    : " << _fdimensions << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "Global Dimensions  : " << _gdimensions << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "Local Dimensions   : " << _ldimensions << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "Reduced Dimensions : " << _rdimensions << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "Outer strides      : " << _ostride << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "Inner strides      : " << _istride << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "iSites             : " << _isites << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "oSites             : " << _osites << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "lSites             : " << lSites() << std::endl;        
 | 
			
		||||
      std::cout << GridLogMessage << "gSites             : " << gSites() << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "Nd                 : " << _ndimension << std::endl;             
 | 
			
		||||
      std::cout << GridLogMessage << "\tFull Dimensions    : " << _fdimensions << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\tSIMD layout        : " << _simd_layout << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\tGlobal Dimensions  : " << _gdimensions << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\tLocal Dimensions   : " << _ldimensions << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\tReduced Dimensions : " << _rdimensions << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\tOuter strides      : " << _ostride << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\tInner strides      : " << _istride << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\tiSites             : " << _isites << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\toSites             : " << _osites << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\tlSites             : " << lSites() << std::endl;        
 | 
			
		||||
      std::cout << GridLogMessage << "\tgSites             : " << gSites() << std::endl;
 | 
			
		||||
      std::cout << GridLogMessage << "\tNd                 : " << _ndimension << std::endl;             
 | 
			
		||||
    } 
 | 
			
		||||
 | 
			
		||||
    ////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -209,9 +212,6 @@ public:
 | 
			
		||||
      assert(lidx<lSites());
 | 
			
		||||
      Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    void GlobalCoorToGlobalIndex(const std::vector<int> & gcoor,int & gidx){
 | 
			
		||||
      gidx=0;
 | 
			
		||||
      int mult=1;
 | 
			
		||||
 
 | 
			
		||||
@@ -61,78 +61,102 @@ public:
 | 
			
		||||
    virtual int CheckerBoardShift(int source_cb,int dim,int shift, int osite){
 | 
			
		||||
      return shift;
 | 
			
		||||
    }
 | 
			
		||||
    /////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    // Constructor takes a parent grid and possibly subdivides communicator.
 | 
			
		||||
    /////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    GridCartesian(const std::vector<int> &dimensions,
 | 
			
		||||
		  const std::vector<int> &simd_layout,
 | 
			
		||||
		  const std::vector<int> &processor_grid
 | 
			
		||||
		  ) : GridBase(processor_grid)
 | 
			
		||||
		  const std::vector<int> &processor_grid,
 | 
			
		||||
		  const GridCartesian &parent) : GridBase(processor_grid,parent)
 | 
			
		||||
    {
 | 
			
		||||
        ///////////////////////
 | 
			
		||||
        // Grid information
 | 
			
		||||
        ///////////////////////
 | 
			
		||||
        _ndimension = dimensions.size();
 | 
			
		||||
            
 | 
			
		||||
        _fdimensions.resize(_ndimension);
 | 
			
		||||
        _gdimensions.resize(_ndimension);
 | 
			
		||||
        _ldimensions.resize(_ndimension);
 | 
			
		||||
        _rdimensions.resize(_ndimension);
 | 
			
		||||
        _simd_layout.resize(_ndimension);
 | 
			
		||||
	_lstart.resize(_ndimension);
 | 
			
		||||
	_lend.resize(_ndimension);
 | 
			
		||||
            
 | 
			
		||||
        _ostride.resize(_ndimension);
 | 
			
		||||
        _istride.resize(_ndimension);
 | 
			
		||||
            
 | 
			
		||||
        _fsites = _gsites = _osites = _isites = 1;
 | 
			
		||||
      Init(dimensions,simd_layout,processor_grid);
 | 
			
		||||
    }
 | 
			
		||||
    /////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    // Construct from comm world
 | 
			
		||||
    /////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    GridCartesian(const std::vector<int> &dimensions,
 | 
			
		||||
		  const std::vector<int> &simd_layout,
 | 
			
		||||
		  const std::vector<int> &processor_grid) : GridBase(processor_grid)
 | 
			
		||||
    {
 | 
			
		||||
      Init(dimensions,simd_layout,processor_grid);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
        for(int d=0;d<_ndimension;d++){
 | 
			
		||||
	  _fdimensions[d] = dimensions[d]; // Global dimensions
 | 
			
		||||
	  _gdimensions[d] = _fdimensions[d]; // Global dimensions
 | 
			
		||||
	  _simd_layout[d] = simd_layout[d];
 | 
			
		||||
	  _fsites = _fsites * _fdimensions[d];
 | 
			
		||||
	  _gsites = _gsites * _gdimensions[d];
 | 
			
		||||
    void Init(const std::vector<int> &dimensions,
 | 
			
		||||
	      const std::vector<int> &simd_layout,
 | 
			
		||||
	      const std::vector<int> &processor_grid)
 | 
			
		||||
    {
 | 
			
		||||
      ///////////////////////
 | 
			
		||||
      // Grid information
 | 
			
		||||
      ///////////////////////
 | 
			
		||||
      _ndimension = dimensions.size();
 | 
			
		||||
 | 
			
		||||
	  //FIXME check for exact division
 | 
			
		||||
      _fdimensions.resize(_ndimension);
 | 
			
		||||
      _gdimensions.resize(_ndimension);
 | 
			
		||||
      _ldimensions.resize(_ndimension);
 | 
			
		||||
      _rdimensions.resize(_ndimension);
 | 
			
		||||
      _simd_layout.resize(_ndimension);
 | 
			
		||||
      _lstart.resize(_ndimension);
 | 
			
		||||
      _lend.resize(_ndimension);
 | 
			
		||||
 | 
			
		||||
	  // Use a reduced simd grid
 | 
			
		||||
	  _ldimensions[d]= _gdimensions[d]/_processors[d];  //local dimensions
 | 
			
		||||
	  _rdimensions[d]= _ldimensions[d]/_simd_layout[d]; //overdecomposition
 | 
			
		||||
	  _lstart[d]     = _processor_coor[d]*_ldimensions[d];
 | 
			
		||||
	  _lend[d]       = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1;
 | 
			
		||||
	  _osites  *= _rdimensions[d];
 | 
			
		||||
	  _isites  *= _simd_layout[d];
 | 
			
		||||
                
 | 
			
		||||
	  // Addressing support
 | 
			
		||||
	  if ( d==0 ) {
 | 
			
		||||
	    _ostride[d] = 1;
 | 
			
		||||
	    _istride[d] = 1;
 | 
			
		||||
	  } else {
 | 
			
		||||
	    _ostride[d] = _ostride[d-1]*_rdimensions[d-1];
 | 
			
		||||
	    _istride[d] = _istride[d-1]*_simd_layout[d-1];
 | 
			
		||||
	  }
 | 
			
		||||
      _ostride.resize(_ndimension);
 | 
			
		||||
      _istride.resize(_ndimension);
 | 
			
		||||
 | 
			
		||||
      _fsites = _gsites = _osites = _isites = 1;
 | 
			
		||||
 | 
			
		||||
      for (int d = 0; d < _ndimension; d++)
 | 
			
		||||
      {
 | 
			
		||||
        _fdimensions[d] = dimensions[d];   // Global dimensions
 | 
			
		||||
        _gdimensions[d] = _fdimensions[d]; // Global dimensions
 | 
			
		||||
        _simd_layout[d] = simd_layout[d];
 | 
			
		||||
        _fsites = _fsites * _fdimensions[d];
 | 
			
		||||
        _gsites = _gsites * _gdimensions[d];
 | 
			
		||||
 | 
			
		||||
        // Use a reduced simd grid
 | 
			
		||||
        _ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions
 | 
			
		||||
        assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
 | 
			
		||||
 | 
			
		||||
        _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition
 | 
			
		||||
        assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
 | 
			
		||||
 | 
			
		||||
        _lstart[d] = _processor_coor[d] * _ldimensions[d];
 | 
			
		||||
        _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
 | 
			
		||||
        _osites *= _rdimensions[d];
 | 
			
		||||
        _isites *= _simd_layout[d];
 | 
			
		||||
 | 
			
		||||
        // Addressing support
 | 
			
		||||
        if (d == 0)
 | 
			
		||||
        {
 | 
			
		||||
          _ostride[d] = 1;
 | 
			
		||||
          _istride[d] = 1;
 | 
			
		||||
        }
 | 
			
		||||
        
 | 
			
		||||
        ///////////////////////
 | 
			
		||||
        // subplane information
 | 
			
		||||
        ///////////////////////
 | 
			
		||||
        _slice_block.resize(_ndimension);
 | 
			
		||||
        _slice_stride.resize(_ndimension);
 | 
			
		||||
        _slice_nblock.resize(_ndimension);
 | 
			
		||||
            
 | 
			
		||||
        int block =1;
 | 
			
		||||
        int nblock=1;
 | 
			
		||||
        for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
 | 
			
		||||
            
 | 
			
		||||
        for(int d=0;d<_ndimension;d++){
 | 
			
		||||
            nblock/=_rdimensions[d];
 | 
			
		||||
            _slice_block[d] =block;
 | 
			
		||||
            _slice_stride[d]=_ostride[d]*_rdimensions[d];
 | 
			
		||||
            _slice_nblock[d]=nblock;
 | 
			
		||||
            block = block*_rdimensions[d];
 | 
			
		||||
        else
 | 
			
		||||
        {
 | 
			
		||||
          _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
 | 
			
		||||
          _istride[d] = _istride[d - 1] * _simd_layout[d - 1];
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      ///////////////////////
 | 
			
		||||
      // subplane information
 | 
			
		||||
      ///////////////////////
 | 
			
		||||
      _slice_block.resize(_ndimension);
 | 
			
		||||
      _slice_stride.resize(_ndimension);
 | 
			
		||||
      _slice_nblock.resize(_ndimension);
 | 
			
		||||
 | 
			
		||||
      int block = 1;
 | 
			
		||||
      int nblock = 1;
 | 
			
		||||
      for (int d = 0; d < _ndimension; d++)
 | 
			
		||||
        nblock *= _rdimensions[d];
 | 
			
		||||
 | 
			
		||||
      for (int d = 0; d < _ndimension; d++)
 | 
			
		||||
      {
 | 
			
		||||
        nblock /= _rdimensions[d];
 | 
			
		||||
        _slice_block[d] = block;
 | 
			
		||||
        _slice_stride[d] = _ostride[d] * _rdimensions[d];
 | 
			
		||||
        _slice_nblock[d] = nblock;
 | 
			
		||||
        block = block * _rdimensions[d];
 | 
			
		||||
      }
 | 
			
		||||
    };
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -112,40 +112,73 @@ public:
 | 
			
		||||
      }
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    GridRedBlackCartesian(const GridBase *base) : GridRedBlackCartesian(base->_fdimensions,base->_simd_layout,base->_processors)  {};
 | 
			
		||||
    ////////////////////////////////////////////////////////////
 | 
			
		||||
    // Create Redblack from original grid; require full grid pointer ?
 | 
			
		||||
    ////////////////////////////////////////////////////////////
 | 
			
		||||
    GridRedBlackCartesian(const GridBase *base) : GridBase(base->_processors,*base)
 | 
			
		||||
    {
 | 
			
		||||
      int dims = base->_ndimension;
 | 
			
		||||
      std::vector<int> checker_dim_mask(dims,1);
 | 
			
		||||
      int checker_dim = 0;
 | 
			
		||||
      Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim);
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    GridRedBlackCartesian(const std::vector<int> &dimensions,
 | 
			
		||||
    ////////////////////////////////////////////////////////////
 | 
			
		||||
    // Create redblack from original grid, with non-trivial checker dim mask
 | 
			
		||||
    ////////////////////////////////////////////////////////////
 | 
			
		||||
    GridRedBlackCartesian(const GridBase *base,
 | 
			
		||||
			  const std::vector<int> &checker_dim_mask,
 | 
			
		||||
			  int checker_dim
 | 
			
		||||
			  ) :  GridBase(base->_processors,*base) 
 | 
			
		||||
    {
 | 
			
		||||
      Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim)  ;
 | 
			
		||||
    }
 | 
			
		||||
#if 0
 | 
			
		||||
    ////////////////////////////////////////////////////////////
 | 
			
		||||
    // Create redblack grid ;; deprecate these. Should not
 | 
			
		||||
    // need direct creation of redblack without a full grid to base on
 | 
			
		||||
    ////////////////////////////////////////////////////////////
 | 
			
		||||
    GridRedBlackCartesian(const GridBase *base,
 | 
			
		||||
			  const std::vector<int> &dimensions,
 | 
			
		||||
			  const std::vector<int> &simd_layout,
 | 
			
		||||
			  const std::vector<int> &processor_grid,
 | 
			
		||||
			  const std::vector<int> &checker_dim_mask,
 | 
			
		||||
			  int checker_dim
 | 
			
		||||
			  ) :  GridBase(processor_grid) 
 | 
			
		||||
			  ) :  GridBase(processor_grid,*base) 
 | 
			
		||||
    {
 | 
			
		||||
      Init(dimensions,simd_layout,processor_grid,checker_dim_mask,checker_dim);
 | 
			
		||||
    }
 | 
			
		||||
    GridRedBlackCartesian(const std::vector<int> &dimensions,
 | 
			
		||||
 | 
			
		||||
    ////////////////////////////////////////////////////////////
 | 
			
		||||
    // Create redblack grid
 | 
			
		||||
    ////////////////////////////////////////////////////////////
 | 
			
		||||
    GridRedBlackCartesian(const GridBase *base,
 | 
			
		||||
			  const std::vector<int> &dimensions,
 | 
			
		||||
			  const std::vector<int> &simd_layout,
 | 
			
		||||
			  const std::vector<int> &processor_grid) : GridBase(processor_grid) 
 | 
			
		||||
			  const std::vector<int> &processor_grid) : GridBase(processor_grid,*base) 
 | 
			
		||||
    {
 | 
			
		||||
      std::vector<int> checker_dim_mask(dimensions.size(),1);
 | 
			
		||||
      Init(dimensions,simd_layout,processor_grid,checker_dim_mask,0);
 | 
			
		||||
      int checker_dim = 0;
 | 
			
		||||
      Init(dimensions,simd_layout,processor_grid,checker_dim_mask,checker_dim);
 | 
			
		||||
    }
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
    void Init(const std::vector<int> &dimensions,
 | 
			
		||||
	      const std::vector<int> &simd_layout,
 | 
			
		||||
	      const std::vector<int> &processor_grid,
 | 
			
		||||
	      const std::vector<int> &checker_dim_mask,
 | 
			
		||||
	      int checker_dim)
 | 
			
		||||
              const std::vector<int> &simd_layout,
 | 
			
		||||
              const std::vector<int> &processor_grid,
 | 
			
		||||
              const std::vector<int> &checker_dim_mask,
 | 
			
		||||
              int checker_dim)
 | 
			
		||||
    {
 | 
			
		||||
    ///////////////////////
 | 
			
		||||
    // Grid information
 | 
			
		||||
    ///////////////////////
 | 
			
		||||
      ///////////////////////
 | 
			
		||||
      // Grid information
 | 
			
		||||
      ///////////////////////
 | 
			
		||||
      _checker_dim = checker_dim;
 | 
			
		||||
      assert(checker_dim_mask[checker_dim]==1);
 | 
			
		||||
      assert(checker_dim_mask[checker_dim] == 1);
 | 
			
		||||
      _ndimension = dimensions.size();
 | 
			
		||||
      assert(checker_dim_mask.size()==_ndimension);
 | 
			
		||||
      assert(processor_grid.size()==_ndimension);
 | 
			
		||||
      assert(simd_layout.size()==_ndimension);
 | 
			
		||||
      
 | 
			
		||||
      assert(checker_dim_mask.size() == _ndimension);
 | 
			
		||||
      assert(processor_grid.size() == _ndimension);
 | 
			
		||||
      assert(simd_layout.size() == _ndimension);
 | 
			
		||||
 | 
			
		||||
      _fdimensions.resize(_ndimension);
 | 
			
		||||
      _gdimensions.resize(_ndimension);
 | 
			
		||||
      _ldimensions.resize(_ndimension);
 | 
			
		||||
@@ -153,114 +186,133 @@ public:
 | 
			
		||||
      _simd_layout.resize(_ndimension);
 | 
			
		||||
      _lstart.resize(_ndimension);
 | 
			
		||||
      _lend.resize(_ndimension);
 | 
			
		||||
      
 | 
			
		||||
 | 
			
		||||
      _ostride.resize(_ndimension);
 | 
			
		||||
      _istride.resize(_ndimension);
 | 
			
		||||
      
 | 
			
		||||
 | 
			
		||||
      _fsites = _gsites = _osites = _isites = 1;
 | 
			
		||||
	
 | 
			
		||||
      _checker_dim_mask=checker_dim_mask;
 | 
			
		||||
 | 
			
		||||
      for(int d=0;d<_ndimension;d++){
 | 
			
		||||
	_fdimensions[d] = dimensions[d];
 | 
			
		||||
	_gdimensions[d] = _fdimensions[d];
 | 
			
		||||
	_fsites = _fsites * _fdimensions[d];
 | 
			
		||||
	_gsites = _gsites * _gdimensions[d];
 | 
			
		||||
        
 | 
			
		||||
	if (d==_checker_dim) {
 | 
			
		||||
	  _gdimensions[d] = _gdimensions[d]/2; // Remove a checkerboard
 | 
			
		||||
	}
 | 
			
		||||
	_ldimensions[d] = _gdimensions[d]/_processors[d];
 | 
			
		||||
	_lstart[d]     = _processor_coor[d]*_ldimensions[d];
 | 
			
		||||
	_lend[d]       = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1;
 | 
			
		||||
      _checker_dim_mask = checker_dim_mask;
 | 
			
		||||
 | 
			
		||||
	// Use a reduced simd grid
 | 
			
		||||
	_simd_layout[d] = simd_layout[d];
 | 
			
		||||
	_rdimensions[d]= _ldimensions[d]/_simd_layout[d];
 | 
			
		||||
	assert(_rdimensions[d]>0);
 | 
			
		||||
      for (int d = 0; d < _ndimension; d++)
 | 
			
		||||
      {
 | 
			
		||||
        _fdimensions[d] = dimensions[d];
 | 
			
		||||
        _gdimensions[d] = _fdimensions[d];
 | 
			
		||||
        _fsites = _fsites * _fdimensions[d];
 | 
			
		||||
        _gsites = _gsites * _gdimensions[d];
 | 
			
		||||
 | 
			
		||||
	// all elements of a simd vector must have same checkerboard.
 | 
			
		||||
	// If Ls vectorised, this must still be the case; e.g. dwf rb5d
 | 
			
		||||
	if ( _simd_layout[d]>1 ) {
 | 
			
		||||
	  if ( checker_dim_mask[d] ) { 
 | 
			
		||||
	    assert( (_rdimensions[d]&0x1) == 0 );
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
        if (d == _checker_dim)
 | 
			
		||||
        {
 | 
			
		||||
          assert((_gdimensions[d] & 0x1) == 0);
 | 
			
		||||
          _gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard
 | 
			
		||||
        }
 | 
			
		||||
        _ldimensions[d] = _gdimensions[d] / _processors[d];
 | 
			
		||||
        assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
 | 
			
		||||
        _lstart[d] = _processor_coor[d] * _ldimensions[d];
 | 
			
		||||
        _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
 | 
			
		||||
 | 
			
		||||
	_osites *= _rdimensions[d];
 | 
			
		||||
	_isites *= _simd_layout[d];
 | 
			
		||||
        
 | 
			
		||||
	// Addressing support
 | 
			
		||||
	if ( d==0 ) {
 | 
			
		||||
	  _ostride[d] = 1;
 | 
			
		||||
	  _istride[d] = 1;
 | 
			
		||||
	} else {
 | 
			
		||||
	  _ostride[d] = _ostride[d-1]*_rdimensions[d-1];
 | 
			
		||||
	  _istride[d] = _istride[d-1]*_simd_layout[d-1];
 | 
			
		||||
	}
 | 
			
		||||
        // Use a reduced simd grid
 | 
			
		||||
        _simd_layout[d] = simd_layout[d];
 | 
			
		||||
        _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; // this is not checking if this is integer
 | 
			
		||||
        assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
 | 
			
		||||
        assert(_rdimensions[d] > 0);
 | 
			
		||||
 | 
			
		||||
        // all elements of a simd vector must have same checkerboard.
 | 
			
		||||
        // If Ls vectorised, this must still be the case; e.g. dwf rb5d
 | 
			
		||||
        if (_simd_layout[d] > 1)
 | 
			
		||||
        {
 | 
			
		||||
          if (checker_dim_mask[d])
 | 
			
		||||
          {
 | 
			
		||||
            assert((_rdimensions[d] & 0x1) == 0);
 | 
			
		||||
          }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        _osites *= _rdimensions[d];
 | 
			
		||||
        _isites *= _simd_layout[d];
 | 
			
		||||
 | 
			
		||||
        // Addressing support
 | 
			
		||||
        if (d == 0)
 | 
			
		||||
        {
 | 
			
		||||
          _ostride[d] = 1;
 | 
			
		||||
          _istride[d] = 1;
 | 
			
		||||
        }
 | 
			
		||||
        else
 | 
			
		||||
        {
 | 
			
		||||
          _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
 | 
			
		||||
          _istride[d] = _istride[d - 1] * _simd_layout[d - 1];
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
            
 | 
			
		||||
 | 
			
		||||
      ////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
      // subplane information
 | 
			
		||||
      ////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
      _slice_block.resize(_ndimension);
 | 
			
		||||
      _slice_stride.resize(_ndimension);
 | 
			
		||||
      _slice_nblock.resize(_ndimension);
 | 
			
		||||
        
 | 
			
		||||
      int block =1;
 | 
			
		||||
      int nblock=1;
 | 
			
		||||
      for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
 | 
			
		||||
      
 | 
			
		||||
      for(int d=0;d<_ndimension;d++){
 | 
			
		||||
	nblock/=_rdimensions[d];
 | 
			
		||||
	_slice_block[d] =block;
 | 
			
		||||
	_slice_stride[d]=_ostride[d]*_rdimensions[d];
 | 
			
		||||
	_slice_nblock[d]=nblock;
 | 
			
		||||
	block = block*_rdimensions[d];
 | 
			
		||||
 | 
			
		||||
      int block = 1;
 | 
			
		||||
      int nblock = 1;
 | 
			
		||||
      for (int d = 0; d < _ndimension; d++)
 | 
			
		||||
        nblock *= _rdimensions[d];
 | 
			
		||||
 | 
			
		||||
      for (int d = 0; d < _ndimension; d++)
 | 
			
		||||
      {
 | 
			
		||||
        nblock /= _rdimensions[d];
 | 
			
		||||
        _slice_block[d] = block;
 | 
			
		||||
        _slice_stride[d] = _ostride[d] * _rdimensions[d];
 | 
			
		||||
        _slice_nblock[d] = nblock;
 | 
			
		||||
        block = block * _rdimensions[d];
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      ////////////////////////////////////////////////
 | 
			
		||||
      // Create a checkerboard lookup table
 | 
			
		||||
      ////////////////////////////////////////////////
 | 
			
		||||
      int rvol = 1;
 | 
			
		||||
      for(int d=0;d<_ndimension;d++){
 | 
			
		||||
	rvol=rvol * _rdimensions[d];
 | 
			
		||||
      for (int d = 0; d < _ndimension; d++)
 | 
			
		||||
      {
 | 
			
		||||
        rvol = rvol * _rdimensions[d];
 | 
			
		||||
      }
 | 
			
		||||
      _checker_board.resize(rvol);
 | 
			
		||||
      for(int osite=0;osite<_osites;osite++){
 | 
			
		||||
	_checker_board[osite] = CheckerBoardFromOindex (osite);
 | 
			
		||||
      for (int osite = 0; osite < _osites; osite++)
 | 
			
		||||
      {
 | 
			
		||||
        _checker_board[osite] = CheckerBoardFromOindex(osite);
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
    };
 | 
			
		||||
protected:
 | 
			
		||||
 | 
			
		||||
  protected:
 | 
			
		||||
    virtual int oIndex(std::vector<int> &coor)
 | 
			
		||||
    {
 | 
			
		||||
      int idx=0;
 | 
			
		||||
      for(int d=0;d<_ndimension;d++) {
 | 
			
		||||
	if( d==_checker_dim ) {
 | 
			
		||||
	  idx+=_ostride[d]*((coor[d]/2)%_rdimensions[d]);
 | 
			
		||||
	} else {
 | 
			
		||||
	  idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
 | 
			
		||||
	}
 | 
			
		||||
      int idx = 0;
 | 
			
		||||
      for (int d = 0; d < _ndimension; d++)
 | 
			
		||||
      {
 | 
			
		||||
        if (d == _checker_dim)
 | 
			
		||||
        {
 | 
			
		||||
          idx += _ostride[d] * ((coor[d] / 2) % _rdimensions[d]);
 | 
			
		||||
        }
 | 
			
		||||
        else
 | 
			
		||||
        {
 | 
			
		||||
          idx += _ostride[d] * (coor[d] % _rdimensions[d]);
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
      return idx;
 | 
			
		||||
    };
 | 
			
		||||
        
 | 
			
		||||
 | 
			
		||||
    virtual int iIndex(std::vector<int> &lcoor)
 | 
			
		||||
    {
 | 
			
		||||
        int idx=0;
 | 
			
		||||
        for(int d=0;d<_ndimension;d++) {
 | 
			
		||||
	  if( d==_checker_dim ) {
 | 
			
		||||
	    idx+=_istride[d]*(lcoor[d]/(2*_rdimensions[d]));
 | 
			
		||||
	  } else { 
 | 
			
		||||
	    idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
 | 
			
		||||
	  }
 | 
			
		||||
	}
 | 
			
		||||
        return idx;
 | 
			
		||||
      int idx = 0;
 | 
			
		||||
      for (int d = 0; d < _ndimension; d++)
 | 
			
		||||
      {
 | 
			
		||||
        if (d == _checker_dim)
 | 
			
		||||
        {
 | 
			
		||||
          idx += _istride[d] * (lcoor[d] / (2 * _rdimensions[d]));
 | 
			
		||||
        }
 | 
			
		||||
        else
 | 
			
		||||
        {
 | 
			
		||||
          idx += _istride[d] * (lcoor[d] / _rdimensions[d]);
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
      return idx;
 | 
			
		||||
    }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -26,6 +26,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include <Grid/GridCore.h>
 | 
			
		||||
#include <fcntl.h>
 | 
			
		||||
#include <unistd.h>
 | 
			
		||||
#include <limits.h>
 | 
			
		||||
#include <sys/mman.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
@@ -33,8 +37,11 @@ namespace Grid {
 | 
			
		||||
// Info that is setup once and indept of cartesian layout
 | 
			
		||||
///////////////////////////////////////////////////////////////
 | 
			
		||||
void *              CartesianCommunicator::ShmCommBuf;
 | 
			
		||||
uint64_t            CartesianCommunicator::MAX_MPI_SHM_BYTES   = 128*1024*1024; 
 | 
			
		||||
CartesianCommunicator::CommunicatorPolicy_t  CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
 | 
			
		||||
uint64_t            CartesianCommunicator::MAX_MPI_SHM_BYTES   = 1024LL*1024LL*1024LL; 
 | 
			
		||||
CartesianCommunicator::CommunicatorPolicy_t  
 | 
			
		||||
CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
 | 
			
		||||
int CartesianCommunicator::nCommThreads = -1;
 | 
			
		||||
int CartesianCommunicator::Hugepages = 0;
 | 
			
		||||
 | 
			
		||||
/////////////////////////////////
 | 
			
		||||
// Alloc, free shmem region
 | 
			
		||||
@@ -60,7 +67,7 @@ void CartesianCommunicator::ShmBufferFreeAll(void) {
 | 
			
		||||
/////////////////////////////////
 | 
			
		||||
// Grid information queries
 | 
			
		||||
/////////////////////////////////
 | 
			
		||||
int                      CartesianCommunicator::Dimensions(void)         { return _ndimension; };
 | 
			
		||||
int                      CartesianCommunicator::Dimensions(void)        { return _ndimension; };
 | 
			
		||||
int                      CartesianCommunicator::IsBoss(void)            { return _processor==0; };
 | 
			
		||||
int                      CartesianCommunicator::BossRank(void)          { return 0; };
 | 
			
		||||
int                      CartesianCommunicator::ThisRank(void)          { return _processor; };
 | 
			
		||||
@@ -89,25 +96,142 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
 | 
			
		||||
  GlobalSumVector((double *)c,2*N);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPI3L)
 | 
			
		||||
 | 
			
		||||
#if defined( GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT)
 | 
			
		||||
 | 
			
		||||
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent) 
 | 
			
		||||
{
 | 
			
		||||
  _ndimension = processors.size();
 | 
			
		||||
  assert(_ndimension = parent._ndimension);
 | 
			
		||||
  
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // split the communicator
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  int Nparent;
 | 
			
		||||
  MPI_Comm_size(parent.communicator,&Nparent);
 | 
			
		||||
 | 
			
		||||
  int childsize=1;
 | 
			
		||||
  for(int d=0;d<processors.size();d++) {
 | 
			
		||||
    childsize *= processors[d];
 | 
			
		||||
  }
 | 
			
		||||
  int Nchild = Nparent/childsize;
 | 
			
		||||
  assert (childsize * Nchild == Nparent);
 | 
			
		||||
 | 
			
		||||
  int prank;  MPI_Comm_rank(parent.communicator,&prank);
 | 
			
		||||
  int crank = prank % childsize;
 | 
			
		||||
  int ccomm = prank / childsize;
 | 
			
		||||
 | 
			
		||||
  MPI_Comm comm_split;
 | 
			
		||||
  if ( Nchild > 1 ) { 
 | 
			
		||||
 | 
			
		||||
    std::cout << GridLogMessage<<"Child communicator of "<< std::hex << parent.communicator << std::dec<<std::endl;
 | 
			
		||||
    std::cout << GridLogMessage<<" parent grid["<< parent._ndimension<<"]    ";
 | 
			
		||||
    for(int d=0;d<parent._processors.size();d++)  std::cout << parent._processors[d] << " ";
 | 
			
		||||
    std::cout<<std::endl;
 | 
			
		||||
 | 
			
		||||
    std::cout << GridLogMessage<<" child grid["<< _ndimension <<"]    ";
 | 
			
		||||
    for(int d=0;d<processors.size();d++)  std::cout << processors[d] << " ";
 | 
			
		||||
    std::cout<<std::endl;
 | 
			
		||||
 | 
			
		||||
    int ierr= MPI_Comm_split(parent.communicator, ccomm,crank,&comm_split);
 | 
			
		||||
    assert(ierr==0);
 | 
			
		||||
    //////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    // Declare victory
 | 
			
		||||
    //////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    std::cout << GridLogMessage<<"Divided communicator "<< parent._Nprocessors<<" into "
 | 
			
		||||
	      <<Nchild <<" communicators with " << childsize << " ranks"<<std::endl;
 | 
			
		||||
  } else {
 | 
			
		||||
    comm_split=parent.communicator;
 | 
			
		||||
    //    std::cout << "Passed parental communicator to a new communicator" <<std::endl;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Set up from the new split communicator
 | 
			
		||||
  //////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  InitFromMPICommunicator(processors,comm_split);
 | 
			
		||||
}
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Take an MPI_Comm and self assemble
 | 
			
		||||
//////////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base)
 | 
			
		||||
{
 | 
			
		||||
  //  if ( communicator_base != communicator_world ) {
 | 
			
		||||
  //    std::cout << "Cartesian communicator created with a non-world communicator"<<std::endl;
 | 
			
		||||
  //  }
 | 
			
		||||
  _ndimension = processors.size();
 | 
			
		||||
  _processor_coor.resize(_ndimension);
 | 
			
		||||
 | 
			
		||||
  /////////////////////////////////
 | 
			
		||||
  // Count the requested nodes
 | 
			
		||||
  /////////////////////////////////
 | 
			
		||||
  _Nprocessors=1;
 | 
			
		||||
  _processors = processors;
 | 
			
		||||
  for(int i=0;i<_ndimension;i++){
 | 
			
		||||
    _Nprocessors*=_processors[i];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  std::vector<int> periodic(_ndimension,1);
 | 
			
		||||
  MPI_Cart_create(communicator_base, _ndimension,&_processors[0],&periodic[0],1,&communicator);
 | 
			
		||||
  MPI_Comm_rank(communicator,&_processor);
 | 
			
		||||
  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
 | 
			
		||||
 | 
			
		||||
  int Size;
 | 
			
		||||
  MPI_Comm_size(communicator,&Size);
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_COMMS_MPIT
 | 
			
		||||
  communicator_halo.resize (2*_ndimension);
 | 
			
		||||
  for(int i=0;i<_ndimension*2;i++){
 | 
			
		||||
    MPI_Comm_dup(communicator,&communicator_halo[i]);
 | 
			
		||||
  }
 | 
			
		||||
#endif
 | 
			
		||||
  
 | 
			
		||||
  assert(Size==_Nprocessors);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) 
 | 
			
		||||
{
 | 
			
		||||
  InitFromMPICommunicator(processors,communicator_world);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#if !defined( GRID_COMMS_MPI3) 
 | 
			
		||||
 | 
			
		||||
int                      CartesianCommunicator::NodeCount(void)    { return ProcessorCount();};
 | 
			
		||||
int                      CartesianCommunicator::RankCount(void)    { return ProcessorCount();};
 | 
			
		||||
 | 
			
		||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 | 
			
		||||
						       void *xmit,
 | 
			
		||||
						       int xmit_to_rank,
 | 
			
		||||
						       void *recv,
 | 
			
		||||
						       int recv_from_rank,
 | 
			
		||||
						       int bytes)
 | 
			
		||||
#endif
 | 
			
		||||
#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPIT)
 | 
			
		||||
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 | 
			
		||||
						     int xmit_to_rank,
 | 
			
		||||
						     void *recv,
 | 
			
		||||
						     int recv_from_rank,
 | 
			
		||||
						     int bytes, int dir)
 | 
			
		||||
{
 | 
			
		||||
  std::vector<CommsRequest_t> list;
 | 
			
		||||
  // Discard the "dir"
 | 
			
		||||
  SendToRecvFromBegin   (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
 | 
			
		||||
  SendToRecvFromComplete(list);
 | 
			
		||||
  return 2.0*bytes;
 | 
			
		||||
}
 | 
			
		||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 | 
			
		||||
							 void *xmit,
 | 
			
		||||
							 int xmit_to_rank,
 | 
			
		||||
							 void *recv,
 | 
			
		||||
							 int recv_from_rank,
 | 
			
		||||
							 int bytes, int dir)
 | 
			
		||||
{
 | 
			
		||||
  // Discard the "dir"
 | 
			
		||||
  SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
 | 
			
		||||
  return 2.0*bytes;
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
 | 
			
		||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 | 
			
		||||
{
 | 
			
		||||
  SendToRecvFromComplete(waitall);
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#if !defined( GRID_COMMS_MPI3) 
 | 
			
		||||
 | 
			
		||||
void CartesianCommunicator::StencilBarrier(void){};
 | 
			
		||||
 | 
			
		||||
commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;
 | 
			
		||||
@@ -121,8 +245,30 @@ void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) {
 | 
			
		||||
  return NULL;
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::ShmInitGeneric(void){
 | 
			
		||||
#if 1
 | 
			
		||||
  int mmap_flag =0;
 | 
			
		||||
#ifdef MAP_ANONYMOUS
 | 
			
		||||
  mmap_flag = mmap_flag| MAP_SHARED | MAP_ANONYMOUS;
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef MAP_ANON
 | 
			
		||||
  mmap_flag = mmap_flag| MAP_SHARED | MAP_ANON;
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef MAP_HUGETLB
 | 
			
		||||
  if ( Hugepages ) mmap_flag |= MAP_HUGETLB;
 | 
			
		||||
#endif
 | 
			
		||||
  ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag, -1, 0); 
 | 
			
		||||
  if (ShmCommBuf == (void *)MAP_FAILED) {
 | 
			
		||||
    perror("mmap failed ");
 | 
			
		||||
    exit(EXIT_FAILURE);  
 | 
			
		||||
  }
 | 
			
		||||
#ifdef MADV_HUGEPAGE
 | 
			
		||||
  if (!Hugepages ) madvise(ShmCommBuf,MAX_MPI_SHM_BYTES,MADV_HUGEPAGE);
 | 
			
		||||
#endif
 | 
			
		||||
#else 
 | 
			
		||||
  ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES);
 | 
			
		||||
  ShmCommBuf=(void *)&ShmBufStorageVector[0];
 | 
			
		||||
#endif
 | 
			
		||||
  bzero(ShmCommBuf,MAX_MPI_SHM_BYTES);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -38,7 +38,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#ifdef GRID_COMMS_MPI3
 | 
			
		||||
#include <mpi.h>
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_COMMS_MPI3L
 | 
			
		||||
#ifdef GRID_COMMS_MPIT
 | 
			
		||||
#include <mpi.h>
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_COMMS_SHMEM
 | 
			
		||||
@@ -50,12 +50,24 @@ namespace Grid {
 | 
			
		||||
class CartesianCommunicator {
 | 
			
		||||
  public:    
 | 
			
		||||
 | 
			
		||||
  // 65536 ranks per node adequate for now
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////
 | 
			
		||||
  // Isend/Irecv/Wait, or Sendrecv blocking
 | 
			
		||||
  ////////////////////////////////////////////
 | 
			
		||||
  enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
 | 
			
		||||
  static CommunicatorPolicy_t CommunicatorPolicy;
 | 
			
		||||
  static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
 | 
			
		||||
 | 
			
		||||
  ///////////////////////////////////////////
 | 
			
		||||
  // Up to 65536 ranks per node adequate for now
 | 
			
		||||
  // 128MB shared memory for comms enought for 48^4 local vol comms
 | 
			
		||||
  // Give external control (command line override?) of this
 | 
			
		||||
 | 
			
		||||
  static const int      MAXLOG2RANKSPERNODE = 16;            
 | 
			
		||||
  static uint64_t MAX_MPI_SHM_BYTES;
 | 
			
		||||
  ///////////////////////////////////////////
 | 
			
		||||
  static const int MAXLOG2RANKSPERNODE = 16;            
 | 
			
		||||
  static uint64_t  MAX_MPI_SHM_BYTES;
 | 
			
		||||
  static int       nCommThreads;
 | 
			
		||||
  // use explicit huge pages
 | 
			
		||||
  static int       Hugepages;
 | 
			
		||||
 | 
			
		||||
  // Communicator should know nothing of the physics grid, only processor grid.
 | 
			
		||||
  int              _Nprocessors;     // How many in all
 | 
			
		||||
@@ -64,14 +76,19 @@ class CartesianCommunicator {
 | 
			
		||||
  std::vector<int> _processor_coor;  // linear processor coordinate
 | 
			
		||||
  unsigned long _ndimension;
 | 
			
		||||
 | 
			
		||||
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPI3L)
 | 
			
		||||
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
 | 
			
		||||
  static MPI_Comm communicator_world;
 | 
			
		||||
         MPI_Comm communicator;
 | 
			
		||||
 | 
			
		||||
  MPI_Comm              communicator;
 | 
			
		||||
  std::vector<MPI_Comm> communicator_halo;
 | 
			
		||||
 | 
			
		||||
  typedef MPI_Request CommsRequest_t;
 | 
			
		||||
 | 
			
		||||
#else 
 | 
			
		||||
  typedef int CommsRequest_t;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Helper functionality for SHM Windows common to all other impls
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -117,11 +134,7 @@ class CartesianCommunicator {
 | 
			
		||||
  /////////////////////////////////
 | 
			
		||||
  static void * ShmCommBuf;
 | 
			
		||||
 | 
			
		||||
  // Isend/Irecv/Wait, or Sendrecv blocking
 | 
			
		||||
  enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
 | 
			
		||||
  static CommunicatorPolicy_t CommunicatorPolicy;
 | 
			
		||||
  static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
  size_t heap_top;
 | 
			
		||||
  size_t heap_bytes;
 | 
			
		||||
 | 
			
		||||
@@ -135,11 +148,23 @@ class CartesianCommunicator {
 | 
			
		||||
  // Must call in Grid startup
 | 
			
		||||
  ////////////////////////////////////////////////
 | 
			
		||||
  static void Init(int *argc, char ***argv);
 | 
			
		||||
  
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////
 | 
			
		||||
  // Constructor of any given grid
 | 
			
		||||
  // Constructors to sub-divide a parent communicator
 | 
			
		||||
  // and default to comm world
 | 
			
		||||
  ////////////////////////////////////////////////
 | 
			
		||||
  CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent);
 | 
			
		||||
  CartesianCommunicator(const std::vector<int> &pdimensions_in);
 | 
			
		||||
 | 
			
		||||
 private:
 | 
			
		||||
#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPIT) 
 | 
			
		||||
  ////////////////////////////////////////////////
 | 
			
		||||
  // Private initialise from an MPI communicator
 | 
			
		||||
  // Can use after an MPI_Comm_split, but hidden from user so private
 | 
			
		||||
  ////////////////////////////////////////////////
 | 
			
		||||
  void InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base);
 | 
			
		||||
#endif
 | 
			
		||||
 public:
 | 
			
		||||
  
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Wraps MPI_Cart routines, or implements equivalent on other impls
 | 
			
		||||
@@ -211,14 +236,21 @@ class CartesianCommunicator {
 | 
			
		||||
  
 | 
			
		||||
  void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
 | 
			
		||||
 | 
			
		||||
  double StencilSendToRecvFrom(void *xmit,
 | 
			
		||||
			       int xmit_to_rank,
 | 
			
		||||
			       void *recv,
 | 
			
		||||
			       int recv_from_rank,
 | 
			
		||||
			       int bytes,int dir);
 | 
			
		||||
 | 
			
		||||
  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 | 
			
		||||
				  void *xmit,
 | 
			
		||||
				  int xmit_to_rank,
 | 
			
		||||
				  void *recv,
 | 
			
		||||
				  int recv_from_rank,
 | 
			
		||||
				  int bytes);
 | 
			
		||||
				    void *xmit,
 | 
			
		||||
				    int xmit_to_rank,
 | 
			
		||||
				    void *recv,
 | 
			
		||||
				    int recv_from_rank,
 | 
			
		||||
				    int bytes,int dir);
 | 
			
		||||
  
 | 
			
		||||
  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
 | 
			
		||||
  
 | 
			
		||||
  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
 | 
			
		||||
  void StencilBarrier(void);
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////////////
 | 
			
		||||
 
 | 
			
		||||
@@ -52,29 +52,6 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
 | 
			
		||||
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
 | 
			
		||||
  ShmInitGeneric();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 | 
			
		||||
{
 | 
			
		||||
  _ndimension = processors.size();
 | 
			
		||||
  std::vector<int> periodic(_ndimension,1);
 | 
			
		||||
 | 
			
		||||
  _Nprocessors=1;
 | 
			
		||||
  _processors = processors;
 | 
			
		||||
  _processor_coor.resize(_ndimension);
 | 
			
		||||
  
 | 
			
		||||
  MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator);
 | 
			
		||||
  MPI_Comm_rank(communicator,&_processor);
 | 
			
		||||
  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
 | 
			
		||||
 | 
			
		||||
  for(int i=0;i<_ndimension;i++){
 | 
			
		||||
    _Nprocessors*=_processors[i];
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  int Size; 
 | 
			
		||||
  MPI_Comm_size(communicator,&Size);
 | 
			
		||||
  
 | 
			
		||||
  assert(Size==_Nprocessors);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalSum(uint32_t &u){
 | 
			
		||||
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
 
 | 
			
		||||
@@ -37,11 +37,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#include <sys/ipc.h>
 | 
			
		||||
#include <sys/shm.h>
 | 
			
		||||
#include <sys/mman.h>
 | 
			
		||||
//#include <zlib.h>
 | 
			
		||||
#ifndef SHM_HUGETLB
 | 
			
		||||
#define SHM_HUGETLB 04000
 | 
			
		||||
#include <zlib.h>
 | 
			
		||||
#ifdef HAVE_NUMAIF_H
 | 
			
		||||
#include <numaif.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
///////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -197,7 +198,46 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
 | 
			
		||||
  ShmCommBuf = 0;
 | 
			
		||||
  ShmCommBufs.resize(ShmSize);
 | 
			
		||||
 | 
			
		||||
#if 1
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Hugetlbf and others map filesystems as mappable huge pages
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
#ifdef GRID_MPI3_SHMMMAP
 | 
			
		||||
  char shm_name [NAME_MAX];
 | 
			
		||||
  for(int r=0;r<ShmSize;r++){
 | 
			
		||||
    
 | 
			
		||||
    size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
 | 
			
		||||
    sprintf(shm_name,GRID_SHM_PATH "/Grid_mpi3_shm_%d_%d",GroupRank,r);
 | 
			
		||||
    //sprintf(shm_name,"/var/lib/hugetlbfs/group/wheel/pagesize-2MB/" "Grid_mpi3_shm_%d_%d",GroupRank,r);
 | 
			
		||||
    //    printf("Opening file %s \n",shm_name);
 | 
			
		||||
    int fd=open(shm_name,O_RDWR|O_CREAT,0666);
 | 
			
		||||
    if ( fd == -1) { 
 | 
			
		||||
      printf("open %s failed\n",shm_name);
 | 
			
		||||
      perror("open hugetlbfs");
 | 
			
		||||
      exit(0);
 | 
			
		||||
    }
 | 
			
		||||
    int mmap_flag = MAP_SHARED ;
 | 
			
		||||
#ifdef MAP_POPULATE    
 | 
			
		||||
    mmap_flag|=MAP_POPULATE;
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef MAP_HUGETLB
 | 
			
		||||
    if ( Hugepages ) mmap_flag |= MAP_HUGETLB;
 | 
			
		||||
#endif
 | 
			
		||||
    void *ptr = (void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag,fd, 0); 
 | 
			
		||||
    if ( ptr == (void *)MAP_FAILED ) {    
 | 
			
		||||
      printf("mmap %s failed\n",shm_name);
 | 
			
		||||
      perror("failed mmap");      assert(0);    
 | 
			
		||||
    }
 | 
			
		||||
    assert(((uint64_t)ptr&0x3F)==0);
 | 
			
		||||
    ShmCommBufs[r] =ptr;
 | 
			
		||||
    
 | 
			
		||||
  }
 | 
			
		||||
#endif
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // POSIX SHMOPEN ; as far as I know Linux does not allow EXPLICIT HugePages with this case
 | 
			
		||||
  // tmpfs (Larry Meadows says) does not support explicit huge page, and this is used for 
 | 
			
		||||
  // the posix shm virtual file system
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
#ifdef GRID_MPI3_SHMOPEN
 | 
			
		||||
  char shm_name [NAME_MAX];
 | 
			
		||||
  if ( ShmRank == 0 ) {
 | 
			
		||||
    for(int r=0;r<ShmSize;r++){
 | 
			
		||||
@@ -210,11 +250,39 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
 | 
			
		||||
      int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666);
 | 
			
		||||
      if ( fd < 0 ) {	perror("failed shm_open");	assert(0);      }
 | 
			
		||||
      ftruncate(fd, size);
 | 
			
		||||
      
 | 
			
		||||
      int mmap_flag = MAP_SHARED;
 | 
			
		||||
#ifdef MAP_POPULATE 
 | 
			
		||||
      mmap_flag |= MAP_POPULATE;
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef MAP_HUGETLB
 | 
			
		||||
      if (Hugepages) mmap_flag |= MAP_HUGETLB;
 | 
			
		||||
#endif
 | 
			
		||||
      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
 | 
			
		||||
 | 
			
		||||
      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
 | 
			
		||||
      if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
 | 
			
		||||
      if ( ptr == (void * )MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
 | 
			
		||||
      assert(((uint64_t)ptr&0x3F)==0);
 | 
			
		||||
      ShmCommBufs[r] =ptr;
 | 
			
		||||
 | 
			
		||||
// Experiments; Experiments; Try to force numa domain on the shm segment if we have numaif.h
 | 
			
		||||
#if 0
 | 
			
		||||
//#ifdef HAVE_NUMAIF_H
 | 
			
		||||
	int status;
 | 
			
		||||
	int flags=MPOL_MF_MOVE;
 | 
			
		||||
#ifdef KNL
 | 
			
		||||
	int nodes=1; // numa domain == MCDRAM
 | 
			
		||||
	// Find out if in SNC2,SNC4 mode ?
 | 
			
		||||
#else
 | 
			
		||||
	int nodes=r; // numa domain == MPI ID
 | 
			
		||||
#endif
 | 
			
		||||
	unsigned long count=1;
 | 
			
		||||
	for(uint64_t page=0;page<size;page+=4096){
 | 
			
		||||
	  void *pages = (void *) ( page + (uint64_t)ptr );
 | 
			
		||||
	  uint64_t *cow_it = (uint64_t *)pages;	*cow_it = 1;
 | 
			
		||||
	  ierr= move_pages(0,count, &pages,&nodes,&status,flags);
 | 
			
		||||
	  if (ierr && (page==0)) perror("numa relocate command failed");
 | 
			
		||||
	}
 | 
			
		||||
#endif
 | 
			
		||||
	ShmCommBufs[r] =ptr;
 | 
			
		||||
      
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
@@ -236,21 +304,32 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
 | 
			
		||||
      ShmCommBufs[r] =ptr;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
#else
 | 
			
		||||
#endif
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
  // SHMGET SHMAT and SHM_HUGETLB flag
 | 
			
		||||
  ////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
#ifdef GRID_MPI3_SHMGET
 | 
			
		||||
  std::vector<int> shmids(ShmSize);
 | 
			
		||||
 | 
			
		||||
  if ( ShmRank == 0 ) {
 | 
			
		||||
    for(int r=0;r<ShmSize;r++){
 | 
			
		||||
      size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
 | 
			
		||||
      key_t key   = 0x4545 + r;
 | 
			
		||||
      if ((shmids[r]= shmget(key,size, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) {
 | 
			
		||||
      key_t key   = IPC_PRIVATE;
 | 
			
		||||
      int flags = IPC_CREAT | SHM_R | SHM_W;
 | 
			
		||||
#ifdef SHM_HUGETLB
 | 
			
		||||
      if (Hugepages) flags|=SHM_HUGETLB;
 | 
			
		||||
#endif
 | 
			
		||||
      if ((shmids[r]= shmget(key,size, flags)) ==-1) {
 | 
			
		||||
	int errsv = errno;
 | 
			
		||||
	printf("Errno %d\n",errsv);
 | 
			
		||||
	printf("key   %d\n",key);
 | 
			
		||||
	printf("size  %lld\n",size);
 | 
			
		||||
	printf("flags %d\n",flags);
 | 
			
		||||
	perror("shmget");
 | 
			
		||||
	exit(1);
 | 
			
		||||
      } else { 
 | 
			
		||||
	printf("shmid: 0x%x\n", shmids[r]);
 | 
			
		||||
      }
 | 
			
		||||
      printf("shmid: 0x%x\n", shmids[r]);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  MPI_Barrier(ShmComm);
 | 
			
		||||
@@ -371,12 +450,27 @@ void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &c
 | 
			
		||||
  assert(lr!=-1);
 | 
			
		||||
  Lexicographic::CoorFromIndex(coor,lr,_processors);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
//////////////////////////////////
 | 
			
		||||
// Try to subdivide communicator
 | 
			
		||||
//////////////////////////////////
 | 
			
		||||
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent) 
 | 
			
		||||
  : CartesianCommunicator(processors) 
 | 
			
		||||
{
 | 
			
		||||
  std::cout << "Attempts to split MPI3 communicators will fail until implemented" <<std::endl;
 | 
			
		||||
}
 | 
			
		||||
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 | 
			
		||||
{ 
 | 
			
		||||
  int ierr;
 | 
			
		||||
  communicator=communicator_world;
 | 
			
		||||
 | 
			
		||||
  _ndimension = processors.size();
 | 
			
		||||
 | 
			
		||||
  communicator_halo.resize (2*_ndimension);
 | 
			
		||||
  for(int i=0;i<_ndimension*2;i++){
 | 
			
		||||
    MPI_Comm_dup(communicator,&communicator_halo[i]);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  ////////////////////////////////////////////////////////////////
 | 
			
		||||
  // Assert power of two shm_size.
 | 
			
		||||
  ////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -599,13 +693,27 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 | 
			
		||||
						       void *xmit,
 | 
			
		||||
						       int dest,
 | 
			
		||||
						       void *recv,
 | 
			
		||||
						       int from,
 | 
			
		||||
						       int bytes)
 | 
			
		||||
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 | 
			
		||||
						     int dest,
 | 
			
		||||
						     void *recv,
 | 
			
		||||
						     int from,
 | 
			
		||||
						     int bytes,int dir)
 | 
			
		||||
{
 | 
			
		||||
  std::vector<CommsRequest_t> list;
 | 
			
		||||
  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
 | 
			
		||||
  StencilSendToRecvFromComplete(list,dir);
 | 
			
		||||
  return offbytes;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 | 
			
		||||
							 void *xmit,
 | 
			
		||||
							 int dest,
 | 
			
		||||
							 void *recv,
 | 
			
		||||
							 int from,
 | 
			
		||||
							 int bytes,int dir)
 | 
			
		||||
{
 | 
			
		||||
  assert(dir < communicator_halo.size());
 | 
			
		||||
 | 
			
		||||
  MPI_Request xrq;
 | 
			
		||||
  MPI_Request rrq;
 | 
			
		||||
 | 
			
		||||
@@ -624,26 +732,26 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
 | 
			
		||||
  gfrom = MPI_UNDEFINED;
 | 
			
		||||
#endif
 | 
			
		||||
  if ( gfrom ==MPI_UNDEFINED) {
 | 
			
		||||
    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
 | 
			
		||||
    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[dir],&rrq);
 | 
			
		||||
    assert(ierr==0);
 | 
			
		||||
    list.push_back(rrq);
 | 
			
		||||
    off_node_bytes+=bytes;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( gdest == MPI_UNDEFINED ) {
 | 
			
		||||
    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
 | 
			
		||||
    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[dir],&xrq);
 | 
			
		||||
    assert(ierr==0);
 | 
			
		||||
    list.push_back(xrq);
 | 
			
		||||
    off_node_bytes+=bytes;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  if ( CommunicatorPolicy == CommunicatorPolicySequential ) { 
 | 
			
		||||
    this->StencilSendToRecvFromComplete(list);
 | 
			
		||||
    this->StencilSendToRecvFromComplete(list,dir);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  return off_node_bytes;
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
 | 
			
		||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 | 
			
		||||
{
 | 
			
		||||
  SendToRecvFromComplete(waitall);
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										259
									
								
								lib/communicator/Communicator_mpit.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										259
									
								
								lib/communicator/Communicator_mpit.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,259 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./lib/communicator/Communicator_mpi.cc
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include <Grid/GridCore.h>
 | 
			
		||||
#include <Grid/GridQCDcore.h>
 | 
			
		||||
#include <Grid/qcd/action/ActionCore.h>
 | 
			
		||||
#include <mpi.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
///////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Info that is setup once and indept of cartesian layout
 | 
			
		||||
///////////////////////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
MPI_Comm CartesianCommunicator::communicator_world;
 | 
			
		||||
 | 
			
		||||
// Should error check all MPI calls.
 | 
			
		||||
void CartesianCommunicator::Init(int *argc, char ***argv) {
 | 
			
		||||
  int flag;
 | 
			
		||||
  int provided;
 | 
			
		||||
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
 | 
			
		||||
  if ( !flag ) {
 | 
			
		||||
    MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
 | 
			
		||||
    if ( provided != MPI_THREAD_MULTIPLE ) {
 | 
			
		||||
      QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
 | 
			
		||||
  ShmInitGeneric();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void CartesianCommunicator::GlobalSum(uint32_t &u){
 | 
			
		||||
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalSum(uint64_t &u){
 | 
			
		||||
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalXOR(uint32_t &u){
 | 
			
		||||
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalXOR(uint64_t &u){
 | 
			
		||||
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalSum(float &f){
 | 
			
		||||
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalSumVector(float *f,int N)
 | 
			
		||||
{
 | 
			
		||||
  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalSum(double &d)
 | 
			
		||||
{
 | 
			
		||||
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::GlobalSumVector(double *d,int N)
 | 
			
		||||
{
 | 
			
		||||
  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 | 
			
		||||
{
 | 
			
		||||
  int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
 | 
			
		||||
{
 | 
			
		||||
  int rank;
 | 
			
		||||
  int ierr=MPI_Cart_rank  (communicator, &coor[0], &rank);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
  return rank;
 | 
			
		||||
}
 | 
			
		||||
void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
 | 
			
		||||
{
 | 
			
		||||
  coor.resize(_ndimension);
 | 
			
		||||
  int ierr=MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Basic Halo comms primitive
 | 
			
		||||
void CartesianCommunicator::SendToRecvFrom(void *xmit,
 | 
			
		||||
					   int dest,
 | 
			
		||||
					   void *recv,
 | 
			
		||||
					   int from,
 | 
			
		||||
					   int bytes)
 | 
			
		||||
{
 | 
			
		||||
  std::vector<CommsRequest_t> reqs(0);
 | 
			
		||||
  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
 | 
			
		||||
  SendToRecvFromComplete(reqs);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void CartesianCommunicator::SendRecvPacket(void *xmit,
 | 
			
		||||
					   void *recv,
 | 
			
		||||
					   int sender,
 | 
			
		||||
					   int receiver,
 | 
			
		||||
					   int bytes)
 | 
			
		||||
{
 | 
			
		||||
  MPI_Status stat;
 | 
			
		||||
  assert(sender != receiver);
 | 
			
		||||
  int tag = sender;
 | 
			
		||||
  if ( _processor == sender ) {
 | 
			
		||||
    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
 | 
			
		||||
  }
 | 
			
		||||
  if ( _processor == receiver ) { 
 | 
			
		||||
    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Basic Halo comms primitive
 | 
			
		||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 | 
			
		||||
						void *xmit,
 | 
			
		||||
						int dest,
 | 
			
		||||
						void *recv,
 | 
			
		||||
						int from,
 | 
			
		||||
						int bytes)
 | 
			
		||||
{
 | 
			
		||||
  int myrank = _processor;
 | 
			
		||||
  int ierr;
 | 
			
		||||
  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
 | 
			
		||||
    MPI_Request xrq;
 | 
			
		||||
    MPI_Request rrq;
 | 
			
		||||
 | 
			
		||||
    ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
 | 
			
		||||
    ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
 | 
			
		||||
    
 | 
			
		||||
    assert(ierr==0);
 | 
			
		||||
    list.push_back(xrq);
 | 
			
		||||
    list.push_back(rrq);
 | 
			
		||||
  } else { 
 | 
			
		||||
    // Give the CPU to MPI immediately; can use threads to overlap optionally
 | 
			
		||||
    ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
 | 
			
		||||
		      recv,bytes,MPI_CHAR,from, from,
 | 
			
		||||
		      communicator,MPI_STATUS_IGNORE);
 | 
			
		||||
    assert(ierr==0);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 | 
			
		||||
{
 | 
			
		||||
  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
 | 
			
		||||
    int nreq=list.size();
 | 
			
		||||
    std::vector<MPI_Status> status(nreq);
 | 
			
		||||
    int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
 | 
			
		||||
    assert(ierr==0);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void CartesianCommunicator::Barrier(void)
 | 
			
		||||
{
 | 
			
		||||
  int ierr = MPI_Barrier(communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 | 
			
		||||
{
 | 
			
		||||
  int ierr=MPI_Bcast(data,
 | 
			
		||||
		     bytes,
 | 
			
		||||
		     MPI_BYTE,
 | 
			
		||||
		     root,
 | 
			
		||||
		     communicator);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
  ///////////////////////////////////////////////////////
 | 
			
		||||
  // Should only be used prior to Grid Init finished.
 | 
			
		||||
  // Check for this?
 | 
			
		||||
  ///////////////////////////////////////////////////////
 | 
			
		||||
int CartesianCommunicator::RankWorld(void){ 
 | 
			
		||||
  int r; 
 | 
			
		||||
  MPI_Comm_rank(communicator_world,&r);
 | 
			
		||||
  return r;
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 | 
			
		||||
{
 | 
			
		||||
  int ierr= MPI_Bcast(data,
 | 
			
		||||
		      bytes,
 | 
			
		||||
		      MPI_BYTE,
 | 
			
		||||
		      root,
 | 
			
		||||
		      communicator_world);
 | 
			
		||||
  assert(ierr==0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 | 
			
		||||
							 void *xmit,
 | 
			
		||||
							 int xmit_to_rank,
 | 
			
		||||
							 void *recv,
 | 
			
		||||
							 int recv_from_rank,
 | 
			
		||||
							 int bytes,int dir)
 | 
			
		||||
{
 | 
			
		||||
  int myrank = _processor;
 | 
			
		||||
  int ierr;
 | 
			
		||||
  assert(dir < communicator_halo.size());
 | 
			
		||||
  
 | 
			
		||||
  //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
 | 
			
		||||
  // Give the CPU to MPI immediately; can use threads to overlap optionally
 | 
			
		||||
  MPI_Request req[2];
 | 
			
		||||
  MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]);
 | 
			
		||||
  MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank  ,myrank        , communicator_halo[dir],&req[0]);
 | 
			
		||||
 | 
			
		||||
  list.push_back(req[0]);
 | 
			
		||||
  list.push_back(req[1]);
 | 
			
		||||
  return 2.0*bytes;
 | 
			
		||||
}
 | 
			
		||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 | 
			
		||||
{ 
 | 
			
		||||
  int nreq=waitall.size();
 | 
			
		||||
  MPI_Waitall(nreq, &waitall[0], MPI_STATUSES_IGNORE);
 | 
			
		||||
};
 | 
			
		||||
double CartesianCommunicator::StencilSendToRecvFrom(void *xmit,
 | 
			
		||||
						    int xmit_to_rank,
 | 
			
		||||
						    void *recv,
 | 
			
		||||
						    int recv_from_rank,
 | 
			
		||||
						    int bytes,int dir)
 | 
			
		||||
{
 | 
			
		||||
  int myrank = _processor;
 | 
			
		||||
  int ierr;
 | 
			
		||||
  assert(dir < communicator_halo.size());
 | 
			
		||||
  
 | 
			
		||||
  //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
 | 
			
		||||
  // Give the CPU to MPI immediately; can use threads to overlap optionally
 | 
			
		||||
  MPI_Request req[2];
 | 
			
		||||
  MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]);
 | 
			
		||||
  MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank  ,myrank        , communicator_halo[dir],&req[0]);
 | 
			
		||||
  MPI_Waitall(2, req, MPI_STATUSES_IGNORE);
 | 
			
		||||
  return 2.0*bytes;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -38,6 +38,9 @@ void CartesianCommunicator::Init(int *argc, char *** arv)
 | 
			
		||||
  ShmInitGeneric();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent) 
 | 
			
		||||
  : CartesianCommunicator(processors) {}
 | 
			
		||||
 | 
			
		||||
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 | 
			
		||||
{
 | 
			
		||||
  _processors = processors;
 | 
			
		||||
 
 | 
			
		||||
@@ -75,6 +75,11 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
 | 
			
		||||
  ShmInitGeneric();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent) 
 | 
			
		||||
  : CartesianCommunicator(processors) 
 | 
			
		||||
{
 | 
			
		||||
  std::cout << "Attempts to split SHMEM communicators will fail " <<std::endl;
 | 
			
		||||
}
 | 
			
		||||
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 | 
			
		||||
{
 | 
			
		||||
  _ndimension = processors.size();
 | 
			
		||||
 
 | 
			
		||||
@@ -42,7 +42,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
#include <Grid/cshift/Cshift_mpi.h>
 | 
			
		||||
#endif 
 | 
			
		||||
 | 
			
		||||
#ifdef GRID_COMMS_MPI3L
 | 
			
		||||
#ifdef GRID_COMMS_MPIT
 | 
			
		||||
#include <Grid/cshift/Cshift_mpi.h>
 | 
			
		||||
#endif 
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										16252
									
								
								lib/json/json.hpp
									
									
									
									
									
								
							
							
						
						
									
										16252
									
								
								lib/json/json.hpp
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@@ -369,6 +369,7 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
 | 
			
		||||
{
 | 
			
		||||
  int NN    = BlockSolverGrid->_ndimension;
 | 
			
		||||
@@ -387,6 +388,7 @@ inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Or
 | 
			
		||||
  }
 | 
			
		||||
  return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); 
 | 
			
		||||
}
 | 
			
		||||
*/
 | 
			
		||||
 | 
			
		||||
template<class vobj>
 | 
			
		||||
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
 | 
			
		||||
@@ -398,14 +400,15 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
 | 
			
		||||
  int Nblock = X._grid->GlobalDimensions()[Orthog];
 | 
			
		||||
 | 
			
		||||
  GridBase *FullGrid  = X._grid;
 | 
			
		||||
  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
 | 
			
		||||
  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
 | 
			
		||||
 | 
			
		||||
  Lattice<vobj> Xslice(SliceGrid);
 | 
			
		||||
  Lattice<vobj> Rslice(SliceGrid);
 | 
			
		||||
  //  Lattice<vobj> Xslice(SliceGrid);
 | 
			
		||||
  //  Lattice<vobj> Rslice(SliceGrid);
 | 
			
		||||
 | 
			
		||||
  assert( FullGrid->_simd_layout[Orthog]==1);
 | 
			
		||||
  int nh =  FullGrid->_ndimension;
 | 
			
		||||
  int nl = SliceGrid->_ndimension;
 | 
			
		||||
  //  int nl = SliceGrid->_ndimension;
 | 
			
		||||
  int nl = nh-1;
 | 
			
		||||
 | 
			
		||||
  //FIXME package in a convenient iterator
 | 
			
		||||
  //Should loop over a plane orthogonal to direction "Orthog"
 | 
			
		||||
@@ -448,14 +451,14 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
 | 
			
		||||
  int Nblock = X._grid->GlobalDimensions()[Orthog];
 | 
			
		||||
 | 
			
		||||
  GridBase *FullGrid  = X._grid;
 | 
			
		||||
  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
 | 
			
		||||
 | 
			
		||||
  Lattice<vobj> Xslice(SliceGrid);
 | 
			
		||||
  Lattice<vobj> Rslice(SliceGrid);
 | 
			
		||||
  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
 | 
			
		||||
  //  Lattice<vobj> Xslice(SliceGrid);
 | 
			
		||||
  //  Lattice<vobj> Rslice(SliceGrid);
 | 
			
		||||
 | 
			
		||||
  assert( FullGrid->_simd_layout[Orthog]==1);
 | 
			
		||||
  int nh =  FullGrid->_ndimension;
 | 
			
		||||
  int nl = SliceGrid->_ndimension;
 | 
			
		||||
  //  int nl = SliceGrid->_ndimension;
 | 
			
		||||
  int nl=1;
 | 
			
		||||
 | 
			
		||||
  //FIXME package in a convenient iterator
 | 
			
		||||
  //Should loop over a plane orthogonal to direction "Orthog"
 | 
			
		||||
@@ -498,18 +501,19 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
 | 
			
		||||
  typedef typename vobj::vector_type vector_type;
 | 
			
		||||
  
 | 
			
		||||
  GridBase *FullGrid  = lhs._grid;
 | 
			
		||||
  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
 | 
			
		||||
  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
 | 
			
		||||
  
 | 
			
		||||
  int Nblock = FullGrid->GlobalDimensions()[Orthog];
 | 
			
		||||
  
 | 
			
		||||
  Lattice<vobj> Lslice(SliceGrid);
 | 
			
		||||
  Lattice<vobj> Rslice(SliceGrid);
 | 
			
		||||
  //  Lattice<vobj> Lslice(SliceGrid);
 | 
			
		||||
  //  Lattice<vobj> Rslice(SliceGrid);
 | 
			
		||||
  
 | 
			
		||||
  mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
 | 
			
		||||
 | 
			
		||||
  assert( FullGrid->_simd_layout[Orthog]==1);
 | 
			
		||||
  int nh =  FullGrid->_ndimension;
 | 
			
		||||
  int nl = SliceGrid->_ndimension;
 | 
			
		||||
  //  int nl = SliceGrid->_ndimension;
 | 
			
		||||
  int nl = nh-1;
 | 
			
		||||
 | 
			
		||||
  //FIXME package in a convenient iterator
 | 
			
		||||
  //Should loop over a plane orthogonal to direction "Orthog"
 | 
			
		||||
@@ -540,7 +544,7 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
 | 
			
		||||
      for(int i=0;i<Nblock;i++){
 | 
			
		||||
      for(int j=0;j<Nblock;j++){
 | 
			
		||||
	auto tmp = innerProduct(Left[i],Right[j]);
 | 
			
		||||
	vector_typeD rtmp = TensorRemove(tmp);
 | 
			
		||||
	auto rtmp = TensorRemove(tmp);
 | 
			
		||||
	mat_thread(i,j) += Reduce(rtmp);
 | 
			
		||||
      }}
 | 
			
		||||
    }}
 | 
			
		||||
@@ -549,6 +553,14 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
 | 
			
		||||
      mat += mat_thread;
 | 
			
		||||
    }  
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  for(int i=0;i<Nblock;i++){
 | 
			
		||||
  for(int j=0;j<Nblock;j++){
 | 
			
		||||
    ComplexD sum = mat(i,j);
 | 
			
		||||
    FullGrid->GlobalSum(sum);
 | 
			
		||||
    mat(i,j)=sum;
 | 
			
		||||
  }}
 | 
			
		||||
 | 
			
		||||
  return;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -95,7 +95,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
 | 
			
		||||
////////////////////////////////////////////////////////////
 | 
			
		||||
void Grid_quiesce_nodes(void) {
 | 
			
		||||
  int me = 0;
 | 
			
		||||
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPI3L)
 | 
			
		||||
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
 | 
			
		||||
  MPI_Comm_rank(MPI_COMM_WORLD, &me);
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef GRID_COMMS_SHMEM
 | 
			
		||||
 
 | 
			
		||||
@@ -29,7 +29,7 @@
 | 
			
		||||
#ifndef GRID_BINARY_IO_H
 | 
			
		||||
#define GRID_BINARY_IO_H
 | 
			
		||||
 | 
			
		||||
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) 
 | 
			
		||||
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT) 
 | 
			
		||||
#define USE_MPI_IO
 | 
			
		||||
#else
 | 
			
		||||
#undef  USE_MPI_IO
 | 
			
		||||
@@ -98,35 +98,39 @@ class BinaryIO {
 | 
			
		||||
 | 
			
		||||
    NerscChecksum(grid,scalardata,nersc_csum);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  template<class fobj> static inline void NerscChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &nersc_csum)
 | 
			
		||||
 | 
			
		||||
  template <class fobj>
 | 
			
		||||
  static inline void NerscChecksum(GridBase *grid, std::vector<fobj> &fbuf, uint32_t &nersc_csum)
 | 
			
		||||
  {
 | 
			
		||||
    const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
 | 
			
		||||
    const uint64_t size32 = sizeof(fobj) / sizeof(uint32_t);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    uint64_t lsites              =grid->lSites();
 | 
			
		||||
    if (fbuf.size()==1) {
 | 
			
		||||
      lsites=1;
 | 
			
		||||
    uint64_t lsites = grid->lSites();
 | 
			
		||||
    if (fbuf.size() == 1)
 | 
			
		||||
    {
 | 
			
		||||
      lsites = 1;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
#pragma omp parallel
 | 
			
		||||
    { 
 | 
			
		||||
      uint32_t nersc_csum_thr=0;
 | 
			
		||||
    #pragma omp parallel
 | 
			
		||||
    {
 | 
			
		||||
      uint32_t nersc_csum_thr = 0;
 | 
			
		||||
 | 
			
		||||
#pragma omp for
 | 
			
		||||
      for(uint64_t local_site=0;local_site<lsites;local_site++){
 | 
			
		||||
	uint32_t * site_buf = (uint32_t *)&fbuf[local_site];
 | 
			
		||||
	for(uint64_t j=0;j<size32;j++){
 | 
			
		||||
	  nersc_csum_thr=nersc_csum_thr+site_buf[j];
 | 
			
		||||
	}
 | 
			
		||||
      #pragma omp for
 | 
			
		||||
      for (uint64_t local_site = 0; local_site < lsites; local_site++)
 | 
			
		||||
      {
 | 
			
		||||
        uint32_t *site_buf = (uint32_t *)&fbuf[local_site];
 | 
			
		||||
        for (uint64_t j = 0; j < size32; j++)
 | 
			
		||||
        {
 | 
			
		||||
          nersc_csum_thr = nersc_csum_thr + site_buf[j];
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
#pragma omp critical
 | 
			
		||||
      #pragma omp critical
 | 
			
		||||
      {
 | 
			
		||||
	nersc_csum  += nersc_csum_thr;
 | 
			
		||||
        nersc_csum += nersc_csum_thr;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class fobj> static inline void ScidacChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &scidac_csuma,uint32_t &scidac_csumb)
 | 
			
		||||
  {
 | 
			
		||||
    const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
 | 
			
		||||
@@ -266,7 +270,7 @@ class BinaryIO {
 | 
			
		||||
    grid->Barrier();
 | 
			
		||||
    GridStopWatch timer; 
 | 
			
		||||
    GridStopWatch bstimer;
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    nersc_csum=0;
 | 
			
		||||
    scidac_csuma=0;
 | 
			
		||||
    scidac_csumb=0;
 | 
			
		||||
@@ -362,18 +366,22 @@ class BinaryIO {
 | 
			
		||||
#else 
 | 
			
		||||
	assert(0);
 | 
			
		||||
#endif
 | 
			
		||||
      } else { 
 | 
			
		||||
	std::cout<< GridLogMessage<< "C++ read I/O "<< file<<" : "
 | 
			
		||||
		 << iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
 | 
			
		||||
	std::ifstream fin;
 | 
			
		||||
	fin.open(file,std::ios::binary|std::ios::in);
 | 
			
		||||
	if ( control & BINARYIO_MASTER_APPEND )  {
 | 
			
		||||
	  fin.seekg(-sizeof(fobj),fin.end);
 | 
			
		||||
	} else { 
 | 
			
		||||
	  fin.seekg(offset+myrank*lsites*sizeof(fobj));
 | 
			
		||||
	}
 | 
			
		||||
	fin.read((char *)&iodata[0],iodata.size()*sizeof(fobj));assert( fin.fail()==0);
 | 
			
		||||
	fin.close();
 | 
			
		||||
      } else {
 | 
			
		||||
        std::cout << GridLogMessage << "C++ read I/O " << file << " : "
 | 
			
		||||
                  << iodata.size() * sizeof(fobj) << " bytes" << std::endl;
 | 
			
		||||
        std::ifstream fin;
 | 
			
		||||
        fin.open(file, std::ios::binary | std::ios::in);
 | 
			
		||||
        if (control & BINARYIO_MASTER_APPEND)
 | 
			
		||||
        {
 | 
			
		||||
          fin.seekg(-sizeof(fobj), fin.end);
 | 
			
		||||
        }
 | 
			
		||||
        else
 | 
			
		||||
        {
 | 
			
		||||
          fin.seekg(offset + myrank * lsites * sizeof(fobj));
 | 
			
		||||
        }
 | 
			
		||||
        fin.read((char *)&iodata[0], iodata.size() * sizeof(fobj));
 | 
			
		||||
        assert(fin.fail() == 0);
 | 
			
		||||
        fin.close();
 | 
			
		||||
      }
 | 
			
		||||
      timer.Stop();
 | 
			
		||||
 | 
			
		||||
@@ -405,30 +413,78 @@ class BinaryIO {
 | 
			
		||||
      timer.Start();
 | 
			
		||||
      if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
 | 
			
		||||
#ifdef USE_MPI_IO
 | 
			
		||||
	std::cout<< GridLogMessage<< "MPI write I/O "<< file<< std::endl;
 | 
			
		||||
	ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDWR|MPI_MODE_CREATE,MPI_INFO_NULL, &fh); assert(ierr==0);
 | 
			
		||||
	ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);                        assert(ierr==0);
 | 
			
		||||
	ierr=MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);                                        assert(ierr==0);
 | 
			
		||||
	MPI_File_close(&fh);
 | 
			
		||||
	MPI_Type_free(&fileArray);
 | 
			
		||||
	MPI_Type_free(&localArray);
 | 
			
		||||
        std::cout << GridLogMessage << "MPI write I/O " << file << std::endl;
 | 
			
		||||
        ierr = MPI_File_open(grid->communicator, (char *)file.c_str(), MPI_MODE_RDWR | MPI_MODE_CREATE, MPI_INFO_NULL, &fh);
 | 
			
		||||
        std::cout << GridLogMessage << "Checking for errors" << std::endl;
 | 
			
		||||
        if (ierr != MPI_SUCCESS)
 | 
			
		||||
        {
 | 
			
		||||
          char error_string[BUFSIZ];
 | 
			
		||||
          int length_of_error_string, error_class;
 | 
			
		||||
 | 
			
		||||
          MPI_Error_class(ierr, &error_class);
 | 
			
		||||
          MPI_Error_string(error_class, error_string, &length_of_error_string);
 | 
			
		||||
          fprintf(stderr, "%3d: %s\n", myrank, error_string);
 | 
			
		||||
          MPI_Error_string(ierr, error_string, &length_of_error_string);
 | 
			
		||||
          fprintf(stderr, "%3d: %s\n", myrank, error_string);
 | 
			
		||||
          MPI_Abort(MPI_COMM_WORLD, 1); //assert(ierr == 0);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        std::cout << GridLogDebug << "MPI read I/O set view " << file << std::endl;
 | 
			
		||||
        ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);
 | 
			
		||||
        assert(ierr == 0);
 | 
			
		||||
 | 
			
		||||
        std::cout << GridLogDebug << "MPI read I/O write all " << file << std::endl;
 | 
			
		||||
        ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);
 | 
			
		||||
        assert(ierr == 0);
 | 
			
		||||
 | 
			
		||||
        MPI_File_close(&fh);
 | 
			
		||||
        MPI_Type_free(&fileArray);
 | 
			
		||||
        MPI_Type_free(&localArray);
 | 
			
		||||
#else 
 | 
			
		||||
	assert(0);
 | 
			
		||||
#endif
 | 
			
		||||
      } else { 
 | 
			
		||||
	std::ofstream fout; fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
 | 
			
		||||
	std::cout<< GridLogMessage<< "C++ write I/O "<< file<<" : "
 | 
			
		||||
		 << iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
 | 
			
		||||
	if ( control & BINARYIO_MASTER_APPEND )  {
 | 
			
		||||
        
 | 
			
		||||
	std::ofstream fout; 
 | 
			
		||||
  fout.exceptions ( std::fstream::failbit | std::fstream::badbit );
 | 
			
		||||
  try {
 | 
			
		||||
    fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
 | 
			
		||||
  } catch (const std::fstream::failure& exc) {
 | 
			
		||||
    std::cout << GridLogError << "Error in opening the file " << file << " for output" <<std::endl;
 | 
			
		||||
    std::cout << GridLogError << "Exception description: " << exc.what() << std::endl;
 | 
			
		||||
    std::cout << GridLogError << "Probable cause: wrong path, inaccessible location "<< std::endl;
 | 
			
		||||
    #ifdef USE_MPI_IO
 | 
			
		||||
    MPI_Abort(MPI_COMM_WORLD,1);
 | 
			
		||||
    #else
 | 
			
		||||
    exit(1);
 | 
			
		||||
    #endif
 | 
			
		||||
  }
 | 
			
		||||
	std::cout << GridLogMessage<< "C++ write I/O "<< file<<" : "
 | 
			
		||||
		        << iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
 | 
			
		||||
	
 | 
			
		||||
  if ( control & BINARYIO_MASTER_APPEND )  {
 | 
			
		||||
	  fout.seekp(0,fout.end);
 | 
			
		||||
	} else {
 | 
			
		||||
	  fout.seekp(offset+myrank*lsites*sizeof(fobj));
 | 
			
		||||
	}
 | 
			
		||||
	fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));assert( fout.fail()==0);
 | 
			
		||||
  
 | 
			
		||||
  try {
 | 
			
		||||
  	fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));//assert( fout.fail()==0);
 | 
			
		||||
  }
 | 
			
		||||
  catch (const std::fstream::failure& exc) {
 | 
			
		||||
    std::cout << "Exception in writing file " << file << std::endl;
 | 
			
		||||
    std::cout << GridLogError << "Exception description: "<< exc.what() << std::endl;
 | 
			
		||||
    #ifdef USE_MPI_IO
 | 
			
		||||
    MPI_Abort(MPI_COMM_WORLD,1);
 | 
			
		||||
    #else
 | 
			
		||||
    exit(1);
 | 
			
		||||
    #endif
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
	fout.close();
 | 
			
		||||
      }
 | 
			
		||||
      timer.Stop();
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  timer.Stop();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
    std::cout<<GridLogMessage<<"IOobject: ";
 | 
			
		||||
    if ( control & BINARYIO_READ) std::cout << " read  ";
 | 
			
		||||
@@ -442,11 +498,14 @@ class BinaryIO {
 | 
			
		||||
    //////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    // Safety check
 | 
			
		||||
    //////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
    grid->Barrier();
 | 
			
		||||
    grid->GlobalSum(nersc_csum);
 | 
			
		||||
    grid->GlobalXOR(scidac_csuma);
 | 
			
		||||
    grid->GlobalXOR(scidac_csumb);
 | 
			
		||||
    grid->Barrier();
 | 
			
		||||
    // if the data size is 1 we do not want to sum over the MPI ranks
 | 
			
		||||
    if (iodata.size() != 1){
 | 
			
		||||
      grid->Barrier();
 | 
			
		||||
      grid->GlobalSum(nersc_csum);
 | 
			
		||||
      grid->GlobalXOR(scidac_csuma);
 | 
			
		||||
      grid->GlobalXOR(scidac_csumb);
 | 
			
		||||
      grid->Barrier();
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  /////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
@@ -546,9 +605,9 @@ class BinaryIO {
 | 
			
		||||
    int gsites = grid->gSites();
 | 
			
		||||
    int lsites = grid->lSites();
 | 
			
		||||
 | 
			
		||||
    uint32_t nersc_csum_tmp;
 | 
			
		||||
    uint32_t scidac_csuma_tmp;
 | 
			
		||||
    uint32_t scidac_csumb_tmp;
 | 
			
		||||
    uint32_t nersc_csum_tmp   = 0;
 | 
			
		||||
    uint32_t scidac_csuma_tmp = 0;
 | 
			
		||||
    uint32_t scidac_csumb_tmp = 0;
 | 
			
		||||
 | 
			
		||||
    GridStopWatch timer;
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -84,10 +84,6 @@ namespace QCD {
 | 
			
		||||
   stream << "GRID_";
 | 
			
		||||
   stream << ScidacWordMnemonic<stype>();
 | 
			
		||||
 | 
			
		||||
   //   std::cout << " Lorentz N/S/V/M : " << _LorentzN<<" "<<_LorentzScalar<<"/"<<_LorentzVector<<"/"<<_LorentzMatrix<<std::endl;
 | 
			
		||||
   //   std::cout << " Spin    N/S/V/M : " << _SpinN   <<" "<<_SpinScalar   <<"/"<<_SpinVector   <<"/"<<_SpinMatrix<<std::endl;
 | 
			
		||||
   //   std::cout << " Colour  N/S/V/M : " << _ColourN <<" "<<_ColourScalar <<"/"<<_ColourVector <<"/"<<_ColourMatrix<<std::endl;
 | 
			
		||||
 | 
			
		||||
   if ( _LorentzVector )   stream << "_LorentzVector"<<_LorentzN;
 | 
			
		||||
   if ( _LorentzMatrix )   stream << "_LorentzMatrix"<<_LorentzN;
 | 
			
		||||
 | 
			
		||||
@@ -182,7 +178,7 @@ class GridLimeReader : public BinaryIO {
 | 
			
		||||
   /////////////////////////////////////////////
 | 
			
		||||
   // Open the file
 | 
			
		||||
   /////////////////////////////////////////////
 | 
			
		||||
   void open(std::string &_filename) 
 | 
			
		||||
   void open(const std::string &_filename) 
 | 
			
		||||
   {
 | 
			
		||||
     filename= _filename;
 | 
			
		||||
     File = fopen(filename.c_str(), "r");
 | 
			
		||||
@@ -210,19 +206,33 @@ class GridLimeReader : public BinaryIO {
 | 
			
		||||
 | 
			
		||||
    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
 | 
			
		||||
 | 
			
		||||
      std::cout << GridLogMessage << limeReaderType(LimeR) <<std::endl;
 | 
			
		||||
	
 | 
			
		||||
      if ( strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) )  ) {
 | 
			
		||||
      uint64_t file_bytes =limeReaderBytes(LimeR);
 | 
			
		||||
 | 
			
		||||
      //      std::cout << GridLogMessage << limeReaderType(LimeR) << " "<< file_bytes <<" bytes "<<std::endl;
 | 
			
		||||
      //      std::cout << GridLogMessage<< " readLimeObject seeking "<<  record_name <<" found record :" <<limeReaderType(LimeR) <<std::endl;
 | 
			
		||||
 | 
			
		||||
      if ( !strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) )  ) {
 | 
			
		||||
 | 
			
		||||
	//	std::cout << GridLogMessage<< " readLimeLatticeBinaryObject matches ! " <<std::endl;
 | 
			
		||||
 | 
			
		||||
	uint64_t PayloadSize = sizeof(sobj) * field._grid->_gsites;
 | 
			
		||||
 | 
			
		||||
	//	std::cout << "R sizeof(sobj)= " <<sizeof(sobj)<<std::endl;
 | 
			
		||||
	//	std::cout << "R Gsites " <<field._grid->_gsites<<std::endl;
 | 
			
		||||
	//	std::cout << "R Payload expected " <<PayloadSize<<std::endl;
 | 
			
		||||
	//	std::cout << "R file size " <<file_bytes <<std::endl;
 | 
			
		||||
 | 
			
		||||
	assert(PayloadSize == file_bytes);// Must match or user error
 | 
			
		||||
 | 
			
		||||
	off_t offset= ftell(File);
 | 
			
		||||
	//	std::cout << " ReadLatticeObject from offset "<<offset << std::endl;
 | 
			
		||||
	BinarySimpleMunger<sobj,sobj> munge;
 | 
			
		||||
	BinaryIO::readLatticeObject< sobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
 | 
			
		||||
	BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
 | 
			
		||||
 | 
			
		||||
	/////////////////////////////////////////////
 | 
			
		||||
	// Insist checksum is next record
 | 
			
		||||
	/////////////////////////////////////////////
 | 
			
		||||
	readLimeObject(scidacChecksum_,std::string("scidacChecksum"),record_name);
 | 
			
		||||
	readLimeObject(scidacChecksum_,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM));
 | 
			
		||||
 | 
			
		||||
	/////////////////////////////////////////////
 | 
			
		||||
	// Verify checksums
 | 
			
		||||
@@ -242,11 +252,19 @@ class GridLimeReader : public BinaryIO {
 | 
			
		||||
    // should this be a do while; can we miss a first record??
 | 
			
		||||
    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
 | 
			
		||||
 | 
			
		||||
      //      std::cout << GridLogMessage<< " readLimeObject seeking "<< record_name <<" found record :" <<limeReaderType(LimeR) <<std::endl;
 | 
			
		||||
 | 
			
		||||
      uint64_t nbytes = limeReaderBytes(LimeR);//size of this record (configuration)
 | 
			
		||||
 | 
			
		||||
      if ( strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) )  ) {
 | 
			
		||||
      if ( !strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) )  ) {
 | 
			
		||||
 | 
			
		||||
	//	std::cout << GridLogMessage<< " readLimeObject matches ! " << record_name <<std::endl;
 | 
			
		||||
 | 
			
		||||
	std::vector<char> xmlc(nbytes+1,'\0');
 | 
			
		||||
	limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);    
 | 
			
		||||
 | 
			
		||||
	//	std::cout << GridLogMessage<< " readLimeObject matches XML " << &xmlc[0] <<std::endl;
 | 
			
		||||
 | 
			
		||||
	XmlReader RD(&xmlc[0],"");
 | 
			
		||||
	read(RD,object_name,object);
 | 
			
		||||
	return;
 | 
			
		||||
@@ -261,13 +279,14 @@ class GridLimeWriter : public BinaryIO {
 | 
			
		||||
 public:
 | 
			
		||||
   ///////////////////////////////////////////////////
 | 
			
		||||
   // FIXME: format for RNG? Now just binary out instead
 | 
			
		||||
   // FIXME: collective calls or not ?
 | 
			
		||||
   //      : must know if I am the I/O boss
 | 
			
		||||
   ///////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
   FILE       *File;
 | 
			
		||||
   LimeWriter *LimeW;
 | 
			
		||||
   std::string filename;
 | 
			
		||||
 | 
			
		||||
   void open(std::string &_filename) { 
 | 
			
		||||
   void open(const std::string &_filename) { 
 | 
			
		||||
     filename= _filename;
 | 
			
		||||
     File = fopen(filename.c_str(), "w");
 | 
			
		||||
     LimeW = limeCreateWriter(File); assert(LimeW != NULL );
 | 
			
		||||
@@ -302,14 +321,18 @@ class GridLimeWriter : public BinaryIO {
 | 
			
		||||
      write(WR,object_name,object);
 | 
			
		||||
      xmlstring = WR.XmlString();
 | 
			
		||||
    }
 | 
			
		||||
    //    std::cout << "WriteLimeObject" << record_name <<std::endl;
 | 
			
		||||
    uint64_t nbytes = xmlstring.size();
 | 
			
		||||
    //    std::cout << " xmlstring "<< nbytes<< " " << xmlstring <<std::endl;
 | 
			
		||||
    int err;
 | 
			
		||||
    LimeRecordHeader *h = limeCreateHeader(MB, ME,(char *)record_name.c_str(), nbytes); assert(h!= NULL);
 | 
			
		||||
    LimeRecordHeader *h = limeCreateHeader(MB, ME,const_cast<char *>(record_name.c_str()), nbytes); 
 | 
			
		||||
    assert(h!= NULL);
 | 
			
		||||
 | 
			
		||||
    err=limeWriteRecordHeader(h, LimeW);                    assert(err>=0);
 | 
			
		||||
    err=limeWriteRecordData(&xmlstring[0], &nbytes, LimeW); assert(err>=0);
 | 
			
		||||
    err=limeWriterCloseRecord(LimeW);                       assert(err>=0);
 | 
			
		||||
    limeDestroyHeader(h);
 | 
			
		||||
    //    std::cout << " File offset is now"<<ftell(File) << std::endl;
 | 
			
		||||
  }
 | 
			
		||||
  ////////////////////////////////////////////
 | 
			
		||||
  // Write a generic lattice field and csum
 | 
			
		||||
@@ -326,6 +349,11 @@ class GridLimeWriter : public BinaryIO {
 | 
			
		||||
    uint64_t PayloadSize = sizeof(sobj) * field._grid->_gsites;
 | 
			
		||||
    createLimeRecordHeader(record_name, 0, 0, PayloadSize);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    //    std::cout << "W sizeof(sobj)"      <<sizeof(sobj)<<std::endl;
 | 
			
		||||
    //    std::cout << "W Gsites "           <<field._grid->_gsites<<std::endl;
 | 
			
		||||
    //    std::cout << "W Payload expected " <<PayloadSize<<std::endl;
 | 
			
		||||
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////
 | 
			
		||||
    // NB: FILE and iostream are jointly writing disjoint sequences in the
 | 
			
		||||
    // the same file through different file handles (integer units).
 | 
			
		||||
@@ -340,6 +368,7 @@ class GridLimeWriter : public BinaryIO {
 | 
			
		||||
    //  v) Continue writing scidac record.
 | 
			
		||||
    ////////////////////////////////////////////////////////////////////
 | 
			
		||||
    off_t offset = ftell(File);
 | 
			
		||||
    //    std::cout << " Writing to offset "<<offset << std::endl;
 | 
			
		||||
    std::string format = getFormatString<vobj>();
 | 
			
		||||
    BinarySimpleMunger<sobj,sobj> munge;
 | 
			
		||||
    BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
 | 
			
		||||
@@ -354,7 +383,7 @@ class GridLimeWriter : public BinaryIO {
 | 
			
		||||
    checksum.suma= streama.str();
 | 
			
		||||
    checksum.sumb= streamb.str();
 | 
			
		||||
    std::cout << GridLogMessage<<" writing scidac checksums "<<std::hex<<scidac_csuma<<"/"<<scidac_csumb<<std::dec<<std::endl;
 | 
			
		||||
    writeLimeObject(0,1,checksum,std::string("scidacChecksum"    ),std::string(SCIDAC_CHECKSUM));
 | 
			
		||||
    writeLimeObject(0,1,checksum,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM));
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
@@ -371,11 +400,9 @@ class ScidacWriter : public GridLimeWriter {
 | 
			
		||||
  ////////////////////////////////////////////////
 | 
			
		||||
  // Write generic lattice field in scidac format
 | 
			
		||||
  ////////////////////////////////////////////////
 | 
			
		||||
   template <class vobj, class userRecord>
 | 
			
		||||
  template <class vobj, class userRecord>
 | 
			
		||||
  void writeScidacFieldRecord(Lattice<vobj> &field,userRecord _userRecord) 
 | 
			
		||||
  {
 | 
			
		||||
    typedef typename vobj::scalar_object sobj;
 | 
			
		||||
    uint64_t nbytes;
 | 
			
		||||
    GridBase * grid = field._grid;
 | 
			
		||||
 | 
			
		||||
    ////////////////////////////////////////
 | 
			
		||||
@@ -397,6 +424,66 @@ class ScidacWriter : public GridLimeWriter {
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ScidacReader : public GridLimeReader {
 | 
			
		||||
 public:
 | 
			
		||||
 | 
			
		||||
   template<class SerialisableUserFile>
 | 
			
		||||
   void readScidacFileRecord(GridBase *grid,SerialisableUserFile &_userFile)
 | 
			
		||||
   {
 | 
			
		||||
     scidacFile    _scidacFile(grid);
 | 
			
		||||
     readLimeObject(_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
 | 
			
		||||
     readLimeObject(_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
 | 
			
		||||
   }
 | 
			
		||||
  ////////////////////////////////////////////////
 | 
			
		||||
  // Write generic lattice field in scidac format
 | 
			
		||||
  ////////////////////////////////////////////////
 | 
			
		||||
  template <class vobj, class userRecord>
 | 
			
		||||
  void readScidacFieldRecord(Lattice<vobj> &field,userRecord &_userRecord) 
 | 
			
		||||
  {
 | 
			
		||||
    typedef typename vobj::scalar_object sobj;
 | 
			
		||||
    GridBase * grid = field._grid;
 | 
			
		||||
 | 
			
		||||
    ////////////////////////////////////////
 | 
			
		||||
    // fill the Grid header
 | 
			
		||||
    ////////////////////////////////////////
 | 
			
		||||
    FieldMetaData header;
 | 
			
		||||
    scidacRecord  _scidacRecord;
 | 
			
		||||
    scidacFile    _scidacFile;
 | 
			
		||||
 | 
			
		||||
    //////////////////////////////////////////////
 | 
			
		||||
    // Fill the Lime file record by record
 | 
			
		||||
    //////////////////////////////////////////////
 | 
			
		||||
    readLimeObject(header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message 
 | 
			
		||||
    readLimeObject(_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML));
 | 
			
		||||
    readLimeObject(_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
 | 
			
		||||
    readLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA));
 | 
			
		||||
  }
 | 
			
		||||
  void skipPastBinaryRecord(void) {
 | 
			
		||||
    std::string rec_name(ILDG_BINARY_DATA);
 | 
			
		||||
    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
 | 
			
		||||
      if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) )  ) {
 | 
			
		||||
	skipPastObjectRecord(std::string(SCIDAC_CHECKSUM));
 | 
			
		||||
	return;
 | 
			
		||||
      }
 | 
			
		||||
    }    
 | 
			
		||||
  }
 | 
			
		||||
  void skipPastObjectRecord(std::string rec_name) {
 | 
			
		||||
    while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { 
 | 
			
		||||
      if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) )  ) {
 | 
			
		||||
	return;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  void skipScidacFieldRecord() {
 | 
			
		||||
    skipPastObjectRecord(std::string(GRID_FORMAT));
 | 
			
		||||
    skipPastObjectRecord(std::string(SCIDAC_RECORD_XML));
 | 
			
		||||
    skipPastObjectRecord(std::string(SCIDAC_PRIVATE_RECORD_XML));
 | 
			
		||||
    skipPastBinaryRecord();
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class IldgWriter : public ScidacWriter {
 | 
			
		||||
 public:
 | 
			
		||||
 | 
			
		||||
@@ -425,8 +512,6 @@ class IldgWriter : public ScidacWriter {
 | 
			
		||||
    typedef iLorentzColourMatrix<vsimd> vobj;
 | 
			
		||||
    typedef typename vobj::scalar_object sobj;
 | 
			
		||||
 | 
			
		||||
    uint64_t nbytes;
 | 
			
		||||
 | 
			
		||||
    ////////////////////////////////////////
 | 
			
		||||
    // fill the Grid header
 | 
			
		||||
    ////////////////////////////////////////
 | 
			
		||||
 
 | 
			
		||||
@@ -64,6 +64,11 @@ namespace Grid {
 | 
			
		||||
// file compatability, so should be correct to assume the undocumented but defacto file structure.
 | 
			
		||||
/////////////////////////////////////////////////////////////////////////////////
 | 
			
		||||
 | 
			
		||||
struct emptyUserRecord : Serializable { 
 | 
			
		||||
  GRID_SERIALIZABLE_CLASS_MEMBERS(emptyUserRecord,int,dummy);
 | 
			
		||||
  emptyUserRecord() { dummy=0; };
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
////////////////////////
 | 
			
		||||
// Scidac private file xml
 | 
			
		||||
// <?xml version="1.0" encoding="UTF-8"?><scidacFile><version>1.1</version><spacetime>4</spacetime><dims>16 16 16 32 </dims><volfmt>0</volfmt></scidacFile>
 | 
			
		||||
 
 | 
			
		||||
@@ -85,6 +85,9 @@ namespace Grid {
 | 
			
		||||
	nd=4;
 | 
			
		||||
	dimension.resize(4);
 | 
			
		||||
	boundary.resize(4);
 | 
			
		||||
	scidac_checksuma=0;
 | 
			
		||||
	scidac_checksumb=0;
 | 
			
		||||
	checksum=0;
 | 
			
		||||
      }
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
@@ -104,6 +107,7 @@ namespace Grid {
 | 
			
		||||
      header.nd = nd;
 | 
			
		||||
      header.dimension.resize(nd);
 | 
			
		||||
      header.boundary.resize(nd);
 | 
			
		||||
      header.data_start = 0;
 | 
			
		||||
      for(int d=0;d<nd;d++) {
 | 
			
		||||
	header.dimension[d] = grid->_fdimensions[d];
 | 
			
		||||
      }
 | 
			
		||||
 
 | 
			
		||||
@@ -40,7 +40,7 @@ const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::Performan
 | 
			
		||||
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES          ,  "CPUCYCLES.........." , INSTRUCTIONS},
 | 
			
		||||
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS        ,  "INSTRUCTIONS......." , CPUCYCLES   },
 | 
			
		||||
    // 4
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
#ifdef KNL
 | 
			
		||||
    { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES    },
 | 
			
		||||
    { PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS  },
 | 
			
		||||
    { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS    },
 | 
			
		||||
 
 | 
			
		||||
@@ -47,4 +47,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
////////////////////////////////////////
 | 
			
		||||
#include <Grid/qcd/action/pseudofermion/PseudoFermion.h>
 | 
			
		||||
 | 
			
		||||
////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Laplacian on fermion fields
 | 
			
		||||
////////////////////////////////////////////////////////////////////////
 | 
			
		||||
#include <Grid/qcd/utils/CovariantLaplacian.h>
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -53,7 +53,7 @@ directory
 | 
			
		||||
// Utility functions
 | 
			
		||||
////////////////////////////////////////////
 | 
			
		||||
#include <Grid/qcd/utils/Metric.h>
 | 
			
		||||
#include <Grid/qcd/utils/CovariantLaplacian.h>
 | 
			
		||||
#include <Grid/qcd/utils/CovariantAdjointLaplacian.h>
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										100
									
								
								lib/qcd/action/fermion/AbstractEOFAFermion.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										100
									
								
								lib/qcd/action/fermion/AbstractEOFAFermion.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,100 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
Source file: ./lib/qcd/action/fermion/AbstractEOFAFermion.h
 | 
			
		||||
 | 
			
		||||
Copyright (C) 2017
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: David Murphy <dmurphy@phys.columbia.edu>
 | 
			
		||||
 | 
			
		||||
This program is free software; you can redistribute it and/or modify
 | 
			
		||||
it under the terms of the GNU General Public License as published by
 | 
			
		||||
the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
(at your option) any later version.
 | 
			
		||||
 | 
			
		||||
This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
You should have received a copy of the GNU General Public License along
 | 
			
		||||
with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
#ifndef  GRID_QCD_ABSTRACT_EOFA_FERMION_H
 | 
			
		||||
#define  GRID_QCD_ABSTRACT_EOFA_FERMION_H
 | 
			
		||||
 | 
			
		||||
#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
  // DJM: Abstract base class for EOFA fermion types.
 | 
			
		||||
  // Defines layout of additional EOFA-specific parameters and operators.
 | 
			
		||||
  // Use to construct EOFA pseudofermion actions that are agnostic to
 | 
			
		||||
  // Shamir / Mobius / etc., and ensure that no one can construct EOFA
 | 
			
		||||
  // pseudofermion action with non-EOFA fermion type.
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  class AbstractEOFAFermion : public CayleyFermion5D<Impl> {
 | 
			
		||||
    public:
 | 
			
		||||
      INHERIT_IMPL_TYPES(Impl);
 | 
			
		||||
 | 
			
		||||
    public:
 | 
			
		||||
      // Fermion operator: D(mq1) + shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm}
 | 
			
		||||
      RealD mq1;
 | 
			
		||||
      RealD mq2;
 | 
			
		||||
      RealD mq3;
 | 
			
		||||
      RealD shift;
 | 
			
		||||
      int pm;
 | 
			
		||||
 | 
			
		||||
      RealD alpha; // Mobius scale
 | 
			
		||||
      RealD k;     // EOFA normalization constant
 | 
			
		||||
 | 
			
		||||
      virtual void Instantiatable(void) = 0;
 | 
			
		||||
 | 
			
		||||
      // EOFA-specific operations
 | 
			
		||||
      // Force user to implement in derived classes
 | 
			
		||||
      virtual void  Omega    (const FermionField& in, FermionField& out, int sign, int dag) = 0;
 | 
			
		||||
      virtual void  Dtilde   (const FermionField& in, FermionField& out) = 0;
 | 
			
		||||
      virtual void  DtildeInv(const FermionField& in, FermionField& out) = 0;
 | 
			
		||||
 | 
			
		||||
      // Implement derivatives in base class:
 | 
			
		||||
      // for EOFA both DWF and Mobius just need d(Dw)/dU
 | 
			
		||||
      virtual void MDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag){
 | 
			
		||||
        this->DhopDeriv(mat, U, V, dag);
 | 
			
		||||
      };
 | 
			
		||||
      virtual void MoeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag){
 | 
			
		||||
        this->DhopDerivOE(mat, U, V, dag);
 | 
			
		||||
      };
 | 
			
		||||
      virtual void MeoDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag){
 | 
			
		||||
        this->DhopDerivEO(mat, U, V, dag);
 | 
			
		||||
      };
 | 
			
		||||
 | 
			
		||||
      // Recompute 5D coefficients for different value of shift constant
 | 
			
		||||
      // (needed for heatbath loop over poles)
 | 
			
		||||
      virtual void RefreshShiftCoefficients(RealD new_shift) = 0;
 | 
			
		||||
 | 
			
		||||
      // Constructors
 | 
			
		||||
      AbstractEOFAFermion(GaugeField& _Umu, GridCartesian& FiveDimGrid, GridRedBlackCartesian& FiveDimRedBlackGrid,
 | 
			
		||||
        GridCartesian& FourDimGrid, GridRedBlackCartesian& FourDimRedBlackGrid,
 | 
			
		||||
        RealD _mq1, RealD _mq2, RealD _mq3, RealD _shift, int _pm,
 | 
			
		||||
        RealD _M5, RealD _b, RealD _c, const ImplParams& p=ImplParams())
 | 
			
		||||
        : CayleyFermion5D<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid, FourDimGrid, FourDimRedBlackGrid,
 | 
			
		||||
          _mq1, _M5, p), mq1(_mq1), mq2(_mq2), mq3(_mq3), shift(_shift), pm(_pm)
 | 
			
		||||
      {
 | 
			
		||||
        int Ls = this->Ls;
 | 
			
		||||
        this->alpha = _b + _c;
 | 
			
		||||
        this->k = this->alpha * (_mq3-_mq2) * std::pow(this->alpha+1.0,2*Ls) /
 | 
			
		||||
                    ( std::pow(this->alpha+1.0,Ls) + _mq2*std::pow(this->alpha-1.0,Ls) ) /
 | 
			
		||||
                    ( std::pow(this->alpha+1.0,Ls) + _mq3*std::pow(this->alpha-1.0,Ls) );
 | 
			
		||||
      };
 | 
			
		||||
  };
 | 
			
		||||
}}
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
@@ -77,7 +77,6 @@ void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
 | 
			
		||||
{
 | 
			
		||||
  this->Report();
 | 
			
		||||
@@ -119,7 +118,6 @@ template<class Impl> void CayleyFermion5D<Impl>::CayleyZeroCounters(void)
 | 
			
		||||
  MooeeInvTime=0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class Impl>  
 | 
			
		||||
void CayleyFermion5D<Impl>::M5D   (const FermionField &psi, FermionField &chi)
 | 
			
		||||
{
 | 
			
		||||
@@ -414,7 +412,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
 | 
			
		||||
  for(int i=0; i < Ls; i++){
 | 
			
		||||
    as[i] = 1.0;
 | 
			
		||||
    omega[i] = gamma[i]*zolo_hi; //NB reciprocal relative to Chroma NEF code
 | 
			
		||||
    //    assert(fabs(omega[i])>0.0);
 | 
			
		||||
    assert(omega[i]!=Coeff_t(0.0));
 | 
			
		||||
    bs[i] = 0.5*(bpc/omega[i] + bmc);
 | 
			
		||||
    cs[i] = 0.5*(bpc/omega[i] - bmc);
 | 
			
		||||
  }
 | 
			
		||||
@@ -429,7 +427,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
 | 
			
		||||
  
 | 
			
		||||
  for(int i=0;i<Ls;i++){
 | 
			
		||||
    bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);     
 | 
			
		||||
    //    assert(fabs(bee[i])>0.0);
 | 
			
		||||
    assert(bee[i]!=Coeff_t(0.0));
 | 
			
		||||
    cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
 | 
			
		||||
    beo[i]=as[i]*bs[i];
 | 
			
		||||
    ceo[i]=-as[i]*cs[i];
 | 
			
		||||
@@ -455,11 +453,17 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
 | 
			
		||||
    dee[i] = bee[i];
 | 
			
		||||
    
 | 
			
		||||
    if ( i < Ls-1 ) {
 | 
			
		||||
 | 
			
		||||
      assert(bee[i]!=Coeff_t(0.0));
 | 
			
		||||
      assert(bee[0]!=Coeff_t(0.0));
 | 
			
		||||
      
 | 
			
		||||
      lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
 | 
			
		||||
      
 | 
			
		||||
      leem[i]=mass*cee[Ls-1]/bee[0];
 | 
			
		||||
      for(int j=0;j<i;j++)  leem[i]*= aee[j]/bee[j+1];
 | 
			
		||||
      for(int j=0;j<i;j++) {
 | 
			
		||||
	assert(bee[j+1]!=Coeff_t(0.0));
 | 
			
		||||
	leem[i]*= aee[j]/bee[j+1];
 | 
			
		||||
      }
 | 
			
		||||
      
 | 
			
		||||
      uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row
 | 
			
		||||
      
 | 
			
		||||
@@ -478,7 +482,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
 | 
			
		||||
  { 
 | 
			
		||||
    Coeff_t delta_d=mass*cee[Ls-1];
 | 
			
		||||
    for(int j=0;j<Ls-1;j++) {
 | 
			
		||||
      //      assert(fabs(bee[j])>0.0);
 | 
			
		||||
      assert(bee[j] != Coeff_t(0.0));
 | 
			
		||||
      delta_d *= cee[j]/bee[j];
 | 
			
		||||
    }
 | 
			
		||||
    dee[Ls-1] += delta_d;
 | 
			
		||||
 
 | 
			
		||||
@@ -1,6 +1,6 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
    Source file: ./lib/qcd/action/fermion/CayleyFermion5D.h
 | 
			
		||||
 | 
			
		||||
@@ -35,24 +35,24 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
  namespace QCD {
 | 
			
		||||
 | 
			
		||||
     template<typename T> struct switcheroo   {  
 | 
			
		||||
       static inline int iscomplex()  { return 0; } 
 | 
			
		||||
     template<typename T> struct switcheroo   {
 | 
			
		||||
       static inline int iscomplex()  { return 0; }
 | 
			
		||||
 | 
			
		||||
       template<class vec>
 | 
			
		||||
       static inline vec mult(vec a, vec b) {
 | 
			
		||||
	 return real_mult(a,b);
 | 
			
		||||
       }
 | 
			
		||||
     };
 | 
			
		||||
     template<> struct switcheroo<ComplexD> {  
 | 
			
		||||
       static inline int iscomplex()  { return 1; } 
 | 
			
		||||
     template<> struct switcheroo<ComplexD> {
 | 
			
		||||
       static inline int iscomplex()  { return 1; }
 | 
			
		||||
 | 
			
		||||
       template<class vec>
 | 
			
		||||
       static inline vec mult(vec a, vec b) {
 | 
			
		||||
	 return a*b;
 | 
			
		||||
       }
 | 
			
		||||
     };
 | 
			
		||||
     template<> struct switcheroo<ComplexF> {  
 | 
			
		||||
       static inline int iscomplex()  { return 1; } 
 | 
			
		||||
     template<> struct switcheroo<ComplexF> {
 | 
			
		||||
       static inline int iscomplex()  { return 1; }
 | 
			
		||||
       template<class vec>
 | 
			
		||||
       static inline vec mult(vec a, vec b) {
 | 
			
		||||
	 return a*b;
 | 
			
		||||
@@ -90,14 +90,14 @@ namespace Grid {
 | 
			
		||||
      // Instantiate different versions depending on Impl
 | 
			
		||||
      /////////////////////////////////////////////////////
 | 
			
		||||
      void M5D(const FermionField &psi,
 | 
			
		||||
	       const FermionField &phi, 
 | 
			
		||||
	       const FermionField &phi,
 | 
			
		||||
	       FermionField &chi,
 | 
			
		||||
	       std::vector<Coeff_t> &lower,
 | 
			
		||||
	       std::vector<Coeff_t> &diag,
 | 
			
		||||
	       std::vector<Coeff_t> &upper);
 | 
			
		||||
 | 
			
		||||
      void M5Ddag(const FermionField &psi,
 | 
			
		||||
		  const FermionField &phi, 
 | 
			
		||||
		  const FermionField &phi,
 | 
			
		||||
		  FermionField &chi,
 | 
			
		||||
		  std::vector<Coeff_t> &lower,
 | 
			
		||||
		  std::vector<Coeff_t> &diag,
 | 
			
		||||
@@ -125,7 +125,7 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
      // Efficient support for multigrid coarsening
 | 
			
		||||
      virtual void  Mdir (const FermionField &in, FermionField &out,int dir,int disp);
 | 
			
		||||
      
 | 
			
		||||
 | 
			
		||||
      void   Meooe5D       (const FermionField &in, FermionField &out);
 | 
			
		||||
      void   MeooeDag5D    (const FermionField &in, FermionField &out);
 | 
			
		||||
 | 
			
		||||
@@ -133,23 +133,23 @@ namespace Grid {
 | 
			
		||||
      RealD mass;
 | 
			
		||||
 | 
			
		||||
      // Cayley form Moebius (tanh and zolotarev)
 | 
			
		||||
      std::vector<Coeff_t> omega; 
 | 
			
		||||
      std::vector<Coeff_t> omega;
 | 
			
		||||
      std::vector<Coeff_t> bs;    // S dependent coeffs
 | 
			
		||||
      std::vector<Coeff_t> cs;    
 | 
			
		||||
      std::vector<Coeff_t> as;    
 | 
			
		||||
      std::vector<Coeff_t> cs;
 | 
			
		||||
      std::vector<Coeff_t> as;
 | 
			
		||||
      // For preconditioning Cayley form
 | 
			
		||||
      std::vector<Coeff_t> bee;    
 | 
			
		||||
      std::vector<Coeff_t> cee;    
 | 
			
		||||
      std::vector<Coeff_t> aee;    
 | 
			
		||||
      std::vector<Coeff_t> beo;    
 | 
			
		||||
      std::vector<Coeff_t> ceo;    
 | 
			
		||||
      std::vector<Coeff_t> aeo;    
 | 
			
		||||
      std::vector<Coeff_t> bee;
 | 
			
		||||
      std::vector<Coeff_t> cee;
 | 
			
		||||
      std::vector<Coeff_t> aee;
 | 
			
		||||
      std::vector<Coeff_t> beo;
 | 
			
		||||
      std::vector<Coeff_t> ceo;
 | 
			
		||||
      std::vector<Coeff_t> aeo;
 | 
			
		||||
      // LDU factorisation of the eeoo matrix
 | 
			
		||||
      std::vector<Coeff_t> lee;    
 | 
			
		||||
      std::vector<Coeff_t> leem;    
 | 
			
		||||
      std::vector<Coeff_t> uee;    
 | 
			
		||||
      std::vector<Coeff_t> ueem;    
 | 
			
		||||
      std::vector<Coeff_t> dee;    
 | 
			
		||||
      std::vector<Coeff_t> lee;
 | 
			
		||||
      std::vector<Coeff_t> leem;
 | 
			
		||||
      std::vector<Coeff_t> uee;
 | 
			
		||||
      std::vector<Coeff_t> ueem;
 | 
			
		||||
      std::vector<Coeff_t> dee;
 | 
			
		||||
 | 
			
		||||
      // Matrices of 5d ee inverse params
 | 
			
		||||
      Vector<iSinglet<Simd> >  MatpInv;
 | 
			
		||||
@@ -165,7 +165,7 @@ namespace Grid {
 | 
			
		||||
		      GridRedBlackCartesian &FourDimRedBlackGrid,
 | 
			
		||||
		      RealD _mass,RealD _M5,const ImplParams &p= ImplParams());
 | 
			
		||||
 | 
			
		||||
      
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
     void CayleyReport(void);
 | 
			
		||||
     void CayleyZeroCounters(void);
 | 
			
		||||
@@ -179,9 +179,9 @@ namespace Grid {
 | 
			
		||||
     double MooeeInvTime;
 | 
			
		||||
 | 
			
		||||
    protected:
 | 
			
		||||
      void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
 | 
			
		||||
      void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
 | 
			
		||||
      void SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c);
 | 
			
		||||
      virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
 | 
			
		||||
      virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
 | 
			
		||||
      virtual void SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c);
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										438
									
								
								lib/qcd/action/fermion/DomainWallEOFAFermion.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										438
									
								
								lib/qcd/action/fermion/DomainWallEOFAFermion.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,438 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermion.cc
 | 
			
		||||
 | 
			
		||||
Copyright (C) 2017
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: David Murphy <dmurphy@phys.columbia.edu>
 | 
			
		||||
 | 
			
		||||
This program is free software; you can redistribute it and/or modify
 | 
			
		||||
it under the terms of the GNU General Public License as published by
 | 
			
		||||
the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
(at your option) any later version.
 | 
			
		||||
 | 
			
		||||
This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
You should have received a copy of the GNU General Public License along
 | 
			
		||||
with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#include <Grid/Grid_Eigen_Dense.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/FermionCore.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    DomainWallEOFAFermion<Impl>::DomainWallEOFAFermion(
 | 
			
		||||
      GaugeField            &_Umu,
 | 
			
		||||
      GridCartesian         &FiveDimGrid,
 | 
			
		||||
      GridRedBlackCartesian &FiveDimRedBlackGrid,
 | 
			
		||||
      GridCartesian         &FourDimGrid,
 | 
			
		||||
      GridRedBlackCartesian &FourDimRedBlackGrid,
 | 
			
		||||
      RealD _mq1, RealD _mq2, RealD _mq3,
 | 
			
		||||
      RealD _shift, int _pm, RealD _M5, const ImplParams &p) :
 | 
			
		||||
    AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
 | 
			
		||||
        FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
 | 
			
		||||
        _shift, _pm, _M5, 1.0, 0.0, p)
 | 
			
		||||
    {
 | 
			
		||||
        RealD eps = 1.0;
 | 
			
		||||
        Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);
 | 
			
		||||
        assert(zdata->n == this->Ls);
 | 
			
		||||
 | 
			
		||||
        std::cout << GridLogMessage << "DomainWallEOFAFermion with Ls=" << this->Ls << std::endl;
 | 
			
		||||
        this->SetCoefficientsTanh(zdata, 1.0, 0.0);
 | 
			
		||||
 | 
			
		||||
        Approx::zolotarev_free(zdata);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /*
 | 
			
		||||
     Additional EOFA operators only called outside the inverter.
 | 
			
		||||
     Since speed is not essential, simple axpby-style
 | 
			
		||||
     implementations should be fine.
 | 
			
		||||
    */
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
 | 
			
		||||
    {
 | 
			
		||||
        int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
        Din = zero;
 | 
			
		||||
        if((sign == 1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, Ls-1, 0); }
 | 
			
		||||
        else if((sign == -1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
 | 
			
		||||
        else if((sign == 1 ) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, Ls-1); }
 | 
			
		||||
        else if((sign == -1) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // This is just the identity for DWF
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi){ chi = psi; }
 | 
			
		||||
 | 
			
		||||
    // This is just the identity for DWF
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi){ chi = psi; }
 | 
			
		||||
 | 
			
		||||
    /*****************************************************************************************************/
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    RealD DomainWallEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
        int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
        FermionField Din(psi._grid);
 | 
			
		||||
 | 
			
		||||
        this->Meooe5D(psi, Din);
 | 
			
		||||
        this->DW(Din, chi, DaggerNo);
 | 
			
		||||
        axpby(chi, 1.0, 1.0, chi, psi);
 | 
			
		||||
        this->M5D(psi, chi);
 | 
			
		||||
        return(norm2(chi));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    RealD DomainWallEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
        int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
        FermionField Din(psi._grid);
 | 
			
		||||
 | 
			
		||||
        this->DW(psi, Din, DaggerYes);
 | 
			
		||||
        this->MeooeDag5D(Din, chi);
 | 
			
		||||
        this->M5Ddag(psi, chi);
 | 
			
		||||
        axpby(chi, 1.0, 1.0, chi, psi);
 | 
			
		||||
        return(norm2(chi));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    // Performance critical fermion operators called inside the inverter
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
        int   Ls    = this->Ls;
 | 
			
		||||
        int   pm    = this->pm;
 | 
			
		||||
        RealD shift = this->shift;
 | 
			
		||||
        RealD mq1   = this->mq1;
 | 
			
		||||
        RealD mq2   = this->mq2;
 | 
			
		||||
        RealD mq3   = this->mq3;
 | 
			
		||||
 | 
			
		||||
        // coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
 | 
			
		||||
        Coeff_t shiftp(0.0), shiftm(0.0);
 | 
			
		||||
        if(shift != 0.0){
 | 
			
		||||
          if(pm == 1){ shiftp = shift*(mq3-mq2); }
 | 
			
		||||
          else{ shiftm = -shift*(mq3-mq2); }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        std::vector<Coeff_t> diag(Ls,1.0);
 | 
			
		||||
        std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
 | 
			
		||||
        std::vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftp;
 | 
			
		||||
 | 
			
		||||
        #if(0)
 | 
			
		||||
            std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl;
 | 
			
		||||
            for(int i=0; i<diag.size(); ++i){
 | 
			
		||||
                std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
 | 
			
		||||
            }
 | 
			
		||||
            for(int i=0; i<upper.size(); ++i){
 | 
			
		||||
                std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
 | 
			
		||||
            }
 | 
			
		||||
            for(int i=0; i<lower.size(); ++i){
 | 
			
		||||
                std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
 | 
			
		||||
            }
 | 
			
		||||
        #endif
 | 
			
		||||
 | 
			
		||||
        this->M5D(psi, chi, chi, lower, diag, upper);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
        int   Ls    = this->Ls;
 | 
			
		||||
        int   pm    = this->pm;
 | 
			
		||||
        RealD shift = this->shift;
 | 
			
		||||
        RealD mq1   = this->mq1;
 | 
			
		||||
        RealD mq2   = this->mq2;
 | 
			
		||||
        RealD mq3   = this->mq3;
 | 
			
		||||
 | 
			
		||||
        // coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
 | 
			
		||||
        Coeff_t shiftp(0.0), shiftm(0.0);
 | 
			
		||||
        if(shift != 0.0){
 | 
			
		||||
          if(pm == 1){ shiftp = shift*(mq3-mq2); }
 | 
			
		||||
          else{ shiftm = -shift*(mq3-mq2); }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        std::vector<Coeff_t> diag(Ls,1.0);
 | 
			
		||||
        std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
 | 
			
		||||
        std::vector<Coeff_t> lower(Ls,-1.0); lower[0]    = mq1 + shiftm;
 | 
			
		||||
 | 
			
		||||
        #if(0)
 | 
			
		||||
            std::cout << GridLogMessage << "DomainWallEOFAFermion::M5Ddag(FF&,FF&):" << std::endl;
 | 
			
		||||
            for(int i=0; i<diag.size(); ++i){
 | 
			
		||||
                std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
 | 
			
		||||
            }
 | 
			
		||||
            for(int i=0; i<upper.size(); ++i){
 | 
			
		||||
                std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
 | 
			
		||||
            }
 | 
			
		||||
            for(int i=0; i<lower.size(); ++i){
 | 
			
		||||
                std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
 | 
			
		||||
            }
 | 
			
		||||
        #endif
 | 
			
		||||
 | 
			
		||||
        this->M5Ddag(psi, chi, chi, lower, diag, upper);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // half checkerboard operations
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
        int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
        std::vector<Coeff_t> diag = this->bee;
 | 
			
		||||
        std::vector<Coeff_t> upper(Ls);
 | 
			
		||||
        std::vector<Coeff_t> lower(Ls);
 | 
			
		||||
 | 
			
		||||
        for(int s=0; s<Ls; s++){
 | 
			
		||||
          upper[s] = -this->cee[s];
 | 
			
		||||
          lower[s] = -this->cee[s];
 | 
			
		||||
        }
 | 
			
		||||
        upper[Ls-1] = this->dm;
 | 
			
		||||
        lower[0]    = this->dp;
 | 
			
		||||
 | 
			
		||||
        this->M5D(psi, psi, chi, lower, diag, upper);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
        int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
        std::vector<Coeff_t> diag = this->bee;
 | 
			
		||||
        std::vector<Coeff_t> upper(Ls);
 | 
			
		||||
        std::vector<Coeff_t> lower(Ls);
 | 
			
		||||
 | 
			
		||||
        for(int s=0; s<Ls; s++){
 | 
			
		||||
          upper[s] = -this->cee[s];
 | 
			
		||||
          lower[s] = -this->cee[s];
 | 
			
		||||
        }
 | 
			
		||||
        upper[Ls-1] = this->dp;
 | 
			
		||||
        lower[0]    = this->dm;
 | 
			
		||||
 | 
			
		||||
        this->M5Ddag(psi, psi, chi, lower, diag, upper);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /****************************************************************************************/
 | 
			
		||||
 | 
			
		||||
    //Zolo
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c)
 | 
			
		||||
    {
 | 
			
		||||
        int   Ls    = this->Ls;
 | 
			
		||||
        int   pm    = this->pm;
 | 
			
		||||
        RealD mq1   = this->mq1;
 | 
			
		||||
        RealD mq2   = this->mq2;
 | 
			
		||||
        RealD mq3   = this->mq3;
 | 
			
		||||
        RealD shift = this->shift;
 | 
			
		||||
 | 
			
		||||
        ////////////////////////////////////////////////////////
 | 
			
		||||
        // Constants for the preconditioned matrix Cayley form
 | 
			
		||||
        ////////////////////////////////////////////////////////
 | 
			
		||||
        this->bs.resize(Ls);
 | 
			
		||||
        this->cs.resize(Ls);
 | 
			
		||||
        this->aee.resize(Ls);
 | 
			
		||||
        this->aeo.resize(Ls);
 | 
			
		||||
        this->bee.resize(Ls);
 | 
			
		||||
        this->beo.resize(Ls);
 | 
			
		||||
        this->cee.resize(Ls);
 | 
			
		||||
        this->ceo.resize(Ls);
 | 
			
		||||
 | 
			
		||||
        for(int i=0; i<Ls; ++i){
 | 
			
		||||
          this->bee[i] = 4.0 - this->M5 + 1.0;
 | 
			
		||||
          this->cee[i] = 1.0;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        for(int i=0; i<Ls; ++i){
 | 
			
		||||
          this->aee[i] = this->cee[i];
 | 
			
		||||
          this->bs[i] = this->beo[i] = 1.0;
 | 
			
		||||
          this->cs[i] = this->ceo[i] = 0.0;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        //////////////////////////////////////////
 | 
			
		||||
        // EOFA shift terms
 | 
			
		||||
        //////////////////////////////////////////
 | 
			
		||||
        if(pm == 1){
 | 
			
		||||
          this->dp = mq1*this->cee[0] + shift*(mq3-mq2);
 | 
			
		||||
          this->dm = mq1*this->cee[Ls-1];
 | 
			
		||||
        } else if(this->pm == -1) {
 | 
			
		||||
          this->dp = mq1*this->cee[0];
 | 
			
		||||
          this->dm = mq1*this->cee[Ls-1] - shift*(mq3-mq2);
 | 
			
		||||
        } else {
 | 
			
		||||
          this->dp = mq1*this->cee[0];
 | 
			
		||||
          this->dm = mq1*this->cee[Ls-1];
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        //////////////////////////////////////////
 | 
			
		||||
        // LDU decomposition of eeoo
 | 
			
		||||
        //////////////////////////////////////////
 | 
			
		||||
        this->dee.resize(Ls+1);
 | 
			
		||||
        this->lee.resize(Ls);
 | 
			
		||||
        this->leem.resize(Ls);
 | 
			
		||||
        this->uee.resize(Ls);
 | 
			
		||||
        this->ueem.resize(Ls);
 | 
			
		||||
 | 
			
		||||
        for(int i=0; i<Ls; ++i){
 | 
			
		||||
 | 
			
		||||
          if(i < Ls-1){
 | 
			
		||||
 | 
			
		||||
            this->lee[i] = -this->cee[i+1]/this->bee[i]; // sub-diag entry on the ith column
 | 
			
		||||
 | 
			
		||||
            this->leem[i] = this->dm/this->bee[i];
 | 
			
		||||
            for(int j=0; j<i; j++){ this->leem[i] *= this->aee[j]/this->bee[j]; }
 | 
			
		||||
 | 
			
		||||
            this->dee[i] = this->bee[i];
 | 
			
		||||
 | 
			
		||||
            this->uee[i] = -this->aee[i]/this->bee[i];   // up-diag entry on the ith row
 | 
			
		||||
 | 
			
		||||
            this->ueem[i] = this->dp / this->bee[0];
 | 
			
		||||
            for(int j=1; j<=i; j++){ this->ueem[i] *= this->cee[j]/this->bee[j]; }
 | 
			
		||||
 | 
			
		||||
          } else {
 | 
			
		||||
 | 
			
		||||
            this->lee[i]  = 0.0;
 | 
			
		||||
            this->leem[i] = 0.0;
 | 
			
		||||
            this->uee[i]  = 0.0;
 | 
			
		||||
            this->ueem[i] = 0.0;
 | 
			
		||||
 | 
			
		||||
          }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        {
 | 
			
		||||
          Coeff_t delta_d = 1.0 / this->bee[0];
 | 
			
		||||
          for(int j=1; j<Ls-1; j++){ delta_d *= this->cee[j] / this->bee[j]; }
 | 
			
		||||
          this->dee[Ls-1] = this->bee[Ls-1] + this->cee[0] * this->dm * delta_d;
 | 
			
		||||
          this->dee[Ls] = this->bee[Ls-1] + this->cee[Ls-1] * this->dp * delta_d;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        int inv = 1;
 | 
			
		||||
        this->MooeeInternalCompute(0, inv, this->MatpInv, this->MatmInv);
 | 
			
		||||
        this->MooeeInternalCompute(1, inv, this->MatpInvDag, this->MatmInvDag);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Recompute Cayley-form coefficients for different shift
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
 | 
			
		||||
    {
 | 
			
		||||
        this->shift = new_shift;
 | 
			
		||||
        Approx::zolotarev_data *zdata = Approx::higham(1.0, this->Ls);
 | 
			
		||||
        this->SetCoefficientsTanh(zdata, 1.0, 0.0);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::MooeeInternalCompute(int dag, int inv,
 | 
			
		||||
        Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
 | 
			
		||||
    {
 | 
			
		||||
        int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
        GridBase* grid = this->FermionRedBlackGrid();
 | 
			
		||||
        int LLs = grid->_rdimensions[0];
 | 
			
		||||
 | 
			
		||||
        if(LLs == Ls){ return; } // Not vectorised in 5th direction
 | 
			
		||||
 | 
			
		||||
        Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
 | 
			
		||||
        Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
 | 
			
		||||
 | 
			
		||||
        for(int s=0; s<Ls; s++){
 | 
			
		||||
            Pplus(s,s)  = this->bee[s];
 | 
			
		||||
            Pminus(s,s) = this->bee[s];
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        for(int s=0; s<Ls-1; s++){
 | 
			
		||||
            Pminus(s,s+1) = -this->cee[s];
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        for(int s=0; s<Ls-1; s++){
 | 
			
		||||
            Pplus(s+1,s) = -this->cee[s+1];
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        Pplus (0,Ls-1) = this->dp;
 | 
			
		||||
        Pminus(Ls-1,0) = this->dm;
 | 
			
		||||
 | 
			
		||||
        Eigen::MatrixXcd PplusMat ;
 | 
			
		||||
        Eigen::MatrixXcd PminusMat;
 | 
			
		||||
 | 
			
		||||
        #if(0)
 | 
			
		||||
            std::cout << GridLogMessage << "Pplus:" << std::endl;
 | 
			
		||||
            for(int s=0; s<Ls; ++s){
 | 
			
		||||
                for(int ss=0; ss<Ls; ++ss){
 | 
			
		||||
                    std::cout << Pplus(s,ss) << "\t";
 | 
			
		||||
                }
 | 
			
		||||
                std::cout << std::endl;
 | 
			
		||||
            }
 | 
			
		||||
            std::cout << GridLogMessage << "Pminus:" << std::endl;
 | 
			
		||||
            for(int s=0; s<Ls; ++s){
 | 
			
		||||
                for(int ss=0; ss<Ls; ++ss){
 | 
			
		||||
                    std::cout << Pminus(s,ss) << "\t";
 | 
			
		||||
                }
 | 
			
		||||
                std::cout << std::endl;
 | 
			
		||||
            }
 | 
			
		||||
        #endif
 | 
			
		||||
 | 
			
		||||
        if(inv) {
 | 
			
		||||
            PplusMat  = Pplus.inverse();
 | 
			
		||||
            PminusMat = Pminus.inverse();
 | 
			
		||||
        } else {
 | 
			
		||||
            PplusMat  = Pplus;
 | 
			
		||||
            PminusMat = Pminus;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        if(dag){
 | 
			
		||||
            PplusMat.adjointInPlace();
 | 
			
		||||
            PminusMat.adjointInPlace();
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        typedef typename SiteHalfSpinor::scalar_type scalar_type;
 | 
			
		||||
        const int Nsimd = Simd::Nsimd();
 | 
			
		||||
        Matp.resize(Ls*LLs);
 | 
			
		||||
        Matm.resize(Ls*LLs);
 | 
			
		||||
 | 
			
		||||
        for(int s2=0; s2<Ls; s2++){
 | 
			
		||||
        for(int s1=0; s1<LLs; s1++){
 | 
			
		||||
            int istride = LLs;
 | 
			
		||||
            int ostride = 1;
 | 
			
		||||
            Simd Vp;
 | 
			
		||||
            Simd Vm;
 | 
			
		||||
            scalar_type *sp = (scalar_type*) &Vp;
 | 
			
		||||
            scalar_type *sm = (scalar_type*) &Vm;
 | 
			
		||||
            for(int l=0; l<Nsimd; l++){
 | 
			
		||||
                if(switcheroo<Coeff_t>::iscomplex()) {
 | 
			
		||||
                    sp[l] = PplusMat (l*istride+s1*ostride,s2);
 | 
			
		||||
                    sm[l] = PminusMat(l*istride+s1*ostride,s2);
 | 
			
		||||
                } else {
 | 
			
		||||
                    // if real
 | 
			
		||||
                    scalar_type tmp;
 | 
			
		||||
                    tmp = PplusMat (l*istride+s1*ostride,s2);
 | 
			
		||||
                    sp[l] = scalar_type(tmp.real(),tmp.real());
 | 
			
		||||
                    tmp = PminusMat(l*istride+s1*ostride,s2);
 | 
			
		||||
                    sm[l] = scalar_type(tmp.real(),tmp.real());
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
            Matp[LLs*s2+s1] = Vp;
 | 
			
		||||
            Matm[LLs*s2+s1] = Vm;
 | 
			
		||||
        }}
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    FermOpTemplateInstantiate(DomainWallEOFAFermion);
 | 
			
		||||
    GparityFermOpTemplateInstantiate(DomainWallEOFAFermion);
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
							
								
								
									
										115
									
								
								lib/qcd/action/fermion/DomainWallEOFAFermion.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										115
									
								
								lib/qcd/action/fermion/DomainWallEOFAFermion.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,115 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermion.h
 | 
			
		||||
 | 
			
		||||
Copyright (C) 2017
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: David Murphy <dmurphy@phys.columbia.edu>
 | 
			
		||||
 | 
			
		||||
This program is free software; you can redistribute it and/or modify
 | 
			
		||||
it under the terms of the GNU General Public License as published by
 | 
			
		||||
the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
(at your option) any later version.
 | 
			
		||||
 | 
			
		||||
This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
You should have received a copy of the GNU General Public License along
 | 
			
		||||
with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
#ifndef  GRID_QCD_DOMAIN_WALL_EOFA_FERMION_H
 | 
			
		||||
#define  GRID_QCD_DOMAIN_WALL_EOFA_FERMION_H
 | 
			
		||||
 | 
			
		||||
#include <Grid/qcd/action/fermion/AbstractEOFAFermion.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  class DomainWallEOFAFermion : public AbstractEOFAFermion<Impl>
 | 
			
		||||
  {
 | 
			
		||||
    public:
 | 
			
		||||
      INHERIT_IMPL_TYPES(Impl);
 | 
			
		||||
 | 
			
		||||
    public:
 | 
			
		||||
      // Modified (0,Ls-1) and (Ls-1,0) elements of Mooee
 | 
			
		||||
      // for red-black preconditioned Shamir EOFA
 | 
			
		||||
      Coeff_t dm;
 | 
			
		||||
      Coeff_t dp;
 | 
			
		||||
 | 
			
		||||
      virtual void Instantiatable(void) {};
 | 
			
		||||
 | 
			
		||||
      // EOFA-specific operations
 | 
			
		||||
      virtual void  Omega      (const FermionField& in, FermionField& out, int sign, int dag);
 | 
			
		||||
      virtual void  Dtilde     (const FermionField& in, FermionField& out);
 | 
			
		||||
      virtual void  DtildeInv  (const FermionField& in, FermionField& out);
 | 
			
		||||
 | 
			
		||||
      // override multiply
 | 
			
		||||
      virtual RealD M          (const FermionField& in, FermionField& out);
 | 
			
		||||
      virtual RealD Mdag       (const FermionField& in, FermionField& out);
 | 
			
		||||
 | 
			
		||||
      // half checkerboard operations
 | 
			
		||||
      virtual void  Mooee      (const FermionField& in, FermionField& out);
 | 
			
		||||
      virtual void  MooeeDag   (const FermionField& in, FermionField& out);
 | 
			
		||||
      virtual void  MooeeInv   (const FermionField& in, FermionField& out);
 | 
			
		||||
      virtual void  MooeeInvDag(const FermionField& in, FermionField& out);
 | 
			
		||||
 | 
			
		||||
      virtual void   M5D       (const FermionField& psi, FermionField& chi);
 | 
			
		||||
      virtual void   M5Ddag    (const FermionField& psi, FermionField& chi);
 | 
			
		||||
 | 
			
		||||
      /////////////////////////////////////////////////////
 | 
			
		||||
      // Instantiate different versions depending on Impl
 | 
			
		||||
      /////////////////////////////////////////////////////
 | 
			
		||||
      void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
 | 
			
		||||
        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
 | 
			
		||||
 | 
			
		||||
      void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
 | 
			
		||||
        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
 | 
			
		||||
 | 
			
		||||
      void MooeeInternal(const FermionField& in, FermionField& out, int dag, int inv);
 | 
			
		||||
 | 
			
		||||
      void MooeeInternalCompute(int dag, int inv, Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
 | 
			
		||||
 | 
			
		||||
      void MooeeInternalAsm(const FermionField& in, FermionField& out, int LLs, int site,
 | 
			
		||||
        Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
 | 
			
		||||
 | 
			
		||||
      void MooeeInternalZAsm(const FermionField& in, FermionField& out, int LLs, int site,
 | 
			
		||||
        Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
 | 
			
		||||
 | 
			
		||||
      virtual void RefreshShiftCoefficients(RealD new_shift);
 | 
			
		||||
 | 
			
		||||
      // Constructors
 | 
			
		||||
      DomainWallEOFAFermion(GaugeField& _Umu, GridCartesian& FiveDimGrid, GridRedBlackCartesian& FiveDimRedBlackGrid,
 | 
			
		||||
        GridCartesian& FourDimGrid, GridRedBlackCartesian& FourDimRedBlackGrid,
 | 
			
		||||
        RealD _mq1, RealD _mq2, RealD _mq3, RealD _shift, int pm,
 | 
			
		||||
        RealD _M5, const ImplParams& p=ImplParams());
 | 
			
		||||
 | 
			
		||||
    protected:
 | 
			
		||||
      void SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c);
 | 
			
		||||
  };
 | 
			
		||||
}}
 | 
			
		||||
 | 
			
		||||
#define INSTANTIATE_DPERP_DWF_EOFA(A)\
 | 
			
		||||
template void DomainWallEOFAFermion<A>::M5D(const FermionField& psi, const FermionField& phi, FermionField& chi, \
 | 
			
		||||
  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
 | 
			
		||||
template void DomainWallEOFAFermion<A>::M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi, \
 | 
			
		||||
  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
 | 
			
		||||
template void DomainWallEOFAFermion<A>::MooeeInv(const FermionField& psi, FermionField& chi); \
 | 
			
		||||
template void DomainWallEOFAFermion<A>::MooeeInvDag(const FermionField& psi, FermionField& chi);
 | 
			
		||||
 | 
			
		||||
#undef  DOMAIN_WALL_EOFA_DPERP_DENSE
 | 
			
		||||
#define DOMAIN_WALL_EOFA_DPERP_CACHE
 | 
			
		||||
#undef  DOMAIN_WALL_EOFA_DPERP_LINALG
 | 
			
		||||
#define DOMAIN_WALL_EOFA_DPERP_VEC
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
							
								
								
									
										248
									
								
								lib/qcd/action/fermion/DomainWallEOFAFermioncache.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										248
									
								
								lib/qcd/action/fermion/DomainWallEOFAFermioncache.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,248 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermioncache.cc
 | 
			
		||||
 | 
			
		||||
Copyright (C) 2017
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: David Murphy <dmurphy@phys.columbia.edu>
 | 
			
		||||
 | 
			
		||||
This program is free software; you can redistribute it and/or modify
 | 
			
		||||
it under the terms of the GNU General Public License as published by
 | 
			
		||||
the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
(at your option) any later version.
 | 
			
		||||
 | 
			
		||||
This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
You should have received a copy of the GNU General Public License along
 | 
			
		||||
with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#include <Grid/qcd/action/fermion/FermionCore.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
    // FIXME -- make a version of these routines with site loop outermost for cache reuse.
 | 
			
		||||
 | 
			
		||||
    // Pminus fowards
 | 
			
		||||
    // Pplus  backwards..
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
 | 
			
		||||
        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
 | 
			
		||||
    {
 | 
			
		||||
        int Ls = this->Ls;
 | 
			
		||||
        GridBase* grid = psi._grid;
 | 
			
		||||
 | 
			
		||||
        assert(phi.checkerboard == psi.checkerboard);
 | 
			
		||||
        chi.checkerboard = psi.checkerboard;
 | 
			
		||||
        // Flops = 6.0*(Nc*Ns) *Ls*vol
 | 
			
		||||
        this->M5Dcalls++;
 | 
			
		||||
        this->M5Dtime -= usecond();
 | 
			
		||||
 | 
			
		||||
        parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
 | 
			
		||||
            for(int s=0; s<Ls; s++){
 | 
			
		||||
                auto tmp = psi._odata[0];
 | 
			
		||||
                if(s==0) {
 | 
			
		||||
                    spProj5m(tmp, psi._odata[ss+s+1]);
 | 
			
		||||
                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
 | 
			
		||||
                    spProj5p(tmp, psi._odata[ss+Ls-1]);
 | 
			
		||||
                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
 | 
			
		||||
                } else if(s==(Ls-1)) {
 | 
			
		||||
                    spProj5m(tmp, psi._odata[ss+0]);
 | 
			
		||||
                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
 | 
			
		||||
                    spProj5p(tmp, psi._odata[ss+s-1]);
 | 
			
		||||
                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
 | 
			
		||||
                } else {
 | 
			
		||||
                    spProj5m(tmp, psi._odata[ss+s+1]);
 | 
			
		||||
                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
 | 
			
		||||
                    spProj5p(tmp, psi._odata[ss+s-1]);
 | 
			
		||||
                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        this->M5Dtime += usecond();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
 | 
			
		||||
        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
 | 
			
		||||
    {
 | 
			
		||||
        int Ls = this->Ls;
 | 
			
		||||
        GridBase* grid = psi._grid;
 | 
			
		||||
        assert(phi.checkerboard == psi.checkerboard);
 | 
			
		||||
        chi.checkerboard=psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
        // Flops = 6.0*(Nc*Ns) *Ls*vol
 | 
			
		||||
        this->M5Dcalls++;
 | 
			
		||||
        this->M5Dtime -= usecond();
 | 
			
		||||
 | 
			
		||||
        parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
 | 
			
		||||
            auto tmp = psi._odata[0];
 | 
			
		||||
            for(int s=0; s<Ls; s++){
 | 
			
		||||
                if(s==0) {
 | 
			
		||||
                    spProj5p(tmp, psi._odata[ss+s+1]);
 | 
			
		||||
                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
 | 
			
		||||
                    spProj5m(tmp, psi._odata[ss+Ls-1]);
 | 
			
		||||
                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
 | 
			
		||||
                } else if(s==(Ls-1)) {
 | 
			
		||||
                    spProj5p(tmp, psi._odata[ss+0]);
 | 
			
		||||
                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
 | 
			
		||||
                    spProj5m(tmp, psi._odata[ss+s-1]);
 | 
			
		||||
                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
 | 
			
		||||
                } else {
 | 
			
		||||
                    spProj5p(tmp, psi._odata[ss+s+1]);
 | 
			
		||||
                    chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
 | 
			
		||||
                    spProj5m(tmp, psi._odata[ss+s-1]);
 | 
			
		||||
                    chi[ss+s] = chi[ss+s] + lower[s]*tmp;
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        this->M5Dtime += usecond();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
        GridBase* grid = psi._grid;
 | 
			
		||||
        int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
        chi.checkerboard = psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
        this->MooeeInvCalls++;
 | 
			
		||||
        this->MooeeInvTime -= usecond();
 | 
			
		||||
 | 
			
		||||
        parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
 | 
			
		||||
 | 
			
		||||
            auto tmp1 = psi._odata[0];
 | 
			
		||||
            auto tmp2 = psi._odata[0];
 | 
			
		||||
 | 
			
		||||
            // flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls  = 12*Ls * (9) = 108*Ls flops
 | 
			
		||||
            // Apply (L^{\prime})^{-1}
 | 
			
		||||
            chi[ss] = psi[ss]; // chi[0]=psi[0]
 | 
			
		||||
            for(int s=1; s<Ls; s++){
 | 
			
		||||
                spProj5p(tmp1, chi[ss+s-1]);
 | 
			
		||||
                chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp1;
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            // L_m^{-1}
 | 
			
		||||
            for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
 | 
			
		||||
                spProj5m(tmp1, chi[ss+s]);
 | 
			
		||||
                chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp1;
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            // U_m^{-1} D^{-1}
 | 
			
		||||
            for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
 | 
			
		||||
                spProj5p(tmp1, chi[ss+Ls-1]);
 | 
			
		||||
                chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls])*tmp1;
 | 
			
		||||
            }
 | 
			
		||||
            spProj5m(tmp2, chi[ss+Ls-1]);
 | 
			
		||||
            chi[ss+Ls-1] = (1.0/this->dee[Ls])*tmp1 + (1.0/this->dee[Ls-1])*tmp2;
 | 
			
		||||
 | 
			
		||||
            // Apply U^{-1}
 | 
			
		||||
            for(int s=Ls-2; s>=0; s--){
 | 
			
		||||
                spProj5m(tmp1, chi[ss+s+1]);
 | 
			
		||||
                chi[ss+s] = chi[ss+s] - this->uee[s]*tmp1;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        this->MooeeInvTime += usecond();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
        GridBase* grid = psi._grid;
 | 
			
		||||
        int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
        assert(psi.checkerboard == psi.checkerboard);
 | 
			
		||||
        chi.checkerboard = psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
        std::vector<Coeff_t> ueec(Ls);
 | 
			
		||||
        std::vector<Coeff_t> deec(Ls+1);
 | 
			
		||||
        std::vector<Coeff_t> leec(Ls);
 | 
			
		||||
        std::vector<Coeff_t> ueemc(Ls);
 | 
			
		||||
        std::vector<Coeff_t> leemc(Ls);
 | 
			
		||||
 | 
			
		||||
        for(int s=0; s<ueec.size(); s++){
 | 
			
		||||
            ueec[s]  = conjugate(this->uee[s]);
 | 
			
		||||
            deec[s]  = conjugate(this->dee[s]);
 | 
			
		||||
            leec[s]  = conjugate(this->lee[s]);
 | 
			
		||||
            ueemc[s] = conjugate(this->ueem[s]);
 | 
			
		||||
            leemc[s] = conjugate(this->leem[s]);
 | 
			
		||||
        }
 | 
			
		||||
        deec[Ls] = conjugate(this->dee[Ls]);
 | 
			
		||||
 | 
			
		||||
        this->MooeeInvCalls++;
 | 
			
		||||
        this->MooeeInvTime -= usecond();
 | 
			
		||||
 | 
			
		||||
        parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
 | 
			
		||||
 | 
			
		||||
            auto tmp1 = psi._odata[0];
 | 
			
		||||
            auto tmp2 = psi._odata[0];
 | 
			
		||||
 | 
			
		||||
            // Apply (U^{\prime})^{-dagger}
 | 
			
		||||
            chi[ss] = psi[ss];
 | 
			
		||||
            for(int s=1; s<Ls; s++){
 | 
			
		||||
                spProj5m(tmp1, chi[ss+s-1]);
 | 
			
		||||
                chi[ss+s] = psi[ss+s] - ueec[s-1]*tmp1;
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            // U_m^{-\dagger}
 | 
			
		||||
            for(int s=0; s<Ls-1; s++){
 | 
			
		||||
                spProj5p(tmp1, chi[ss+s]);
 | 
			
		||||
                chi[ss+Ls-1] = chi[ss+Ls-1] - ueemc[s]*tmp1;
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            // L_m^{-\dagger} D^{-dagger}
 | 
			
		||||
            for(int s=0; s<Ls-1; s++){
 | 
			
		||||
                spProj5m(tmp1, chi[ss+Ls-1]);
 | 
			
		||||
                chi[ss+s] = (1.0/deec[s])*chi[ss+s] - (leemc[s]/deec[Ls-1])*tmp1;
 | 
			
		||||
            }
 | 
			
		||||
            spProj5p(tmp2, chi[ss+Ls-1]);
 | 
			
		||||
            chi[ss+Ls-1] = (1.0/deec[Ls-1])*tmp1 + (1.0/deec[Ls])*tmp2;
 | 
			
		||||
 | 
			
		||||
            // Apply L^{-dagger}
 | 
			
		||||
            for(int s=Ls-2; s>=0; s--){
 | 
			
		||||
                spProj5p(tmp1, chi[ss+s+1]);
 | 
			
		||||
                chi[ss+s] = chi[ss+s] - leec[s]*tmp1;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        this->MooeeInvTime += usecond();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    #ifdef DOMAIN_WALL_EOFA_DPERP_CACHE
 | 
			
		||||
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
 | 
			
		||||
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
 | 
			
		||||
 | 
			
		||||
    #endif
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
							
								
								
									
										159
									
								
								lib/qcd/action/fermion/DomainWallEOFAFermiondense.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										159
									
								
								lib/qcd/action/fermion/DomainWallEOFAFermiondense.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,159 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermiondense.cc
 | 
			
		||||
 | 
			
		||||
Copyright (C) 2017
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: David Murphy <dmurphy@phys.columbia.edu>
 | 
			
		||||
 | 
			
		||||
This program is free software; you can redistribute it and/or modify
 | 
			
		||||
it under the terms of the GNU General Public License as published by
 | 
			
		||||
the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
(at your option) any later version.
 | 
			
		||||
 | 
			
		||||
This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
You should have received a copy of the GNU General Public License along
 | 
			
		||||
with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#include <Grid/Grid_Eigen_Dense.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/FermionCore.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
    /*
 | 
			
		||||
    * Dense matrix versions of routines
 | 
			
		||||
    */
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
        this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
        this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
 | 
			
		||||
    {
 | 
			
		||||
        int Ls = this->Ls;
 | 
			
		||||
        int LLs = psi._grid->_rdimensions[0];
 | 
			
		||||
        int vol = psi._grid->oSites()/LLs;
 | 
			
		||||
 | 
			
		||||
        chi.checkerboard = psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
        assert(Ls==LLs);
 | 
			
		||||
 | 
			
		||||
        Eigen::MatrixXd Pplus  = Eigen::MatrixXd::Zero(Ls,Ls);
 | 
			
		||||
        Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
 | 
			
		||||
 | 
			
		||||
        for(int s=0;s<Ls;s++){
 | 
			
		||||
            Pplus(s,s)  = this->bee[s];
 | 
			
		||||
            Pminus(s,s) = this->bee[s];
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        for(int s=0; s<Ls-1; s++){
 | 
			
		||||
            Pminus(s,s+1) = -this->cee[s];
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        for(int s=0; s<Ls-1; s++){
 | 
			
		||||
            Pplus(s+1,s) = -this->cee[s+1];
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        Pplus (0,Ls-1) = this->dp;
 | 
			
		||||
        Pminus(Ls-1,0) = this->dm;
 | 
			
		||||
 | 
			
		||||
        Eigen::MatrixXd PplusMat ;
 | 
			
		||||
        Eigen::MatrixXd PminusMat;
 | 
			
		||||
 | 
			
		||||
        if(inv) {
 | 
			
		||||
            PplusMat  = Pplus.inverse();
 | 
			
		||||
            PminusMat = Pminus.inverse();
 | 
			
		||||
        } else {
 | 
			
		||||
            PplusMat  = Pplus;
 | 
			
		||||
            PminusMat = Pminus;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        if(dag){
 | 
			
		||||
            PplusMat.adjointInPlace();
 | 
			
		||||
            PminusMat.adjointInPlace();
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // For the non-vectorised s-direction this is simple
 | 
			
		||||
 | 
			
		||||
        for(auto site=0; site<vol; site++){
 | 
			
		||||
 | 
			
		||||
            SiteSpinor     SiteChi;
 | 
			
		||||
            SiteHalfSpinor SitePplus;
 | 
			
		||||
            SiteHalfSpinor SitePminus;
 | 
			
		||||
 | 
			
		||||
            for(int s1=0; s1<Ls; s1++){
 | 
			
		||||
                SiteChi = zero;
 | 
			
		||||
                for(int s2=0; s2<Ls; s2++){
 | 
			
		||||
                    int lex2 = s2 + Ls*site;
 | 
			
		||||
                    if(PplusMat(s1,s2) != 0.0){
 | 
			
		||||
                        spProj5p(SitePplus,psi[lex2]);
 | 
			
		||||
                        accumRecon5p(SiteChi, PplusMat(s1,s2)*SitePplus);
 | 
			
		||||
                    }
 | 
			
		||||
                    if(PminusMat(s1,s2) != 0.0){
 | 
			
		||||
                        spProj5m(SitePminus, psi[lex2]);
 | 
			
		||||
                        accumRecon5m(SiteChi, PminusMat(s1,s2)*SitePminus);
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
                chi[s1+Ls*site] = SiteChi*0.5;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    #ifdef DOMAIN_WALL_EOFA_DPERP_DENSE
 | 
			
		||||
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
 | 
			
		||||
 | 
			
		||||
        template void DomainWallEOFAFermion<GparityWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
        template void DomainWallEOFAFermion<GparityWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
        template void DomainWallEOFAFermion<WilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
        template void DomainWallEOFAFermion<WilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
        template void DomainWallEOFAFermion<ZWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
        template void DomainWallEOFAFermion<ZWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
 | 
			
		||||
 | 
			
		||||
        template void DomainWallEOFAFermion<GparityWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
        template void DomainWallEOFAFermion<GparityWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
        template void DomainWallEOFAFermion<WilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
        template void DomainWallEOFAFermion<WilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
        template void DomainWallEOFAFermion<ZWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
        template void DomainWallEOFAFermion<ZWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
 | 
			
		||||
    #endif
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
							
								
								
									
										168
									
								
								lib/qcd/action/fermion/DomainWallEOFAFermionssp.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										168
									
								
								lib/qcd/action/fermion/DomainWallEOFAFermionssp.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,168 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermionssp.cc
 | 
			
		||||
 | 
			
		||||
Copyright (C) 2017
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: David Murphy <dmurphy@phys.columbia.edu>
 | 
			
		||||
 | 
			
		||||
This program is free software; you can redistribute it and/or modify
 | 
			
		||||
it under the terms of the GNU General Public License as published by
 | 
			
		||||
the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
(at your option) any later version.
 | 
			
		||||
 | 
			
		||||
This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
You should have received a copy of the GNU General Public License along
 | 
			
		||||
with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#include <Grid/qcd/action/fermion/FermionCore.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
    // FIXME -- make a version of these routines with site loop outermost for cache reuse.
 | 
			
		||||
    // Pminus fowards
 | 
			
		||||
    // Pplus  backwards
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
 | 
			
		||||
        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
 | 
			
		||||
    {
 | 
			
		||||
        Coeff_t one(1.0);
 | 
			
		||||
        int Ls = this->Ls;
 | 
			
		||||
        for(int s=0; s<Ls; s++){
 | 
			
		||||
            if(s==0) {
 | 
			
		||||
              axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
 | 
			
		||||
              axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, Ls-1);
 | 
			
		||||
            } else if (s==(Ls-1)) {
 | 
			
		||||
              axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, 0);
 | 
			
		||||
              axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, s-1);
 | 
			
		||||
            } else {
 | 
			
		||||
              axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
 | 
			
		||||
              axpby_ssp_pplus(chi, one, chi, lower[s], psi, s, s-1);
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
 | 
			
		||||
        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
 | 
			
		||||
    {
 | 
			
		||||
        Coeff_t one(1.0);
 | 
			
		||||
        int Ls = this->Ls;
 | 
			
		||||
        for(int s=0; s<Ls; s++){
 | 
			
		||||
            if(s==0) {
 | 
			
		||||
              axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
 | 
			
		||||
              axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, Ls-1);
 | 
			
		||||
            } else if (s==(Ls-1)) {
 | 
			
		||||
              axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, 0);
 | 
			
		||||
              axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
 | 
			
		||||
            } else {
 | 
			
		||||
              axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
 | 
			
		||||
              axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
        Coeff_t one(1.0);
 | 
			
		||||
        Coeff_t czero(0.0);
 | 
			
		||||
        chi.checkerboard = psi.checkerboard;
 | 
			
		||||
        int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
        FermionField tmp(psi._grid);
 | 
			
		||||
 | 
			
		||||
        // Apply (L^{\prime})^{-1}
 | 
			
		||||
        axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
 | 
			
		||||
        for(int s=1; s<Ls; s++){
 | 
			
		||||
            axpby_ssp_pplus(chi, one, psi, -this->lee[s-1], chi, s, s-1);// recursion Psi[s] -lee P_+ chi[s-1]
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // L_m^{-1}
 | 
			
		||||
        for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
 | 
			
		||||
            axpby_ssp_pminus(chi, one, chi, -this->leem[s], chi, Ls-1, s);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // U_m^{-1} D^{-1}
 | 
			
		||||
        for(int s=0; s<Ls-1; s++){
 | 
			
		||||
            axpby_ssp_pplus(chi, one/this->dee[s], chi, -this->ueem[s]/this->dee[Ls], chi, s, Ls-1);
 | 
			
		||||
        }
 | 
			
		||||
        axpby_ssp_pminus(tmp, czero, chi, one/this->dee[Ls-1], chi, Ls-1, Ls-1);
 | 
			
		||||
        axpby_ssp_pplus(chi, one, tmp, one/this->dee[Ls], chi, Ls-1, Ls-1);
 | 
			
		||||
 | 
			
		||||
        // Apply U^{-1}
 | 
			
		||||
        for(int s=Ls-2; s>=0; s--){
 | 
			
		||||
            axpby_ssp_pminus(chi, one, chi, -this->uee[s], chi, s, s+1);  // chi[Ls]
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
        Coeff_t one(1.0);
 | 
			
		||||
        Coeff_t czero(0.0);
 | 
			
		||||
        chi.checkerboard = psi.checkerboard;
 | 
			
		||||
        int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
        FermionField tmp(psi._grid);
 | 
			
		||||
 | 
			
		||||
        // Apply (U^{\prime})^{-dagger}
 | 
			
		||||
        axpby_ssp(chi, one, psi, czero, psi, 0, 0);      // chi[0]=psi[0]
 | 
			
		||||
        for(int s=1; s<Ls; s++){
 | 
			
		||||
            axpby_ssp_pminus(chi, one, psi, -conjugate(this->uee[s-1]), chi, s, s-1);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // U_m^{-\dagger}
 | 
			
		||||
        for(int s=0; s<Ls-1; s++){
 | 
			
		||||
            axpby_ssp_pplus(chi, one, chi, -conjugate(this->ueem[s]), chi, Ls-1, s);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // L_m^{-\dagger} D^{-dagger}
 | 
			
		||||
        for(int s=0; s<Ls-1; s++){
 | 
			
		||||
            axpby_ssp_pminus(chi, one/conjugate(this->dee[s]), chi, -conjugate(this->leem[s]/this->dee[Ls-1]), chi, s, Ls-1);
 | 
			
		||||
        }
 | 
			
		||||
        axpby_ssp_pminus(tmp, czero, chi, one/conjugate(this->dee[Ls-1]), chi, Ls-1, Ls-1);
 | 
			
		||||
        axpby_ssp_pplus(chi, one, tmp, one/conjugate(this->dee[Ls]), chi, Ls-1, Ls-1);
 | 
			
		||||
 | 
			
		||||
        // Apply L^{-dagger}
 | 
			
		||||
        for(int s=Ls-2; s>=0; s--){
 | 
			
		||||
            axpby_ssp_pplus(chi, one, chi, -conjugate(this->lee[s]), chi, s, s+1);  // chi[Ls]
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    #ifdef DOMAIN_WALL_EOFA_DPERP_LINALG
 | 
			
		||||
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
 | 
			
		||||
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
 | 
			
		||||
 | 
			
		||||
    #endif
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
							
								
								
									
										605
									
								
								lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										605
									
								
								lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,605 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
 | 
			
		||||
 | 
			
		||||
Copyright (C) 2017
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: David Murphy <dmurphy@phys.columbia.edu>
 | 
			
		||||
 | 
			
		||||
This program is free software; you can redistribute it and/or modify
 | 
			
		||||
it under the terms of the GNU General Public License as published by
 | 
			
		||||
the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
(at your option) any later version.
 | 
			
		||||
 | 
			
		||||
This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
You should have received a copy of the GNU General Public License along
 | 
			
		||||
with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#include <Grid/qcd/action/fermion/FermionCore.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
    /*
 | 
			
		||||
    * Dense matrix versions of routines
 | 
			
		||||
    */
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
        this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
        this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
 | 
			
		||||
        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
 | 
			
		||||
    {
 | 
			
		||||
        GridBase* grid = psi._grid;
 | 
			
		||||
        int Ls  = this->Ls;
 | 
			
		||||
        int LLs = grid->_rdimensions[0];
 | 
			
		||||
        const int nsimd = Simd::Nsimd();
 | 
			
		||||
 | 
			
		||||
        Vector<iSinglet<Simd> > u(LLs);
 | 
			
		||||
        Vector<iSinglet<Simd> > l(LLs);
 | 
			
		||||
        Vector<iSinglet<Simd> > d(LLs);
 | 
			
		||||
 | 
			
		||||
        assert(Ls/LLs == nsimd);
 | 
			
		||||
        assert(phi.checkerboard == psi.checkerboard);
 | 
			
		||||
 | 
			
		||||
        chi.checkerboard = psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
        // just directly address via type pun
 | 
			
		||||
        typedef typename Simd::scalar_type scalar_type;
 | 
			
		||||
        scalar_type* u_p = (scalar_type*) &u[0];
 | 
			
		||||
        scalar_type* l_p = (scalar_type*) &l[0];
 | 
			
		||||
        scalar_type* d_p = (scalar_type*) &d[0];
 | 
			
		||||
 | 
			
		||||
        for(int o=0;o<LLs;o++){ // outer
 | 
			
		||||
        for(int i=0;i<nsimd;i++){ //inner
 | 
			
		||||
            int s  = o + i*LLs;
 | 
			
		||||
            int ss = o*nsimd + i;
 | 
			
		||||
            u_p[ss] = upper[s];
 | 
			
		||||
            l_p[ss] = lower[s];
 | 
			
		||||
            d_p[ss] = diag[s];
 | 
			
		||||
        }}
 | 
			
		||||
 | 
			
		||||
        this->M5Dcalls++;
 | 
			
		||||
        this->M5Dtime -= usecond();
 | 
			
		||||
 | 
			
		||||
        assert(Nc == 3);
 | 
			
		||||
 | 
			
		||||
        parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
 | 
			
		||||
 | 
			
		||||
            #if 0
 | 
			
		||||
 | 
			
		||||
                alignas(64) SiteHalfSpinor hp;
 | 
			
		||||
                alignas(64) SiteHalfSpinor hm;
 | 
			
		||||
                alignas(64) SiteSpinor fp;
 | 
			
		||||
                alignas(64) SiteSpinor fm;
 | 
			
		||||
 | 
			
		||||
                for(int v=0; v<LLs; v++){
 | 
			
		||||
 | 
			
		||||
                    int vp = (v+1)%LLs;
 | 
			
		||||
                    int vm = (v+LLs-1)%LLs;
 | 
			
		||||
 | 
			
		||||
                    spProj5m(hp, psi[ss+vp]);
 | 
			
		||||
                    spProj5p(hm, psi[ss+vm]);
 | 
			
		||||
 | 
			
		||||
                    if (vp <= v){ rotate(hp, hp, 1); }
 | 
			
		||||
                    if (vm >= v){ rotate(hm, hm, nsimd-1); }
 | 
			
		||||
 | 
			
		||||
                    hp = 0.5*hp;
 | 
			
		||||
                    hm = 0.5*hm;
 | 
			
		||||
 | 
			
		||||
                    spRecon5m(fp, hp);
 | 
			
		||||
                    spRecon5p(fm, hm);
 | 
			
		||||
 | 
			
		||||
                    chi[ss+v] = d[v]*phi[ss+v];
 | 
			
		||||
                    chi[ss+v] = chi[ss+v] + u[v]*fp;
 | 
			
		||||
                    chi[ss+v] = chi[ss+v] + l[v]*fm;
 | 
			
		||||
 | 
			
		||||
                }
 | 
			
		||||
 | 
			
		||||
            #else
 | 
			
		||||
 | 
			
		||||
                for(int v=0; v<LLs; v++){
 | 
			
		||||
 | 
			
		||||
                    vprefetch(psi[ss+v+LLs]);
 | 
			
		||||
 | 
			
		||||
                    int vp = (v==LLs-1) ? 0     : v+1;
 | 
			
		||||
                    int vm = (v==0)     ? LLs-1 : v-1;
 | 
			
		||||
 | 
			
		||||
                    Simd hp_00 = psi[ss+vp]()(2)(0);
 | 
			
		||||
                    Simd hp_01 = psi[ss+vp]()(2)(1);
 | 
			
		||||
                    Simd hp_02 = psi[ss+vp]()(2)(2);
 | 
			
		||||
                    Simd hp_10 = psi[ss+vp]()(3)(0);
 | 
			
		||||
                    Simd hp_11 = psi[ss+vp]()(3)(1);
 | 
			
		||||
                    Simd hp_12 = psi[ss+vp]()(3)(2);
 | 
			
		||||
 | 
			
		||||
                    Simd hm_00 = psi[ss+vm]()(0)(0);
 | 
			
		||||
                    Simd hm_01 = psi[ss+vm]()(0)(1);
 | 
			
		||||
                    Simd hm_02 = psi[ss+vm]()(0)(2);
 | 
			
		||||
                    Simd hm_10 = psi[ss+vm]()(1)(0);
 | 
			
		||||
                    Simd hm_11 = psi[ss+vm]()(1)(1);
 | 
			
		||||
                    Simd hm_12 = psi[ss+vm]()(1)(2);
 | 
			
		||||
 | 
			
		||||
                    if(vp <= v){
 | 
			
		||||
                        hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
 | 
			
		||||
                        hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
 | 
			
		||||
                        hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
 | 
			
		||||
                        hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
 | 
			
		||||
                        hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
 | 
			
		||||
                        hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
 | 
			
		||||
                    }
 | 
			
		||||
 | 
			
		||||
                    if(vm >= v){
 | 
			
		||||
                        hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
 | 
			
		||||
                        hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
 | 
			
		||||
                        hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
 | 
			
		||||
                        hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
 | 
			
		||||
                        hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
 | 
			
		||||
                        hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
 | 
			
		||||
                    }
 | 
			
		||||
 | 
			
		||||
                    // Can force these to real arithmetic and save 2x.
 | 
			
		||||
                    Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
 | 
			
		||||
                    Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
 | 
			
		||||
                    Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
 | 
			
		||||
                    Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
 | 
			
		||||
                    Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
 | 
			
		||||
                    Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
 | 
			
		||||
                    Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
 | 
			
		||||
                    Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
 | 
			
		||||
                    Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
 | 
			
		||||
                    Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
 | 
			
		||||
                    Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
 | 
			
		||||
                    Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
 | 
			
		||||
 | 
			
		||||
                    vstream(chi[ss+v]()(0)(0), p_00);
 | 
			
		||||
                    vstream(chi[ss+v]()(0)(1), p_01);
 | 
			
		||||
                    vstream(chi[ss+v]()(0)(2), p_02);
 | 
			
		||||
                    vstream(chi[ss+v]()(1)(0), p_10);
 | 
			
		||||
                    vstream(chi[ss+v]()(1)(1), p_11);
 | 
			
		||||
                    vstream(chi[ss+v]()(1)(2), p_12);
 | 
			
		||||
                    vstream(chi[ss+v]()(2)(0), p_20);
 | 
			
		||||
                    vstream(chi[ss+v]()(2)(1), p_21);
 | 
			
		||||
                    vstream(chi[ss+v]()(2)(2), p_22);
 | 
			
		||||
                    vstream(chi[ss+v]()(3)(0), p_30);
 | 
			
		||||
                    vstream(chi[ss+v]()(3)(1), p_31);
 | 
			
		||||
                    vstream(chi[ss+v]()(3)(2), p_32);
 | 
			
		||||
                }
 | 
			
		||||
 | 
			
		||||
            #endif
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        this->M5Dtime += usecond();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
 | 
			
		||||
        FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
 | 
			
		||||
    {
 | 
			
		||||
        GridBase* grid = psi._grid;
 | 
			
		||||
        int Ls  = this->Ls;
 | 
			
		||||
        int LLs = grid->_rdimensions[0];
 | 
			
		||||
        int nsimd = Simd::Nsimd();
 | 
			
		||||
 | 
			
		||||
        Vector<iSinglet<Simd> > u(LLs);
 | 
			
		||||
        Vector<iSinglet<Simd> > l(LLs);
 | 
			
		||||
        Vector<iSinglet<Simd> > d(LLs);
 | 
			
		||||
 | 
			
		||||
        assert(Ls/LLs == nsimd);
 | 
			
		||||
        assert(phi.checkerboard == psi.checkerboard);
 | 
			
		||||
 | 
			
		||||
        chi.checkerboard = psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
        // just directly address via type pun
 | 
			
		||||
        typedef typename Simd::scalar_type scalar_type;
 | 
			
		||||
        scalar_type* u_p = (scalar_type*) &u[0];
 | 
			
		||||
        scalar_type* l_p = (scalar_type*) &l[0];
 | 
			
		||||
        scalar_type* d_p = (scalar_type*) &d[0];
 | 
			
		||||
 | 
			
		||||
        for(int o=0; o<LLs; o++){ // outer
 | 
			
		||||
        for(int i=0; i<nsimd; i++){ //inner
 | 
			
		||||
            int s  = o + i*LLs;
 | 
			
		||||
            int ss = o*nsimd + i;
 | 
			
		||||
            u_p[ss] = upper[s];
 | 
			
		||||
            l_p[ss] = lower[s];
 | 
			
		||||
            d_p[ss] = diag[s];
 | 
			
		||||
        }}
 | 
			
		||||
 | 
			
		||||
        this->M5Dcalls++;
 | 
			
		||||
        this->M5Dtime -= usecond();
 | 
			
		||||
 | 
			
		||||
        parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
 | 
			
		||||
 | 
			
		||||
        #if 0
 | 
			
		||||
 | 
			
		||||
            alignas(64) SiteHalfSpinor hp;
 | 
			
		||||
            alignas(64) SiteHalfSpinor hm;
 | 
			
		||||
            alignas(64) SiteSpinor fp;
 | 
			
		||||
            alignas(64) SiteSpinor fm;
 | 
			
		||||
 | 
			
		||||
            for(int v=0; v<LLs; v++){
 | 
			
		||||
 | 
			
		||||
                int vp = (v+1)%LLs;
 | 
			
		||||
                int vm = (v+LLs-1)%LLs;
 | 
			
		||||
 | 
			
		||||
                spProj5p(hp, psi[ss+vp]);
 | 
			
		||||
                spProj5m(hm, psi[ss+vm]);
 | 
			
		||||
 | 
			
		||||
                if(vp <= v){ rotate(hp, hp, 1); }
 | 
			
		||||
                if(vm >= v){ rotate(hm, hm, nsimd-1); }
 | 
			
		||||
 | 
			
		||||
                hp = hp*0.5;
 | 
			
		||||
                hm = hm*0.5;
 | 
			
		||||
                spRecon5p(fp, hp);
 | 
			
		||||
                spRecon5m(fm, hm);
 | 
			
		||||
 | 
			
		||||
                chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
 | 
			
		||||
                chi[ss+v] = chi[ss+v]     +l[v]*fm;
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
        #else
 | 
			
		||||
 | 
			
		||||
            for(int v=0; v<LLs; v++){
 | 
			
		||||
 | 
			
		||||
                vprefetch(psi[ss+v+LLs]);
 | 
			
		||||
 | 
			
		||||
                int vp = (v == LLs-1) ? 0     : v+1;
 | 
			
		||||
                int vm = (v == 0    ) ? LLs-1 : v-1;
 | 
			
		||||
 | 
			
		||||
                Simd hp_00 = psi[ss+vp]()(0)(0);
 | 
			
		||||
                Simd hp_01 = psi[ss+vp]()(0)(1);
 | 
			
		||||
                Simd hp_02 = psi[ss+vp]()(0)(2);
 | 
			
		||||
                Simd hp_10 = psi[ss+vp]()(1)(0);
 | 
			
		||||
                Simd hp_11 = psi[ss+vp]()(1)(1);
 | 
			
		||||
                Simd hp_12 = psi[ss+vp]()(1)(2);
 | 
			
		||||
 | 
			
		||||
                Simd hm_00 = psi[ss+vm]()(2)(0);
 | 
			
		||||
                Simd hm_01 = psi[ss+vm]()(2)(1);
 | 
			
		||||
                Simd hm_02 = psi[ss+vm]()(2)(2);
 | 
			
		||||
                Simd hm_10 = psi[ss+vm]()(3)(0);
 | 
			
		||||
                Simd hm_11 = psi[ss+vm]()(3)(1);
 | 
			
		||||
                Simd hm_12 = psi[ss+vm]()(3)(2);
 | 
			
		||||
 | 
			
		||||
                if (vp <= v){
 | 
			
		||||
                    hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
 | 
			
		||||
                    hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
 | 
			
		||||
                    hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
 | 
			
		||||
                    hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
 | 
			
		||||
                    hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
 | 
			
		||||
                    hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
 | 
			
		||||
                }
 | 
			
		||||
 | 
			
		||||
                if(vm >= v){
 | 
			
		||||
                    hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
 | 
			
		||||
                    hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
 | 
			
		||||
                    hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
 | 
			
		||||
                    hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
 | 
			
		||||
                    hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
 | 
			
		||||
                    hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
 | 
			
		||||
                }
 | 
			
		||||
 | 
			
		||||
                Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
 | 
			
		||||
                Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
 | 
			
		||||
                Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
 | 
			
		||||
                Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
 | 
			
		||||
                Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
 | 
			
		||||
                Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
 | 
			
		||||
                Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
 | 
			
		||||
                Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
 | 
			
		||||
                Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
 | 
			
		||||
                Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
 | 
			
		||||
                Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
 | 
			
		||||
                Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
 | 
			
		||||
 | 
			
		||||
                vstream(chi[ss+v]()(0)(0), p_00);
 | 
			
		||||
                vstream(chi[ss+v]()(0)(1), p_01);
 | 
			
		||||
                vstream(chi[ss+v]()(0)(2), p_02);
 | 
			
		||||
                vstream(chi[ss+v]()(1)(0), p_10);
 | 
			
		||||
                vstream(chi[ss+v]()(1)(1), p_11);
 | 
			
		||||
                vstream(chi[ss+v]()(1)(2), p_12);
 | 
			
		||||
                vstream(chi[ss+v]()(2)(0), p_20);
 | 
			
		||||
                vstream(chi[ss+v]()(2)(1), p_21);
 | 
			
		||||
                vstream(chi[ss+v]()(2)(2), p_22);
 | 
			
		||||
                vstream(chi[ss+v]()(3)(0), p_30);
 | 
			
		||||
                vstream(chi[ss+v]()(3)(1), p_31);
 | 
			
		||||
                vstream(chi[ss+v]()(3)(2), p_32);
 | 
			
		||||
            }
 | 
			
		||||
        #endif
 | 
			
		||||
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        this->M5Dtime += usecond();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    #ifdef AVX512
 | 
			
		||||
        #include<simd/Intel512common.h>
 | 
			
		||||
        #include<simd/Intel512avx.h>
 | 
			
		||||
        #include<simd/Intel512single.h>
 | 
			
		||||
    #endif
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi, FermionField& chi,
 | 
			
		||||
        int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
 | 
			
		||||
    {
 | 
			
		||||
        #ifndef AVX512
 | 
			
		||||
        {
 | 
			
		||||
            SiteHalfSpinor BcastP;
 | 
			
		||||
            SiteHalfSpinor BcastM;
 | 
			
		||||
            SiteHalfSpinor SiteChiP;
 | 
			
		||||
            SiteHalfSpinor SiteChiM;
 | 
			
		||||
 | 
			
		||||
            // Ls*Ls * 2 * 12 * vol flops
 | 
			
		||||
            for(int s1=0; s1<LLs; s1++){
 | 
			
		||||
 | 
			
		||||
                for(int s2=0; s2<LLs; s2++){
 | 
			
		||||
                for(int l=0; l < Simd::Nsimd(); l++){ // simd lane
 | 
			
		||||
 | 
			
		||||
                    int s = s2 + l*LLs;
 | 
			
		||||
                    int lex = s2 + LLs*site;
 | 
			
		||||
 | 
			
		||||
                    if( s2==0 && l==0 ){
 | 
			
		||||
                        SiteChiP=zero;
 | 
			
		||||
                        SiteChiM=zero;
 | 
			
		||||
                    }
 | 
			
		||||
 | 
			
		||||
                    for(int sp=0; sp<2;  sp++){
 | 
			
		||||
                    for(int co=0; co<Nc; co++){
 | 
			
		||||
                        vbroadcast(BcastP()(sp)(co), psi[lex]()(sp)(co), l);
 | 
			
		||||
                    }}
 | 
			
		||||
 | 
			
		||||
                    for(int sp=0; sp<2;  sp++){
 | 
			
		||||
                    for(int co=0; co<Nc; co++){
 | 
			
		||||
                        vbroadcast(BcastM()(sp)(co), psi[lex]()(sp+2)(co), l);
 | 
			
		||||
                    }}
 | 
			
		||||
 | 
			
		||||
                    for(int sp=0; sp<2;  sp++){
 | 
			
		||||
                    for(int co=0; co<Nc; co++){
 | 
			
		||||
                        SiteChiP()(sp)(co) = real_madd(Matp[LLs*s+s1]()()(), BcastP()(sp)(co), SiteChiP()(sp)(co)); // 1100 us.
 | 
			
		||||
                        SiteChiM()(sp)(co) = real_madd(Matm[LLs*s+s1]()()(), BcastM()(sp)(co), SiteChiM()(sp)(co)); // each found by commenting out
 | 
			
		||||
                    }}
 | 
			
		||||
                }}
 | 
			
		||||
 | 
			
		||||
                {
 | 
			
		||||
                    int lex = s1 + LLs*site;
 | 
			
		||||
                    for(int sp=0; sp<2;  sp++){
 | 
			
		||||
                    for(int co=0; co<Nc; co++){
 | 
			
		||||
                        vstream(chi[lex]()(sp)(co),   SiteChiP()(sp)(co));
 | 
			
		||||
                        vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
 | 
			
		||||
                    }}
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
        }
 | 
			
		||||
        #else
 | 
			
		||||
        {
 | 
			
		||||
            // pointers
 | 
			
		||||
            //  MASK_REGS;
 | 
			
		||||
            #define Chi_00 %%zmm1
 | 
			
		||||
            #define Chi_01 %%zmm2
 | 
			
		||||
            #define Chi_02 %%zmm3
 | 
			
		||||
            #define Chi_10 %%zmm4
 | 
			
		||||
            #define Chi_11 %%zmm5
 | 
			
		||||
            #define Chi_12 %%zmm6
 | 
			
		||||
            #define Chi_20 %%zmm7
 | 
			
		||||
            #define Chi_21 %%zmm8
 | 
			
		||||
            #define Chi_22 %%zmm9
 | 
			
		||||
            #define Chi_30 %%zmm10
 | 
			
		||||
            #define Chi_31 %%zmm11
 | 
			
		||||
            #define Chi_32 %%zmm12
 | 
			
		||||
 | 
			
		||||
            #define BCAST0  %%zmm13
 | 
			
		||||
            #define BCAST1  %%zmm14
 | 
			
		||||
            #define BCAST2  %%zmm15
 | 
			
		||||
            #define BCAST3  %%zmm16
 | 
			
		||||
            #define BCAST4  %%zmm17
 | 
			
		||||
            #define BCAST5  %%zmm18
 | 
			
		||||
            #define BCAST6  %%zmm19
 | 
			
		||||
            #define BCAST7  %%zmm20
 | 
			
		||||
            #define BCAST8  %%zmm21
 | 
			
		||||
            #define BCAST9  %%zmm22
 | 
			
		||||
            #define BCAST10 %%zmm23
 | 
			
		||||
            #define BCAST11 %%zmm24
 | 
			
		||||
 | 
			
		||||
            int incr = LLs*LLs*sizeof(iSinglet<Simd>);
 | 
			
		||||
            for(int s1=0; s1<LLs; s1++){
 | 
			
		||||
 | 
			
		||||
                for(int s2=0; s2<LLs; s2++){
 | 
			
		||||
 | 
			
		||||
                    int lex = s2 + LLs*site;
 | 
			
		||||
                    uint64_t a0 = (uint64_t) &Matp[LLs*s2+s1]; // should be cacheable
 | 
			
		||||
                    uint64_t a1 = (uint64_t) &Matm[LLs*s2+s1];
 | 
			
		||||
                    uint64_t a2 = (uint64_t) &psi[lex];
 | 
			
		||||
 | 
			
		||||
                    for(int l=0; l<Simd::Nsimd(); l++){ // simd lane
 | 
			
		||||
                        if((s2+l)==0) {
 | 
			
		||||
                            asm(
 | 
			
		||||
                                    VPREFETCH1(0,%2)              VPREFETCH1(0,%1)
 | 
			
		||||
                                    VPREFETCH1(12,%2)  	          VPREFETCH1(13,%2)
 | 
			
		||||
                                    VPREFETCH1(14,%2)  	          VPREFETCH1(15,%2)
 | 
			
		||||
                                    VBCASTCDUP(0,%2,BCAST0)
 | 
			
		||||
                                    VBCASTCDUP(1,%2,BCAST1)
 | 
			
		||||
                                    VBCASTCDUP(2,%2,BCAST2)
 | 
			
		||||
                                    VBCASTCDUP(3,%2,BCAST3)
 | 
			
		||||
                                    VBCASTCDUP(4,%2,BCAST4)       VMULMEM(0,%0,BCAST0,Chi_00)
 | 
			
		||||
                                    VBCASTCDUP(5,%2,BCAST5)       VMULMEM(0,%0,BCAST1,Chi_01)
 | 
			
		||||
                                    VBCASTCDUP(6,%2,BCAST6)       VMULMEM(0,%0,BCAST2,Chi_02)
 | 
			
		||||
                                    VBCASTCDUP(7,%2,BCAST7)       VMULMEM(0,%0,BCAST3,Chi_10)
 | 
			
		||||
                                    VBCASTCDUP(8,%2,BCAST8)       VMULMEM(0,%0,BCAST4,Chi_11)
 | 
			
		||||
                                    VBCASTCDUP(9,%2,BCAST9)       VMULMEM(0,%0,BCAST5,Chi_12)
 | 
			
		||||
                                    VBCASTCDUP(10,%2,BCAST10)     VMULMEM(0,%1,BCAST6,Chi_20)
 | 
			
		||||
                                    VBCASTCDUP(11,%2,BCAST11)     VMULMEM(0,%1,BCAST7,Chi_21)
 | 
			
		||||
                                    VMULMEM(0,%1,BCAST8,Chi_22)
 | 
			
		||||
                                    VMULMEM(0,%1,BCAST9,Chi_30)
 | 
			
		||||
                                    VMULMEM(0,%1,BCAST10,Chi_31)
 | 
			
		||||
                                    VMULMEM(0,%1,BCAST11,Chi_32)
 | 
			
		||||
                                    : : "r" (a0), "r" (a1), "r" (a2)                            );
 | 
			
		||||
                        } else {
 | 
			
		||||
                            asm(
 | 
			
		||||
                                    VBCASTCDUP(0,%2,BCAST0)   VMADDMEM(0,%0,BCAST0,Chi_00)
 | 
			
		||||
                                    VBCASTCDUP(1,%2,BCAST1)   VMADDMEM(0,%0,BCAST1,Chi_01)
 | 
			
		||||
                                    VBCASTCDUP(2,%2,BCAST2)   VMADDMEM(0,%0,BCAST2,Chi_02)
 | 
			
		||||
                                    VBCASTCDUP(3,%2,BCAST3)   VMADDMEM(0,%0,BCAST3,Chi_10)
 | 
			
		||||
                                    VBCASTCDUP(4,%2,BCAST4)   VMADDMEM(0,%0,BCAST4,Chi_11)
 | 
			
		||||
                                    VBCASTCDUP(5,%2,BCAST5)   VMADDMEM(0,%0,BCAST5,Chi_12)
 | 
			
		||||
                                    VBCASTCDUP(6,%2,BCAST6)   VMADDMEM(0,%1,BCAST6,Chi_20)
 | 
			
		||||
                                    VBCASTCDUP(7,%2,BCAST7)   VMADDMEM(0,%1,BCAST7,Chi_21)
 | 
			
		||||
                                    VBCASTCDUP(8,%2,BCAST8)   VMADDMEM(0,%1,BCAST8,Chi_22)
 | 
			
		||||
                                    VBCASTCDUP(9,%2,BCAST9)   VMADDMEM(0,%1,BCAST9,Chi_30)
 | 
			
		||||
                                    VBCASTCDUP(10,%2,BCAST10) VMADDMEM(0,%1,BCAST10,Chi_31)
 | 
			
		||||
                                    VBCASTCDUP(11,%2,BCAST11) VMADDMEM(0,%1,BCAST11,Chi_32)
 | 
			
		||||
                                    : : "r" (a0), "r" (a1), "r" (a2)                            );
 | 
			
		||||
                        }
 | 
			
		||||
                        a0 = a0 + incr;
 | 
			
		||||
                        a1 = a1 + incr;
 | 
			
		||||
                        a2 = a2 + sizeof(Simd::scalar_type);
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
 | 
			
		||||
                {
 | 
			
		||||
                  int lexa = s1+LLs*site;
 | 
			
		||||
                  asm (
 | 
			
		||||
                     VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01)  VSTORE(2 ,%0,Chi_02)
 | 
			
		||||
                     VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11)  VSTORE(5 ,%0,Chi_12)
 | 
			
		||||
                     VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21)  VSTORE(8 ,%0,Chi_22)
 | 
			
		||||
                     VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31)  VSTORE(11,%0,Chi_32)
 | 
			
		||||
                     : : "r" ((uint64_t)&chi[lexa]) : "memory" );
 | 
			
		||||
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        #undef Chi_00
 | 
			
		||||
        #undef Chi_01
 | 
			
		||||
        #undef Chi_02
 | 
			
		||||
        #undef Chi_10
 | 
			
		||||
        #undef Chi_11
 | 
			
		||||
        #undef Chi_12
 | 
			
		||||
        #undef Chi_20
 | 
			
		||||
        #undef Chi_21
 | 
			
		||||
        #undef Chi_22
 | 
			
		||||
        #undef Chi_30
 | 
			
		||||
        #undef Chi_31
 | 
			
		||||
        #undef Chi_32
 | 
			
		||||
 | 
			
		||||
        #undef BCAST0
 | 
			
		||||
        #undef BCAST1
 | 
			
		||||
        #undef BCAST2
 | 
			
		||||
        #undef BCAST3
 | 
			
		||||
        #undef BCAST4
 | 
			
		||||
        #undef BCAST5
 | 
			
		||||
        #undef BCAST6
 | 
			
		||||
        #undef BCAST7
 | 
			
		||||
        #undef BCAST8
 | 
			
		||||
        #undef BCAST9
 | 
			
		||||
        #undef BCAST10
 | 
			
		||||
        #undef BCAST11
 | 
			
		||||
        #endif
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    // Z-mobius version
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::MooeeInternalZAsm(const FermionField& psi, FermionField& chi,
 | 
			
		||||
        int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
 | 
			
		||||
    {
 | 
			
		||||
        std::cout << "Error: zMobius not implemented for EOFA" << std::endl;
 | 
			
		||||
        exit(-1);
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void DomainWallEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
 | 
			
		||||
    {
 | 
			
		||||
        int Ls  = this->Ls;
 | 
			
		||||
        int LLs = psi._grid->_rdimensions[0];
 | 
			
		||||
        int vol = psi._grid->oSites()/LLs;
 | 
			
		||||
 | 
			
		||||
        chi.checkerboard = psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
        Vector<iSinglet<Simd> > Matp;
 | 
			
		||||
        Vector<iSinglet<Simd> > Matm;
 | 
			
		||||
        Vector<iSinglet<Simd> > *_Matp;
 | 
			
		||||
        Vector<iSinglet<Simd> > *_Matm;
 | 
			
		||||
 | 
			
		||||
        //  MooeeInternalCompute(dag,inv,Matp,Matm);
 | 
			
		||||
        if(inv && dag){
 | 
			
		||||
            _Matp = &this->MatpInvDag;
 | 
			
		||||
            _Matm = &this->MatmInvDag;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        if(inv && (!dag)){
 | 
			
		||||
            _Matp = &this->MatpInv;
 | 
			
		||||
            _Matm = &this->MatmInv;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        if(!inv){
 | 
			
		||||
            MooeeInternalCompute(dag, inv, Matp, Matm);
 | 
			
		||||
            _Matp = &Matp;
 | 
			
		||||
            _Matm = &Matm;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        assert(_Matp->size() == Ls*LLs);
 | 
			
		||||
 | 
			
		||||
        this->MooeeInvCalls++;
 | 
			
		||||
        this->MooeeInvTime -= usecond();
 | 
			
		||||
 | 
			
		||||
        if(switcheroo<Coeff_t>::iscomplex()){
 | 
			
		||||
            parallel_for(auto site=0; site<vol; site++){
 | 
			
		||||
                MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
 | 
			
		||||
            }
 | 
			
		||||
        } else {
 | 
			
		||||
            parallel_for(auto site=0; site<vol; site++){
 | 
			
		||||
                MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        this->MooeeInvTime += usecond();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    #ifdef DOMAIN_WALL_EOFA_DPERP_VEC
 | 
			
		||||
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplD);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplF);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplD);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplF);
 | 
			
		||||
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplDF);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplFH);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplDF);
 | 
			
		||||
        INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplFH);
 | 
			
		||||
 | 
			
		||||
        template void DomainWallEOFAFermion<DomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
        template void DomainWallEOFAFermion<DomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
        template void DomainWallEOFAFermion<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
        template void DomainWallEOFAFermion<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
 | 
			
		||||
        template void DomainWallEOFAFermion<DomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
        template void DomainWallEOFAFermion<DomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
        template void DomainWallEOFAFermion<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
        template void DomainWallEOFAFermion<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
 | 
			
		||||
 | 
			
		||||
    #endif
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
@@ -1,6 +1,6 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
    Source file: ./lib/qcd/action/fermion/Fermion_base_aggregate.h
 | 
			
		||||
 | 
			
		||||
@@ -38,6 +38,8 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 | 
			
		||||
// - ContinuedFractionFermion5D.cc
 | 
			
		||||
// - WilsonFermion.cc
 | 
			
		||||
// - WilsonKernels.cc
 | 
			
		||||
// - DomainWallEOFAFermion.cc
 | 
			
		||||
// - MobiusEOFAFermion.cc
 | 
			
		||||
//
 | 
			
		||||
// The explicit instantiation is only avoidable if we move this source to headers and end up with include/parse/recompile
 | 
			
		||||
// for EVERY .cc file. This define centralises the list and restores global push of impl cases
 | 
			
		||||
@@ -55,8 +57,9 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 | 
			
		||||
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/CayleyFermion5D.h>     // Cayley types
 | 
			
		||||
#include <Grid/qcd/action/fermion/DomainWallFermion.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/DomainWallFermion.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/MobiusFermion.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/ZMobiusFermion.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/SchurDiagTwoKappa.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/ScaledShamirFermion.h>
 | 
			
		||||
@@ -113,6 +116,14 @@ typedef DomainWallFermion<WilsonImplRL> DomainWallFermionRL;
 | 
			
		||||
typedef DomainWallFermion<WilsonImplFH> DomainWallFermionFH;
 | 
			
		||||
typedef DomainWallFermion<WilsonImplDF> DomainWallFermionDF;
 | 
			
		||||
 | 
			
		||||
typedef DomainWallEOFAFermion<WilsonImplR> DomainWallEOFAFermionR;
 | 
			
		||||
typedef DomainWallEOFAFermion<WilsonImplF> DomainWallEOFAFermionF;
 | 
			
		||||
typedef DomainWallEOFAFermion<WilsonImplD> DomainWallEOFAFermionD;
 | 
			
		||||
 | 
			
		||||
typedef DomainWallEOFAFermion<WilsonImplRL> DomainWallEOFAFermionRL;
 | 
			
		||||
typedef DomainWallEOFAFermion<WilsonImplFH> DomainWallEOFAFermionFH;
 | 
			
		||||
typedef DomainWallEOFAFermion<WilsonImplDF> DomainWallEOFAFermionDF;
 | 
			
		||||
 | 
			
		||||
typedef MobiusFermion<WilsonImplR> MobiusFermionR;
 | 
			
		||||
typedef MobiusFermion<WilsonImplF> MobiusFermionF;
 | 
			
		||||
typedef MobiusFermion<WilsonImplD> MobiusFermionD;
 | 
			
		||||
@@ -121,6 +132,14 @@ typedef MobiusFermion<WilsonImplRL> MobiusFermionRL;
 | 
			
		||||
typedef MobiusFermion<WilsonImplFH> MobiusFermionFH;
 | 
			
		||||
typedef MobiusFermion<WilsonImplDF> MobiusFermionDF;
 | 
			
		||||
 | 
			
		||||
typedef MobiusEOFAFermion<WilsonImplR> MobiusEOFAFermionR;
 | 
			
		||||
typedef MobiusEOFAFermion<WilsonImplF> MobiusEOFAFermionF;
 | 
			
		||||
typedef MobiusEOFAFermion<WilsonImplD> MobiusEOFAFermionD;
 | 
			
		||||
 | 
			
		||||
typedef MobiusEOFAFermion<WilsonImplRL> MobiusEOFAFermionRL;
 | 
			
		||||
typedef MobiusEOFAFermion<WilsonImplFH> MobiusEOFAFermionFH;
 | 
			
		||||
typedef MobiusEOFAFermion<WilsonImplDF> MobiusEOFAFermionDF;
 | 
			
		||||
 | 
			
		||||
typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR;
 | 
			
		||||
typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
 | 
			
		||||
typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;
 | 
			
		||||
@@ -129,7 +148,7 @@ typedef ZMobiusFermion<ZWilsonImplRL> ZMobiusFermionRL;
 | 
			
		||||
typedef ZMobiusFermion<ZWilsonImplFH> ZMobiusFermionFH;
 | 
			
		||||
typedef ZMobiusFermion<ZWilsonImplDF> ZMobiusFermionDF;
 | 
			
		||||
 | 
			
		||||
// Ls vectorised 
 | 
			
		||||
// Ls vectorised
 | 
			
		||||
typedef DomainWallFermion<DomainWallVec5dImplR> DomainWallFermionVec5dR;
 | 
			
		||||
typedef DomainWallFermion<DomainWallVec5dImplF> DomainWallFermionVec5dF;
 | 
			
		||||
typedef DomainWallFermion<DomainWallVec5dImplD> DomainWallFermionVec5dD;
 | 
			
		||||
@@ -138,6 +157,14 @@ typedef DomainWallFermion<DomainWallVec5dImplRL> DomainWallFermionVec5dRL;
 | 
			
		||||
typedef DomainWallFermion<DomainWallVec5dImplFH> DomainWallFermionVec5dFH;
 | 
			
		||||
typedef DomainWallFermion<DomainWallVec5dImplDF> DomainWallFermionVec5dDF;
 | 
			
		||||
 | 
			
		||||
typedef DomainWallEOFAFermion<DomainWallVec5dImplR> DomainWallEOFAFermionVec5dR;
 | 
			
		||||
typedef DomainWallEOFAFermion<DomainWallVec5dImplF> DomainWallEOFAFermionVec5dF;
 | 
			
		||||
typedef DomainWallEOFAFermion<DomainWallVec5dImplD> DomainWallEOFAFermionVec5dD;
 | 
			
		||||
 | 
			
		||||
typedef DomainWallEOFAFermion<DomainWallVec5dImplRL> DomainWallEOFAFermionVec5dRL;
 | 
			
		||||
typedef DomainWallEOFAFermion<DomainWallVec5dImplFH> DomainWallEOFAFermionVec5dFH;
 | 
			
		||||
typedef DomainWallEOFAFermion<DomainWallVec5dImplDF> DomainWallEOFAFermionVec5dDF;
 | 
			
		||||
 | 
			
		||||
typedef MobiusFermion<DomainWallVec5dImplR> MobiusFermionVec5dR;
 | 
			
		||||
typedef MobiusFermion<DomainWallVec5dImplF> MobiusFermionVec5dF;
 | 
			
		||||
typedef MobiusFermion<DomainWallVec5dImplD> MobiusFermionVec5dD;
 | 
			
		||||
@@ -146,6 +173,14 @@ typedef MobiusFermion<DomainWallVec5dImplRL> MobiusFermionVec5dRL;
 | 
			
		||||
typedef MobiusFermion<DomainWallVec5dImplFH> MobiusFermionVec5dFH;
 | 
			
		||||
typedef MobiusFermion<DomainWallVec5dImplDF> MobiusFermionVec5dDF;
 | 
			
		||||
 | 
			
		||||
typedef MobiusEOFAFermion<DomainWallVec5dImplR> MobiusEOFAFermionVec5dR;
 | 
			
		||||
typedef MobiusEOFAFermion<DomainWallVec5dImplF> MobiusEOFAFermionVec5dF;
 | 
			
		||||
typedef MobiusEOFAFermion<DomainWallVec5dImplD> MobiusEOFAFermionVec5dD;
 | 
			
		||||
 | 
			
		||||
typedef MobiusEOFAFermion<DomainWallVec5dImplRL> MobiusEOFAFermionVec5dRL;
 | 
			
		||||
typedef MobiusEOFAFermion<DomainWallVec5dImplFH> MobiusEOFAFermionVec5dFH;
 | 
			
		||||
typedef MobiusEOFAFermion<DomainWallVec5dImplDF> MobiusEOFAFermionVec5dDF;
 | 
			
		||||
 | 
			
		||||
typedef ZMobiusFermion<ZDomainWallVec5dImplR> ZMobiusFermionVec5dR;
 | 
			
		||||
typedef ZMobiusFermion<ZDomainWallVec5dImplF> ZMobiusFermionVec5dF;
 | 
			
		||||
typedef ZMobiusFermion<ZDomainWallVec5dImplD> ZMobiusFermionVec5dD;
 | 
			
		||||
@@ -206,6 +241,14 @@ typedef DomainWallFermion<GparityWilsonImplRL> GparityDomainWallFermionRL;
 | 
			
		||||
typedef DomainWallFermion<GparityWilsonImplFH> GparityDomainWallFermionFH;
 | 
			
		||||
typedef DomainWallFermion<GparityWilsonImplDF> GparityDomainWallFermionDF;
 | 
			
		||||
 | 
			
		||||
typedef DomainWallEOFAFermion<GparityWilsonImplR> GparityDomainWallEOFAFermionR;
 | 
			
		||||
typedef DomainWallEOFAFermion<GparityWilsonImplF> GparityDomainWallEOFAFermionF;
 | 
			
		||||
typedef DomainWallEOFAFermion<GparityWilsonImplD> GparityDomainWallEOFAFermionD;
 | 
			
		||||
 | 
			
		||||
typedef DomainWallEOFAFermion<GparityWilsonImplRL> GparityDomainWallEOFAFermionRL;
 | 
			
		||||
typedef DomainWallEOFAFermion<GparityWilsonImplFH> GparityDomainWallEOFAFermionFH;
 | 
			
		||||
typedef DomainWallEOFAFermion<GparityWilsonImplDF> GparityDomainWallEOFAFermionDF;
 | 
			
		||||
 | 
			
		||||
typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionR;
 | 
			
		||||
typedef WilsonTMFermion<GparityWilsonImplF> GparityWilsonTMFermionF;
 | 
			
		||||
typedef WilsonTMFermion<GparityWilsonImplD> GparityWilsonTMFermionD;
 | 
			
		||||
@@ -222,6 +265,14 @@ typedef MobiusFermion<GparityWilsonImplRL> GparityMobiusFermionRL;
 | 
			
		||||
typedef MobiusFermion<GparityWilsonImplFH> GparityMobiusFermionFH;
 | 
			
		||||
typedef MobiusFermion<GparityWilsonImplDF> GparityMobiusFermionDF;
 | 
			
		||||
 | 
			
		||||
typedef MobiusEOFAFermion<GparityWilsonImplR> GparityMobiusEOFAFermionR;
 | 
			
		||||
typedef MobiusEOFAFermion<GparityWilsonImplF> GparityMobiusEOFAFermionF;
 | 
			
		||||
typedef MobiusEOFAFermion<GparityWilsonImplD> GparityMobiusEOFAFermionD;
 | 
			
		||||
 | 
			
		||||
typedef MobiusEOFAFermion<GparityWilsonImplRL> GparityMobiusEOFAFermionRL;
 | 
			
		||||
typedef MobiusEOFAFermion<GparityWilsonImplFH> GparityMobiusEOFAFermionFH;
 | 
			
		||||
typedef MobiusEOFAFermion<GparityWilsonImplDF> GparityMobiusEOFAFermionDF;
 | 
			
		||||
 | 
			
		||||
typedef ImprovedStaggeredFermion<StaggeredImplR> ImprovedStaggeredFermionR;
 | 
			
		||||
typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
 | 
			
		||||
typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD;
 | 
			
		||||
@@ -237,4 +288,11 @@ typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermion
 | 
			
		||||
 | 
			
		||||
  }}
 | 
			
		||||
 | 
			
		||||
////////////////////
 | 
			
		||||
// Scalar QED actions
 | 
			
		||||
// TODO: this needs to move to another header after rename to Fermion.h
 | 
			
		||||
////////////////////
 | 
			
		||||
#include <Grid/qcd/action/scalar/Scalar.h>
 | 
			
		||||
#include <Grid/qcd/action/gauge/Photon.h>
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -538,6 +538,12 @@ class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresent
 | 
			
		||||
   
 | 
			
		||||
 }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 template <class ref>
 | 
			
		||||
 inline void loadLinkElement(Simd ®, ref &memory) {
 | 
			
		||||
   reg = memory;
 | 
			
		||||
 }
 | 
			
		||||
 | 
			
		||||
 inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
 | 
			
		||||
 {
 | 
			
		||||
   conformable(Uds._grid,GaugeGrid);
 | 
			
		||||
 
 | 
			
		||||
@@ -230,8 +230,15 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
 | 
			
		||||
{
 | 
			
		||||
  Compressor compressor;
 | 
			
		||||
  int LLs = in._grid->_rdimensions[0];
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  DhopTotalTime -= usecond();
 | 
			
		||||
  DhopCommTime -= usecond();
 | 
			
		||||
  st.HaloExchange(in,compressor);
 | 
			
		||||
  DhopCommTime += usecond();
 | 
			
		||||
  
 | 
			
		||||
  DhopComputeTime -= usecond();
 | 
			
		||||
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
 | 
			
		||||
  if (dag == DaggerYes) {
 | 
			
		||||
    parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
 | 
			
		||||
@@ -244,12 +251,15 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
 | 
			
		||||
	Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  DhopComputeTime += usecond();
 | 
			
		||||
  DhopTotalTime   += usecond();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 | 
			
		||||
{
 | 
			
		||||
  DhopCalls+=1;
 | 
			
		||||
  conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
 | 
			
		||||
  conformable(in._grid,out._grid); // drops the cb check
 | 
			
		||||
 | 
			
		||||
@@ -261,6 +271,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionFie
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 | 
			
		||||
{
 | 
			
		||||
  DhopCalls+=1;
 | 
			
		||||
  conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
 | 
			
		||||
  conformable(in._grid,out._grid); // drops the cb check
 | 
			
		||||
 | 
			
		||||
@@ -272,6 +283,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionFie
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
 | 
			
		||||
{
 | 
			
		||||
  DhopCalls+=2;
 | 
			
		||||
  conformable(in._grid,FermionGrid()); // verifies full grid
 | 
			
		||||
  conformable(in._grid,out._grid);
 | 
			
		||||
 | 
			
		||||
@@ -280,6 +292,54 @@ void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField
 | 
			
		||||
  DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void ImprovedStaggeredFermion5D<Impl>::Report(void) 
 | 
			
		||||
{
 | 
			
		||||
  std::vector<int> latt = GridDefaultLatt();          
 | 
			
		||||
  RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
 | 
			
		||||
  RealD NP = _FourDimGrid->_Nprocessors;
 | 
			
		||||
  RealD NN = _FourDimGrid->NodeCount();
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Number of DhopEO Calls   : " 
 | 
			
		||||
	    << DhopCalls   << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D TotalTime   /Calls       : " 
 | 
			
		||||
	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D CommTime    /Calls       : " 
 | 
			
		||||
	    << DhopCommTime    / DhopCalls << " us" << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D ComputeTime/Calls        : " 
 | 
			
		||||
	    << DhopComputeTime / DhopCalls << " us" << std::endl;
 | 
			
		||||
 | 
			
		||||
  // Average the compute time
 | 
			
		||||
  _FourDimGrid->GlobalSum(DhopComputeTime);
 | 
			
		||||
  DhopComputeTime/=NP;
 | 
			
		||||
 | 
			
		||||
  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
 | 
			
		||||
  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
 | 
			
		||||
  
 | 
			
		||||
  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
 | 
			
		||||
  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
 | 
			
		||||
  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
 | 
			
		||||
 | 
			
		||||
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Stencil"    <<std::endl;  Stencil.Report();
 | 
			
		||||
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilEven"<<std::endl;  StencilEven.Report();
 | 
			
		||||
  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
 | 
			
		||||
}
 | 
			
		||||
template<class Impl>
 | 
			
		||||
void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void) 
 | 
			
		||||
{
 | 
			
		||||
  DhopCalls       = 0;
 | 
			
		||||
  DhopTotalTime    = 0;
 | 
			
		||||
  DhopCommTime    = 0;
 | 
			
		||||
  DhopComputeTime = 0;
 | 
			
		||||
  Stencil.ZeroCounters();
 | 
			
		||||
  StencilEven.ZeroCounters();
 | 
			
		||||
  StencilOdd.ZeroCounters();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/////////////////////////////////////////////////////////////////////////
 | 
			
		||||
// Implement the general interface. Here we use SAME mass on all slices
 | 
			
		||||
 
 | 
			
		||||
@@ -55,6 +55,16 @@ namespace QCD {
 | 
			
		||||
      FermionField _tmp;
 | 
			
		||||
      FermionField &tmp(void) { return _tmp; }
 | 
			
		||||
 | 
			
		||||
      ////////////////////////////////////////
 | 
			
		||||
      // Performance monitoring
 | 
			
		||||
      ////////////////////////////////////////
 | 
			
		||||
      void Report(void);
 | 
			
		||||
      void ZeroCounters(void);
 | 
			
		||||
      double DhopTotalTime;
 | 
			
		||||
      double DhopCalls;
 | 
			
		||||
      double DhopCommTime;
 | 
			
		||||
      double DhopComputeTime;
 | 
			
		||||
 | 
			
		||||
      ///////////////////////////////////////////////////////////////
 | 
			
		||||
      // Implement the abstract base
 | 
			
		||||
      ///////////////////////////////////////////////////////////////
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										502
									
								
								lib/qcd/action/fermion/MobiusEOFAFermion.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										502
									
								
								lib/qcd/action/fermion/MobiusEOFAFermion.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,502 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
Source file: ./lib/qcd/action/fermion/MobiusEOFAFermion.cc
 | 
			
		||||
 | 
			
		||||
Copyright (C) 2017
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: David Murphy <dmurphy@phys.columbia.edu>
 | 
			
		||||
 | 
			
		||||
This program is free software; you can redistribute it and/or modify
 | 
			
		||||
it under the terms of the GNU General Public License as published by
 | 
			
		||||
the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
(at your option) any later version.
 | 
			
		||||
 | 
			
		||||
This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
You should have received a copy of the GNU General Public License along
 | 
			
		||||
with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#include <Grid/Grid_Eigen_Dense.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/FermionCore.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
    MobiusEOFAFermion<Impl>::MobiusEOFAFermion(
 | 
			
		||||
      GaugeField            &_Umu,
 | 
			
		||||
      GridCartesian         &FiveDimGrid,
 | 
			
		||||
      GridRedBlackCartesian &FiveDimRedBlackGrid,
 | 
			
		||||
      GridCartesian         &FourDimGrid,
 | 
			
		||||
      GridRedBlackCartesian &FourDimRedBlackGrid,
 | 
			
		||||
      RealD _mq1, RealD _mq2, RealD _mq3,
 | 
			
		||||
      RealD _shift, int _pm, RealD _M5,
 | 
			
		||||
      RealD _b, RealD _c, const ImplParams &p) :
 | 
			
		||||
    AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
 | 
			
		||||
        FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
 | 
			
		||||
        _shift, _pm, _M5, _b, _c, p)
 | 
			
		||||
    {
 | 
			
		||||
      int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
      RealD eps = 1.0;
 | 
			
		||||
      Approx::zolotarev_data *zdata = Approx::higham(eps, this->Ls);
 | 
			
		||||
      assert(zdata->n == this->Ls);
 | 
			
		||||
 | 
			
		||||
      std::cout << GridLogMessage << "MobiusEOFAFermion (b=" << _b <<
 | 
			
		||||
        ",c=" << _c << ") with Ls=" << Ls << std::endl;
 | 
			
		||||
      this->SetCoefficientsTanh(zdata, _b, _c);
 | 
			
		||||
      std::cout << GridLogMessage << "EOFA parameters: (mq1=" << _mq1 <<
 | 
			
		||||
        ",mq2=" << _mq2 << ",mq3=" << _mq3 << ",shift=" << _shift <<
 | 
			
		||||
        ",pm=" << _pm << ")" << std::endl;
 | 
			
		||||
 | 
			
		||||
      Approx::zolotarev_free(zdata);
 | 
			
		||||
 | 
			
		||||
      if(_shift != 0.0){
 | 
			
		||||
        SetCoefficientsPrecondShiftOps();
 | 
			
		||||
      } else {
 | 
			
		||||
        Mooee_shift.resize(Ls, 0.0);
 | 
			
		||||
        MooeeInv_shift_lc.resize(Ls, 0.0);
 | 
			
		||||
        MooeeInv_shift_norm.resize(Ls, 0.0);
 | 
			
		||||
        MooeeInvDag_shift_lc.resize(Ls, 0.0);
 | 
			
		||||
        MooeeInvDag_shift_norm.resize(Ls, 0.0);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /*
 | 
			
		||||
     Additional EOFA operators only called outside the inverter.
 | 
			
		||||
     Since speed is not essential, simple axpby-style
 | 
			
		||||
     implementations should be fine.
 | 
			
		||||
    */
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void MobiusEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
 | 
			
		||||
    {
 | 
			
		||||
      int Ls = this->Ls;
 | 
			
		||||
      RealD alpha = this->alpha;
 | 
			
		||||
 | 
			
		||||
      Din = zero;
 | 
			
		||||
      if((sign == 1) && (dag == 0)) { // \Omega_{+}
 | 
			
		||||
        for(int s=0; s<Ls; ++s){
 | 
			
		||||
          axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,Ls-s-1)/std::pow(1.0+alpha,Ls-s), psi, s, 0);
 | 
			
		||||
        }
 | 
			
		||||
      } else if((sign == -1) && (dag == 0)) { // \Omega_{-}
 | 
			
		||||
        for(int s=0; s<Ls; ++s){
 | 
			
		||||
          axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,s)/std::pow(1.0+alpha,s+1), psi, s, 0);
 | 
			
		||||
        }
 | 
			
		||||
      } else if((sign == 1 ) && (dag == 1)) { // \Omega_{+}^{\dagger}
 | 
			
		||||
        for(int sp=0; sp<Ls; ++sp){
 | 
			
		||||
          axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,Ls-sp-1)/std::pow(1.0+alpha,Ls-sp), psi, 0, sp);
 | 
			
		||||
        }
 | 
			
		||||
      } else if((sign == -1) && (dag == 1)) { // \Omega_{-}^{\dagger}
 | 
			
		||||
        for(int sp=0; sp<Ls; ++sp){
 | 
			
		||||
          axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,sp)/std::pow(1.0+alpha,sp+1), psi, 0, sp);
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // This is the operator relating the usual Ddwf to TWQCD's EOFA Dirac operator (arXiv:1706.05843, Eqn. 6).
 | 
			
		||||
    // It also relates the preconditioned and unpreconditioned systems described in Appendix B.2.
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void MobiusEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
      int Ls    = this->Ls;
 | 
			
		||||
      RealD b   = 0.5 * ( 1.0 + this->alpha );
 | 
			
		||||
      RealD c   = 0.5 * ( 1.0 - this->alpha );
 | 
			
		||||
      RealD mq1 = this->mq1;
 | 
			
		||||
 | 
			
		||||
      for(int s=0; s<Ls; ++s){
 | 
			
		||||
        if(s == 0) {
 | 
			
		||||
          axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
 | 
			
		||||
          axpby_ssp_pplus (chi, 1.0, chi, mq1*c, psi, s, Ls-1);
 | 
			
		||||
        } else if(s == (Ls-1)) {
 | 
			
		||||
          axpby_ssp_pminus(chi, b, psi, mq1*c, psi, s, 0);
 | 
			
		||||
          axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
 | 
			
		||||
        } else {
 | 
			
		||||
          axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
 | 
			
		||||
          axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void MobiusEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
      int Ls = this->Ls;
 | 
			
		||||
      RealD m = this->mq1;
 | 
			
		||||
      RealD c = 0.5 * this->alpha;
 | 
			
		||||
      RealD d = 0.5;
 | 
			
		||||
 | 
			
		||||
      RealD DtInv_p(0.0), DtInv_m(0.0);
 | 
			
		||||
      RealD N = std::pow(c+d,Ls) + m*std::pow(c-d,Ls);
 | 
			
		||||
      FermionField tmp(this->FermionGrid());
 | 
			
		||||
 | 
			
		||||
      for(int s=0; s<Ls; ++s){
 | 
			
		||||
      for(int sp=0; sp<Ls; ++sp){
 | 
			
		||||
 | 
			
		||||
        DtInv_p = m * std::pow(-1.0,s-sp+1) * std::pow(c-d,Ls+s-sp) / std::pow(c+d,s-sp+1) / N;
 | 
			
		||||
        DtInv_p += (s < sp) ? 0.0 : std::pow(-1.0,s-sp) * std::pow(c-d,s-sp) / std::pow(c+d,s-sp+1);
 | 
			
		||||
        DtInv_m = m * std::pow(-1.0,sp-s+1) * std::pow(c-d,Ls+sp-s) / std::pow(c+d,sp-s+1) / N;
 | 
			
		||||
        DtInv_m += (s > sp) ? 0.0 : std::pow(-1.0,sp-s) * std::pow(c-d,sp-s) / std::pow(c+d,sp-s+1);
 | 
			
		||||
 | 
			
		||||
        if(sp == 0){
 | 
			
		||||
          axpby_ssp_pplus (tmp, 0.0, tmp, DtInv_p, psi, s, sp);
 | 
			
		||||
          axpby_ssp_pminus(tmp, 0.0, tmp, DtInv_m, psi, s, sp);
 | 
			
		||||
        } else {
 | 
			
		||||
          axpby_ssp_pplus (tmp, 1.0, tmp, DtInv_p, psi, s, sp);
 | 
			
		||||
          axpby_ssp_pminus(tmp, 1.0, tmp, DtInv_m, psi, s, sp);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
      }}
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /*****************************************************************************************************/
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    RealD MobiusEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
      int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
      FermionField Din(psi._grid);
 | 
			
		||||
 | 
			
		||||
      this->Meooe5D(psi, Din);
 | 
			
		||||
      this->DW(Din, chi, DaggerNo);
 | 
			
		||||
      axpby(chi, 1.0, 1.0, chi, psi);
 | 
			
		||||
      this->M5D(psi, chi);
 | 
			
		||||
      return(norm2(chi));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    RealD MobiusEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
      int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
      FermionField Din(psi._grid);
 | 
			
		||||
 | 
			
		||||
      this->DW(psi, Din, DaggerYes);
 | 
			
		||||
      this->MeooeDag5D(Din, chi);
 | 
			
		||||
      this->M5Ddag(psi, chi);
 | 
			
		||||
      axpby(chi, 1.0, 1.0, chi, psi);
 | 
			
		||||
      return(norm2(chi));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    // Performance critical fermion operators called inside the inverter
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
      int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
      std::vector<Coeff_t> diag(Ls,1.0);
 | 
			
		||||
      std::vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
 | 
			
		||||
      std::vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
 | 
			
		||||
 | 
			
		||||
      // no shift term
 | 
			
		||||
      if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); }
 | 
			
		||||
 | 
			
		||||
      // fused M + shift operation
 | 
			
		||||
      else{ this->M5D_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
      int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
      std::vector<Coeff_t> diag(Ls,1.0);
 | 
			
		||||
      std::vector<Coeff_t> upper(Ls,-1.0);  upper[Ls-1] = this->mq1;
 | 
			
		||||
      std::vector<Coeff_t> lower(Ls,-1.0);  lower[0]    = this->mq1;
 | 
			
		||||
 | 
			
		||||
      // no shift term
 | 
			
		||||
      if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); }
 | 
			
		||||
 | 
			
		||||
      // fused M + shift operation
 | 
			
		||||
      else{ this->M5Ddag_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // half checkerboard operations
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void MobiusEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
      int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
      // coefficients of Mooee
 | 
			
		||||
      std::vector<Coeff_t> diag = this->bee;
 | 
			
		||||
      std::vector<Coeff_t> upper(Ls);
 | 
			
		||||
      std::vector<Coeff_t> lower(Ls);
 | 
			
		||||
      for(int s=0; s<Ls; s++){
 | 
			
		||||
        upper[s] = -this->cee[s];
 | 
			
		||||
        lower[s] = -this->cee[s];
 | 
			
		||||
      }
 | 
			
		||||
      upper[Ls-1] *= -this->mq1;
 | 
			
		||||
      lower[0]    *= -this->mq1;
 | 
			
		||||
 | 
			
		||||
      // no shift term
 | 
			
		||||
      if(this->shift == 0.0){ this->M5D(psi, psi, chi, lower, diag, upper); }
 | 
			
		||||
 | 
			
		||||
      // fused M + shift operation
 | 
			
		||||
      else { this->M5D_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void MobiusEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
 | 
			
		||||
    {
 | 
			
		||||
      int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
      // coefficients of MooeeDag
 | 
			
		||||
      std::vector<Coeff_t> diag = this->bee;
 | 
			
		||||
      std::vector<Coeff_t> upper(Ls);
 | 
			
		||||
      std::vector<Coeff_t> lower(Ls);
 | 
			
		||||
      for(int s=0; s<Ls; s++){
 | 
			
		||||
        if(s==0) {
 | 
			
		||||
          upper[s] = -this->cee[s+1];
 | 
			
		||||
          lower[s] = this->mq1*this->cee[Ls-1];
 | 
			
		||||
        } else if(s==(Ls-1)) {
 | 
			
		||||
          upper[s] = this->mq1*this->cee[0];
 | 
			
		||||
          lower[s] = -this->cee[s-1];
 | 
			
		||||
        } else {
 | 
			
		||||
          upper[s] = -this->cee[s+1];
 | 
			
		||||
          lower[s] = -this->cee[s-1];
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      // no shift term
 | 
			
		||||
      if(this->shift == 0.0){ this->M5Ddag(psi, psi, chi, lower, diag, upper); }
 | 
			
		||||
 | 
			
		||||
      // fused M + shift operation
 | 
			
		||||
      else{ this->M5Ddag_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /****************************************************************************************/
 | 
			
		||||
 | 
			
		||||
    // Computes coefficients for applying Cayley preconditioned shift operators
 | 
			
		||||
    //  (Mooee + \Delta) --> Mooee_shift
 | 
			
		||||
    //  (Mooee + \Delta)^{-1} --> MooeeInv_shift_lc, MooeeInv_shift_norm
 | 
			
		||||
    //  (Mooee + \Delta)^{-dag} --> MooeeInvDag_shift_lc, MooeeInvDag_shift_norm
 | 
			
		||||
    // For the latter two cases, the operation takes the form
 | 
			
		||||
    //  [ (Mooee + \Delta)^{-1} \psi ]_{i} = Mooee_{ij} \psi_{j} +
 | 
			
		||||
    //      ( MooeeInv_shift_norm )_{i} ( \sum_{j} [ MooeeInv_shift_lc ]_{j} P_{pm} \psi_{j} )
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void MobiusEOFAFermion<Impl>::SetCoefficientsPrecondShiftOps()
 | 
			
		||||
    {
 | 
			
		||||
      int   Ls    = this->Ls;
 | 
			
		||||
      int   pm    = this->pm;
 | 
			
		||||
      RealD alpha = this->alpha;
 | 
			
		||||
      RealD k     = this->k;
 | 
			
		||||
      RealD mq1   = this->mq1;
 | 
			
		||||
      RealD shift = this->shift;
 | 
			
		||||
 | 
			
		||||
      // Initialize
 | 
			
		||||
      Mooee_shift.resize(Ls);
 | 
			
		||||
      MooeeInv_shift_lc.resize(Ls);
 | 
			
		||||
      MooeeInv_shift_norm.resize(Ls);
 | 
			
		||||
      MooeeInvDag_shift_lc.resize(Ls);
 | 
			
		||||
      MooeeInvDag_shift_norm.resize(Ls);
 | 
			
		||||
 | 
			
		||||
      // Construct Mooee_shift
 | 
			
		||||
      int idx(0);
 | 
			
		||||
      Coeff_t N = ( (pm == 1) ? 1.0 : -1.0 ) * (2.0*shift*k) *
 | 
			
		||||
                  ( std::pow(alpha+1.0,Ls) + mq1*std::pow(alpha-1.0,Ls) );
 | 
			
		||||
      for(int s=0; s<Ls; ++s){
 | 
			
		||||
        idx = (pm == 1) ? (s) : (Ls-1-s);
 | 
			
		||||
        Mooee_shift[idx] = N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1);
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      // Tridiagonal solve for MooeeInvDag_shift_lc
 | 
			
		||||
      {
 | 
			
		||||
        Coeff_t m(0.0);
 | 
			
		||||
        std::vector<Coeff_t> d = Mooee_shift;
 | 
			
		||||
        std::vector<Coeff_t> u(Ls,0.0);
 | 
			
		||||
        std::vector<Coeff_t> y(Ls,0.0);
 | 
			
		||||
        std::vector<Coeff_t> q(Ls,0.0);
 | 
			
		||||
        if(pm == 1){ u[0] = 1.0; }
 | 
			
		||||
        else{ u[Ls-1] = 1.0; }
 | 
			
		||||
 | 
			
		||||
        // Tridiagonal matrix algorithm + Sherman-Morrison formula
 | 
			
		||||
        //
 | 
			
		||||
        // We solve
 | 
			
		||||
        //  ( Mooee' + u \otimes v ) MooeeInvDag_shift_lc = Mooee_shift
 | 
			
		||||
        // where Mooee' is the tridiagonal part of Mooee_{+}, and
 | 
			
		||||
        // u = (1,0,...,0) and v = (0,...,0,mq1*cee[0]) are chosen
 | 
			
		||||
        // so that the outer-product u \otimes v gives the (0,Ls-1)
 | 
			
		||||
        // entry of Mooee_{+}.
 | 
			
		||||
        //
 | 
			
		||||
        // We do this as two solves: Mooee'*y = d and Mooee'*q = u,
 | 
			
		||||
        // and then construct the solution to the original system
 | 
			
		||||
        //  MooeeInvDag_shift_lc = y - <v,y> / ( 1 + <v,q> ) q
 | 
			
		||||
        if(pm == 1){
 | 
			
		||||
          for(int s=1; s<Ls; ++s){
 | 
			
		||||
            m = -this->cee[s] / this->bee[s-1];
 | 
			
		||||
            d[s] -= m*d[s-1];
 | 
			
		||||
            u[s] -= m*u[s-1];
 | 
			
		||||
          }
 | 
			
		||||
        }
 | 
			
		||||
        y[Ls-1] = d[Ls-1] / this->bee[Ls-1];
 | 
			
		||||
        q[Ls-1] = u[Ls-1] / this->bee[Ls-1];
 | 
			
		||||
        for(int s=Ls-2; s>=0; --s){
 | 
			
		||||
          if(pm == 1){
 | 
			
		||||
            y[s] = d[s] / this->bee[s];
 | 
			
		||||
            q[s] = u[s] / this->bee[s];
 | 
			
		||||
          } else {
 | 
			
		||||
            y[s] = ( d[s] + this->cee[s]*y[s+1] ) / this->bee[s];
 | 
			
		||||
            q[s] = ( u[s] + this->cee[s]*q[s+1] ) / this->bee[s];
 | 
			
		||||
          }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // Construct MooeeInvDag_shift_lc
 | 
			
		||||
        for(int s=0; s<Ls; ++s){
 | 
			
		||||
          if(pm == 1){
 | 
			
		||||
            MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[0]*y[Ls-1] /
 | 
			
		||||
              (1.0+mq1*this->cee[0]*q[Ls-1]) * q[s];
 | 
			
		||||
          } else {
 | 
			
		||||
            MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[Ls-1]*y[0] /
 | 
			
		||||
              (1.0+mq1*this->cee[Ls-1]*q[0]) * q[s];
 | 
			
		||||
          }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // Compute remaining coefficients
 | 
			
		||||
        N = (pm == 1) ? (1.0 + MooeeInvDag_shift_lc[Ls-1]) : (1.0 + MooeeInvDag_shift_lc[0]);
 | 
			
		||||
        for(int s=0; s<Ls; ++s){
 | 
			
		||||
 | 
			
		||||
          // MooeeInv_shift_lc
 | 
			
		||||
          if(pm == 1){ MooeeInv_shift_lc[s] = std::pow(this->bee[s],s) * std::pow(this->cee[s],Ls-1-s); }
 | 
			
		||||
          else{ MooeeInv_shift_lc[s] = std::pow(this->bee[s],Ls-1-s) * std::pow(this->cee[s],s); }
 | 
			
		||||
 | 
			
		||||
          // MooeeInv_shift_norm
 | 
			
		||||
          MooeeInv_shift_norm[s] = -MooeeInvDag_shift_lc[s] /
 | 
			
		||||
            ( std::pow(this->bee[s],Ls) + mq1*std::pow(this->cee[s],Ls) ) / N;
 | 
			
		||||
 | 
			
		||||
          // MooeeInvDag_shift_norm
 | 
			
		||||
          if(pm == 1){ MooeeInvDag_shift_norm[s] = -std::pow(this->bee[s],s) * std::pow(this->cee[s],Ls-1-s) /
 | 
			
		||||
            ( std::pow(this->bee[s],Ls) + mq1*std::pow(this->cee[s],Ls) ) / N; }
 | 
			
		||||
          else{ MooeeInvDag_shift_norm[s] = -std::pow(this->bee[s],Ls-1-s) * std::pow(this->cee[s],s) /
 | 
			
		||||
            ( std::pow(this->bee[s],Ls) + mq1*std::pow(this->cee[s],Ls) ) / N; }
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Recompute coefficients for a different value of shift constant
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void MobiusEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
 | 
			
		||||
    {
 | 
			
		||||
      this->shift = new_shift;
 | 
			
		||||
      if(new_shift != 0.0){
 | 
			
		||||
        SetCoefficientsPrecondShiftOps();
 | 
			
		||||
      } else {
 | 
			
		||||
        int Ls = this->Ls;
 | 
			
		||||
        Mooee_shift.resize(Ls,0.0);
 | 
			
		||||
        MooeeInv_shift_lc.resize(Ls,0.0);
 | 
			
		||||
        MooeeInv_shift_norm.resize(Ls,0.0);
 | 
			
		||||
        MooeeInvDag_shift_lc.resize(Ls,0.0);
 | 
			
		||||
        MooeeInvDag_shift_norm.resize(Ls,0.0);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    template<class Impl>
 | 
			
		||||
    void MobiusEOFAFermion<Impl>::MooeeInternalCompute(int dag, int inv,
 | 
			
		||||
      Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
 | 
			
		||||
    {
 | 
			
		||||
      int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
      GridBase* grid = this->FermionRedBlackGrid();
 | 
			
		||||
      int LLs = grid->_rdimensions[0];
 | 
			
		||||
 | 
			
		||||
      if(LLs == Ls){ return; } // Not vectorised in 5th direction
 | 
			
		||||
 | 
			
		||||
      Eigen::MatrixXcd Pplus  = Eigen::MatrixXcd::Zero(Ls,Ls);
 | 
			
		||||
      Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
 | 
			
		||||
 | 
			
		||||
      for(int s=0; s<Ls; s++){
 | 
			
		||||
        Pplus(s,s)  = this->bee[s];
 | 
			
		||||
        Pminus(s,s) = this->bee[s];
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      for(int s=0; s<Ls-1; s++){
 | 
			
		||||
        Pminus(s,s+1) = -this->cee[s];
 | 
			
		||||
        Pplus(s+1,s) = -this->cee[s+1];
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      Pplus (0,Ls-1) = this->mq1*this->cee[0];
 | 
			
		||||
      Pminus(Ls-1,0) = this->mq1*this->cee[Ls-1];
 | 
			
		||||
 | 
			
		||||
      if(this->shift != 0.0){
 | 
			
		||||
        RealD c = 0.5 * this->alpha;
 | 
			
		||||
        RealD d = 0.5;
 | 
			
		||||
        RealD N = this->shift * this->k * ( std::pow(c+d,Ls) + this->mq1*std::pow(c-d,Ls) );
 | 
			
		||||
        if(this->pm == 1) {
 | 
			
		||||
          for(int s=0; s<Ls; ++s){
 | 
			
		||||
            Pplus(s,Ls-1) += N * std::pow(-1.0,s) * std::pow(c-d,s) / std::pow(c+d,Ls+s+1);
 | 
			
		||||
          }
 | 
			
		||||
        } else {
 | 
			
		||||
          for(int s=0; s<Ls; ++s){
 | 
			
		||||
            Pminus(s,0) += N * std::pow(-1.0,s+1) * std::pow(c-d,Ls-1-s) / std::pow(c+d,2*Ls-s);
 | 
			
		||||
          }
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      Eigen::MatrixXcd PplusMat ;
 | 
			
		||||
      Eigen::MatrixXcd PminusMat;
 | 
			
		||||
 | 
			
		||||
      if(inv) {
 | 
			
		||||
        PplusMat  = Pplus.inverse();
 | 
			
		||||
        PminusMat = Pminus.inverse();
 | 
			
		||||
      } else {
 | 
			
		||||
        PplusMat  = Pplus;
 | 
			
		||||
        PminusMat = Pminus;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      if(dag){
 | 
			
		||||
        PplusMat.adjointInPlace();
 | 
			
		||||
        PminusMat.adjointInPlace();
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      typedef typename SiteHalfSpinor::scalar_type scalar_type;
 | 
			
		||||
      const int Nsimd = Simd::Nsimd();
 | 
			
		||||
      Matp.resize(Ls*LLs);
 | 
			
		||||
      Matm.resize(Ls*LLs);
 | 
			
		||||
 | 
			
		||||
      for(int s2=0; s2<Ls; s2++){
 | 
			
		||||
      for(int s1=0; s1<LLs; s1++){
 | 
			
		||||
        int istride = LLs;
 | 
			
		||||
        int ostride = 1;
 | 
			
		||||
        Simd Vp;
 | 
			
		||||
        Simd Vm;
 | 
			
		||||
        scalar_type *sp = (scalar_type*) &Vp;
 | 
			
		||||
        scalar_type *sm = (scalar_type*) &Vm;
 | 
			
		||||
        for(int l=0; l<Nsimd; l++){
 | 
			
		||||
          if(switcheroo<Coeff_t>::iscomplex()) {
 | 
			
		||||
            sp[l] = PplusMat (l*istride+s1*ostride,s2);
 | 
			
		||||
            sm[l] = PminusMat(l*istride+s1*ostride,s2);
 | 
			
		||||
          } else {
 | 
			
		||||
            // if real
 | 
			
		||||
            scalar_type tmp;
 | 
			
		||||
            tmp = PplusMat (l*istride+s1*ostride,s2);
 | 
			
		||||
            sp[l] = scalar_type(tmp.real(),tmp.real());
 | 
			
		||||
            tmp = PminusMat(l*istride+s1*ostride,s2);
 | 
			
		||||
            sm[l] = scalar_type(tmp.real(),tmp.real());
 | 
			
		||||
          }
 | 
			
		||||
        }
 | 
			
		||||
        Matp[LLs*s2+s1] = Vp;
 | 
			
		||||
        Matm[LLs*s2+s1] = Vm;
 | 
			
		||||
      }}
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  FermOpTemplateInstantiate(MobiusEOFAFermion);
 | 
			
		||||
  GparityFermOpTemplateInstantiate(MobiusEOFAFermion);
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
							
								
								
									
										133
									
								
								lib/qcd/action/fermion/MobiusEOFAFermion.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										133
									
								
								lib/qcd/action/fermion/MobiusEOFAFermion.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,133 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
Source file: ./lib/qcd/action/fermion/MobiusEOFAFermion.h
 | 
			
		||||
 | 
			
		||||
Copyright (C) 2017
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: David Murphy <dmurphy@phys.columbia.edu>
 | 
			
		||||
 | 
			
		||||
This program is free software; you can redistribute it and/or modify
 | 
			
		||||
it under the terms of the GNU General Public License as published by
 | 
			
		||||
the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
(at your option) any later version.
 | 
			
		||||
 | 
			
		||||
This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
You should have received a copy of the GNU General Public License along
 | 
			
		||||
with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
#ifndef  GRID_QCD_MOBIUS_EOFA_FERMION_H
 | 
			
		||||
#define  GRID_QCD_MOBIUS_EOFA_FERMION_H
 | 
			
		||||
 | 
			
		||||
#include <Grid/qcd/action/fermion/AbstractEOFAFermion.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  class MobiusEOFAFermion : public AbstractEOFAFermion<Impl>
 | 
			
		||||
  {
 | 
			
		||||
    public:
 | 
			
		||||
      INHERIT_IMPL_TYPES(Impl);
 | 
			
		||||
 | 
			
		||||
    public:
 | 
			
		||||
      // Shift operator coefficients for red-black preconditioned Mobius EOFA
 | 
			
		||||
      std::vector<Coeff_t> Mooee_shift;
 | 
			
		||||
      std::vector<Coeff_t> MooeeInv_shift_lc;
 | 
			
		||||
      std::vector<Coeff_t> MooeeInv_shift_norm;
 | 
			
		||||
      std::vector<Coeff_t> MooeeInvDag_shift_lc;
 | 
			
		||||
      std::vector<Coeff_t> MooeeInvDag_shift_norm;
 | 
			
		||||
 | 
			
		||||
      virtual void Instantiatable(void) {};
 | 
			
		||||
 | 
			
		||||
      // EOFA-specific operations
 | 
			
		||||
      virtual void  Omega            (const FermionField& in, FermionField& out, int sign, int dag);
 | 
			
		||||
      virtual void  Dtilde           (const FermionField& in, FermionField& out);
 | 
			
		||||
      virtual void  DtildeInv        (const FermionField& in, FermionField& out);
 | 
			
		||||
 | 
			
		||||
      // override multiply
 | 
			
		||||
      virtual RealD M                (const FermionField& in, FermionField& out);
 | 
			
		||||
      virtual RealD Mdag             (const FermionField& in, FermionField& out);
 | 
			
		||||
 | 
			
		||||
      // half checkerboard operations
 | 
			
		||||
      virtual void  Mooee            (const FermionField& in, FermionField& out);
 | 
			
		||||
      virtual void  MooeeDag         (const FermionField& in, FermionField& out);
 | 
			
		||||
      virtual void  MooeeInv         (const FermionField& in, FermionField& out);
 | 
			
		||||
      virtual void  MooeeInv_shift   (const FermionField& in, FermionField& out);
 | 
			
		||||
      virtual void  MooeeInvDag      (const FermionField& in, FermionField& out);
 | 
			
		||||
      virtual void  MooeeInvDag_shift(const FermionField& in, FermionField& out);
 | 
			
		||||
 | 
			
		||||
      virtual void   M5D             (const FermionField& psi, FermionField& chi);
 | 
			
		||||
      virtual void   M5Ddag          (const FermionField& psi, FermionField& chi);
 | 
			
		||||
 | 
			
		||||
      /////////////////////////////////////////////////////
 | 
			
		||||
      // Instantiate different versions depending on Impl
 | 
			
		||||
      /////////////////////////////////////////////////////
 | 
			
		||||
      void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
 | 
			
		||||
        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
 | 
			
		||||
 | 
			
		||||
      void M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
 | 
			
		||||
        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
 | 
			
		||||
        std::vector<Coeff_t>& shift_coeffs);
 | 
			
		||||
 | 
			
		||||
      void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
 | 
			
		||||
        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
 | 
			
		||||
 | 
			
		||||
      void M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
 | 
			
		||||
        std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
 | 
			
		||||
        std::vector<Coeff_t>& shift_coeffs);
 | 
			
		||||
 | 
			
		||||
      void MooeeInternal(const FermionField& in, FermionField& out, int dag, int inv);
 | 
			
		||||
 | 
			
		||||
      void MooeeInternalCompute(int dag, int inv, Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
 | 
			
		||||
 | 
			
		||||
      void MooeeInternalAsm(const FermionField& in, FermionField& out, int LLs, int site,
 | 
			
		||||
        Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
 | 
			
		||||
 | 
			
		||||
      void MooeeInternalZAsm(const FermionField& in, FermionField& out, int LLs, int site,
 | 
			
		||||
        Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
 | 
			
		||||
 | 
			
		||||
      virtual void RefreshShiftCoefficients(RealD new_shift);
 | 
			
		||||
 | 
			
		||||
      // Constructors
 | 
			
		||||
      MobiusEOFAFermion(GaugeField& _Umu, GridCartesian& FiveDimGrid, GridRedBlackCartesian& FiveDimRedBlackGrid,
 | 
			
		||||
        GridCartesian& FourDimGrid, GridRedBlackCartesian& FourDimRedBlackGrid,
 | 
			
		||||
        RealD _mq1, RealD _mq2, RealD _mq3, RealD _shift, int pm,
 | 
			
		||||
        RealD _M5, RealD _b, RealD _c, const ImplParams& p=ImplParams());
 | 
			
		||||
 | 
			
		||||
    protected:
 | 
			
		||||
      void SetCoefficientsPrecondShiftOps(void);
 | 
			
		||||
  };
 | 
			
		||||
}}
 | 
			
		||||
 | 
			
		||||
#define INSTANTIATE_DPERP_MOBIUS_EOFA(A)\
 | 
			
		||||
template void MobiusEOFAFermion<A>::M5D(const FermionField& psi, const FermionField& phi, FermionField& chi, \
 | 
			
		||||
  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
 | 
			
		||||
template void MobiusEOFAFermion<A>::M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi, \
 | 
			
		||||
  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper, std::vector<Coeff_t>& shift_coeffs); \
 | 
			
		||||
template void MobiusEOFAFermion<A>::M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi, \
 | 
			
		||||
  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
 | 
			
		||||
template void MobiusEOFAFermion<A>::M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi, \
 | 
			
		||||
  std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper, std::vector<Coeff_t>& shift_coeffs); \
 | 
			
		||||
template void MobiusEOFAFermion<A>::MooeeInv(const FermionField& psi, FermionField& chi); \
 | 
			
		||||
template void MobiusEOFAFermion<A>::MooeeInv_shift(const FermionField& psi, FermionField& chi); \
 | 
			
		||||
template void MobiusEOFAFermion<A>::MooeeInvDag(const FermionField& psi, FermionField& chi); \
 | 
			
		||||
template void MobiusEOFAFermion<A>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi);
 | 
			
		||||
 | 
			
		||||
#undef  MOBIUS_EOFA_DPERP_DENSE
 | 
			
		||||
#define MOBIUS_EOFA_DPERP_CACHE
 | 
			
		||||
#undef  MOBIUS_EOFA_DPERP_LINALG
 | 
			
		||||
#define MOBIUS_EOFA_DPERP_VEC
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
							
								
								
									
										429
									
								
								lib/qcd/action/fermion/MobiusEOFAFermioncache.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										429
									
								
								lib/qcd/action/fermion/MobiusEOFAFermioncache.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,429 @@
 | 
			
		||||
/*************************************************************************************
 | 
			
		||||
 | 
			
		||||
Grid physics library, www.github.com/paboyle/Grid
 | 
			
		||||
 | 
			
		||||
Source file: ./lib/qcd/action/fermion/MobiusEOFAFermioncache.cc
 | 
			
		||||
 | 
			
		||||
Copyright (C) 2017
 | 
			
		||||
 | 
			
		||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
 | 
			
		||||
Author: paboyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
Author: David Murphy <dmurphy@phys.columbia.edu>
 | 
			
		||||
 | 
			
		||||
This program is free software; you can redistribute it and/or modify
 | 
			
		||||
it under the terms of the GNU General Public License as published by
 | 
			
		||||
the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
(at your option) any later version.
 | 
			
		||||
 | 
			
		||||
This program is distributed in the hope that it will be useful,
 | 
			
		||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
You should have received a copy of the GNU General Public License along
 | 
			
		||||
with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
*************************************************************************************/
 | 
			
		||||
/*  END LEGAL */
 | 
			
		||||
 | 
			
		||||
#include <Grid/qcd/action/fermion/FermionCore.h>
 | 
			
		||||
#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
 | 
			
		||||
 | 
			
		||||
namespace Grid {
 | 
			
		||||
namespace QCD {
 | 
			
		||||
 | 
			
		||||
  // FIXME -- make a version of these routines with site loop outermost for cache reuse.
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi, const FermionField &phi, FermionField &chi,
 | 
			
		||||
    std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
 | 
			
		||||
  {
 | 
			
		||||
    int Ls = this->Ls;
 | 
			
		||||
    GridBase *grid = psi._grid;
 | 
			
		||||
 | 
			
		||||
    assert(phi.checkerboard == psi.checkerboard);
 | 
			
		||||
    chi.checkerboard = psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
    // Flops = 6.0*(Nc*Ns) *Ls*vol
 | 
			
		||||
    this->M5Dcalls++;
 | 
			
		||||
    this->M5Dtime -= usecond();
 | 
			
		||||
 | 
			
		||||
    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
 | 
			
		||||
      for(int s=0; s<Ls; s++){
 | 
			
		||||
        auto tmp = psi._odata[0];
 | 
			
		||||
        if(s==0){
 | 
			
		||||
          spProj5m(tmp, psi._odata[ss+s+1]);
 | 
			
		||||
          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
 | 
			
		||||
          spProj5p(tmp, psi._odata[ss+Ls-1]);
 | 
			
		||||
          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
 | 
			
		||||
        } else if(s==(Ls-1)) {
 | 
			
		||||
          spProj5m(tmp, psi._odata[ss+0]);
 | 
			
		||||
          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
 | 
			
		||||
          spProj5p(tmp, psi._odata[ss+s-1]);
 | 
			
		||||
          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
 | 
			
		||||
        } else {
 | 
			
		||||
          spProj5m(tmp, psi._odata[ss+s+1]);
 | 
			
		||||
          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
 | 
			
		||||
          spProj5p(tmp, psi._odata[ss+s-1]);
 | 
			
		||||
          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    this->M5Dtime += usecond();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi, const FermionField &phi, FermionField &chi,
 | 
			
		||||
    std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
 | 
			
		||||
    std::vector<Coeff_t> &shift_coeffs)
 | 
			
		||||
  {
 | 
			
		||||
    int Ls = this->Ls;
 | 
			
		||||
    int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
 | 
			
		||||
    GridBase *grid = psi._grid;
 | 
			
		||||
 | 
			
		||||
    assert(phi.checkerboard == psi.checkerboard);
 | 
			
		||||
    chi.checkerboard = psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
    // Flops = 6.0*(Nc*Ns) *Ls*vol
 | 
			
		||||
    this->M5Dcalls++;
 | 
			
		||||
    this->M5Dtime -= usecond();
 | 
			
		||||
 | 
			
		||||
    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
 | 
			
		||||
      for(int s=0; s<Ls; s++){
 | 
			
		||||
        auto tmp = psi._odata[0];
 | 
			
		||||
        if(s==0){
 | 
			
		||||
          spProj5m(tmp, psi._odata[ss+s+1]);
 | 
			
		||||
          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
 | 
			
		||||
          spProj5p(tmp, psi._odata[ss+Ls-1]);
 | 
			
		||||
          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
 | 
			
		||||
        } else if(s==(Ls-1)) {
 | 
			
		||||
          spProj5m(tmp, psi._odata[ss+0]);
 | 
			
		||||
          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
 | 
			
		||||
          spProj5p(tmp, psi._odata[ss+s-1]);
 | 
			
		||||
          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
 | 
			
		||||
        } else {
 | 
			
		||||
          spProj5m(tmp, psi._odata[ss+s+1]);
 | 
			
		||||
          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
 | 
			
		||||
          spProj5p(tmp, psi._odata[ss+s-1]);
 | 
			
		||||
          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
 | 
			
		||||
        }
 | 
			
		||||
        if(this->pm == 1){ spProj5p(tmp, psi._odata[ss+shift_s]); }
 | 
			
		||||
        else{ spProj5m(tmp, psi._odata[ss+shift_s]); }
 | 
			
		||||
        chi[ss+s] = chi[ss+s] + shift_coeffs[s]*tmp;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    this->M5Dtime += usecond();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi, const FermionField &phi, FermionField &chi,
 | 
			
		||||
    std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
 | 
			
		||||
  {
 | 
			
		||||
    int Ls = this->Ls;
 | 
			
		||||
    GridBase *grid = psi._grid;
 | 
			
		||||
 | 
			
		||||
    assert(phi.checkerboard == psi.checkerboard);
 | 
			
		||||
    chi.checkerboard = psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
    // Flops = 6.0*(Nc*Ns) *Ls*vol
 | 
			
		||||
    this->M5Dcalls++;
 | 
			
		||||
    this->M5Dtime -= usecond();
 | 
			
		||||
 | 
			
		||||
    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
 | 
			
		||||
      auto tmp = psi._odata[0];
 | 
			
		||||
      for(int s=0; s<Ls; s++){
 | 
			
		||||
        if(s==0) {
 | 
			
		||||
          spProj5p(tmp, psi._odata[ss+s+1]);
 | 
			
		||||
          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
 | 
			
		||||
          spProj5m(tmp, psi._odata[ss+Ls-1]);
 | 
			
		||||
          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
 | 
			
		||||
        } else if(s==(Ls-1)) {
 | 
			
		||||
          spProj5p(tmp, psi._odata[ss+0]);
 | 
			
		||||
          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
 | 
			
		||||
          spProj5m(tmp, psi._odata[ss+s-1]);
 | 
			
		||||
          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
 | 
			
		||||
        } else {
 | 
			
		||||
          spProj5p(tmp, psi._odata[ss+s+1]);
 | 
			
		||||
          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
 | 
			
		||||
          spProj5m(tmp, psi._odata[ss+s-1]);
 | 
			
		||||
          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    this->M5Dtime += usecond();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi, const FermionField &phi, FermionField &chi,
 | 
			
		||||
    std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
 | 
			
		||||
    std::vector<Coeff_t> &shift_coeffs)
 | 
			
		||||
  {
 | 
			
		||||
    int Ls = this->Ls;
 | 
			
		||||
    int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
 | 
			
		||||
    GridBase *grid = psi._grid;
 | 
			
		||||
 | 
			
		||||
    assert(phi.checkerboard == psi.checkerboard);
 | 
			
		||||
    chi.checkerboard = psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
    // Flops = 6.0*(Nc*Ns) *Ls*vol
 | 
			
		||||
    this->M5Dcalls++;
 | 
			
		||||
    this->M5Dtime -= usecond();
 | 
			
		||||
 | 
			
		||||
    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
 | 
			
		||||
      chi[ss+Ls-1] = zero;
 | 
			
		||||
      auto tmp = psi._odata[0];
 | 
			
		||||
      for(int s=0; s<Ls; s++){
 | 
			
		||||
        if(s==0) {
 | 
			
		||||
          spProj5p(tmp, psi._odata[ss+s+1]);
 | 
			
		||||
          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
 | 
			
		||||
          spProj5m(tmp, psi._odata[ss+Ls-1]);
 | 
			
		||||
          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
 | 
			
		||||
        } else if(s==(Ls-1)) {
 | 
			
		||||
          spProj5p(tmp, psi._odata[ss+0]);
 | 
			
		||||
          chi[ss+s] = chi[ss+s] + diag[s]*phi[ss+s] + upper[s]*tmp;
 | 
			
		||||
          spProj5m(tmp, psi._odata[ss+s-1]);
 | 
			
		||||
          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
 | 
			
		||||
        } else {
 | 
			
		||||
          spProj5p(tmp, psi._odata[ss+s+1]);
 | 
			
		||||
          chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
 | 
			
		||||
          spProj5m(tmp, psi._odata[ss+s-1]);
 | 
			
		||||
          chi[ss+s] = chi[ss+s] + lower[s]*tmp;
 | 
			
		||||
        }
 | 
			
		||||
        if(this->pm == 1){ spProj5p(tmp, psi._odata[ss+s]); }
 | 
			
		||||
        else{ spProj5m(tmp, psi._odata[ss+s]); }
 | 
			
		||||
        chi[ss+shift_s] = chi[ss+shift_s] + shift_coeffs[s]*tmp;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    this->M5Dtime += usecond();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi, FermionField &chi)
 | 
			
		||||
  {
 | 
			
		||||
    if(this->shift != 0.0){ MooeeInv_shift(psi,chi); return; }
 | 
			
		||||
 | 
			
		||||
    GridBase *grid = psi._grid;
 | 
			
		||||
    int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
    chi.checkerboard = psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
    this->MooeeInvCalls++;
 | 
			
		||||
    this->MooeeInvTime -= usecond();
 | 
			
		||||
 | 
			
		||||
    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
 | 
			
		||||
 | 
			
		||||
      auto tmp = psi._odata[0];
 | 
			
		||||
 | 
			
		||||
      // Apply (L^{\prime})^{-1}
 | 
			
		||||
      chi[ss] = psi[ss]; // chi[0]=psi[0]
 | 
			
		||||
      for(int s=1; s<Ls; s++){
 | 
			
		||||
        spProj5p(tmp, chi[ss+s-1]);
 | 
			
		||||
        chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      // L_m^{-1}
 | 
			
		||||
      for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
 | 
			
		||||
        spProj5m(tmp, chi[ss+s]);
 | 
			
		||||
        chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      // U_m^{-1} D^{-1}
 | 
			
		||||
      for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
 | 
			
		||||
        spProj5p(tmp, chi[ss+Ls-1]);
 | 
			
		||||
        chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls-1])*tmp;
 | 
			
		||||
      }
 | 
			
		||||
      chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
 | 
			
		||||
 | 
			
		||||
      // Apply U^{-1}
 | 
			
		||||
      for(int s=Ls-2; s>=0; s--){
 | 
			
		||||
        spProj5m(tmp, chi[ss+s+1]);
 | 
			
		||||
        chi[ss+s] = chi[ss+s] - this->uee[s]*tmp;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    this->MooeeInvTime += usecond();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi, FermionField &chi)
 | 
			
		||||
  {
 | 
			
		||||
    GridBase *grid = psi._grid;
 | 
			
		||||
    int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
    chi.checkerboard = psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
    this->MooeeInvCalls++;
 | 
			
		||||
    this->MooeeInvTime -= usecond();
 | 
			
		||||
 | 
			
		||||
    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
 | 
			
		||||
 | 
			
		||||
      auto tmp1        = psi._odata[0];
 | 
			
		||||
      auto tmp2        = psi._odata[0];
 | 
			
		||||
      auto tmp2_spProj = psi._odata[0];
 | 
			
		||||
 | 
			
		||||
      // Apply (L^{\prime})^{-1} and accumulate MooeeInv_shift_lc[j]*psi[j] in tmp2
 | 
			
		||||
      chi[ss] = psi[ss]; // chi[0]=psi[0]
 | 
			
		||||
      tmp2 = MooeeInv_shift_lc[0]*psi[ss];
 | 
			
		||||
      for(int s=1; s<Ls; s++){
 | 
			
		||||
        spProj5p(tmp1, chi[ss+s-1]);
 | 
			
		||||
        chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp1;
 | 
			
		||||
        tmp2 = tmp2 + MooeeInv_shift_lc[s]*psi[ss+s];
 | 
			
		||||
      }
 | 
			
		||||
      if(this->pm == 1){ spProj5p(tmp2_spProj, tmp2);}
 | 
			
		||||
      else{ spProj5m(tmp2_spProj, tmp2); }
 | 
			
		||||
 | 
			
		||||
      // L_m^{-1}
 | 
			
		||||
      for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
 | 
			
		||||
        spProj5m(tmp1, chi[ss+s]);
 | 
			
		||||
        chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp1;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      // U_m^{-1} D^{-1}
 | 
			
		||||
      for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
 | 
			
		||||
        spProj5p(tmp1, chi[ss+Ls-1]);
 | 
			
		||||
        chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls-1])*tmp1;
 | 
			
		||||
      }
 | 
			
		||||
      // chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1] + MooeeInv_shift_norm[Ls-1]*tmp2_spProj;
 | 
			
		||||
      chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
 | 
			
		||||
      spProj5m(tmp1, chi[ss+Ls-1]);
 | 
			
		||||
      chi[ss+Ls-1] = chi[ss+Ls-1] + MooeeInv_shift_norm[Ls-1]*tmp2_spProj;
 | 
			
		||||
 | 
			
		||||
      // Apply U^{-1} and add shift term
 | 
			
		||||
      for(int s=Ls-2; s>=0; s--){
 | 
			
		||||
        chi[ss+s] = chi[ss+s] - this->uee[s]*tmp1;
 | 
			
		||||
        spProj5m(tmp1, chi[ss+s]);
 | 
			
		||||
        chi[ss+s] = chi[ss+s] + MooeeInv_shift_norm[s]*tmp2_spProj;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    this->MooeeInvTime += usecond();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi, FermionField &chi)
 | 
			
		||||
  {
 | 
			
		||||
    if(this->shift != 0.0){ MooeeInvDag_shift(psi,chi); return; }
 | 
			
		||||
 | 
			
		||||
    GridBase *grid = psi._grid;
 | 
			
		||||
    int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
    chi.checkerboard = psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
    this->MooeeInvCalls++;
 | 
			
		||||
    this->MooeeInvTime -= usecond();
 | 
			
		||||
 | 
			
		||||
    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
 | 
			
		||||
 | 
			
		||||
      auto tmp = psi._odata[0];
 | 
			
		||||
 | 
			
		||||
      // Apply (U^{\prime})^{-dag}
 | 
			
		||||
      chi[ss] = psi[ss];
 | 
			
		||||
      for(int s=1; s<Ls; s++){
 | 
			
		||||
        spProj5m(tmp, chi[ss+s-1]);
 | 
			
		||||
        chi[ss+s] = psi[ss+s] - this->uee[s-1]*tmp;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      // U_m^{-\dag}
 | 
			
		||||
      for(int s=0; s<Ls-1; s++){
 | 
			
		||||
        spProj5p(tmp, chi[ss+s]);
 | 
			
		||||
        chi[ss+Ls-1] = chi[ss+Ls-1] - this->ueem[s]*tmp;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      // L_m^{-\dag} D^{-dag}
 | 
			
		||||
      for(int s=0; s<Ls-1; s++){
 | 
			
		||||
        spProj5m(tmp, chi[ss+Ls-1]);
 | 
			
		||||
        chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->leem[s]/this->dee[Ls-1])*tmp;
 | 
			
		||||
      }
 | 
			
		||||
      chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
 | 
			
		||||
 | 
			
		||||
      // Apply L^{-dag}
 | 
			
		||||
      for(int s=Ls-2; s>=0; s--){
 | 
			
		||||
        spProj5p(tmp, chi[ss+s+1]);
 | 
			
		||||
        chi[ss+s] = chi[ss+s] - this->lee[s]*tmp;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    this->MooeeInvTime += usecond();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  template<class Impl>
 | 
			
		||||
  void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi, FermionField &chi)
 | 
			
		||||
  {
 | 
			
		||||
    GridBase *grid = psi._grid;
 | 
			
		||||
    int Ls = this->Ls;
 | 
			
		||||
 | 
			
		||||
    chi.checkerboard = psi.checkerboard;
 | 
			
		||||
 | 
			
		||||
    this->MooeeInvCalls++;
 | 
			
		||||
    this->MooeeInvTime -= usecond();
 | 
			
		||||
 | 
			
		||||
    parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
 | 
			
		||||
 | 
			
		||||
      auto tmp1        = psi._odata[0];
 | 
			
		||||
      auto tmp2        = psi._odata[0];
 | 
			
		||||
      auto tmp2_spProj = psi._odata[0];
 | 
			
		||||
 | 
			
		||||
      // Apply (U^{\prime})^{-dag} and accumulate MooeeInvDag_shift_lc[j]*psi[j] in tmp2
 | 
			
		||||
      chi[ss] = psi[ss];
 | 
			
		||||
      tmp2 = MooeeInvDag_shift_lc[0]*psi[ss];
 | 
			
		||||
      for(int s=1; s<Ls; s++){
 | 
			
		||||
        spProj5m(tmp1, chi[ss+s-1]);
 | 
			
		||||
        chi[ss+s] = psi[ss+s] - this->uee[s-1]*tmp1;
 | 
			
		||||
        tmp2 = tmp2 + MooeeInvDag_shift_lc[s]*psi[ss+s];
 | 
			
		||||
      }
 | 
			
		||||
      if(this->pm == 1){ spProj5p(tmp2_spProj, tmp2);}
 | 
			
		||||
      else{ spProj5m(tmp2_spProj, tmp2); }
 | 
			
		||||
 | 
			
		||||
      // U_m^{-\dag}
 | 
			
		||||
      for(int s=0; s<Ls-1; s++){
 | 
			
		||||
        spProj5p(tmp1, chi[ss+s]);
 | 
			
		||||
        chi[ss+Ls-1] = chi[ss+Ls-1] - this->ueem[s]*tmp1;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      // L_m^{-\dag} D^{-dag}
 | 
			
		||||
      for(int s=0; s<Ls-1; s++){
 | 
			
		||||
        spProj5m(tmp1, chi[ss+Ls-1]);
 | 
			
		||||
        chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->leem[s]/this->dee[Ls-1])*tmp1;
 | 
			
		||||
      }
 | 
			
		||||
      chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
 | 
			
		||||
      spProj5p(tmp1, chi[ss+Ls-1]);
 | 
			
		||||
      chi[ss+Ls-1] = chi[ss+Ls-1] + MooeeInvDag_shift_norm[Ls-1]*tmp2_spProj;
 | 
			
		||||
 | 
			
		||||
      // Apply L^{-dag}
 | 
			
		||||
      for(int s=Ls-2; s>=0; s--){
 | 
			
		||||
        chi[ss+s] = chi[ss+s] - this->lee[s]*tmp1;
 | 
			
		||||
        spProj5p(tmp1, chi[ss+s]);
 | 
			
		||||
        chi[ss+s] = chi[ss+s] + MooeeInvDag_shift_norm[s]*tmp2_spProj;
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    this->MooeeInvTime += usecond();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  #ifdef MOBIUS_EOFA_DPERP_CACHE
 | 
			
		||||
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplF);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplD);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplF);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplD);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplF);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplD);
 | 
			
		||||
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplFH);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplDF);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplFH);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplDF);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplFH);
 | 
			
		||||
    INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplDF);
 | 
			
		||||
 | 
			
		||||
  #endif
 | 
			
		||||
 | 
			
		||||
}}
 | 
			
		||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user