mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-10-31 03:54:33 +00:00 
			
		
		
		
	Compare commits
	
		
			183 Commits
		
	
	
		
			feature/la
			...
			feature/mu
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | ad89abb018 | ||
|  | 80c5bce5bb | ||
|  | f68b5de9c8 | ||
|  | d0f3d525d5 | ||
|  | 3a58217405 | ||
|  | c289699d9a | ||
|  | c3b1263e75 | ||
| 102ea9ae66 | |||
|  | 5fa386ddc9 | ||
|  | d9cd4f0273 | ||
|  | b49bec0cec | ||
|  | ae56e556c6 | ||
|  | 1cdf999668 | ||
|  | 11062fb686 | ||
|  | 383ca7d392 | ||
|  | a446d95c33 | ||
|  | be66e7dd95 | ||
|  | 6d0d064a6c | ||
|  | bfef525ed2 | ||
|  | 0b0cf62193 | ||
|  | 7d88198387 | ||
|  | 2f619482b8 | ||
|  | d6472eda8d | ||
|  | 9e658de238 | ||
|  | bcefdd7c4e | ||
|  | fd367d8bfd | ||
|  | 8a3fe60a27 | ||
|  | 44051aecd1 | ||
|  | 06e6f8de00 | ||
|  | dbe4d7850c | ||
|  | 4fe182e5a7 | ||
|  | 175f393f9d | ||
|  | 14d53e1c9e | ||
|  | 8bd869da37 | ||
|  | c7036f6717 | ||
|  | c0485d799d | ||
|  | 7abc5613bd | ||
|  | 237cfd11ab | ||
|  | a4b7dddb67 | ||
|  | 5696781862 | ||
| c3f0889eda | |||
|  | 0f214ad427 | ||
|  | fe4912880d | ||
|  | f038c6babe | ||
|  | 169f4b2711 | ||
|  | 2d8aff36fe | ||
|  | 659d7d1a40 | ||
|  | dc6f078246 | ||
|  | 8a4714a4a6 | ||
|  | 40e119c61c | ||
|  | 7b0237b081 | ||
|  | b68ad0cc0b | ||
|  | 37263fd9b1 | ||
|  | 3d09e3e9e0 | ||
|  | 1354b46338 | ||
|  | 251a97fe1b | ||
|  | e18929eaa0 | ||
|  | f3b0a92e71 | ||
|  | a0be3f7330 | ||
|  | b5a6e4f1fd | ||
|  | 7a788db3dc | ||
|  | f20eceb6cd | ||
|  | 38325ebbc6 | ||
|  | b73bd151bb | ||
|  | 694b305cab | ||
|  | 2d3737a133 | ||
|  | ac1f1838bc | ||
|  | 09d09d0fe5 | ||
|  | bf630a6821 | ||
|  | 8859a151cc | ||
|  | 688a39cfd9 | ||
|  | 6f5a5cd9b3 | ||
|  | 0933aeefd4 | ||
|  | 322f61acee | ||
|  | 08e04b9676 | ||
| feaa2ac947 | |||
| 07de925127 | |||
|  | a9c816a268 | ||
|  | e43a8b6b8a | ||
|  | bf729766dd | ||
|  | dafb351d38 | ||
| 0b707b861c | |||
| 15e87a4607 | |||
| 7d7220cbd7 | |||
|  | 54e94360ad | ||
| 0af740dc15 | |||
| d2e8372df3 | |||
|  | 869b99ec1e | ||
| 4372d04ad4 | |||
|  | 56abbdf4c2 | ||
|  | af71c63f4c | ||
|  | 0440d4ce66 | ||
| b22eab8c8b | |||
|  | a7d56523ab | ||
| 1e8a2e1621 | |||
| 7587df831a | |||
| 81b18f843a | |||
|  | a833f88c32 | ||
|  | 07b2c1b253 | ||
|  | 735cbdb983 | ||
|  | 2ad54c5a02 | ||
|  | 3d04dc33c6 | ||
| 2490816297 | |||
| 5f55bca378 | |||
| f6aa82b7f2 | |||
| 22749699a3 | |||
| 0503c028be | |||
| 22f4feee7b | |||
| 3f858d6755 | |||
| 35fa3d1dfd | |||
|  | c4435e6beb | ||
| d1ece74137 | |||
| 43c817cc67 | |||
|  | 51bf1501fc | ||
|  | 741bc836f6 | ||
|  | 8546d01a4c | ||
| 1407418755 | |||
| a6a0da873f | |||
|  | 7b03d8d087 | ||
|  | 4b759b8f2a | ||
|  | 038b6ee9cd | ||
|  | 38806343a8 | ||
|  | 831ca4e3bf | ||
| eedcaf6470 | |||
| b39f0d1fb6 | |||
| 9f1267dfe6 | |||
| 2e90285232 | |||
| e254de982e | |||
| 28d99b5297 | |||
|  | ee93f0218b | ||
| 161ed102a5 | |||
|  | f65a585236 | ||
|  | ae99e99da2 | ||
| f3ca29af6c | |||
| 37988221a8 | |||
| 7a327a3f28 | |||
| 92f8950a56 | |||
| 65987a8a58 | |||
| 889d828bc2 | |||
| ad98b6193d | |||
| fc760016b3 | |||
| 2da86f7dae | |||
| 97843e2b58 | |||
| 82b3f54697 | |||
| 673994b281 | |||
| bbc0eff078 | |||
| 4c60e31070 | |||
| afbf7d4c37 | |||
| 8c3cc32364 | |||
| 4c3fd9fa3f | |||
| 17b3a10d46 | |||
| 149a46b92c | |||
| db9c28a773 | |||
| 9ac3ac41df | |||
| 2af9ab9034 | |||
| 6f1ea96293 | |||
| 2e3c5890b6 | |||
| bc6678732f | |||
| b10ae00c8a | |||
|  | 6ad73145bc | ||
| f7293f2ddb | |||
|  | 6b8ee7bae0 | ||
|  | 739c2308b5 | ||
|  | a71b69389b | ||
|  | d49e502f53 | ||
|  | 92ec3404f8 | ||
|  | f4ebea3381 | ||
|  | cf167d0cd1 | ||
|  | c363bdd784 | ||
|  | c30d96ea50 | ||
|  | 7ffe17ada1 | ||
| 330a9b3f4c | |||
|  | 28ff66a381 | ||
|  | 78c7bcee36 | ||
| 00a7b95631 | |||
| 94d8321d01 | |||
|  | ac24cc9f99 | ||
|  | 3ab4c8c0bb | ||
| 26d124283e | |||
| 0d889b7041 | |||
| ab31ad006a | |||
| 6e4a06e180 | |||
|  | 446c768cd3 | 
							
								
								
									
										68
									
								
								.travis.yml
									
									
									
									
									
								
							
							
						
						
									
										68
									
								
								.travis.yml
									
									
									
									
									
								
							| @@ -9,68 +9,6 @@ matrix: | |||||||
|     - os:        osx |     - os:        osx | ||||||
|       osx_image: xcode8.3 |       osx_image: xcode8.3 | ||||||
|       compiler: clang |       compiler: clang | ||||||
|     - compiler: gcc |  | ||||||
|       dist: trusty |  | ||||||
|       sudo: required |  | ||||||
|       addons: |  | ||||||
|         apt: |  | ||||||
|           sources: |  | ||||||
|             - ubuntu-toolchain-r-test |  | ||||||
|           packages: |  | ||||||
|             - g++-4.9 |  | ||||||
|             - libmpfr-dev |  | ||||||
|             - libgmp-dev |  | ||||||
|             - libmpc-dev |  | ||||||
|             - libopenmpi-dev |  | ||||||
|             - openmpi-bin |  | ||||||
|             - binutils-dev |  | ||||||
|       env: VERSION=-4.9 |  | ||||||
|     - compiler: gcc |  | ||||||
|       dist: trusty |  | ||||||
|       sudo: required |  | ||||||
|       addons: |  | ||||||
|         apt: |  | ||||||
|           sources: |  | ||||||
|             - ubuntu-toolchain-r-test |  | ||||||
|           packages: |  | ||||||
|             - g++-5 |  | ||||||
|             - libmpfr-dev |  | ||||||
|             - libgmp-dev |  | ||||||
|             - libmpc-dev |  | ||||||
|             - libopenmpi-dev |  | ||||||
|             - openmpi-bin |  | ||||||
|             - binutils-dev |  | ||||||
|       env: VERSION=-5 |  | ||||||
|     - compiler: clang |  | ||||||
|       dist: trusty |  | ||||||
|       addons: |  | ||||||
|         apt: |  | ||||||
|           sources: |  | ||||||
|             - ubuntu-toolchain-r-test |  | ||||||
|           packages: |  | ||||||
|             - g++-4.8 |  | ||||||
|             - libmpfr-dev |  | ||||||
|             - libgmp-dev |  | ||||||
|             - libmpc-dev |  | ||||||
|             - libopenmpi-dev |  | ||||||
|             - openmpi-bin |  | ||||||
|             - binutils-dev |  | ||||||
|       env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz |  | ||||||
|     - compiler: clang |  | ||||||
|       dist: trusty |  | ||||||
|       addons: |  | ||||||
|         apt: |  | ||||||
|           sources: |  | ||||||
|             - ubuntu-toolchain-r-test |  | ||||||
|           packages: |  | ||||||
|             - g++-4.8 |  | ||||||
|             - libmpfr-dev |  | ||||||
|             - libgmp-dev |  | ||||||
|             - libmpc-dev |  | ||||||
|             - libopenmpi-dev |  | ||||||
|             - openmpi-bin |  | ||||||
|             - binutils-dev |  | ||||||
|       env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz |  | ||||||
|        |        | ||||||
| before_install: | before_install: | ||||||
|     - export GRIDDIR=`pwd` |     - export GRIDDIR=`pwd` | ||||||
| @@ -106,9 +44,3 @@ script: | |||||||
|     - make -j4 |     - make -j4 | ||||||
|     - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals |     - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals | ||||||
|     - make check |     - make check | ||||||
|     - echo make clean |  | ||||||
|     - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto ; fi |  | ||||||
|     - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then make -j4; fi |  | ||||||
|     - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										279
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										279
									
								
								README.md
									
									
									
									
									
								
							| @@ -1,27 +1,44 @@ | |||||||
| # Grid | # Grid [),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=Grid&tab=projectOverview) [](https://travis-ci.org/paboyle/Grid) | ||||||
| <table> |  | ||||||
| <tr> |  | ||||||
|     <td>Last stable release</td> |  | ||||||
|     <td><a href="https://travis-ci.org/paboyle/Grid"> |  | ||||||
|     <img src="https://travis-ci.org/paboyle/Grid.svg?branch=master"></a> |  | ||||||
|     </td> |  | ||||||
| </tr> |  | ||||||
| <tr> |  | ||||||
|     <td>Development branch</td> |  | ||||||
|     <td><a href="https://travis-ci.org/paboyle/Grid"> |  | ||||||
|     <img src="https://travis-ci.org/paboyle/Grid.svg?branch=develop"></a> |  | ||||||
|     </td> |  | ||||||
| </tr> |  | ||||||
| </table> |  | ||||||
|  |  | ||||||
| **Data parallel C++ mathematical object library.** | **Data parallel C++ mathematical object library.** | ||||||
|  |  | ||||||
| License: GPL v2. | License: GPL v2. | ||||||
|  |  | ||||||
| Last update Nov 2016. | Last update June 2017. | ||||||
|  |  | ||||||
| _Please do not send pull requests to the `master` branch which is reserved for releases._ | _Please do not send pull requests to the `master` branch which is reserved for releases._ | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  | ### Description | ||||||
|  | This library provides data parallel C++ container classes with internal memory layout | ||||||
|  | that is transformed to map efficiently to SIMD architectures. CSHIFT facilities | ||||||
|  | are provided, similar to HPF and cmfortran, and user control is given over the mapping of | ||||||
|  | array indices to both MPI tasks and SIMD processing elements. | ||||||
|  |  | ||||||
|  | * Identically shaped arrays then be processed with perfect data parallelisation. | ||||||
|  | * Such identically shaped arrays are called conformable arrays. | ||||||
|  |  | ||||||
|  | The transformation is based on the observation that Cartesian array processing involves | ||||||
|  | identical processing to be performed on different regions of the Cartesian array. | ||||||
|  |  | ||||||
|  | The library will both geometrically decompose into MPI tasks and across SIMD lanes. | ||||||
|  | Local vector loops are parallelised with OpenMP pragmas. | ||||||
|  |  | ||||||
|  | Data parallel array operations can then be specified with a SINGLE data parallel paradigm, but | ||||||
|  | optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a significant simplification | ||||||
|  | for most programmers. | ||||||
|  |  | ||||||
|  | The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture. | ||||||
|  | Presently SSE4, ARM NEON (128 bits) AVX, AVX2, QPX (256 bits), IMCI and AVX512 (512 bits) targets are supported. | ||||||
|  |  | ||||||
|  | These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types.  | ||||||
|  | The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`. | ||||||
|  |  | ||||||
|  | MPI, OpenMP, and SIMD parallelism are present in the library. | ||||||
|  | Please see [this paper](https://arxiv.org/abs/1512.03487) for more detail. | ||||||
|  |  | ||||||
|  |  | ||||||
| ### Compilers | ### Compilers | ||||||
|  |  | ||||||
| Intel ICPC v16.0.3 and later | Intel ICPC v16.0.3 and later | ||||||
| @@ -56,35 +73,25 @@ When you file an issue, please go though the following checklist: | |||||||
| 6. Attach the output of `make V=1`. | 6. Attach the output of `make V=1`. | ||||||
| 7. Describe the issue and any previous attempt to solve it. If relevant, show how to reproduce the issue using a minimal working example. | 7. Describe the issue and any previous attempt to solve it. If relevant, show how to reproduce the issue using a minimal working example. | ||||||
|  |  | ||||||
|  | ### Required libraries | ||||||
|  | Grid requires: | ||||||
|  |  | ||||||
|  | [GMP](https://gmplib.org/),  | ||||||
|  |  | ||||||
| ### Description | [MPFR](http://www.mpfr.org/)  | ||||||
| This library provides data parallel C++ container classes with internal memory layout |  | ||||||
| that is transformed to map efficiently to SIMD architectures. CSHIFT facilities |  | ||||||
| are provided, similar to HPF and cmfortran, and user control is given over the mapping of |  | ||||||
| array indices to both MPI tasks and SIMD processing elements. |  | ||||||
|  |  | ||||||
| * Identically shaped arrays then be processed with perfect data parallelisation. | Bootstrapping grid downloads and uses for internal dense matrix (non-QCD operations) the Eigen library. | ||||||
| * Such identically shaped arrays are called conformable arrays. |  | ||||||
|  |  | ||||||
| The transformation is based on the observation that Cartesian array processing involves | Grid optionally uses: | ||||||
| identical processing to be performed on different regions of the Cartesian array. |  | ||||||
|  |  | ||||||
| The library will both geometrically decompose into MPI tasks and across SIMD lanes. | [HDF5](https://support.hdfgroup.org/HDF5/)   | ||||||
| Local vector loops are parallelised with OpenMP pragmas. |  | ||||||
|  |  | ||||||
| Data parallel array operations can then be specified with a SINGLE data parallel paradigm, but | [LIME](http://usqcd-software.github.io/c-lime/) for ILDG and SciDAC file format support.  | ||||||
| optimally use MPI, OpenMP and SIMD parallelism under the hood. This is a significant simplification |  | ||||||
| for most programmers. |  | ||||||
|  |  | ||||||
| The layout transformations are parametrised by the SIMD vector length. This adapts according to the architecture. | [FFTW](http://www.fftw.org) either generic version or via the Intel MKL library. | ||||||
| Presently SSE4 (128 bit) AVX, AVX2, QPX (256 bit), IMCI, and AVX512 (512 bit) targets are supported (ARM NEON on the way). |  | ||||||
|  |  | ||||||
| These are presented as `vRealF`, `vRealD`, `vComplexF`, and `vComplexD` internal vector data types. These may be useful in themselves for other programmers. | LAPACK either generic version or Intel MKL library. | ||||||
| The corresponding scalar types are named `RealF`, `RealD`, `ComplexF` and `ComplexD`. |  | ||||||
|  |  | ||||||
| MPI, OpenMP, and SIMD parallelism are present in the library. |  | ||||||
| Please see https://arxiv.org/abs/1512.03487 for more detail. |  | ||||||
|  |  | ||||||
| ### Quick start | ### Quick start | ||||||
| First, start by cloning the repository: | First, start by cloning the repository: | ||||||
| @@ -155,7 +162,6 @@ The following options can be use with the `--enable-comms=` option to target dif | |||||||
| | `none`         | no communications                                             | | | `none`         | no communications                                             | | ||||||
| | `mpi[-auto]`   | MPI communications                                            | | | `mpi[-auto]`   | MPI communications                                            | | ||||||
| | `mpi3[-auto]`  | MPI communications using MPI 3 shared memory                  | | | `mpi3[-auto]`  | MPI communications using MPI 3 shared memory                  | | ||||||
| | `mpi3l[-auto]` | MPI communications using MPI 3 shared memory and leader model | |  | ||||||
| | `shmem `       | Cray SHMEM communications                                     | | | `shmem `       | Cray SHMEM communications                                     | | ||||||
|  |  | ||||||
| For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names). The `-auto` suffix is not supported by the Cray environment wrapper scripts. Use the standard versions instead.   | For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names). The `-auto` suffix is not supported by the Cray environment wrapper scripts. Use the standard versions instead.   | ||||||
| @@ -173,7 +179,8 @@ The following options can be use with the `--enable-simd=` option to target diff | |||||||
| | `AVXFMA4`   | AVX (256 bit) + FMA4                   | | | `AVXFMA4`   | AVX (256 bit) + FMA4                   | | ||||||
| | `AVX2`      | AVX 2 (256 bit)                        | | | `AVX2`      | AVX 2 (256 bit)                        | | ||||||
| | `AVX512`    | AVX 512 bit                            | | | `AVX512`    | AVX 512 bit                            | | ||||||
| | `QPX`       | QPX (256 bit)                          | | | `NEONv8`    | [ARM NEON](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.den0024a/ch07s03.html) (128 bit)                     | | ||||||
|  | | `QPX`       | IBM QPX (256 bit)                      | | ||||||
|  |  | ||||||
| Alternatively, some CPU codenames can be directly used: | Alternatively, some CPU codenames can be directly used: | ||||||
|  |  | ||||||
| @@ -195,21 +202,205 @@ The following configuration is recommended for the Intel Knights Landing platfor | |||||||
| ``` bash | ``` bash | ||||||
| ../configure --enable-precision=double\ | ../configure --enable-precision=double\ | ||||||
|              --enable-simd=KNL        \ |              --enable-simd=KNL        \ | ||||||
|              --enable-comms=mpi-auto \ |              --enable-comms=mpi-auto  \ | ||||||
|              --with-gmp=<path>        \ |  | ||||||
|              --with-mpfr=<path>       \ |  | ||||||
|              --enable-mkl             \ |              --enable-mkl             \ | ||||||
|              CXX=icpc MPICXX=mpiicpc |              CXX=icpc MPICXX=mpiicpc | ||||||
| ``` | ``` | ||||||
|  | The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library. | ||||||
|  |  | ||||||
| where `<path>` is the UNIX prefix where GMP and MPFR are installed. If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: | If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: | ||||||
|  |  | ||||||
| ``` bash | ``` bash | ||||||
| ../configure --enable-precision=double\ | ../configure --enable-precision=double\ | ||||||
|              --enable-simd=KNL        \ |              --enable-simd=KNL        \ | ||||||
|              --enable-comms=mpi       \ |              --enable-comms=mpi       \ | ||||||
|              --with-gmp=<path>        \ |  | ||||||
|              --with-mpfr=<path>       \ |  | ||||||
|              --enable-mkl             \ |              --enable-mkl             \ | ||||||
|              CXX=CC CC=cc |              CXX=CC CC=cc | ||||||
| ``` | ``` | ||||||
|  |  | ||||||
|  | If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed: | ||||||
|  | ``` bash | ||||||
|  |                --with-gmp=<path>        \ | ||||||
|  |                --with-mpfr=<path>       \ | ||||||
|  | ``` | ||||||
|  | where `<path>` is the UNIX prefix where GMP and MPFR are installed.  | ||||||
|  |  | ||||||
|  | Knight's Landing with Intel Omnipath adapters with two adapters per node  | ||||||
|  | presently performs better with use of more than one rank per node, using shared memory  | ||||||
|  | for interior communication. This is the mpi3 communications implementation.  | ||||||
|  | We recommend four ranks per node for best performance, but optimum is local volume dependent. | ||||||
|  |  | ||||||
|  | ``` bash | ||||||
|  | ../configure --enable-precision=double\ | ||||||
|  |              --enable-simd=KNL        \ | ||||||
|  |              --enable-comms=mpi3-auto \ | ||||||
|  |              --enable-mkl             \ | ||||||
|  |              CC=icpc MPICXX=mpiicpc  | ||||||
|  | ``` | ||||||
|  |  | ||||||
|  | ### Build setup for Intel Haswell Xeon platform | ||||||
|  |  | ||||||
|  | The following configuration is recommended for the Intel Haswell platform: | ||||||
|  |  | ||||||
|  | ``` bash | ||||||
|  | ../configure --enable-precision=double\ | ||||||
|  |              --enable-simd=AVX2       \ | ||||||
|  |              --enable-comms=mpi3-auto \ | ||||||
|  |              --enable-mkl             \ | ||||||
|  |              CXX=icpc MPICXX=mpiicpc | ||||||
|  | ``` | ||||||
|  | The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library. | ||||||
|  |  | ||||||
|  | If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed: | ||||||
|  | ``` bash | ||||||
|  |                --with-gmp=<path>        \ | ||||||
|  |                --with-mpfr=<path>       \ | ||||||
|  | ``` | ||||||
|  | where `<path>` is the UNIX prefix where GMP and MPFR are installed.  | ||||||
|  |  | ||||||
|  | If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: | ||||||
|  |  | ||||||
|  | ``` bash | ||||||
|  | ../configure --enable-precision=double\ | ||||||
|  |              --enable-simd=AVX2       \ | ||||||
|  |              --enable-comms=mpi3      \ | ||||||
|  |              --enable-mkl             \ | ||||||
|  |              CXX=CC CC=cc | ||||||
|  | ``` | ||||||
|  | Since Dual socket nodes are commonplace, we recommend MPI-3 as the default with the use of  | ||||||
|  | one rank per socket. If using the Intel MPI library, threads should be pinned to NUMA domains using | ||||||
|  | ``` | ||||||
|  |         export I_MPI_PIN=1 | ||||||
|  | ``` | ||||||
|  | This is the default. | ||||||
|  |  | ||||||
|  | ### Build setup for Intel Skylake Xeon platform | ||||||
|  |  | ||||||
|  | The following configuration is recommended for the Intel Skylake platform: | ||||||
|  |  | ||||||
|  | ``` bash | ||||||
|  | ../configure --enable-precision=double\ | ||||||
|  |              --enable-simd=AVX512     \ | ||||||
|  |              --enable-comms=mpi3      \ | ||||||
|  |              --enable-mkl             \ | ||||||
|  |              CXX=mpiicpc | ||||||
|  | ``` | ||||||
|  | The MKL flag enables use of BLAS and FFTW from the Intel Math Kernels Library. | ||||||
|  |  | ||||||
|  | If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed: | ||||||
|  | ``` bash | ||||||
|  |                --with-gmp=<path>        \ | ||||||
|  |                --with-mpfr=<path>       \ | ||||||
|  | ``` | ||||||
|  | where `<path>` is the UNIX prefix where GMP and MPFR are installed.  | ||||||
|  |  | ||||||
|  | If you are working on a Cray machine that does not use the `mpiicpc` wrapper, please use: | ||||||
|  |  | ||||||
|  | ``` bash | ||||||
|  | ../configure --enable-precision=double\ | ||||||
|  |              --enable-simd=AVX512     \ | ||||||
|  |              --enable-comms=mpi3      \ | ||||||
|  |              --enable-mkl             \ | ||||||
|  |              CXX=CC CC=cc | ||||||
|  | ``` | ||||||
|  | Since Dual socket nodes are commonplace, we recommend MPI-3 as the default with the use of  | ||||||
|  | one rank per socket. If using the Intel MPI library, threads should be pinned to NUMA domains using | ||||||
|  | ```  | ||||||
|  |         export I_MPI_PIN=1 | ||||||
|  | ``` | ||||||
|  | This is the default.  | ||||||
|  |  | ||||||
|  | #### Expected Skylake Gold 6148 dual socket (single prec, single node 20+20 cores) performance using NUMA MPI mapping):  | ||||||
|  |  | ||||||
|  | mpirun -n 2 benchmarks/Benchmark_dwf --grid 16.16.16.16 --mpi 2.1.1.1 --cacheblocking 2.2.2.2 --dslash-asm --shm 1024 --threads 18  | ||||||
|  |  | ||||||
|  | TBA | ||||||
|  |  | ||||||
|  |  | ||||||
|  | ### Build setup for AMD EPYC / RYZEN | ||||||
|  |  | ||||||
|  | The AMD EPYC is a multichip module comprising 32 cores spread over four distinct chips each with 8 cores. | ||||||
|  | So, even with a single socket node there is a quad-chip module. Dual socket nodes with 64 cores total | ||||||
|  | are common. Each chip within the module exposes a separate NUMA domain. | ||||||
|  | There are four NUMA domains per socket and we recommend one MPI rank per NUMA domain. | ||||||
|  | MPI-3 is recommended with the use of four ranks per socket, | ||||||
|  | and 8 threads per rank.  | ||||||
|  |  | ||||||
|  | The following configuration is recommended for the AMD EPYC platform. | ||||||
|  |  | ||||||
|  | ``` bash | ||||||
|  | ../configure --enable-precision=double\ | ||||||
|  |              --enable-simd=AVX2       \ | ||||||
|  |              --enable-comms=mpi3 \ | ||||||
|  |              CXX=mpicxx  | ||||||
|  | ``` | ||||||
|  |  | ||||||
|  | If gmp and mpfr are NOT in standard places (/usr/) these flags may be needed: | ||||||
|  | ``` bash | ||||||
|  |                --with-gmp=<path>        \ | ||||||
|  |                --with-mpfr=<path>       \ | ||||||
|  | ``` | ||||||
|  | where `<path>` is the UNIX prefix where GMP and MPFR are installed.  | ||||||
|  |  | ||||||
|  | Using MPICH and g++ v4.9.2, best performance can be obtained using explicit GOMP_CPU_AFFINITY flags for each MPI rank. | ||||||
|  | This can be done by invoking MPI on a wrapper script omp_bind.sh to handle this.  | ||||||
|  |  | ||||||
|  | It is recommended to run 8 MPI ranks on a single dual socket AMD EPYC, with 8 threads per rank using MPI3 and | ||||||
|  | shared memory to communicate within this node: | ||||||
|  |  | ||||||
|  | mpirun -np 8 ./omp_bind.sh ./Benchmark_dwf --mpi 2.2.2.1 --dslash-unroll --threads 8 --grid 16.16.16.16 --cacheblocking 4.4.4.4  | ||||||
|  |  | ||||||
|  | Where omp_bind.sh does the following: | ||||||
|  | ``` | ||||||
|  | #!/bin/bash | ||||||
|  |  | ||||||
|  | numanode=` expr $PMI_RANK % 8 ` | ||||||
|  | basecore=`expr $numanode \* 16` | ||||||
|  | core0=`expr $basecore + 0 ` | ||||||
|  | core1=`expr $basecore + 2 ` | ||||||
|  | core2=`expr $basecore + 4 ` | ||||||
|  | core3=`expr $basecore + 6 ` | ||||||
|  | core4=`expr $basecore + 8 ` | ||||||
|  | core5=`expr $basecore + 10 ` | ||||||
|  | core6=`expr $basecore + 12 ` | ||||||
|  | core7=`expr $basecore + 14 ` | ||||||
|  |  | ||||||
|  | export GOMP_CPU_AFFINITY="$core0 $core1 $core2 $core3 $core4 $core5 $core6 $core7" | ||||||
|  | echo GOMP_CUP_AFFINITY $GOMP_CPU_AFFINITY | ||||||
|  |  | ||||||
|  | $@ | ||||||
|  | ``` | ||||||
|  |  | ||||||
|  | Performance: | ||||||
|  |  | ||||||
|  | #### Expected AMD EPYC 7601 dual socket (single prec, single node 32+32 cores) performance using NUMA MPI mapping):  | ||||||
|  |  | ||||||
|  | mpirun  -np 8 ./omp_bind.sh ./Benchmark_dwf --threads 8 --mpi 2.2.2.1 --dslash-unroll --grid 16.16.16.16 --cacheblocking 4.4.4.4 | ||||||
|  |  | ||||||
|  | TBA | ||||||
|  |  | ||||||
|  | ### Build setup for BlueGene/Q | ||||||
|  |  | ||||||
|  | To be written... | ||||||
|  |  | ||||||
|  | ### Build setup for ARM Neon | ||||||
|  |  | ||||||
|  | To be written... | ||||||
|  |  | ||||||
|  | ### Build setup for laptops, other compilers, non-cluster builds | ||||||
|  |  | ||||||
|  | Many versions of g++ and clang++ work with Grid, and involve merely replacing CXX (and MPICXX), | ||||||
|  | and omit the enable-mkl flag.  | ||||||
|  |  | ||||||
|  | Single node builds are enabled with  | ||||||
|  | ``` | ||||||
|  |             --enable-comms=none | ||||||
|  | ``` | ||||||
|  |  | ||||||
|  | FFTW support that is not in the default search path may then enabled with | ||||||
|  | ``` | ||||||
|  |     --with-fftw=<installpath> | ||||||
|  | ``` | ||||||
|  |  | ||||||
|  | BLAS will not be compiled in by default, and Lanczos will default to Eigen diagonalisation. | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										16
									
								
								TODO
									
									
									
									
									
								
							
							
						
						
									
										16
									
								
								TODO
									
									
									
									
									
								
							| @@ -2,18 +2,20 @@ TODO: | |||||||
| --------------- | --------------- | ||||||
|  |  | ||||||
| Large item work list: | Large item work list: | ||||||
| 1)- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O |  | ||||||
|  |  | ||||||
|  | 1)- BG/Q port and check | ||||||
| 2)- Christoph's local basis expansion Lanczos | 2)- Christoph's local basis expansion Lanczos | ||||||
| 3)- BG/Q port and check | 3)- Precision conversion and sort out localConvert      <-- partial | ||||||
| 4)- Precision conversion and sort out localConvert      <-- partial |  | ||||||
|   - Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet |   - Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet | ||||||
| 5)- Physical propagator interface | 4)- Physical propagator interface | ||||||
| 6)- Conserved currents | 5)- Conserved currents | ||||||
| 7)- Multigrid Wilson and DWF, compare to other Multigrid implementations | 6)- Multigrid Wilson and DWF, compare to other Multigrid implementations | ||||||
| 8)- HDCR resume | 7)- HDCR resume | ||||||
|  |  | ||||||
| Recent DONE  | Recent DONE  | ||||||
|  |  | ||||||
|  | -- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O.  <--- DONE | ||||||
| -- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- DONE | -- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- DONE | ||||||
| -- GaugeFix into central location                      <-- DONE | -- GaugeFix into central location                      <-- DONE | ||||||
| -- Scidac and Ildg metadata handling                   <-- DONE | -- Scidac and Ildg metadata handling                   <-- DONE | ||||||
|   | |||||||
							
								
								
									
										775
									
								
								benchmarks/Benchmark_ITT.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										775
									
								
								benchmarks/Benchmark_ITT.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,775 @@ | |||||||
|  |     /************************************************************************************* | ||||||
|  |  | ||||||
|  |     Grid physics library, www.github.com/paboyle/Grid  | ||||||
|  |  | ||||||
|  |     Source file: ./benchmarks/Benchmark_memory_bandwidth.cc | ||||||
|  |  | ||||||
|  |     Copyright (C) 2015 | ||||||
|  |  | ||||||
|  | Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||||
|  | Author: paboyle <paboyle@ph.ed.ac.uk> | ||||||
|  |  | ||||||
|  |     This program is free software; you can redistribute it and/or modify | ||||||
|  |     it under the terms of the GNU General Public License as published by | ||||||
|  |     the Free Software Foundation; either version 2 of the License, or | ||||||
|  |     (at your option) any later version. | ||||||
|  |  | ||||||
|  |     This program is distributed in the hope that it will be useful, | ||||||
|  |     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  |     GNU General Public License for more details. | ||||||
|  |  | ||||||
|  |     You should have received a copy of the GNU General Public License along | ||||||
|  |     with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  |  | ||||||
|  |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|  |     *************************************************************************************/ | ||||||
|  |     /*  END LEGAL */ | ||||||
|  | #include <Grid/Grid.h> | ||||||
|  |  | ||||||
|  | using namespace std; | ||||||
|  | using namespace Grid; | ||||||
|  | using namespace Grid::QCD; | ||||||
|  |  | ||||||
|  | typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR; | ||||||
|  | typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF; | ||||||
|  | typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD; | ||||||
|  |  | ||||||
|  |  | ||||||
|  | std::vector<int> L_list; | ||||||
|  | std::vector<int> Ls_list; | ||||||
|  | std::vector<double> mflop_list; | ||||||
|  |  | ||||||
|  | double mflop_ref; | ||||||
|  | double mflop_ref_err; | ||||||
|  |  | ||||||
|  | int NN_global; | ||||||
|  |  | ||||||
|  | struct time_statistics{ | ||||||
|  |   double mean; | ||||||
|  |   double err; | ||||||
|  |   double min; | ||||||
|  |   double max; | ||||||
|  |  | ||||||
|  |   void statistics(std::vector<double> v){ | ||||||
|  |       double sum = std::accumulate(v.begin(), v.end(), 0.0); | ||||||
|  |       mean = sum / v.size(); | ||||||
|  |  | ||||||
|  |       std::vector<double> diff(v.size()); | ||||||
|  |       std::transform(v.begin(), v.end(), diff.begin(), [=](double x) { return x - mean; }); | ||||||
|  |       double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0); | ||||||
|  |       err = std::sqrt(sq_sum / (v.size()*(v.size() - 1))); | ||||||
|  |  | ||||||
|  |       auto result = std::minmax_element(v.begin(), v.end()); | ||||||
|  |       min = *result.first; | ||||||
|  |       max = *result.second; | ||||||
|  | } | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | void comms_header(){ | ||||||
|  |   std::cout <<GridLogMessage << " L  "<<"\t"<<" Ls  "<<"\t" | ||||||
|  |             <<std::setw(11)<<"bytes"<<"MB/s uni (err/min/max)"<<"\t\t"<<"MB/s bidi (err/min/max)"<<std::endl; | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | Gamma::Algebra Gmu [] = { | ||||||
|  |   Gamma::Algebra::GammaX, | ||||||
|  |   Gamma::Algebra::GammaY, | ||||||
|  |   Gamma::Algebra::GammaZ, | ||||||
|  |   Gamma::Algebra::GammaT | ||||||
|  | }; | ||||||
|  | struct controls { | ||||||
|  |   int Opt; | ||||||
|  |   int CommsOverlap; | ||||||
|  |   Grid::CartesianCommunicator::CommunicatorPolicy_t CommsAsynch; | ||||||
|  |   //  int HugePages; | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | class Benchmark { | ||||||
|  | public: | ||||||
|  |   static void Decomposition (void ) { | ||||||
|  |  | ||||||
|  |     int threads = GridThread::GetThreads(); | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "= Grid is setup to use "<<threads<<" threads"<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n"; | ||||||
|  |     std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage<<"\tvReal          : "<<sizeof(vReal )*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vReal::Nsimd()))<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage<<"\tvRealD         : "<<sizeof(vRealD)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage<<"\tvComplex       : "<<sizeof(vComplex )*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplex::Nsimd()))<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage<<"\tvComplexF      : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage<<"\tvComplexD      : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |  | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   static void Comms(void) | ||||||
|  |   { | ||||||
|  |     int Nloop=200; | ||||||
|  |     int nmu=0; | ||||||
|  |     int maxlat=32; | ||||||
|  |  | ||||||
|  |     std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd()); | ||||||
|  |     std::vector<int> mpi_layout  = GridDefaultMpi(); | ||||||
|  |  | ||||||
|  |     for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++; | ||||||
|  |  | ||||||
|  |     std::vector<double> t_time(Nloop); | ||||||
|  |     time_statistics timestat; | ||||||
|  |  | ||||||
|  |     std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|  |     comms_header(); | ||||||
|  |  | ||||||
|  |     for(int lat=4;lat<=maxlat;lat+=4){ | ||||||
|  |       for(int Ls=8;Ls<=8;Ls*=2){ | ||||||
|  |  | ||||||
|  | 	std::vector<int> latt_size  ({lat*mpi_layout[0], | ||||||
|  | 	      lat*mpi_layout[1], | ||||||
|  | 	      lat*mpi_layout[2], | ||||||
|  | 	      lat*mpi_layout[3]}); | ||||||
|  |  | ||||||
|  | 	GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||||
|  | 	RealD Nrank = Grid._Nprocessors; | ||||||
|  | 	RealD Nnode = Grid.NodeCount(); | ||||||
|  | 	RealD ppn = Nrank/Nnode; | ||||||
|  |  | ||||||
|  | 	std::vector<HalfSpinColourVectorD *> xbuf(8); | ||||||
|  | 	std::vector<HalfSpinColourVectorD *> rbuf(8); | ||||||
|  | 	Grid.ShmBufferFreeAll(); | ||||||
|  | 	for(int d=0;d<8;d++){ | ||||||
|  | 	  xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); | ||||||
|  | 	  rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); | ||||||
|  | 	  bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); | ||||||
|  | 	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); | ||||||
|  | 	int ncomm; | ||||||
|  | 	double dbytes; | ||||||
|  | 	std::vector<double> times(Nloop); | ||||||
|  | 	for(int i=0;i<Nloop;i++){ | ||||||
|  |  | ||||||
|  | 	  double start=usecond(); | ||||||
|  |  | ||||||
|  | 	  dbytes=0; | ||||||
|  | 	  ncomm=0; | ||||||
|  |  | ||||||
|  | 	  parallel_for(int dir=0;dir<8;dir++){ | ||||||
|  |  | ||||||
|  | 	    double tbytes; | ||||||
|  | 	    int mu =dir % 4; | ||||||
|  |  | ||||||
|  | 	    if (mpi_layout[mu]>1 ) { | ||||||
|  | 	         | ||||||
|  | 	      int xmit_to_rank; | ||||||
|  | 	      int recv_from_rank; | ||||||
|  | 	      if ( dir == mu ) {  | ||||||
|  | 		int comm_proc=1; | ||||||
|  | 		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); | ||||||
|  | 	      } else {  | ||||||
|  | 		int comm_proc = mpi_layout[mu]-1; | ||||||
|  | 		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); | ||||||
|  | 	      } | ||||||
|  | 	      tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, | ||||||
|  | 						 (void *)&rbuf[dir][0], recv_from_rank, | ||||||
|  | 						 bytes,dir); | ||||||
|  | 	   | ||||||
|  | #ifdef GRID_OMP | ||||||
|  | #pragma omp atomic | ||||||
|  | #endif | ||||||
|  | 	      ncomm++; | ||||||
|  |  | ||||||
|  | #ifdef GRID_OMP | ||||||
|  | #pragma omp atomic | ||||||
|  | #endif | ||||||
|  | 	      dbytes+=tbytes; | ||||||
|  | 	    } | ||||||
|  | 	  } | ||||||
|  | 	  Grid.Barrier(); | ||||||
|  | 	  double stop=usecond(); | ||||||
|  | 	  t_time[i] = stop-start; // microseconds | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	timestat.statistics(t_time); | ||||||
|  | 	//	for(int i=0;i<t_time.size();i++){ | ||||||
|  | 	//	  std::cout << i<<" "<<t_time[i]<<std::endl; | ||||||
|  | 	//	} | ||||||
|  |  | ||||||
|  | 	dbytes=dbytes*ppn; | ||||||
|  | 	double xbytes    = dbytes*0.5; | ||||||
|  | 	double rbytes    = dbytes*0.5; | ||||||
|  | 	double bidibytes = dbytes; | ||||||
|  |  | ||||||
|  | 	std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t" | ||||||
|  | 		 <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7) | ||||||
|  | 		 <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " " | ||||||
|  | 		 <<xbytes/timestat.max <<" "<< xbytes/timestat.min   | ||||||
|  | 		 << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " " | ||||||
|  | 		 << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl; | ||||||
|  |  | ||||||
|  |   | ||||||
|  | 	 | ||||||
|  | 	    } | ||||||
|  |     }     | ||||||
|  |  | ||||||
|  |     return; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   static void Memory(void) | ||||||
|  |   { | ||||||
|  |     const int Nvec=8; | ||||||
|  |     typedef Lattice< iVector< vReal,Nvec> > LatticeVec; | ||||||
|  |     typedef iVector<vReal,Nvec> Vec; | ||||||
|  |  | ||||||
|  |     std::vector<int> simd_layout = GridDefaultSimd(Nd,vReal::Nsimd()); | ||||||
|  |     std::vector<int> mpi_layout  = GridDefaultMpi(); | ||||||
|  |  | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "= Benchmarking a*x + y bandwidth"<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; | ||||||
|  |    | ||||||
|  |   uint64_t lmax=48; | ||||||
|  | #define NLOOP (100*lmax*lmax*lmax*lmax/lat/lat/lat/lat) | ||||||
|  |  | ||||||
|  |     GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9})); | ||||||
|  |     for(int lat=8;lat<=lmax;lat+=4){ | ||||||
|  |  | ||||||
|  |       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); | ||||||
|  |       int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; | ||||||
|  |       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||||
|  |  | ||||||
|  |       Vec rn ; random(sRNG,rn); | ||||||
|  |  | ||||||
|  |       LatticeVec z(&Grid); z=rn; | ||||||
|  |       LatticeVec x(&Grid); x=rn; | ||||||
|  |       LatticeVec y(&Grid); y=rn; | ||||||
|  |       double a=2.0; | ||||||
|  |  | ||||||
|  |       uint64_t Nloop=NLOOP; | ||||||
|  |  | ||||||
|  |       double start=usecond(); | ||||||
|  |       for(int i=0;i<Nloop;i++){ | ||||||
|  | 	z=a*x-y; | ||||||
|  |         x._odata[0]=z._odata[0]; // force serial dependency to prevent optimise away | ||||||
|  |         y._odata[4]=z._odata[4]; | ||||||
|  |       } | ||||||
|  |       double stop=usecond(); | ||||||
|  |       double time = (stop-start)/Nloop*1000; | ||||||
|  |       | ||||||
|  |       double flops=vol*Nvec*2;// mul,add | ||||||
|  |       double bytes=3.0*vol*Nvec*sizeof(Real); | ||||||
|  |       std::cout<<GridLogMessage<<std::setprecision(3)  | ||||||
|  | 	       << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl; | ||||||
|  |  | ||||||
|  |     } | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  |   static double DWF5(int Ls,int L) | ||||||
|  |   { | ||||||
|  |     RealD mass=0.1; | ||||||
|  |     RealD M5  =1.8; | ||||||
|  |  | ||||||
|  |     double mflops; | ||||||
|  |     double mflops_best = 0; | ||||||
|  |     double mflops_worst= 0; | ||||||
|  |     std::vector<double> mflops_all; | ||||||
|  |  | ||||||
|  |     /////////////////////////////////////////////////////// | ||||||
|  |     // Set/Get the layout & grid size | ||||||
|  |     /////////////////////////////////////////////////////// | ||||||
|  |     int threads = GridThread::GetThreads(); | ||||||
|  |     std::vector<int> mpi = GridDefaultMpi(); assert(mpi.size()==4); | ||||||
|  |     std::vector<int> local({L,L,L,L}); | ||||||
|  |  | ||||||
|  |     GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(std::vector<int>({64,64,64,64}),  | ||||||
|  | 								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); | ||||||
|  |     uint64_t NP = TmpGrid->RankCount(); | ||||||
|  |     uint64_t NN = TmpGrid->NodeCount(); | ||||||
|  |     NN_global=NN; | ||||||
|  |     uint64_t SHM=NP/NN; | ||||||
|  |  | ||||||
|  |     std::vector<int> internal; | ||||||
|  |     if      ( SHM == 1 )   internal = std::vector<int>({1,1,1,1}); | ||||||
|  |     else if ( SHM == 2 )   internal = std::vector<int>({2,1,1,1}); | ||||||
|  |     else if ( SHM == 4 )   internal = std::vector<int>({2,2,1,1}); | ||||||
|  |     else if ( SHM == 8 )   internal = std::vector<int>({2,2,2,1}); | ||||||
|  |     else assert(0); | ||||||
|  |  | ||||||
|  |     std::vector<int> nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]}); | ||||||
|  |     std::vector<int> latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]}); | ||||||
|  |  | ||||||
|  |     ///////// Welcome message //////////// | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "Benchmark DWF Ls vec on "<<L<<"^4 local volume "<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "* MPI ranks      : "<<GridCmdVectorIntToString(mpi)<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "* Intranode      : "<<GridCmdVectorIntToString(internal)<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "* nodes          : "<<GridCmdVectorIntToString(nodes)<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |  | ||||||
|  |     ///////// Lattice Init //////////// | ||||||
|  |     GridCartesian         * UGrid    = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); | ||||||
|  |     GridRedBlackCartesian * UrbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); | ||||||
|  |     GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi()); | ||||||
|  |     GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid); | ||||||
|  |     GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid); | ||||||
|  |     GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid); | ||||||
|  |  | ||||||
|  |     ///////// RNG Init //////////// | ||||||
|  |     std::vector<int> seeds4({1,2,3,4}); | ||||||
|  |     std::vector<int> seeds5({5,6,7,8}); | ||||||
|  |     GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4); | ||||||
|  |     GridParallelRNG          RNG5(sFGrid);  RNG5.SeedFixedIntegers(seeds5); | ||||||
|  |     std::cout << GridLogMessage << "Initialised RNGs" << std::endl; | ||||||
|  |  | ||||||
|  |     ///////// Source preparation //////////// | ||||||
|  |     LatticeFermion src   (sFGrid); random(RNG5,src); | ||||||
|  |     LatticeFermion tmp   (sFGrid); | ||||||
|  |  | ||||||
|  |     RealD N2 = 1.0/::sqrt(norm2(src)); | ||||||
|  |     src = src*N2; | ||||||
|  |      | ||||||
|  |     LatticeGaugeField Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu);  | ||||||
|  |  | ||||||
|  |     WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5); | ||||||
|  |     LatticeFermion src_e (sFrbGrid); | ||||||
|  |     LatticeFermion src_o (sFrbGrid); | ||||||
|  |     LatticeFermion r_e   (sFrbGrid); | ||||||
|  |     LatticeFermion r_o   (sFrbGrid); | ||||||
|  |     LatticeFermion r_eo  (sFGrid); | ||||||
|  |     LatticeFermion err   (sFGrid); | ||||||
|  |     { | ||||||
|  |  | ||||||
|  |       pickCheckerboard(Even,src_e,src); | ||||||
|  |       pickCheckerboard(Odd,src_o,src); | ||||||
|  |  | ||||||
|  | #if defined(AVX512)  | ||||||
|  |       const int num_cases = 6; | ||||||
|  |       std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O "); | ||||||
|  | #else | ||||||
|  |       const int num_cases = 4; | ||||||
|  |       std::string fmt("U/S ; U/O ; G/S ; G/O "); | ||||||
|  | #endif | ||||||
|  |       controls Cases [] = { | ||||||
|  | #ifdef AVX512 | ||||||
|  | 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  }, | ||||||
|  | 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }, | ||||||
|  | #endif | ||||||
|  | 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  }, | ||||||
|  | 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }, | ||||||
|  | 	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  }, | ||||||
|  | 	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  } | ||||||
|  |       };  | ||||||
|  |  | ||||||
|  |       for(int c=0;c<num_cases;c++) { | ||||||
|  |  | ||||||
|  | 	QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap; | ||||||
|  | 	QCD::WilsonKernelsStatic::Opt   = Cases[c].Opt; | ||||||
|  | 	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch); | ||||||
|  |  | ||||||
|  | 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  | 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl; | ||||||
|  | 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl; | ||||||
|  | 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl; | ||||||
|  | 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl; | ||||||
|  | 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl; | ||||||
|  | 	if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl; | ||||||
|  | 	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl; | ||||||
|  | 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |  | ||||||
|  | 	int nwarm = 100; | ||||||
|  | 	double t0=usecond(); | ||||||
|  | 	sFGrid->Barrier(); | ||||||
|  | 	for(int i=0;i<nwarm;i++){ | ||||||
|  | 	  sDw.DhopEO(src_o,r_e,DaggerNo); | ||||||
|  | 	} | ||||||
|  | 	sFGrid->Barrier(); | ||||||
|  | 	double t1=usecond(); | ||||||
|  | 	//	uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0); | ||||||
|  | 	//	if (ncall < 500) ncall = 500; | ||||||
|  | 	uint64_t ncall = 500; | ||||||
|  |  | ||||||
|  | 	sFGrid->Broadcast(0,&ncall,sizeof(ncall)); | ||||||
|  |  | ||||||
|  | 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl; | ||||||
|  | 	sDw.ZeroCounters(); | ||||||
|  |  | ||||||
|  | 	time_statistics timestat; | ||||||
|  | 	std::vector<double> t_time(ncall); | ||||||
|  | 	for(uint64_t i=0;i<ncall;i++){ | ||||||
|  | 	  t0=usecond(); | ||||||
|  | 	  sDw.DhopEO(src_o,r_e,DaggerNo); | ||||||
|  | 	  t1=usecond(); | ||||||
|  | 	  t_time[i] = t1-t0; | ||||||
|  | 	} | ||||||
|  | 	sFGrid->Barrier(); | ||||||
|  | 	 | ||||||
|  | 	double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; | ||||||
|  | 	double flops=(1344.0*volume)/2; | ||||||
|  | 	double mf_hi, mf_lo, mf_err; | ||||||
|  |  | ||||||
|  | 	timestat.statistics(t_time); | ||||||
|  | 	mf_hi = flops/timestat.min; | ||||||
|  | 	mf_lo = flops/timestat.max; | ||||||
|  | 	mf_err= flops/timestat.min * timestat.err/timestat.mean; | ||||||
|  |  | ||||||
|  | 	mflops = flops/timestat.mean; | ||||||
|  | 	mflops_all.push_back(mflops); | ||||||
|  | 	if ( mflops_best == 0   ) mflops_best = mflops; | ||||||
|  | 	if ( mflops_worst== 0   ) mflops_worst= mflops; | ||||||
|  | 	if ( mflops>mflops_best ) mflops_best = mflops; | ||||||
|  | 	if ( mflops<mflops_worst) mflops_worst= mflops; | ||||||
|  |  | ||||||
|  | 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl; | ||||||
|  | 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per rank   "<< mflops/NP<<std::endl; | ||||||
|  | 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per node   "<< mflops/NN<<std::endl; | ||||||
|  |  | ||||||
|  | 	sDw.Report(); | ||||||
|  |  | ||||||
|  |       } | ||||||
|  |       std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |       std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl; | ||||||
|  |       std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl; | ||||||
|  |       std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< mflops_worst/mflops_best <<std::endl; | ||||||
|  |       std::cout<<GridLogMessage <<fmt << std::endl; | ||||||
|  |       std::cout<<GridLogMessage ; | ||||||
|  |  | ||||||
|  |       for(int i=0;i<mflops_all.size();i++){ | ||||||
|  | 	std::cout<<mflops_all[i]/NN<<" ; " ; | ||||||
|  |       } | ||||||
|  |       std::cout<<std::endl; | ||||||
|  |       std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |  | ||||||
|  |     } | ||||||
|  |     return mflops_best; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   static double DWF(int Ls,int L) | ||||||
|  |   { | ||||||
|  |     RealD mass=0.1; | ||||||
|  |     RealD M5  =1.8; | ||||||
|  |  | ||||||
|  |     double mflops; | ||||||
|  |     double mflops_best = 0; | ||||||
|  |     double mflops_worst= 0; | ||||||
|  |     std::vector<double> mflops_all; | ||||||
|  |  | ||||||
|  |     /////////////////////////////////////////////////////// | ||||||
|  |     // Set/Get the layout & grid size | ||||||
|  |     /////////////////////////////////////////////////////// | ||||||
|  |     int threads = GridThread::GetThreads(); | ||||||
|  |     std::vector<int> mpi = GridDefaultMpi(); assert(mpi.size()==4); | ||||||
|  |     std::vector<int> local({L,L,L,L}); | ||||||
|  |  | ||||||
|  |     GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(std::vector<int>({64,64,64,64}),  | ||||||
|  | 								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); | ||||||
|  |     uint64_t NP = TmpGrid->RankCount(); | ||||||
|  |     uint64_t NN = TmpGrid->NodeCount(); | ||||||
|  |     NN_global=NN; | ||||||
|  |     uint64_t SHM=NP/NN; | ||||||
|  |  | ||||||
|  |     std::vector<int> internal; | ||||||
|  |     if      ( SHM == 1 )   internal = std::vector<int>({1,1,1,1}); | ||||||
|  |     else if ( SHM == 2 )   internal = std::vector<int>({2,1,1,1}); | ||||||
|  |     else if ( SHM == 4 )   internal = std::vector<int>({2,2,1,1}); | ||||||
|  |     else if ( SHM == 8 )   internal = std::vector<int>({2,2,2,1}); | ||||||
|  |     else assert(0); | ||||||
|  |  | ||||||
|  |     std::vector<int> nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]}); | ||||||
|  |     std::vector<int> latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]}); | ||||||
|  |  | ||||||
|  |     ///////// Welcome message //////////// | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "Benchmark DWF on "<<L<<"^4 local volume "<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "* MPI ranks      : "<<GridCmdVectorIntToString(mpi)<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "* Intranode      : "<<GridCmdVectorIntToString(internal)<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "* nodes          : "<<GridCmdVectorIntToString(nodes)<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     ///////// Lattice Init //////////// | ||||||
|  |     GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); | ||||||
|  |     GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); | ||||||
|  |     GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); | ||||||
|  |     GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); | ||||||
|  |  | ||||||
|  |      | ||||||
|  |     ///////// RNG Init //////////// | ||||||
|  |     std::vector<int> seeds4({1,2,3,4}); | ||||||
|  |     std::vector<int> seeds5({5,6,7,8}); | ||||||
|  |     GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4); | ||||||
|  |     GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5); | ||||||
|  |     std::cout << GridLogMessage << "Initialised RNGs" << std::endl; | ||||||
|  |  | ||||||
|  |     ///////// Source preparation //////////// | ||||||
|  |     LatticeFermion src   (FGrid); random(RNG5,src); | ||||||
|  |     LatticeFermion ref   (FGrid); | ||||||
|  |     LatticeFermion tmp   (FGrid); | ||||||
|  |  | ||||||
|  |     RealD N2 = 1.0/::sqrt(norm2(src)); | ||||||
|  |     src = src*N2; | ||||||
|  |      | ||||||
|  |     LatticeGaugeField Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu);  | ||||||
|  |  | ||||||
|  |     DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); | ||||||
|  |  | ||||||
|  |     //////////////////////////////////// | ||||||
|  |     // Naive wilson implementation | ||||||
|  |     //////////////////////////////////// | ||||||
|  |     { | ||||||
|  |       LatticeGaugeField Umu5d(FGrid);  | ||||||
|  |       std::vector<LatticeColourMatrix> U(4,FGrid); | ||||||
|  |       for(int ss=0;ss<Umu._grid->oSites();ss++){ | ||||||
|  | 	for(int s=0;s<Ls;s++){ | ||||||
|  | 	  Umu5d._odata[Ls*ss+s] = Umu._odata[ss]; | ||||||
|  | 	} | ||||||
|  |       } | ||||||
|  |       ref = zero; | ||||||
|  |       for(int mu=0;mu<Nd;mu++){ | ||||||
|  | 	U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu); | ||||||
|  |       } | ||||||
|  |       for(int mu=0;mu<Nd;mu++){ | ||||||
|  | 	 | ||||||
|  | 	tmp = U[mu]*Cshift(src,mu+1,1); | ||||||
|  | 	ref=ref + tmp - Gamma(Gmu[mu])*tmp; | ||||||
|  | 	 | ||||||
|  | 	tmp =adj(U[mu])*src; | ||||||
|  | 	tmp =Cshift(tmp,mu+1,-1); | ||||||
|  | 	ref=ref + tmp + Gamma(Gmu[mu])*tmp; | ||||||
|  |       } | ||||||
|  |       ref = -0.5*ref; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     LatticeFermion src_e (FrbGrid); | ||||||
|  |     LatticeFermion src_o (FrbGrid); | ||||||
|  |     LatticeFermion r_e   (FrbGrid); | ||||||
|  |     LatticeFermion r_o   (FrbGrid); | ||||||
|  |     LatticeFermion r_eo  (FGrid); | ||||||
|  |     LatticeFermion err   (FGrid); | ||||||
|  |     { | ||||||
|  |  | ||||||
|  |       pickCheckerboard(Even,src_e,src); | ||||||
|  |       pickCheckerboard(Odd,src_o,src); | ||||||
|  |  | ||||||
|  | #if defined(AVX512)  | ||||||
|  |       const int num_cases = 6; | ||||||
|  |       std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O "); | ||||||
|  | #else | ||||||
|  |       const int num_cases = 4; | ||||||
|  |       std::string fmt("U/S ; U/O ; G/S ; G/O "); | ||||||
|  | #endif | ||||||
|  |       controls Cases [] = { | ||||||
|  | #ifdef AVX512 | ||||||
|  | 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  }, | ||||||
|  | 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }, | ||||||
|  | #endif | ||||||
|  | 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  }, | ||||||
|  | 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }, | ||||||
|  | 	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  }, | ||||||
|  | 	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  } | ||||||
|  |       };  | ||||||
|  |  | ||||||
|  |       for(int c=0;c<num_cases;c++) { | ||||||
|  |  | ||||||
|  | 	QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap; | ||||||
|  | 	QCD::WilsonKernelsStatic::Opt   = Cases[c].Opt; | ||||||
|  | 	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch); | ||||||
|  |  | ||||||
|  | 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  | 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl; | ||||||
|  | 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl; | ||||||
|  | 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl; | ||||||
|  | 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl; | ||||||
|  | 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl; | ||||||
|  | 	if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl; | ||||||
|  | 	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl; | ||||||
|  | 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |  | ||||||
|  | 	int nwarm = 200; | ||||||
|  | 	double t0=usecond(); | ||||||
|  | 	FGrid->Barrier(); | ||||||
|  | 	for(int i=0;i<nwarm;i++){ | ||||||
|  | 	  Dw.DhopEO(src_o,r_e,DaggerNo); | ||||||
|  | 	} | ||||||
|  | 	FGrid->Barrier(); | ||||||
|  | 	double t1=usecond(); | ||||||
|  | 	//	uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0); | ||||||
|  | 	//	if (ncall < 500) ncall = 500; | ||||||
|  | 	uint64_t ncall = 1000; | ||||||
|  |  | ||||||
|  | 	FGrid->Broadcast(0,&ncall,sizeof(ncall)); | ||||||
|  |  | ||||||
|  | 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl; | ||||||
|  | 	Dw.ZeroCounters(); | ||||||
|  |  | ||||||
|  | 	time_statistics timestat; | ||||||
|  | 	std::vector<double> t_time(ncall); | ||||||
|  | 	for(uint64_t i=0;i<ncall;i++){ | ||||||
|  | 	  t0=usecond(); | ||||||
|  | 	  Dw.DhopEO(src_o,r_e,DaggerNo); | ||||||
|  | 	  t1=usecond(); | ||||||
|  | 	  t_time[i] = t1-t0; | ||||||
|  | 	} | ||||||
|  | 	FGrid->Barrier(); | ||||||
|  | 	 | ||||||
|  | 	double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; | ||||||
|  | 	double flops=(1344.0*volume)/2; | ||||||
|  | 	double mf_hi, mf_lo, mf_err; | ||||||
|  |  | ||||||
|  | 	timestat.statistics(t_time); | ||||||
|  | 	mf_hi = flops/timestat.min; | ||||||
|  | 	mf_lo = flops/timestat.max; | ||||||
|  | 	mf_err= flops/timestat.min * timestat.err/timestat.mean; | ||||||
|  |  | ||||||
|  | 	mflops = flops/timestat.mean; | ||||||
|  | 	mflops_all.push_back(mflops); | ||||||
|  | 	if ( mflops_best == 0   ) mflops_best = mflops; | ||||||
|  | 	if ( mflops_worst== 0   ) mflops_worst= mflops; | ||||||
|  | 	if ( mflops>mflops_best ) mflops_best = mflops; | ||||||
|  | 	if ( mflops<mflops_worst) mflops_worst= mflops; | ||||||
|  |  | ||||||
|  | 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl; | ||||||
|  | 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank   "<< mflops/NP<<std::endl; | ||||||
|  | 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node   "<< mflops/NN<<std::endl; | ||||||
|  |  | ||||||
|  | 	Dw.Report(); | ||||||
|  |  | ||||||
|  | 	Dw.DhopEO(src_o,r_e,DaggerNo); | ||||||
|  | 	Dw.DhopOE(src_e,r_o,DaggerNo); | ||||||
|  | 	setCheckerboard(r_eo,r_o); | ||||||
|  | 	setCheckerboard(r_eo,r_e); | ||||||
|  | 	err = r_eo-ref;  | ||||||
|  | 	std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl; | ||||||
|  | 	assert((norm2(err)<1.0e-4)); | ||||||
|  |  | ||||||
|  |       } | ||||||
|  |       std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |       std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl; | ||||||
|  |       std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl; | ||||||
|  |       std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< mflops_worst/mflops_best <<std::endl; | ||||||
|  |       std::cout<<GridLogMessage <<fmt << std::endl; | ||||||
|  |       std::cout<<GridLogMessage ; | ||||||
|  |  | ||||||
|  |       for(int i=0;i<mflops_all.size();i++){ | ||||||
|  | 	std::cout<<mflops_all[i]/NN<<" ; " ; | ||||||
|  |       } | ||||||
|  |       std::cout<<std::endl; | ||||||
|  |       std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |  | ||||||
|  |     } | ||||||
|  |     return mflops_best; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | int main (int argc, char ** argv) | ||||||
|  | { | ||||||
|  |   Grid_init(&argc,&argv); | ||||||
|  |  | ||||||
|  |   CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential); | ||||||
|  | #ifdef KNL | ||||||
|  |   LebesgueOrder::Block = std::vector<int>({8,2,2,2}); | ||||||
|  | #else | ||||||
|  |   LebesgueOrder::Block = std::vector<int>({2,2,2,2}); | ||||||
|  | #endif | ||||||
|  |   Benchmark::Decomposition(); | ||||||
|  |  | ||||||
|  |   int do_memory=1; | ||||||
|  |   int do_comms =1; | ||||||
|  |   int do_su3   =0; | ||||||
|  |   int do_wilson=1; | ||||||
|  |   int do_dwf   =1; | ||||||
|  |  | ||||||
|  |   if ( do_memory ) { | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << " Memory benchmark " <<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     Benchmark::Memory(); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   if ( do_comms ) { | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << " Communications benchmark " <<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     Benchmark::Comms(); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   if ( do_su3 ) { | ||||||
|  |     // empty for now | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   int sel=2; | ||||||
|  |   std::vector<int> L_list({8,12,16,24}); | ||||||
|  |   std::vector<double> wilson; | ||||||
|  |   std::vector<double> dwf4; | ||||||
|  |   std::vector<double> dwf5; | ||||||
|  |  | ||||||
|  |   if ( do_wilson ) { | ||||||
|  |     int Ls=1; | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     for(int l=0;l<L_list.size();l++){ | ||||||
|  |       wilson.push_back(Benchmark::DWF(1,L_list[l])); | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   int Ls=16; | ||||||
|  |   if ( do_dwf ) { | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     for(int l=0;l<L_list.size();l++){ | ||||||
|  |       dwf4.push_back(Benchmark::DWF(Ls,L_list[l])); | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   if ( do_dwf ) { | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl; | ||||||
|  |     std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |     for(int l=0;l<L_list.size();l++){ | ||||||
|  |       dwf5.push_back(Benchmark::DWF5(Ls,L_list[l])); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "L \t\t Wilson \t DWF4 \t DWF5 " <<std::endl; | ||||||
|  |   for(int l=0;l<L_list.size();l++){ | ||||||
|  |     std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t "<<dwf4[l]<<" \t "<<dwf5[l] <<std::endl; | ||||||
|  |   } | ||||||
|  |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |  | ||||||
|  |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |   int NN=NN_global; | ||||||
|  |   std::cout<<GridLogMessage << " L \t\t Wilson\t\t DWF4  \t\t DWF5 " <<std::endl; | ||||||
|  |   for(int l=0;l<L_list.size();l++){ | ||||||
|  |     std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]/NN<<" \t "<<dwf4[l]/NN<<" \t "<<dwf5[l] /NN<<std::endl; | ||||||
|  |   } | ||||||
|  |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |  | ||||||
|  |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << " Comparison point result: "  << dwf4[sel]/NN <<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "=================================================================================="<<std::endl; | ||||||
|  |  | ||||||
|  |  | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |   Grid_finalize(); | ||||||
|  | } | ||||||
| @@ -68,7 +68,7 @@ int main (int argc, char ** argv) | |||||||
|  |  | ||||||
|   int Nloop=100; |   int Nloop=100; | ||||||
|   int nmu=0; |   int nmu=0; | ||||||
|   int maxlat=24; |   int maxlat=32; | ||||||
|   for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++; |   for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++; | ||||||
|  |  | ||||||
|   std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl; |   std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl; | ||||||
| @@ -80,7 +80,7 @@ int main (int argc, char ** argv) | |||||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|   header(); |   header(); | ||||||
|   for(int lat=4;lat<=maxlat;lat+=4){ |   for(int lat=4;lat<=maxlat;lat+=4){ | ||||||
|     for(int Ls=8;Ls<=32;Ls*=2){ |     for(int Ls=8;Ls<=8;Ls*=2){ | ||||||
|  |  | ||||||
|       std::vector<int> latt_size  ({lat*mpi_layout[0], |       std::vector<int> latt_size  ({lat*mpi_layout[0], | ||||||
|       				    lat*mpi_layout[1], |       				    lat*mpi_layout[1], | ||||||
| @@ -92,11 +92,16 @@ int main (int argc, char ** argv) | |||||||
|       RealD Nnode = Grid.NodeCount(); |       RealD Nnode = Grid.NodeCount(); | ||||||
|       RealD ppn = Nrank/Nnode; |       RealD ppn = Nrank/Nnode; | ||||||
|  |  | ||||||
|       std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls)); |       std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);	 | ||||||
|       std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls)); |       std::vector<Vector<HalfSpinColourVectorD> > rbuf(8); | ||||||
|  |  | ||||||
|       int ncomm; |       int ncomm; | ||||||
|       int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); |       int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); | ||||||
|  |       for(int mu=0;mu<8;mu++){ | ||||||
|  | 	xbuf[mu].resize(lat*lat*lat*Ls); | ||||||
|  | 	rbuf[mu].resize(lat*lat*lat*Ls); | ||||||
|  | 	//	std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl; | ||||||
|  |       } | ||||||
|  |  | ||||||
|       for(int i=0;i<Nloop;i++){ |       for(int i=0;i<Nloop;i++){ | ||||||
|       double start=usecond(); |       double start=usecond(); | ||||||
| @@ -112,7 +117,6 @@ int main (int argc, char ** argv) | |||||||
| 	    int comm_proc=1; | 	    int comm_proc=1; | ||||||
| 	    int xmit_to_rank; | 	    int xmit_to_rank; | ||||||
| 	    int recv_from_rank; | 	    int recv_from_rank; | ||||||
| 	     |  | ||||||
| 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); | 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); | ||||||
| 	    Grid.SendToRecvFromBegin(requests, | 	    Grid.SendToRecvFromBegin(requests, | ||||||
| 				   (void *)&xbuf[mu][0], | 				   (void *)&xbuf[mu][0], | ||||||
| @@ -163,7 +167,7 @@ int main (int argc, char ** argv) | |||||||
|   header(); |   header(); | ||||||
|  |  | ||||||
|   for(int lat=4;lat<=maxlat;lat+=4){ |   for(int lat=4;lat<=maxlat;lat+=4){ | ||||||
|     for(int Ls=8;Ls<=32;Ls*=2){ |     for(int Ls=8;Ls<=8;Ls*=2){ | ||||||
|  |  | ||||||
|       std::vector<int> latt_size  ({lat,lat,lat,lat}); |       std::vector<int> latt_size  ({lat,lat,lat,lat}); | ||||||
|  |  | ||||||
| @@ -172,9 +176,14 @@ int main (int argc, char ** argv) | |||||||
|       RealD Nnode = Grid.NodeCount(); |       RealD Nnode = Grid.NodeCount(); | ||||||
|       RealD ppn = Nrank/Nnode; |       RealD ppn = Nrank/Nnode; | ||||||
|  |  | ||||||
|       std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls)); |       std::vector<Vector<HalfSpinColourVectorD> > xbuf(8); | ||||||
|       std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls)); |       std::vector<Vector<HalfSpinColourVectorD> > rbuf(8); | ||||||
|  |  | ||||||
|  |       for(int mu=0;mu<8;mu++){ | ||||||
|  | 	xbuf[mu].resize(lat*lat*lat*Ls); | ||||||
|  | 	rbuf[mu].resize(lat*lat*lat*Ls); | ||||||
|  | 	//	std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl; | ||||||
|  |       } | ||||||
|  |  | ||||||
|       int ncomm; |       int ncomm; | ||||||
|       int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); |       int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); | ||||||
| @@ -249,7 +258,7 @@ int main (int argc, char ** argv) | |||||||
|   header(); |   header(); | ||||||
|  |  | ||||||
|   for(int lat=4;lat<=maxlat;lat+=4){ |   for(int lat=4;lat<=maxlat;lat+=4){ | ||||||
|     for(int Ls=8;Ls<=32;Ls*=2){ |     for(int Ls=8;Ls<=8;Ls*=2){ | ||||||
|  |  | ||||||
|       std::vector<int> latt_size  ({lat*mpi_layout[0], |       std::vector<int> latt_size  ({lat*mpi_layout[0], | ||||||
|       				    lat*mpi_layout[1], |       				    lat*mpi_layout[1], | ||||||
| @@ -299,7 +308,7 @@ int main (int argc, char ** argv) | |||||||
| 					      xmit_to_rank, | 					      xmit_to_rank, | ||||||
| 					      (void *)&rbuf[mu][0], | 					      (void *)&rbuf[mu][0], | ||||||
| 					      recv_from_rank, | 					      recv_from_rank, | ||||||
| 					      bytes); | 					      bytes,mu); | ||||||
| 	 | 	 | ||||||
| 	    comm_proc = mpi_layout[mu]-1; | 	    comm_proc = mpi_layout[mu]-1; | ||||||
| 	   | 	   | ||||||
| @@ -310,11 +319,11 @@ int main (int argc, char ** argv) | |||||||
| 					      xmit_to_rank, | 					      xmit_to_rank, | ||||||
| 					      (void *)&rbuf[mu+4][0], | 					      (void *)&rbuf[mu+4][0], | ||||||
| 					      recv_from_rank, | 					      recv_from_rank, | ||||||
| 					      bytes); | 					      bytes,mu+4); | ||||||
| 	   | 	   | ||||||
| 	  } | 	  } | ||||||
| 	} | 	} | ||||||
| 	Grid.StencilSendToRecvFromComplete(requests); | 	Grid.StencilSendToRecvFromComplete(requests,0); | ||||||
| 	Grid.Barrier(); | 	Grid.Barrier(); | ||||||
| 	double stop=usecond(); | 	double stop=usecond(); | ||||||
| 	t_time[i] = stop-start; // microseconds | 	t_time[i] = stop-start; // microseconds | ||||||
| @@ -346,7 +355,7 @@ int main (int argc, char ** argv) | |||||||
|   header(); |   header(); | ||||||
|  |  | ||||||
|   for(int lat=4;lat<=maxlat;lat+=4){ |   for(int lat=4;lat<=maxlat;lat+=4){ | ||||||
|     for(int Ls=8;Ls<=32;Ls*=2){ |     for(int Ls=8;Ls<=8;Ls*=2){ | ||||||
|  |  | ||||||
|       std::vector<int> latt_size  ({lat*mpi_layout[0], |       std::vector<int> latt_size  ({lat*mpi_layout[0], | ||||||
|       				    lat*mpi_layout[1], |       				    lat*mpi_layout[1], | ||||||
| @@ -393,8 +402,8 @@ int main (int argc, char ** argv) | |||||||
| 					      xmit_to_rank, | 					      xmit_to_rank, | ||||||
| 					      (void *)&rbuf[mu][0], | 					      (void *)&rbuf[mu][0], | ||||||
| 					      recv_from_rank, | 					      recv_from_rank, | ||||||
| 					      bytes); | 					      bytes,mu); | ||||||
| 	    Grid.StencilSendToRecvFromComplete(requests); | 	    Grid.StencilSendToRecvFromComplete(requests,mu); | ||||||
| 	    requests.resize(0); | 	    requests.resize(0); | ||||||
|  |  | ||||||
| 	    comm_proc = mpi_layout[mu]-1; | 	    comm_proc = mpi_layout[mu]-1; | ||||||
| @@ -406,8 +415,8 @@ int main (int argc, char ** argv) | |||||||
| 					      xmit_to_rank, | 					      xmit_to_rank, | ||||||
| 					      (void *)&rbuf[mu+4][0], | 					      (void *)&rbuf[mu+4][0], | ||||||
| 					      recv_from_rank, | 					      recv_from_rank, | ||||||
| 					      bytes); | 					      bytes,mu+4); | ||||||
| 	    Grid.StencilSendToRecvFromComplete(requests); | 	    Grid.StencilSendToRecvFromComplete(requests,mu+4); | ||||||
| 	    requests.resize(0); | 	    requests.resize(0); | ||||||
| 	   | 	   | ||||||
| 	  } | 	  } | ||||||
| @@ -436,5 +445,97 @@ int main (int argc, char ** argv) | |||||||
|     } |     } | ||||||
|   }     |   }     | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|  |   header(); | ||||||
|  |  | ||||||
|  |   for(int lat=4;lat<=maxlat;lat+=4){ | ||||||
|  |     for(int Ls=8;Ls<=8;Ls*=2){ | ||||||
|  |  | ||||||
|  |       std::vector<int> latt_size  ({lat*mpi_layout[0], | ||||||
|  |       				    lat*mpi_layout[1], | ||||||
|  |       				    lat*mpi_layout[2], | ||||||
|  |       				    lat*mpi_layout[3]}); | ||||||
|  |  | ||||||
|  |       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||||
|  |       RealD Nrank = Grid._Nprocessors; | ||||||
|  |       RealD Nnode = Grid.NodeCount(); | ||||||
|  |       RealD ppn = Nrank/Nnode; | ||||||
|  |  | ||||||
|  |       std::vector<HalfSpinColourVectorD *> xbuf(8); | ||||||
|  |       std::vector<HalfSpinColourVectorD *> rbuf(8); | ||||||
|  |       Grid.ShmBufferFreeAll(); | ||||||
|  |       for(int d=0;d<8;d++){ | ||||||
|  | 	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); | ||||||
|  | 	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); | ||||||
|  | 	bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); | ||||||
|  | 	bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD)); | ||||||
|  |       } | ||||||
|  |  | ||||||
|  |       int ncomm; | ||||||
|  |       int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD); | ||||||
|  |       double dbytes; | ||||||
|  |       for(int i=0;i<Nloop;i++){ | ||||||
|  | 	double start=usecond(); | ||||||
|  |  | ||||||
|  | 	std::vector<CartesianCommunicator::CommsRequest_t> requests; | ||||||
|  | 	dbytes=0; | ||||||
|  | 	ncomm=0; | ||||||
|  |  | ||||||
|  | 	parallel_for(int dir=0;dir<8;dir++){ | ||||||
|  |  | ||||||
|  | 	  double tbytes; | ||||||
|  | 	  int mu =dir % 4; | ||||||
|  |  | ||||||
|  | 	  if (mpi_layout[mu]>1 ) { | ||||||
|  | 	   | ||||||
|  | 	    ncomm++; | ||||||
|  | 	    int xmit_to_rank; | ||||||
|  | 	    int recv_from_rank; | ||||||
|  | 	    if ( dir == mu ) {  | ||||||
|  | 	      int comm_proc=1; | ||||||
|  | 	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); | ||||||
|  | 	    } else {  | ||||||
|  | 	      int comm_proc = mpi_layout[mu]-1; | ||||||
|  | 	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); | ||||||
|  | 	    } | ||||||
|  |  | ||||||
|  | 	    tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank, | ||||||
|  | 					       (void *)&rbuf[dir][0], recv_from_rank, bytes,dir); | ||||||
|  |  | ||||||
|  | #pragma omp atomic | ||||||
|  | 	    dbytes+=tbytes; | ||||||
|  | 	  } | ||||||
|  | 	} | ||||||
|  | 	Grid.Barrier(); | ||||||
|  | 	double stop=usecond(); | ||||||
|  | 	t_time[i] = stop-start; // microseconds | ||||||
|  |       } | ||||||
|  |  | ||||||
|  |       timestat.statistics(t_time); | ||||||
|  |  | ||||||
|  |       dbytes=dbytes*ppn; | ||||||
|  |       double xbytes    = dbytes*0.5; | ||||||
|  |       double rbytes    = dbytes*0.5; | ||||||
|  |       double bidibytes = dbytes; | ||||||
|  |  | ||||||
|  |  | ||||||
|  |       std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t" | ||||||
|  |                <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7) | ||||||
|  |                <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " " | ||||||
|  |                <<xbytes/timestat.max <<" "<< xbytes/timestat.min   | ||||||
|  |                << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " " | ||||||
|  |                << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl; | ||||||
|  |   | ||||||
|  |     } | ||||||
|  |   }     | ||||||
|  |  | ||||||
|  |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "= All done; Bye Bye"<<std::endl; | ||||||
|  |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|  |  | ||||||
|   Grid_finalize(); |   Grid_finalize(); | ||||||
| } | } | ||||||
|   | |||||||
| @@ -165,7 +165,7 @@ int main (int argc, char ** argv) | |||||||
|   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl; |   std::cout << GridLogMessage<< "*****************************************************************" <<std::endl; | ||||||
|  |  | ||||||
|   DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); |   DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); | ||||||
|   int ncall =1000; |   int ncall =500; | ||||||
|   if (1) { |   if (1) { | ||||||
|     FGrid->Barrier(); |     FGrid->Barrier(); | ||||||
|     Dw.ZeroCounters(); |     Dw.ZeroCounters(); | ||||||
| @@ -303,6 +303,7 @@ int main (int argc, char ** argv) | |||||||
|     } |     } | ||||||
|     assert(sum < 1.0e-4); |     assert(sum < 1.0e-4); | ||||||
|  |  | ||||||
|  |      | ||||||
|     if(1){ |     if(1){ | ||||||
|       std::cout << GridLogMessage<< "*********************************************************" <<std::endl; |       std::cout << GridLogMessage<< "*********************************************************" <<std::endl; | ||||||
|       std::cout << GridLogMessage<< "* Benchmarking WilsonFermion5D<DomainWallVec5dImplR>::DhopEO "<<std::endl; |       std::cout << GridLogMessage<< "* Benchmarking WilsonFermion5D<DomainWallVec5dImplR>::DhopEO "<<std::endl; | ||||||
| @@ -381,8 +382,23 @@ int main (int argc, char ** argv) | |||||||
|       } |       } | ||||||
|       assert(error<1.0e-4); |       assert(error<1.0e-4); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |   if(0){ | ||||||
|  |     std::cout << "Single cache warm call to sDw.Dhop " <<std::endl; | ||||||
|  |     for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){ | ||||||
|  |       sDw.Dhop(ssrc,sresult,0); | ||||||
|  |       PerformanceCounter Counter(i); | ||||||
|  |       Counter.Start(); | ||||||
|  |       sDw.Dhop(ssrc,sresult,0); | ||||||
|  |       Counter.Stop(); | ||||||
|  |       Counter.Report(); | ||||||
|  |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   if (1) |   if (1) | ||||||
|   { // Naive wilson dag implementation |   { // Naive wilson dag implementation | ||||||
|     ref = zero; |     ref = zero; | ||||||
| @@ -487,9 +503,9 @@ int main (int argc, char ** argv) | |||||||
|   std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl; |   std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl; |   std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl; | ||||||
|  |  | ||||||
|   //assert(norm2(src_e)<1.0e-4); |   assert(norm2(src_e)<1.0e-4); | ||||||
|   //assert(norm2(src_o)<1.0e-4); |   assert(norm2(src_o)<1.0e-4); | ||||||
|  |  | ||||||
|   Grid_finalize(); |   Grid_finalize(); | ||||||
|  |   exit(0); | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -55,21 +55,21 @@ int main (int argc, char ** argv) | |||||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl; |   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; |   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; | ||||||
|   uint64_t lmax=64; |   uint64_t lmax=96; | ||||||
| #define NLOOP (100*lmax*lmax*lmax*lmax/vol) | #define NLOOP (10*lmax*lmax*lmax*lmax/vol) | ||||||
|   for(int lat=4;lat<=lmax;lat+=4){ |   for(int lat=8;lat<=lmax;lat+=8){ | ||||||
|  |  | ||||||
|       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); |       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); | ||||||
|       int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; |       int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; | ||||||
|       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); |       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||||
|  |  | ||||||
|       uint64_t Nloop=NLOOP; |       uint64_t Nloop=NLOOP; | ||||||
|  |  | ||||||
|       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}); |       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9})); | ||||||
|  |  | ||||||
|       LatticeVec z(&Grid); //random(pRNG,z); |       LatticeVec z(&Grid);// random(pRNG,z); | ||||||
|       LatticeVec x(&Grid); //random(pRNG,x); |       LatticeVec x(&Grid);// random(pRNG,x); | ||||||
|       LatticeVec y(&Grid); //random(pRNG,y); |       LatticeVec y(&Grid);// random(pRNG,y); | ||||||
|       double a=2.0; |       double a=2.0; | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -83,7 +83,7 @@ int main (int argc, char ** argv) | |||||||
|       double time = (stop-start)/Nloop*1000; |       double time = (stop-start)/Nloop*1000; | ||||||
|        |        | ||||||
|       double flops=vol*Nvec*2;// mul,add |       double flops=vol*Nvec*2;// mul,add | ||||||
|       double bytes=3*vol*Nvec*sizeof(Real); |       double bytes=3.0*vol*Nvec*sizeof(Real); | ||||||
|       std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl; |       std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl; | ||||||
|  |  | ||||||
|     } |     } | ||||||
| @@ -94,17 +94,17 @@ int main (int argc, char ** argv) | |||||||
|   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl; |   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; |   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; | ||||||
|    |    | ||||||
|   for(int lat=4;lat<=lmax;lat+=4){ |   for(int lat=8;lat<=lmax;lat+=8){ | ||||||
|  |  | ||||||
|       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); |       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); | ||||||
|       int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; |       int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; | ||||||
|       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); |       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||||
|  |  | ||||||
|       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}); |       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9})); | ||||||
|  |  | ||||||
|       LatticeVec z(&Grid); //random(pRNG,z); |       LatticeVec z(&Grid);// random(pRNG,z); | ||||||
|       LatticeVec x(&Grid); //random(pRNG,x); |       LatticeVec x(&Grid);// random(pRNG,x); | ||||||
|       LatticeVec y(&Grid); //random(pRNG,y); |       LatticeVec y(&Grid);// random(pRNG,y); | ||||||
|       double a=2.0; |       double a=2.0; | ||||||
|  |  | ||||||
|       uint64_t Nloop=NLOOP; |       uint64_t Nloop=NLOOP; | ||||||
| @@ -119,7 +119,7 @@ int main (int argc, char ** argv) | |||||||
|       double time = (stop-start)/Nloop*1000; |       double time = (stop-start)/Nloop*1000; | ||||||
|       |       | ||||||
|       double flops=vol*Nvec*2;// mul,add |       double flops=vol*Nvec*2;// mul,add | ||||||
|       double bytes=3*vol*Nvec*sizeof(Real); |       double bytes=3.0*vol*Nvec*sizeof(Real); | ||||||
|       std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl; |       std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl; | ||||||
|  |  | ||||||
|     } |     } | ||||||
| @@ -129,20 +129,20 @@ int main (int argc, char ** argv) | |||||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl; |   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl; | ||||||
|  |  | ||||||
|   for(int lat=4;lat<=lmax;lat+=4){ |   for(int lat=8;lat<=lmax;lat+=8){ | ||||||
|  |  | ||||||
|  |  | ||||||
|       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); |       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); | ||||||
|       int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; |       int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; | ||||||
|       uint64_t Nloop=NLOOP; |       uint64_t Nloop=NLOOP; | ||||||
|  |  | ||||||
|       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); |       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||||
|  |  | ||||||
|       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}); |       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9})); | ||||||
|  |  | ||||||
|       LatticeVec z(&Grid); //random(pRNG,z); |       LatticeVec z(&Grid);// random(pRNG,z); | ||||||
|       LatticeVec x(&Grid); //random(pRNG,x); |       LatticeVec x(&Grid);// random(pRNG,x); | ||||||
|       LatticeVec y(&Grid); //random(pRNG,y); |       LatticeVec y(&Grid);// random(pRNG,y); | ||||||
|       RealD a=2.0; |       RealD a=2.0; | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -154,7 +154,7 @@ int main (int argc, char ** argv) | |||||||
|       double stop=usecond(); |       double stop=usecond(); | ||||||
|       double time = (stop-start)/Nloop*1000; |       double time = (stop-start)/Nloop*1000; | ||||||
|        |        | ||||||
|       double bytes=2*vol*Nvec*sizeof(Real); |       double bytes=2.0*vol*Nvec*sizeof(Real); | ||||||
|       double flops=vol*Nvec*1;// mul |       double flops=vol*Nvec*1;// mul | ||||||
|       std::cout<<GridLogMessage <<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl; |       std::cout<<GridLogMessage <<std::setprecision(3) << lat<<"\t\t"<<bytes<<"   \t\t"<<bytes/time<<"\t\t"<<flops/time<<"\t\t"<<(stop-start)/1000./1000.<<std::endl; | ||||||
|  |  | ||||||
| @@ -166,17 +166,17 @@ int main (int argc, char ** argv) | |||||||
|   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl; |   std::cout<<GridLogMessage << "  L  "<<"\t\t"<<"bytes"<<"\t\t\t"<<"GB/s"<<"\t\t"<<"Gflop/s"<<"\t\t seconds"<<std::endl; | ||||||
|   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; |   std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl; | ||||||
|  |  | ||||||
|   for(int lat=4;lat<=lmax;lat+=4){ |   for(int lat=8;lat<=lmax;lat+=8){ | ||||||
|  |  | ||||||
|       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); |       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); | ||||||
|       int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; |       int64_t vol= latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; | ||||||
|       uint64_t Nloop=NLOOP; |       uint64_t Nloop=NLOOP; | ||||||
|       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); |       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||||
|  |  | ||||||
|       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}); |       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9})); | ||||||
|       LatticeVec z(&Grid); //random(pRNG,z); |       LatticeVec z(&Grid);// random(pRNG,z); | ||||||
|       LatticeVec x(&Grid); //random(pRNG,x); |       LatticeVec x(&Grid);// random(pRNG,x); | ||||||
|       LatticeVec y(&Grid); //random(pRNG,y); |       LatticeVec y(&Grid);// random(pRNG,y); | ||||||
|       RealD a=2.0; |       RealD a=2.0; | ||||||
|       Real nn;       |       Real nn;       | ||||||
|       double start=usecond(); |       double start=usecond(); | ||||||
| @@ -187,7 +187,7 @@ int main (int argc, char ** argv) | |||||||
|       double stop=usecond(); |       double stop=usecond(); | ||||||
|       double time = (stop-start)/Nloop*1000; |       double time = (stop-start)/Nloop*1000; | ||||||
|        |        | ||||||
|       double bytes=vol*Nvec*sizeof(Real); |       double bytes=1.0*vol*Nvec*sizeof(Real); | ||||||
|       double flops=vol*Nvec*2;// mul,add |       double flops=vol*Nvec*2;// mul,add | ||||||
|       std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<< "\t\t"<<(stop-start)/1000./1000.<< "\t\t " <<std::endl; |       std::cout<<GridLogMessage<<std::setprecision(3) << lat<<"\t\t"<<bytes<<"  \t\t"<<bytes/time<<"\t\t"<<flops/time<< "\t\t"<<(stop-start)/1000./1000.<< "\t\t " <<std::endl; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -37,12 +37,12 @@ int main (int argc, char ** argv) | |||||||
|   Grid_init(&argc,&argv); |   Grid_init(&argc,&argv); | ||||||
| #define LMAX (64) | #define LMAX (64) | ||||||
|  |  | ||||||
|   int Nloop=20; |   int64_t Nloop=20; | ||||||
|  |  | ||||||
|   std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); |   std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd()); | ||||||
|   std::vector<int> mpi_layout  = GridDefaultMpi(); |   std::vector<int> mpi_layout  = GridDefaultMpi(); | ||||||
|  |  | ||||||
|   int threads = GridThread::GetThreads(); |   int64_t threads = GridThread::GetThreads(); | ||||||
|   std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl; |   std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl; | ||||||
|  |  | ||||||
|   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; |   std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; | ||||||
| @@ -54,16 +54,16 @@ int main (int argc, char ** argv) | |||||||
|   for(int lat=2;lat<=LMAX;lat+=2){ |   for(int lat=2;lat<=LMAX;lat+=2){ | ||||||
|  |  | ||||||
|       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); |       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); | ||||||
|       int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; |       int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; | ||||||
|       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); |       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||||
|       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}); |       GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9})); | ||||||
|  |  | ||||||
|       LatticeColourMatrix z(&Grid);// random(pRNG,z); |       LatticeColourMatrix z(&Grid); random(pRNG,z); | ||||||
|       LatticeColourMatrix x(&Grid);// random(pRNG,x); |       LatticeColourMatrix x(&Grid); random(pRNG,x); | ||||||
|       LatticeColourMatrix y(&Grid);// random(pRNG,y); |       LatticeColourMatrix y(&Grid); random(pRNG,y); | ||||||
|  |  | ||||||
|       double start=usecond(); |       double start=usecond(); | ||||||
|       for(int i=0;i<Nloop;i++){ |       for(int64_t i=0;i<Nloop;i++){ | ||||||
| 	x=x*y; | 	x=x*y; | ||||||
|       } |       } | ||||||
|       double stop=usecond(); |       double stop=usecond(); | ||||||
| @@ -86,17 +86,17 @@ int main (int argc, char ** argv) | |||||||
|   for(int lat=2;lat<=LMAX;lat+=2){ |   for(int lat=2;lat<=LMAX;lat+=2){ | ||||||
|  |  | ||||||
|       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); |       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); | ||||||
|       int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; |       int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; | ||||||
|  |  | ||||||
|       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); |       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||||
|       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}); |       GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9})); | ||||||
|  |  | ||||||
|       LatticeColourMatrix z(&Grid); //random(pRNG,z); |       LatticeColourMatrix z(&Grid); random(pRNG,z); | ||||||
|       LatticeColourMatrix x(&Grid); //random(pRNG,x); |       LatticeColourMatrix x(&Grid); random(pRNG,x); | ||||||
|       LatticeColourMatrix y(&Grid); //random(pRNG,y); |       LatticeColourMatrix y(&Grid); random(pRNG,y); | ||||||
|  |  | ||||||
|       double start=usecond(); |       double start=usecond(); | ||||||
|       for(int i=0;i<Nloop;i++){ |       for(int64_t i=0;i<Nloop;i++){ | ||||||
| 	z=x*y; | 	z=x*y; | ||||||
|       } |       } | ||||||
|       double stop=usecond(); |       double stop=usecond(); | ||||||
| @@ -117,17 +117,17 @@ int main (int argc, char ** argv) | |||||||
|   for(int lat=2;lat<=LMAX;lat+=2){ |   for(int lat=2;lat<=LMAX;lat+=2){ | ||||||
|  |  | ||||||
|       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); |       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); | ||||||
|       int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; |       int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; | ||||||
|  |  | ||||||
|       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); |       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||||
|       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}); |       GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9})); | ||||||
|  |  | ||||||
|       LatticeColourMatrix z(&Grid); //random(pRNG,z); |       LatticeColourMatrix z(&Grid); random(pRNG,z); | ||||||
|       LatticeColourMatrix x(&Grid); //random(pRNG,x); |       LatticeColourMatrix x(&Grid); random(pRNG,x); | ||||||
|       LatticeColourMatrix y(&Grid); //random(pRNG,y); |       LatticeColourMatrix y(&Grid); random(pRNG,y); | ||||||
|  |  | ||||||
|       double start=usecond(); |       double start=usecond(); | ||||||
|       for(int i=0;i<Nloop;i++){ |       for(int64_t i=0;i<Nloop;i++){ | ||||||
| 	mult(z,x,y); | 	mult(z,x,y); | ||||||
|       } |       } | ||||||
|       double stop=usecond(); |       double stop=usecond(); | ||||||
| @@ -148,17 +148,17 @@ int main (int argc, char ** argv) | |||||||
|   for(int lat=2;lat<=LMAX;lat+=2){ |   for(int lat=2;lat<=LMAX;lat+=2){ | ||||||
|  |  | ||||||
|       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); |       std::vector<int> latt_size  ({lat*mpi_layout[0],lat*mpi_layout[1],lat*mpi_layout[2],lat*mpi_layout[3]}); | ||||||
|       int vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; |       int64_t vol = latt_size[0]*latt_size[1]*latt_size[2]*latt_size[3]; | ||||||
|  |  | ||||||
|       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); |       GridCartesian     Grid(latt_size,simd_layout,mpi_layout); | ||||||
|       //      GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}); |       GridParallelRNG          pRNG(&Grid);      pRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9})); | ||||||
|  |  | ||||||
|       LatticeColourMatrix z(&Grid); //random(pRNG,z); |       LatticeColourMatrix z(&Grid); random(pRNG,z); | ||||||
|       LatticeColourMatrix x(&Grid); //random(pRNG,x); |       LatticeColourMatrix x(&Grid); random(pRNG,x); | ||||||
|       LatticeColourMatrix y(&Grid); //random(pRNG,y); |       LatticeColourMatrix y(&Grid); random(pRNG,y); | ||||||
|  |  | ||||||
|       double start=usecond(); |       double start=usecond(); | ||||||
|       for(int i=0;i<Nloop;i++){ |       for(int64_t i=0;i<Nloop;i++){ | ||||||
| 	mac(z,x,y); | 	mac(z,x,y); | ||||||
|       } |       } | ||||||
|       double stop=usecond(); |       double stop=usecond(); | ||||||
|   | |||||||
							
								
								
									
										32
									
								
								configure.ac
									
									
									
									
									
								
							
							
						
						
									
										32
									
								
								configure.ac
									
									
									
									
									
								
							| @@ -13,6 +13,10 @@ m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) | |||||||
| ################ Get git info | ################ Get git info | ||||||
| #AC_REVISION([m4_esyscmd_s([./scripts/configure.commit])]) | #AC_REVISION([m4_esyscmd_s([./scripts/configure.commit])]) | ||||||
|  |  | ||||||
|  | ################ Set flags | ||||||
|  | # do not move! | ||||||
|  | CXXFLAGS="-O3 $CXXFLAGS" | ||||||
|  |  | ||||||
| ############### Checks for programs | ############### Checks for programs | ||||||
| AC_PROG_CXX | AC_PROG_CXX | ||||||
| AC_PROG_RANLIB | AC_PROG_RANLIB | ||||||
| @@ -27,7 +31,6 @@ AX_GXX_VERSION | |||||||
| AC_DEFINE_UNQUOTED([GXX_VERSION],["$GXX_VERSION"], | AC_DEFINE_UNQUOTED([GXX_VERSION],["$GXX_VERSION"], | ||||||
|       [version of g++ that will compile the code]) |       [version of g++ that will compile the code]) | ||||||
|  |  | ||||||
| CXXFLAGS="-g $CXXFLAGS" |  | ||||||
|  |  | ||||||
|  |  | ||||||
| ############### Checks for typedefs, structures, and compiler characteristics | ############### Checks for typedefs, structures, and compiler characteristics | ||||||
| @@ -51,9 +54,14 @@ AC_CHECK_HEADERS(malloc/malloc.h) | |||||||
| AC_CHECK_HEADERS(malloc.h) | AC_CHECK_HEADERS(malloc.h) | ||||||
| AC_CHECK_HEADERS(endian.h) | AC_CHECK_HEADERS(endian.h) | ||||||
| AC_CHECK_HEADERS(execinfo.h) | AC_CHECK_HEADERS(execinfo.h) | ||||||
|  | AC_CHECK_HEADERS(numaif.h) | ||||||
| AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]]) | AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]]) | ||||||
| AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]]) | AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]]) | ||||||
|  |  | ||||||
|  | ############## Standard libraries | ||||||
|  | AC_CHECK_LIB([m],[cos]) | ||||||
|  | AC_CHECK_LIB([stdc++],[abort]) | ||||||
|  |  | ||||||
| ############### GMP and MPFR | ############### GMP and MPFR | ||||||
| AC_ARG_WITH([gmp], | AC_ARG_WITH([gmp], | ||||||
|     [AS_HELP_STRING([--with-gmp=prefix], |     [AS_HELP_STRING([--with-gmp=prefix], | ||||||
| @@ -186,9 +194,14 @@ Info at: http://usqcd.jlab.org/usqcd-docs/c-lime/)]) | |||||||
|  |  | ||||||
| AC_SEARCH_LIBS([crc32], [z], | AC_SEARCH_LIBS([crc32], [z], | ||||||
|                [AC_DEFINE([HAVE_ZLIB], [1], [Define to 1 if you have the `LIBZ' library])] |                [AC_DEFINE([HAVE_ZLIB], [1], [Define to 1 if you have the `LIBZ' library])] | ||||||
|                [have_zlib=true], |                [have_zlib=true] [LIBS="${LIBS} -lz"], | ||||||
| 	       [AC_MSG_ERROR(zlib library was not found in your system.)]) | 	       [AC_MSG_ERROR(zlib library was not found in your system.)]) | ||||||
|  |  | ||||||
|  | AC_SEARCH_LIBS([move_pages], [numa], | ||||||
|  |                [AC_DEFINE([HAVE_LIBNUMA], [1], [Define to 1 if you have the `LIBNUMA' library])] | ||||||
|  |                [have_libnuma=true] [LIBS="${LIBS} -lnuma"], | ||||||
|  | 	       [AC_MSG_WARN(libnuma library was not found in your system. Some optimisations will not apply)]) | ||||||
|  |  | ||||||
| AC_SEARCH_LIBS([H5Fopen], [hdf5_cpp], | AC_SEARCH_LIBS([H5Fopen], [hdf5_cpp], | ||||||
|                [AC_DEFINE([HAVE_HDF5], [1], [Define to 1 if you have the `HDF5' library])] |                [AC_DEFINE([HAVE_HDF5], [1], [Define to 1 if you have the `HDF5' library])] | ||||||
|                [have_hdf5=true] |                [have_hdf5=true] | ||||||
| @@ -241,6 +254,7 @@ case ${ax_cv_cxx_compiler_vendor} in | |||||||
|         SIMD_FLAGS='';; |         SIMD_FLAGS='';; | ||||||
|       KNL) |       KNL) | ||||||
|         AC_DEFINE([AVX512],[1],[AVX512 intrinsics]) |         AC_DEFINE([AVX512],[1],[AVX512 intrinsics]) | ||||||
|  |         AC_DEFINE([KNL],[1],[Knights landing processor]) | ||||||
|         SIMD_FLAGS='-march=knl';; |         SIMD_FLAGS='-march=knl';; | ||||||
|       GEN) |       GEN) | ||||||
|         AC_DEFINE([GEN],[1],[generic vector code]) |         AC_DEFINE([GEN],[1],[generic vector code]) | ||||||
| @@ -248,6 +262,9 @@ case ${ax_cv_cxx_compiler_vendor} in | |||||||
|                            [generic SIMD vector width (in bytes)]) |                            [generic SIMD vector width (in bytes)]) | ||||||
|         SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)" |         SIMD_GEN_WIDTH_MSG=" (width= $ac_gen_simd_width)" | ||||||
|         SIMD_FLAGS='';; |         SIMD_FLAGS='';; | ||||||
|  |       NEONv8) | ||||||
|  |         AC_DEFINE([NEONV8],[1],[ARMv8 NEON]) | ||||||
|  |         SIMD_FLAGS='-march=armv8-a';; | ||||||
|       QPX|BGQ) |       QPX|BGQ) | ||||||
|         AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q]) |         AC_DEFINE([QPX],[1],[QPX intrinsics for BG/Q]) | ||||||
|         SIMD_FLAGS='';; |         SIMD_FLAGS='';; | ||||||
| @@ -276,6 +293,7 @@ case ${ax_cv_cxx_compiler_vendor} in | |||||||
|         SIMD_FLAGS='';; |         SIMD_FLAGS='';; | ||||||
|       KNL) |       KNL) | ||||||
|         AC_DEFINE([AVX512],[1],[AVX512 intrinsics for Knights Landing]) |         AC_DEFINE([AVX512],[1],[AVX512 intrinsics for Knights Landing]) | ||||||
|  |         AC_DEFINE([KNL],[1],[Knights landing processor]) | ||||||
|         SIMD_FLAGS='-xmic-avx512';; |         SIMD_FLAGS='-xmic-avx512';; | ||||||
|       GEN) |       GEN) | ||||||
|         AC_DEFINE([GEN],[1],[generic vector code]) |         AC_DEFINE([GEN],[1],[generic vector code]) | ||||||
| @@ -324,14 +342,14 @@ case ${ac_COMMS} in | |||||||
|         AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] ) |         AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] ) | ||||||
|         comms_type='none' |         comms_type='none' | ||||||
|      ;; |      ;; | ||||||
|      mpi3l*) |  | ||||||
|        AC_DEFINE([GRID_COMMS_MPI3L],[1],[GRID_COMMS_MPI3L] ) |  | ||||||
|        comms_type='mpi3l' |  | ||||||
|      ;; |  | ||||||
|      mpi3*) |      mpi3*) | ||||||
|         AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] ) |         AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] ) | ||||||
|         comms_type='mpi3' |         comms_type='mpi3' | ||||||
|      ;; |      ;; | ||||||
|  |      mpit) | ||||||
|  |         AC_DEFINE([GRID_COMMS_MPIT],[1],[GRID_COMMS_MPIT] ) | ||||||
|  |         comms_type='mpit' | ||||||
|  |      ;; | ||||||
|      mpi*) |      mpi*) | ||||||
|         AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] ) |         AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] ) | ||||||
|         comms_type='mpi' |         comms_type='mpi' | ||||||
| @@ -359,7 +377,7 @@ esac | |||||||
| AM_CONDITIONAL(BUILD_COMMS_SHMEM, [ test "${comms_type}X" == "shmemX" ]) | AM_CONDITIONAL(BUILD_COMMS_SHMEM, [ test "${comms_type}X" == "shmemX" ]) | ||||||
| AM_CONDITIONAL(BUILD_COMMS_MPI,   [ test "${comms_type}X" == "mpiX" ]) | AM_CONDITIONAL(BUILD_COMMS_MPI,   [ test "${comms_type}X" == "mpiX" ]) | ||||||
| AM_CONDITIONAL(BUILD_COMMS_MPI3,  [ test "${comms_type}X" == "mpi3X" ] ) | AM_CONDITIONAL(BUILD_COMMS_MPI3,  [ test "${comms_type}X" == "mpi3X" ] ) | ||||||
| AM_CONDITIONAL(BUILD_COMMS_MPI3L, [ test "${comms_type}X" == "mpi3lX" ] ) | AM_CONDITIONAL(BUILD_COMMS_MPIT,  [ test "${comms_type}X" == "mpitX" ] ) | ||||||
| AM_CONDITIONAL(BUILD_COMMS_NONE,  [ test "${comms_type}X" == "noneX" ]) | AM_CONDITIONAL(BUILD_COMMS_NONE,  [ test "${comms_type}X" == "noneX" ]) | ||||||
|  |  | ||||||
| ############### RNG selection | ############### RNG selection | ||||||
|   | |||||||
| @@ -41,9 +41,10 @@ using namespace Hadrons; | |||||||
| // constructor ///////////////////////////////////////////////////////////////// | // constructor ///////////////////////////////////////////////////////////////// | ||||||
| Environment::Environment(void) | Environment::Environment(void) | ||||||
| { | { | ||||||
|     nd_ = GridDefaultLatt().size(); |     dim_ = GridDefaultLatt(); | ||||||
|  |     nd_  = dim_.size(); | ||||||
|     grid4d_.reset(SpaceTimeGrid::makeFourDimGrid( |     grid4d_.reset(SpaceTimeGrid::makeFourDimGrid( | ||||||
|         GridDefaultLatt(), GridDefaultSimd(nd_, vComplex::Nsimd()), |         dim_, GridDefaultSimd(nd_, vComplex::Nsimd()), | ||||||
|         GridDefaultMpi())); |         GridDefaultMpi())); | ||||||
|     gridRb4d_.reset(SpaceTimeGrid::makeFourDimRedBlackGrid(grid4d_.get())); |     gridRb4d_.reset(SpaceTimeGrid::makeFourDimRedBlackGrid(grid4d_.get())); | ||||||
|     auto loc = getGrid()->LocalDimensions(); |     auto loc = getGrid()->LocalDimensions(); | ||||||
| @@ -132,6 +133,16 @@ unsigned int Environment::getNd(void) const | |||||||
|     return nd_; |     return nd_; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | std::vector<int> Environment::getDim(void) const | ||||||
|  | { | ||||||
|  |     return dim_; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | int Environment::getDim(const unsigned int mu) const | ||||||
|  | { | ||||||
|  |     return dim_[mu]; | ||||||
|  | } | ||||||
|  |  | ||||||
| // random number generator ///////////////////////////////////////////////////// | // random number generator ///////////////////////////////////////////////////// | ||||||
| void Environment::setSeed(const std::vector<int> &seed) | void Environment::setSeed(const std::vector<int> &seed) | ||||||
| { | { | ||||||
| @@ -271,6 +282,21 @@ std::string Environment::getModuleType(const std::string name) const | |||||||
|     return getModuleType(getModuleAddress(name)); |     return getModuleType(getModuleAddress(name)); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | std::string Environment::getModuleNamespace(const unsigned int address) const | ||||||
|  | { | ||||||
|  |     std::string type = getModuleType(address), ns; | ||||||
|  |      | ||||||
|  |     auto pos2 = type.rfind("::"); | ||||||
|  |     auto pos1 = type.rfind("::", pos2 - 2); | ||||||
|  |      | ||||||
|  |     return type.substr(pos1 + 2, pos2 - pos1 - 2); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | std::string Environment::getModuleNamespace(const std::string name) const | ||||||
|  | { | ||||||
|  |     return getModuleNamespace(getModuleAddress(name)); | ||||||
|  | } | ||||||
|  |  | ||||||
| bool Environment::hasModule(const unsigned int address) const | bool Environment::hasModule(const unsigned int address) const | ||||||
| { | { | ||||||
|     return (address < module_.size()); |     return (address < module_.size()); | ||||||
| @@ -492,7 +518,14 @@ std::string Environment::getObjectType(const unsigned int address) const | |||||||
| { | { | ||||||
|     if (hasRegisteredObject(address)) |     if (hasRegisteredObject(address)) | ||||||
|     { |     { | ||||||
|         return typeName(object_[address].type); |         if (object_[address].type) | ||||||
|  |         { | ||||||
|  |             return typeName(object_[address].type); | ||||||
|  |         } | ||||||
|  |         else | ||||||
|  |         { | ||||||
|  |             return "<no type>"; | ||||||
|  |         } | ||||||
|     } |     } | ||||||
|     else if (hasObject(address)) |     else if (hasObject(address)) | ||||||
|     { |     { | ||||||
| @@ -532,6 +565,23 @@ Environment::Size Environment::getObjectSize(const std::string name) const | |||||||
|     return getObjectSize(getObjectAddress(name)); |     return getObjectSize(getObjectAddress(name)); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | unsigned int Environment::getObjectModule(const unsigned int address) const | ||||||
|  | { | ||||||
|  |     if (hasObject(address)) | ||||||
|  |     { | ||||||
|  |         return object_[address].module; | ||||||
|  |     } | ||||||
|  |     else | ||||||
|  |     { | ||||||
|  |         HADRON_ERROR("no object with address " + std::to_string(address)); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | unsigned int Environment::getObjectModule(const std::string name) const | ||||||
|  | { | ||||||
|  |     return getObjectModule(getObjectAddress(name)); | ||||||
|  | } | ||||||
|  |  | ||||||
| unsigned int Environment::getObjectLs(const unsigned int address) const | unsigned int Environment::getObjectLs(const unsigned int address) const | ||||||
| { | { | ||||||
|     if (hasRegisteredObject(address)) |     if (hasRegisteredObject(address)) | ||||||
|   | |||||||
| @@ -106,6 +106,8 @@ public: | |||||||
|     void                    createGrid(const unsigned int Ls); |     void                    createGrid(const unsigned int Ls); | ||||||
|     GridCartesian *         getGrid(const unsigned int Ls = 1) const; |     GridCartesian *         getGrid(const unsigned int Ls = 1) const; | ||||||
|     GridRedBlackCartesian * getRbGrid(const unsigned int Ls = 1) const; |     GridRedBlackCartesian * getRbGrid(const unsigned int Ls = 1) const; | ||||||
|  |     std::vector<int>        getDim(void) const; | ||||||
|  |     int                     getDim(const unsigned int mu) const; | ||||||
|     unsigned int            getNd(void) const; |     unsigned int            getNd(void) const; | ||||||
|     // random number generator |     // random number generator | ||||||
|     void                    setSeed(const std::vector<int> &seed); |     void                    setSeed(const std::vector<int> &seed); | ||||||
| @@ -131,6 +133,8 @@ public: | |||||||
|     std::string             getModuleName(const unsigned int address) const; |     std::string             getModuleName(const unsigned int address) const; | ||||||
|     std::string             getModuleType(const unsigned int address) const; |     std::string             getModuleType(const unsigned int address) const; | ||||||
|     std::string             getModuleType(const std::string name) const; |     std::string             getModuleType(const std::string name) const; | ||||||
|  |     std::string             getModuleNamespace(const unsigned int address) const; | ||||||
|  |     std::string             getModuleNamespace(const std::string name) const; | ||||||
|     bool                    hasModule(const unsigned int address) const; |     bool                    hasModule(const unsigned int address) const; | ||||||
|     bool                    hasModule(const std::string name) const; |     bool                    hasModule(const std::string name) const; | ||||||
|     Graph<unsigned int>     makeModuleGraph(void) const; |     Graph<unsigned int>     makeModuleGraph(void) const; | ||||||
| @@ -171,6 +175,8 @@ public: | |||||||
|     std::string             getObjectType(const std::string name) const; |     std::string             getObjectType(const std::string name) const; | ||||||
|     Size                    getObjectSize(const unsigned int address) const; |     Size                    getObjectSize(const unsigned int address) const; | ||||||
|     Size                    getObjectSize(const std::string name) const; |     Size                    getObjectSize(const std::string name) const; | ||||||
|  |     unsigned int            getObjectModule(const unsigned int address) const; | ||||||
|  |     unsigned int            getObjectModule(const std::string name) const; | ||||||
|     unsigned int            getObjectLs(const unsigned int address) const; |     unsigned int            getObjectLs(const unsigned int address) const; | ||||||
|     unsigned int            getObjectLs(const std::string name) const; |     unsigned int            getObjectLs(const std::string name) const; | ||||||
|     bool                    hasObject(const unsigned int address) const; |     bool                    hasObject(const unsigned int address) const; | ||||||
| @@ -181,6 +187,10 @@ public: | |||||||
|     bool                    hasCreatedObject(const std::string name) const; |     bool                    hasCreatedObject(const std::string name) const; | ||||||
|     bool                    isObject5d(const unsigned int address) const; |     bool                    isObject5d(const unsigned int address) const; | ||||||
|     bool                    isObject5d(const std::string name) const; |     bool                    isObject5d(const std::string name) const; | ||||||
|  |     template <typename T> | ||||||
|  |     bool                    isObjectOfType(const unsigned int address) const; | ||||||
|  |     template <typename T> | ||||||
|  |     bool                    isObjectOfType(const std::string name) const; | ||||||
|     Environment::Size       getTotalSize(void) const; |     Environment::Size       getTotalSize(void) const; | ||||||
|     void                    addOwnership(const unsigned int owner, |     void                    addOwnership(const unsigned int owner, | ||||||
|                                          const unsigned int property); |                                          const unsigned int property); | ||||||
| @@ -197,6 +207,7 @@ private: | |||||||
|     bool                                   dryRun_{false}; |     bool                                   dryRun_{false}; | ||||||
|     unsigned int                           traj_, locVol_; |     unsigned int                           traj_, locVol_; | ||||||
|     // grids |     // grids | ||||||
|  |     std::vector<int>                       dim_; | ||||||
|     GridPt                                 grid4d_; |     GridPt                                 grid4d_; | ||||||
|     std::map<unsigned int, GridPt>         grid5d_; |     std::map<unsigned int, GridPt>         grid5d_; | ||||||
|     GridRbPt                               gridRb4d_; |     GridRbPt                               gridRb4d_; | ||||||
| @@ -343,7 +354,7 @@ T * Environment::getObject(const unsigned int address) const | |||||||
|         else |         else | ||||||
|         { |         { | ||||||
|             HADRON_ERROR("object with address " + std::to_string(address) + |             HADRON_ERROR("object with address " + std::to_string(address) + | ||||||
|                          " does not have type '" + typeid(T).name() + |                          " does not have type '" + typeName(&typeid(T)) + | ||||||
|                          "' (has type '" + getObjectType(address) + "')"); |                          "' (has type '" + getObjectType(address) + "')"); | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| @@ -380,6 +391,37 @@ T * Environment::createLattice(const std::string name) | |||||||
|     return createLattice<T>(getObjectAddress(name)); |     return createLattice<T>(getObjectAddress(name)); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | template <typename T> | ||||||
|  | bool Environment::isObjectOfType(const unsigned int address) const | ||||||
|  | { | ||||||
|  |     if (hasRegisteredObject(address)) | ||||||
|  |     { | ||||||
|  |         if (auto h = dynamic_cast<Holder<T> *>(object_[address].data.get())) | ||||||
|  |         { | ||||||
|  |             return true; | ||||||
|  |         } | ||||||
|  |         else | ||||||
|  |         { | ||||||
|  |             return false; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |     else if (hasObject(address)) | ||||||
|  |     { | ||||||
|  |         HADRON_ERROR("object with address " + std::to_string(address) + | ||||||
|  |                      " exists but is not registered"); | ||||||
|  |     } | ||||||
|  |     else | ||||||
|  |     { | ||||||
|  |         HADRON_ERROR("no object with address " + std::to_string(address)); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | template <typename T> | ||||||
|  | bool Environment::isObjectOfType(const std::string name) const | ||||||
|  | { | ||||||
|  |     return isObjectOfType<T>(getObjectAddress(name)); | ||||||
|  | } | ||||||
|  |  | ||||||
| END_HADRONS_NAMESPACE | END_HADRONS_NAMESPACE | ||||||
|  |  | ||||||
| #endif // Hadrons_Environment_hpp_ | #endif // Hadrons_Environment_hpp_ | ||||||
|   | |||||||
| @@ -51,23 +51,43 @@ using Grid::operator<<; | |||||||
|  * error with GCC 5 (clang & GCC 6 compile fine without it). |  * error with GCC 5 (clang & GCC 6 compile fine without it). | ||||||
|  */ |  */ | ||||||
|  |  | ||||||
| // FIXME: find a way to do that in a more general fashion |  | ||||||
| #ifndef FIMPL | #ifndef FIMPL | ||||||
| #define FIMPL WilsonImplR | #define FIMPL WilsonImplR | ||||||
| #endif | #endif | ||||||
|  | #ifndef SIMPL | ||||||
|  | #define SIMPL ScalarImplCR | ||||||
|  | #endif | ||||||
|  |  | ||||||
| BEGIN_HADRONS_NAMESPACE | BEGIN_HADRONS_NAMESPACE | ||||||
|  |  | ||||||
| // type aliases | // type aliases | ||||||
| #define TYPE_ALIASES(FImpl, suffix)\ | #define FERM_TYPE_ALIASES(FImpl, suffix)\ | ||||||
| typedef FermionOperator<FImpl>                       FMat##suffix;             \ | typedef FermionOperator<FImpl>                       FMat##suffix;             \ | ||||||
| typedef typename FImpl::FermionField                 FermionField##suffix;     \ | typedef typename FImpl::FermionField                 FermionField##suffix;     \ | ||||||
| typedef typename FImpl::PropagatorField              PropagatorField##suffix;  \ | typedef typename FImpl::PropagatorField              PropagatorField##suffix;  \ | ||||||
| typedef typename FImpl::SitePropagator               SitePropagator##suffix;   \ | typedef typename FImpl::SitePropagator               SitePropagator##suffix;   \ | ||||||
| typedef typename FImpl::DoubledGaugeField            DoubledGaugeField##suffix;\ | typedef std::vector<typename FImpl::SitePropagator::scalar_object>             \ | ||||||
| typedef std::function<void(FermionField##suffix &,                             \ |                                                      SlicedPropagator##suffix; | ||||||
|  |  | ||||||
|  | #define GAUGE_TYPE_ALIASES(FImpl, suffix)\ | ||||||
|  | typedef typename FImpl::DoubledGaugeField DoubledGaugeField##suffix; | ||||||
|  |  | ||||||
|  | #define SCALAR_TYPE_ALIASES(SImpl, suffix)\ | ||||||
|  | typedef typename SImpl::Field ScalarField##suffix;\ | ||||||
|  | typedef typename SImpl::Field PropagatorField##suffix; | ||||||
|  |  | ||||||
|  | #define SOLVER_TYPE_ALIASES(FImpl, suffix)\ | ||||||
|  | typedef std::function<void(FermionField##suffix &,\ | ||||||
|                       const FermionField##suffix &)> SolverFn##suffix; |                       const FermionField##suffix &)> SolverFn##suffix; | ||||||
|  |  | ||||||
|  | #define SINK_TYPE_ALIASES(suffix)\ | ||||||
|  | typedef std::function<SlicedPropagator##suffix(const PropagatorField##suffix &)> SinkFn##suffix; | ||||||
|  |  | ||||||
|  | #define FGS_TYPE_ALIASES(FImpl, suffix)\ | ||||||
|  | FERM_TYPE_ALIASES(FImpl, suffix)\ | ||||||
|  | GAUGE_TYPE_ALIASES(FImpl, suffix)\ | ||||||
|  | SOLVER_TYPE_ALIASES(FImpl, suffix) | ||||||
|  |  | ||||||
| // logger | // logger | ||||||
| class HadronsLogger: public Logger | class HadronsLogger: public Logger | ||||||
| { | { | ||||||
|   | |||||||
| @@ -1,31 +1,3 @@ | |||||||
| /************************************************************************************* |  | ||||||
|  |  | ||||||
| Grid physics library, www.github.com/paboyle/Grid  |  | ||||||
|  |  | ||||||
| Source file: extras/Hadrons/Modules.hpp |  | ||||||
|  |  | ||||||
| Copyright (C) 2015 |  | ||||||
| Copyright (C) 2016 |  | ||||||
|  |  | ||||||
| Author: Antonin Portelli <antonin.portelli@me.com> |  | ||||||
|  |  | ||||||
| This program is free software; you can redistribute it and/or modify |  | ||||||
| it under the terms of the GNU General Public License as published by |  | ||||||
| the Free Software Foundation; either version 2 of the License, or |  | ||||||
| (at your option) any later version. |  | ||||||
|  |  | ||||||
| This program is distributed in the hope that it will be useful, |  | ||||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of |  | ||||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the |  | ||||||
| GNU General Public License for more details. |  | ||||||
|  |  | ||||||
| You should have received a copy of the GNU General Public License along |  | ||||||
| with this program; if not, write to the Free Software Foundation, Inc., |  | ||||||
| 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |  | ||||||
|  |  | ||||||
| See the full license in the file "LICENSE" in the top level distribution directory |  | ||||||
| *************************************************************************************/ |  | ||||||
| /*  END LEGAL */ |  | ||||||
| #include <Grid/Hadrons/Modules/MAction/DWF.hpp> | #include <Grid/Hadrons/Modules/MAction/DWF.hpp> | ||||||
| #include <Grid/Hadrons/Modules/MAction/Wilson.hpp> | #include <Grid/Hadrons/Modules/MAction/Wilson.hpp> | ||||||
| #include <Grid/Hadrons/Modules/MContraction/Baryon.hpp> | #include <Grid/Hadrons/Modules/MContraction/Baryon.hpp> | ||||||
| @@ -36,13 +8,18 @@ See the full license in the file "LICENSE" in the top level distribution directo | |||||||
| #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp> | #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianEye.hpp> | ||||||
| #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp> | #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonianNonEye.hpp> | ||||||
| #include <Grid/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp> | #include <Grid/Hadrons/Modules/MContraction/WeakNeutral4ptDisc.hpp> | ||||||
|  | #include <Grid/Hadrons/Modules/MFermion/GaugeProp.hpp> | ||||||
| #include <Grid/Hadrons/Modules/MGauge/Load.hpp> | #include <Grid/Hadrons/Modules/MGauge/Load.hpp> | ||||||
| #include <Grid/Hadrons/Modules/MGauge/Random.hpp> | #include <Grid/Hadrons/Modules/MGauge/Random.hpp> | ||||||
|  | #include <Grid/Hadrons/Modules/MGauge/StochEm.hpp> | ||||||
| #include <Grid/Hadrons/Modules/MGauge/Unit.hpp> | #include <Grid/Hadrons/Modules/MGauge/Unit.hpp> | ||||||
| #include <Grid/Hadrons/Modules/MLoop/NoiseLoop.hpp> | #include <Grid/Hadrons/Modules/MLoop/NoiseLoop.hpp> | ||||||
|  | #include <Grid/Hadrons/Modules/MScalar/ChargedProp.hpp> | ||||||
|  | #include <Grid/Hadrons/Modules/MScalar/FreeProp.hpp> | ||||||
|  | #include <Grid/Hadrons/Modules/MScalar/Scalar.hpp> | ||||||
|  | #include <Grid/Hadrons/Modules/MSink/Point.hpp> | ||||||
| #include <Grid/Hadrons/Modules/MSolver/RBPrecCG.hpp> | #include <Grid/Hadrons/Modules/MSolver/RBPrecCG.hpp> | ||||||
| #include <Grid/Hadrons/Modules/MSource/Point.hpp> | #include <Grid/Hadrons/Modules/MSource/Point.hpp> | ||||||
| #include <Grid/Hadrons/Modules/MSource/SeqGamma.hpp> | #include <Grid/Hadrons/Modules/MSource/SeqGamma.hpp> | ||||||
| #include <Grid/Hadrons/Modules/MSource/Wall.hpp> | #include <Grid/Hadrons/Modules/MSource/Wall.hpp> | ||||||
| #include <Grid/Hadrons/Modules/MSource/Z2.hpp> | #include <Grid/Hadrons/Modules/MSource/Z2.hpp> | ||||||
| #include <Grid/Hadrons/Modules/Quark.hpp> |  | ||||||
|   | |||||||
| @@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo | |||||||
| *************************************************************************************/ | *************************************************************************************/ | ||||||
| /*  END LEGAL */ | /*  END LEGAL */ | ||||||
|  |  | ||||||
| #ifndef Hadrons_DWF_hpp_ | #ifndef Hadrons_MAction_DWF_hpp_ | ||||||
| #define Hadrons_DWF_hpp_ | #define Hadrons_MAction_DWF_hpp_ | ||||||
|  |  | ||||||
| #include <Grid/Hadrons/Global.hpp> | #include <Grid/Hadrons/Global.hpp> | ||||||
| #include <Grid/Hadrons/Module.hpp> | #include <Grid/Hadrons/Module.hpp> | ||||||
| @@ -56,7 +56,7 @@ template <typename FImpl> | |||||||
| class TDWF: public Module<DWFPar> | class TDWF: public Module<DWFPar> | ||||||
| { | { | ||||||
| public: | public: | ||||||
|     TYPE_ALIASES(FImpl,); |     FGS_TYPE_ALIASES(FImpl,); | ||||||
| public: | public: | ||||||
|     // constructor |     // constructor | ||||||
|     TDWF(const std::string name); |     TDWF(const std::string name); | ||||||
| @@ -137,4 +137,4 @@ END_MODULE_NAMESPACE | |||||||
|  |  | ||||||
| END_HADRONS_NAMESPACE | END_HADRONS_NAMESPACE | ||||||
|  |  | ||||||
| #endif // Hadrons_DWF_hpp_ | #endif // Hadrons_MAction_DWF_hpp_ | ||||||
|   | |||||||
| @@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo | |||||||
| *************************************************************************************/ | *************************************************************************************/ | ||||||
| /*  END LEGAL */ | /*  END LEGAL */ | ||||||
|  |  | ||||||
| #ifndef Hadrons_Wilson_hpp_ | #ifndef Hadrons_MAction_Wilson_hpp_ | ||||||
| #define Hadrons_Wilson_hpp_ | #define Hadrons_MAction_Wilson_hpp_ | ||||||
|  |  | ||||||
| #include <Grid/Hadrons/Global.hpp> | #include <Grid/Hadrons/Global.hpp> | ||||||
| #include <Grid/Hadrons/Module.hpp> | #include <Grid/Hadrons/Module.hpp> | ||||||
| @@ -54,7 +54,7 @@ template <typename FImpl> | |||||||
| class TWilson: public Module<WilsonPar> | class TWilson: public Module<WilsonPar> | ||||||
| { | { | ||||||
| public: | public: | ||||||
|     TYPE_ALIASES(FImpl,); |     FGS_TYPE_ALIASES(FImpl,); | ||||||
| public: | public: | ||||||
|     // constructor |     // constructor | ||||||
|     TWilson(const std::string name); |     TWilson(const std::string name); | ||||||
|   | |||||||
| @@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo | |||||||
| *************************************************************************************/ | *************************************************************************************/ | ||||||
| /*  END LEGAL */ | /*  END LEGAL */ | ||||||
|  |  | ||||||
| #ifndef Hadrons_Baryon_hpp_ | #ifndef Hadrons_MContraction_Baryon_hpp_ | ||||||
| #define Hadrons_Baryon_hpp_ | #define Hadrons_MContraction_Baryon_hpp_ | ||||||
|  |  | ||||||
| #include <Grid/Hadrons/Global.hpp> | #include <Grid/Hadrons/Global.hpp> | ||||||
| #include <Grid/Hadrons/Module.hpp> | #include <Grid/Hadrons/Module.hpp> | ||||||
| @@ -55,9 +55,9 @@ template <typename FImpl1, typename FImpl2, typename FImpl3> | |||||||
| class TBaryon: public Module<BaryonPar> | class TBaryon: public Module<BaryonPar> | ||||||
| { | { | ||||||
| public: | public: | ||||||
|     TYPE_ALIASES(FImpl1, 1); |     FERM_TYPE_ALIASES(FImpl1, 1); | ||||||
|     TYPE_ALIASES(FImpl2, 2); |     FERM_TYPE_ALIASES(FImpl2, 2); | ||||||
|     TYPE_ALIASES(FImpl3, 3); |     FERM_TYPE_ALIASES(FImpl3, 3); | ||||||
|     class Result: Serializable |     class Result: Serializable | ||||||
|     { |     { | ||||||
|     public: |     public: | ||||||
| @@ -121,11 +121,11 @@ void TBaryon<FImpl1, FImpl2, FImpl3>::execute(void) | |||||||
|      |      | ||||||
|     // FIXME: do contractions |     // FIXME: do contractions | ||||||
|      |      | ||||||
|     write(writer, "meson", result); |     // write(writer, "meson", result); | ||||||
| } | } | ||||||
|  |  | ||||||
| END_MODULE_NAMESPACE | END_MODULE_NAMESPACE | ||||||
|  |  | ||||||
| END_HADRONS_NAMESPACE | END_HADRONS_NAMESPACE | ||||||
|  |  | ||||||
| #endif // Hadrons_Baryon_hpp_ | #endif // Hadrons_MContraction_Baryon_hpp_ | ||||||
|   | |||||||
| @@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo | |||||||
| *************************************************************************************/ | *************************************************************************************/ | ||||||
| /*  END LEGAL */ | /*  END LEGAL */ | ||||||
|  |  | ||||||
| #ifndef Hadrons_DiscLoop_hpp_ | #ifndef Hadrons_MContraction_DiscLoop_hpp_ | ||||||
| #define Hadrons_DiscLoop_hpp_ | #define Hadrons_MContraction_DiscLoop_hpp_ | ||||||
|  |  | ||||||
| #include <Grid/Hadrons/Global.hpp> | #include <Grid/Hadrons/Global.hpp> | ||||||
| #include <Grid/Hadrons/Module.hpp> | #include <Grid/Hadrons/Module.hpp> | ||||||
| @@ -52,7 +52,7 @@ public: | |||||||
| template <typename FImpl> | template <typename FImpl> | ||||||
| class TDiscLoop: public Module<DiscLoopPar> | class TDiscLoop: public Module<DiscLoopPar> | ||||||
| { | { | ||||||
|     TYPE_ALIASES(FImpl,); |     FERM_TYPE_ALIASES(FImpl,); | ||||||
|     class Result: Serializable |     class Result: Serializable | ||||||
|     { |     { | ||||||
|     public: |     public: | ||||||
| @@ -141,4 +141,4 @@ END_MODULE_NAMESPACE | |||||||
|  |  | ||||||
| END_HADRONS_NAMESPACE | END_HADRONS_NAMESPACE | ||||||
|  |  | ||||||
| #endif // Hadrons_DiscLoop_hpp_ | #endif // Hadrons_MContraction_DiscLoop_hpp_ | ||||||
|   | |||||||
| @@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo | |||||||
| *************************************************************************************/ | *************************************************************************************/ | ||||||
| /*  END LEGAL */ | /*  END LEGAL */ | ||||||
|  |  | ||||||
| #ifndef Hadrons_Gamma3pt_hpp_ | #ifndef Hadrons_MContraction_Gamma3pt_hpp_ | ||||||
| #define Hadrons_Gamma3pt_hpp_ | #define Hadrons_MContraction_Gamma3pt_hpp_ | ||||||
|  |  | ||||||
| #include <Grid/Hadrons/Global.hpp> | #include <Grid/Hadrons/Global.hpp> | ||||||
| #include <Grid/Hadrons/Module.hpp> | #include <Grid/Hadrons/Module.hpp> | ||||||
| @@ -72,9 +72,9 @@ public: | |||||||
| template <typename FImpl1, typename FImpl2, typename FImpl3> | template <typename FImpl1, typename FImpl2, typename FImpl3> | ||||||
| class TGamma3pt: public Module<Gamma3ptPar> | class TGamma3pt: public Module<Gamma3ptPar> | ||||||
| { | { | ||||||
|     TYPE_ALIASES(FImpl1, 1); |     FERM_TYPE_ALIASES(FImpl1, 1); | ||||||
|     TYPE_ALIASES(FImpl2, 2); |     FERM_TYPE_ALIASES(FImpl2, 2); | ||||||
|     TYPE_ALIASES(FImpl3, 3); |     FERM_TYPE_ALIASES(FImpl3, 3); | ||||||
|     class Result: Serializable |     class Result: Serializable | ||||||
|     { |     { | ||||||
|     public: |     public: | ||||||
| @@ -167,4 +167,4 @@ END_MODULE_NAMESPACE | |||||||
|  |  | ||||||
| END_HADRONS_NAMESPACE | END_HADRONS_NAMESPACE | ||||||
|  |  | ||||||
| #endif // Hadrons_Gamma3pt_hpp_ | #endif // Hadrons_MContraction_Gamma3pt_hpp_ | ||||||
|   | |||||||
| @@ -29,8 +29,8 @@ See the full license in the file "LICENSE" in the top level distribution directo | |||||||
| *************************************************************************************/ | *************************************************************************************/ | ||||||
| /*  END LEGAL */ | /*  END LEGAL */ | ||||||
|  |  | ||||||
| #ifndef Hadrons_Meson_hpp_ | #ifndef Hadrons_MContraction_Meson_hpp_ | ||||||
| #define Hadrons_Meson_hpp_ | #define Hadrons_MContraction_Meson_hpp_ | ||||||
|  |  | ||||||
| #include <Grid/Hadrons/Global.hpp> | #include <Grid/Hadrons/Global.hpp> | ||||||
| #include <Grid/Hadrons/Module.hpp> | #include <Grid/Hadrons/Module.hpp> | ||||||
| @@ -69,7 +69,7 @@ public: | |||||||
|                                     std::string, q1, |                                     std::string, q1, | ||||||
|                                     std::string, q2, |                                     std::string, q2, | ||||||
|                                     std::string, gammas, |                                     std::string, gammas, | ||||||
|                                     std::string, mom, |                                     std::string, sink, | ||||||
|                                     std::string, output); |                                     std::string, output); | ||||||
| }; | }; | ||||||
|  |  | ||||||
| @@ -77,8 +77,10 @@ template <typename FImpl1, typename FImpl2> | |||||||
| class TMeson: public Module<MesonPar> | class TMeson: public Module<MesonPar> | ||||||
| { | { | ||||||
| public: | public: | ||||||
|     TYPE_ALIASES(FImpl1, 1); |     FERM_TYPE_ALIASES(FImpl1, 1); | ||||||
|     TYPE_ALIASES(FImpl2, 2); |     FERM_TYPE_ALIASES(FImpl2, 2); | ||||||
|  |     FERM_TYPE_ALIASES(ScalarImplCR, Scalar); | ||||||
|  |     SINK_TYPE_ALIASES(Scalar); | ||||||
|     class Result: Serializable |     class Result: Serializable | ||||||
|     { |     { | ||||||
|     public: |     public: | ||||||
| @@ -115,7 +117,7 @@ TMeson<FImpl1, FImpl2>::TMeson(const std::string name) | |||||||
| template <typename FImpl1, typename FImpl2> | template <typename FImpl1, typename FImpl2> | ||||||
| std::vector<std::string> TMeson<FImpl1, FImpl2>::getInput(void) | std::vector<std::string> TMeson<FImpl1, FImpl2>::getInput(void) | ||||||
| { | { | ||||||
|     std::vector<std::string> input = {par().q1, par().q2}; |     std::vector<std::string> input = {par().q1, par().q2, par().sink}; | ||||||
|      |      | ||||||
|     return input; |     return input; | ||||||
| } | } | ||||||
| @@ -154,6 +156,9 @@ void TMeson<FImpl1, FImpl2>::parseGammaString(std::vector<GammaPair> &gammaList) | |||||||
|  |  | ||||||
|  |  | ||||||
| // execution /////////////////////////////////////////////////////////////////// | // execution /////////////////////////////////////////////////////////////////// | ||||||
|  | #define mesonConnected(q1, q2, gSnk, gSrc) \ | ||||||
|  | (g5*(gSnk))*(q1)*(adj(gSrc)*g5)*adj(q2) | ||||||
|  |  | ||||||
| template <typename FImpl1, typename FImpl2> | template <typename FImpl1, typename FImpl2> | ||||||
| void TMeson<FImpl1, FImpl2>::execute(void) | void TMeson<FImpl1, FImpl2>::execute(void) | ||||||
| { | { | ||||||
| @@ -161,43 +166,72 @@ void TMeson<FImpl1, FImpl2>::execute(void) | |||||||
|                  << " quarks '" << par().q1 << "' and '" << par().q2 << "'" |                  << " quarks '" << par().q1 << "' and '" << par().q2 << "'" | ||||||
|                  << std::endl; |                  << std::endl; | ||||||
|      |      | ||||||
|     CorrWriter              writer(par().output); |     CorrWriter             writer(par().output); | ||||||
|     PropagatorField1       &q1 = *env().template getObject<PropagatorField1>(par().q1); |  | ||||||
|     PropagatorField2       &q2 = *env().template getObject<PropagatorField2>(par().q2); |  | ||||||
|     LatticeComplex         c(env().getGrid()); |  | ||||||
|     Gamma                  g5(Gamma::Algebra::Gamma5); |  | ||||||
|     std::vector<GammaPair> gammaList; |  | ||||||
|     std::vector<TComplex>  buf; |     std::vector<TComplex>  buf; | ||||||
|     std::vector<Result>    result; |     std::vector<Result>    result; | ||||||
|     std::vector<Real>      p; |     Gamma                  g5(Gamma::Algebra::Gamma5); | ||||||
|  |     std::vector<GammaPair> gammaList; | ||||||
|     p  = strToVec<Real>(par().mom); |     int                    nt = env().getDim(Tp); | ||||||
|     LatticeComplex         ph(env().getGrid()), coor(env().getGrid()); |  | ||||||
|     Complex                i(0.0,1.0); |  | ||||||
|     ph = zero; |  | ||||||
|     for(unsigned int mu = 0; mu < env().getNd(); mu++) |  | ||||||
|     { |  | ||||||
|         LatticeCoordinate(coor, mu); |  | ||||||
|         ph = ph + p[mu]*coor*((1./(env().getGrid()->_fdimensions[mu]))); |  | ||||||
|     } |  | ||||||
|     ph = exp((Real)(2*M_PI)*i*ph); |  | ||||||
|      |      | ||||||
|     parseGammaString(gammaList); |     parseGammaString(gammaList); | ||||||
|  |  | ||||||
|     result.resize(gammaList.size()); |     result.resize(gammaList.size()); | ||||||
|     for (unsigned int i = 0; i < result.size(); ++i) |     for (unsigned int i = 0; i < result.size(); ++i) | ||||||
|     { |     { | ||||||
|         Gamma gSnk(gammaList[i].first); |  | ||||||
|         Gamma gSrc(gammaList[i].second); |  | ||||||
|         c = trace((g5*gSnk)*q1*(adj(gSrc)*g5)*adj(q2))*ph; |  | ||||||
|         sliceSum(c, buf, Tp); |  | ||||||
|  |  | ||||||
|         result[i].gamma_snk = gammaList[i].first; |         result[i].gamma_snk = gammaList[i].first; | ||||||
|         result[i].gamma_src = gammaList[i].second; |         result[i].gamma_src = gammaList[i].second; | ||||||
|         result[i].corr.resize(buf.size()); |         result[i].corr.resize(nt); | ||||||
|         for (unsigned int t = 0; t < buf.size(); ++t) |     } | ||||||
|  |     if (env().template isObjectOfType<SlicedPropagator1>(par().q1) and | ||||||
|  |         env().template isObjectOfType<SlicedPropagator2>(par().q2)) | ||||||
|  |     { | ||||||
|  |         SlicedPropagator1 &q1 = *env().template getObject<SlicedPropagator1>(par().q1); | ||||||
|  |         SlicedPropagator2 &q2 = *env().template getObject<SlicedPropagator2>(par().q2); | ||||||
|  |          | ||||||
|  |         LOG(Message) << "(propagator already sinked)" << std::endl; | ||||||
|  |         for (unsigned int i = 0; i < result.size(); ++i) | ||||||
|         { |         { | ||||||
|             result[i].corr[t] = TensorRemove(buf[t]); |             Gamma gSnk(gammaList[i].first); | ||||||
|  |             Gamma gSrc(gammaList[i].second); | ||||||
|  |              | ||||||
|  |             for (unsigned int t = 0; t < buf.size(); ++t) | ||||||
|  |             { | ||||||
|  |                 result[i].corr[t] = TensorRemove(trace(mesonConnected(q1[t], q2[t], gSnk, gSrc))); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |     else | ||||||
|  |     { | ||||||
|  |         PropagatorField1 &q1   = *env().template getObject<PropagatorField1>(par().q1); | ||||||
|  |         PropagatorField2 &q2   = *env().template getObject<PropagatorField2>(par().q2); | ||||||
|  |         LatticeComplex   c(env().getGrid()); | ||||||
|  |          | ||||||
|  |         LOG(Message) << "(using sink '" << par().sink << "')" << std::endl; | ||||||
|  |         for (unsigned int i = 0; i < result.size(); ++i) | ||||||
|  |         { | ||||||
|  |             Gamma       gSnk(gammaList[i].first); | ||||||
|  |             Gamma       gSrc(gammaList[i].second); | ||||||
|  |             std::string ns; | ||||||
|  |                  | ||||||
|  |             ns = env().getModuleNamespace(env().getObjectModule(par().sink)); | ||||||
|  |             if (ns == "MSource") | ||||||
|  |             { | ||||||
|  |                 PropagatorField1 &sink = | ||||||
|  |                     *env().template getObject<PropagatorField1>(par().sink); | ||||||
|  |                  | ||||||
|  |                 c = trace(mesonConnected(q1, q2, gSnk, gSrc)*sink); | ||||||
|  |                 sliceSum(c, buf, Tp); | ||||||
|  |             } | ||||||
|  |             else if (ns == "MSink") | ||||||
|  |             { | ||||||
|  |                 SinkFnScalar &sink = *env().template getObject<SinkFnScalar>(par().sink); | ||||||
|  |                  | ||||||
|  |                 c   = trace(mesonConnected(q1, q2, gSnk, gSrc)); | ||||||
|  |                 buf = sink(c); | ||||||
|  |             } | ||||||
|  |             for (unsigned int t = 0; t < buf.size(); ++t) | ||||||
|  |             { | ||||||
|  |                 result[i].corr[t] = TensorRemove(buf[t]); | ||||||
|  |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|     write(writer, "meson", result); |     write(writer, "meson", result); | ||||||
| @@ -207,4 +241,4 @@ END_MODULE_NAMESPACE | |||||||
|  |  | ||||||
| END_HADRONS_NAMESPACE | END_HADRONS_NAMESPACE | ||||||
|  |  | ||||||
| #endif // Hadrons_Meson_hpp_ | #endif // Hadrons_MContraction_Meson_hpp_ | ||||||
|   | |||||||
| @@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo | |||||||
| *************************************************************************************/ | *************************************************************************************/ | ||||||
| /*  END LEGAL */ | /*  END LEGAL */ | ||||||
|  |  | ||||||
| #ifndef Hadrons_WeakHamiltonian_hpp_ | #ifndef Hadrons_MContraction_WeakHamiltonian_hpp_ | ||||||
| #define Hadrons_WeakHamiltonian_hpp_ | #define Hadrons_MContraction_WeakHamiltonian_hpp_ | ||||||
|  |  | ||||||
| #include <Grid/Hadrons/Global.hpp> | #include <Grid/Hadrons/Global.hpp> | ||||||
| #include <Grid/Hadrons/Module.hpp> | #include <Grid/Hadrons/Module.hpp> | ||||||
| @@ -83,7 +83,7 @@ public: | |||||||
| class T##modname: public Module<WeakHamiltonianPar>\ | class T##modname: public Module<WeakHamiltonianPar>\ | ||||||
| {\ | {\ | ||||||
| public:\ | public:\ | ||||||
|     TYPE_ALIASES(FIMPL,)\ |     FERM_TYPE_ALIASES(FIMPL,)\ | ||||||
|     class Result: Serializable\ |     class Result: Serializable\ | ||||||
|     {\ |     {\ | ||||||
|     public:\ |     public:\ | ||||||
| @@ -111,4 +111,4 @@ END_MODULE_NAMESPACE | |||||||
|  |  | ||||||
| END_HADRONS_NAMESPACE | END_HADRONS_NAMESPACE | ||||||
|  |  | ||||||
| #endif // Hadrons_WeakHamiltonian_hpp_ | #endif // Hadrons_MContraction_WeakHamiltonian_hpp_ | ||||||
|   | |||||||
| @@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo | |||||||
| *************************************************************************************/ | *************************************************************************************/ | ||||||
| /*  END LEGAL */ | /*  END LEGAL */ | ||||||
|  |  | ||||||
| #ifndef Hadrons_WeakHamiltonianEye_hpp_ | #ifndef Hadrons_MContraction_WeakHamiltonianEye_hpp_ | ||||||
| #define Hadrons_WeakHamiltonianEye_hpp_ | #define Hadrons_MContraction_WeakHamiltonianEye_hpp_ | ||||||
|  |  | ||||||
| #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp> | #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp> | ||||||
|  |  | ||||||
| @@ -55,4 +55,4 @@ END_MODULE_NAMESPACE | |||||||
|  |  | ||||||
| END_HADRONS_NAMESPACE | END_HADRONS_NAMESPACE | ||||||
|  |  | ||||||
| #endif // Hadrons_WeakHamiltonianEye_hpp_ | #endif // Hadrons_MContraction_WeakHamiltonianEye_hpp_ | ||||||
|   | |||||||
| @@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo | |||||||
| *************************************************************************************/ | *************************************************************************************/ | ||||||
| /*  END LEGAL */ | /*  END LEGAL */ | ||||||
|  |  | ||||||
| #ifndef Hadrons_WeakHamiltonianNonEye_hpp_ | #ifndef Hadrons_MContraction_WeakHamiltonianNonEye_hpp_ | ||||||
| #define Hadrons_WeakHamiltonianNonEye_hpp_ | #define Hadrons_MContraction_WeakHamiltonianNonEye_hpp_ | ||||||
|  |  | ||||||
| #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp> | #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp> | ||||||
|  |  | ||||||
| @@ -54,4 +54,4 @@ END_MODULE_NAMESPACE | |||||||
|  |  | ||||||
| END_HADRONS_NAMESPACE | END_HADRONS_NAMESPACE | ||||||
|  |  | ||||||
| #endif // Hadrons_WeakHamiltonianNonEye_hpp_ | #endif // Hadrons_MContraction_WeakHamiltonianNonEye_hpp_ | ||||||
|   | |||||||
| @@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo | |||||||
| *************************************************************************************/ | *************************************************************************************/ | ||||||
| /*  END LEGAL */ | /*  END LEGAL */ | ||||||
|  |  | ||||||
| #ifndef Hadrons_WeakNeutral4ptDisc_hpp_ | #ifndef Hadrons_MContraction_WeakNeutral4ptDisc_hpp_ | ||||||
| #define Hadrons_WeakNeutral4ptDisc_hpp_ | #define Hadrons_MContraction_WeakNeutral4ptDisc_hpp_ | ||||||
|  |  | ||||||
| #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp> | #include <Grid/Hadrons/Modules/MContraction/WeakHamiltonian.hpp> | ||||||
|  |  | ||||||
| @@ -56,4 +56,4 @@ END_MODULE_NAMESPACE | |||||||
|  |  | ||||||
| END_HADRONS_NAMESPACE | END_HADRONS_NAMESPACE | ||||||
|  |  | ||||||
| #endif // Hadrons_WeakNeutral4ptDisc_hpp_ | #endif // Hadrons_MContraction_WeakNeutral4ptDisc_hpp_ | ||||||
|   | |||||||
| @@ -1,34 +1,5 @@ | |||||||
| /*************************************************************************************
 | #ifndef Hadrons_MFermion_GaugeProp_hpp_ | ||||||
| 
 | #define Hadrons_MFermion_GaugeProp_hpp_ | ||||||
| Grid physics library, www.github.com/paboyle/Grid  |  | ||||||
| 
 |  | ||||||
| Source file: extras/Hadrons/Modules/Quark.hpp |  | ||||||
| 
 |  | ||||||
| Copyright (C) 2015 |  | ||||||
| Copyright (C) 2016 |  | ||||||
| 
 |  | ||||||
| Author: Antonin Portelli <antonin.portelli@me.com> |  | ||||||
| 
 |  | ||||||
| This program is free software; you can redistribute it and/or modify |  | ||||||
| it under the terms of the GNU General Public License as published by |  | ||||||
| the Free Software Foundation; either version 2 of the License, or |  | ||||||
| (at your option) any later version. |  | ||||||
| 
 |  | ||||||
| This program is distributed in the hope that it will be useful, |  | ||||||
| but WITHOUT ANY WARRANTY; without even the implied warranty of |  | ||||||
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the |  | ||||||
| GNU General Public License for more details. |  | ||||||
| 
 |  | ||||||
| You should have received a copy of the GNU General Public License along |  | ||||||
| with this program; if not, write to the Free Software Foundation, Inc., |  | ||||||
| 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |  | ||||||
| 
 |  | ||||||
| See the full license in the file "LICENSE" in the top level distribution directory |  | ||||||
| *************************************************************************************/ |  | ||||||
| /*  END LEGAL */ |  | ||||||
| 
 |  | ||||||
| #ifndef Hadrons_Quark_hpp_ |  | ||||||
| #define Hadrons_Quark_hpp_ |  | ||||||
| 
 | 
 | ||||||
| #include <Grid/Hadrons/Global.hpp> | #include <Grid/Hadrons/Global.hpp> | ||||||
| #include <Grid/Hadrons/Module.hpp> | #include <Grid/Hadrons/Module.hpp> | ||||||
| @@ -37,27 +8,29 @@ See the full license in the file "LICENSE" in the top level distribution directo | |||||||
| BEGIN_HADRONS_NAMESPACE | BEGIN_HADRONS_NAMESPACE | ||||||
| 
 | 
 | ||||||
| /******************************************************************************
 | /******************************************************************************
 | ||||||
|  *                               TQuark                                       * |  *                                GaugeProp                                   * | ||||||
|  ******************************************************************************/ |  ******************************************************************************/ | ||||||
| class QuarkPar: Serializable | BEGIN_MODULE_NAMESPACE(MFermion) | ||||||
|  | 
 | ||||||
|  | class GaugePropPar: Serializable | ||||||
| { | { | ||||||
| public: | public: | ||||||
|     GRID_SERIALIZABLE_CLASS_MEMBERS(QuarkPar, |     GRID_SERIALIZABLE_CLASS_MEMBERS(GaugePropPar, | ||||||
|                                     std::string, source, |                                     std::string, source, | ||||||
|                                     std::string, solver); |                                     std::string, solver); | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| template <typename FImpl> | template <typename FImpl> | ||||||
| class TQuark: public Module<QuarkPar> | class TGaugeProp: public Module<GaugePropPar> | ||||||
| { | { | ||||||
| public: | public: | ||||||
|     TYPE_ALIASES(FImpl,); |     FGS_TYPE_ALIASES(FImpl,); | ||||||
| public: | public: | ||||||
|     // constructor
 |     // constructor
 | ||||||
|     TQuark(const std::string name); |     TGaugeProp(const std::string name); | ||||||
|     // destructor
 |     // destructor
 | ||||||
|     virtual ~TQuark(void) = default; |     virtual ~TGaugeProp(void) = default; | ||||||
|     // dependencies/products
 |     // dependency relation
 | ||||||
|     virtual std::vector<std::string> getInput(void); |     virtual std::vector<std::string> getInput(void); | ||||||
|     virtual std::vector<std::string> getOutput(void); |     virtual std::vector<std::string> getOutput(void); | ||||||
|     // setup
 |     // setup
 | ||||||
| @@ -69,20 +42,20 @@ private: | |||||||
|     SolverFn     *solver_{nullptr}; |     SolverFn     *solver_{nullptr}; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| MODULE_REGISTER(Quark, TQuark<FIMPL>); | MODULE_REGISTER_NS(GaugeProp, TGaugeProp<FIMPL>, MFermion); | ||||||
| 
 | 
 | ||||||
| /******************************************************************************
 | /******************************************************************************
 | ||||||
|  *                          TQuark implementation                             * |  *                      TGaugeProp implementation                             * | ||||||
|  ******************************************************************************/ |  ******************************************************************************/ | ||||||
| // constructor /////////////////////////////////////////////////////////////////
 | // constructor /////////////////////////////////////////////////////////////////
 | ||||||
| template <typename FImpl> | template <typename FImpl> | ||||||
| TQuark<FImpl>::TQuark(const std::string name) | TGaugeProp<FImpl>::TGaugeProp(const std::string name) | ||||||
| : Module(name) | : Module<GaugePropPar>(name) | ||||||
| {} | {} | ||||||
| 
 | 
 | ||||||
| // dependencies/products ///////////////////////////////////////////////////////
 | // dependencies/products ///////////////////////////////////////////////////////
 | ||||||
| template <typename FImpl> | template <typename FImpl> | ||||||
| std::vector<std::string> TQuark<FImpl>::getInput(void) | std::vector<std::string> TGaugeProp<FImpl>::getInput(void) | ||||||
| { | { | ||||||
|     std::vector<std::string> in = {par().source, par().solver}; |     std::vector<std::string> in = {par().source, par().solver}; | ||||||
|      |      | ||||||
| @@ -90,7 +63,7 @@ std::vector<std::string> TQuark<FImpl>::getInput(void) | |||||||
| } | } | ||||||
| 
 | 
 | ||||||
| template <typename FImpl> | template <typename FImpl> | ||||||
| std::vector<std::string> TQuark<FImpl>::getOutput(void) | std::vector<std::string> TGaugeProp<FImpl>::getOutput(void) | ||||||
| { | { | ||||||
|     std::vector<std::string> out = {getName(), getName() + "_5d"}; |     std::vector<std::string> out = {getName(), getName() + "_5d"}; | ||||||
|      |      | ||||||
| @@ -99,7 +72,7 @@ std::vector<std::string> TQuark<FImpl>::getOutput(void) | |||||||
| 
 | 
 | ||||||
| // setup ///////////////////////////////////////////////////////////////////////
 | // setup ///////////////////////////////////////////////////////////////////////
 | ||||||
| template <typename FImpl> | template <typename FImpl> | ||||||
| void TQuark<FImpl>::setup(void) | void TGaugeProp<FImpl>::setup(void) | ||||||
| { | { | ||||||
|     Ls_ = env().getObjectLs(par().solver); |     Ls_ = env().getObjectLs(par().solver); | ||||||
|     env().template registerLattice<PropagatorField>(getName()); |     env().template registerLattice<PropagatorField>(getName()); | ||||||
| @@ -111,13 +84,13 @@ void TQuark<FImpl>::setup(void) | |||||||
| 
 | 
 | ||||||
| // execution ///////////////////////////////////////////////////////////////////
 | // execution ///////////////////////////////////////////////////////////////////
 | ||||||
| template <typename FImpl> | template <typename FImpl> | ||||||
| void TQuark<FImpl>::execute(void) | void TGaugeProp<FImpl>::execute(void) | ||||||
| { | { | ||||||
|     LOG(Message) << "Computing quark propagator '" << getName() << "'" |     LOG(Message) << "Computing quark propagator '" << getName() << "'" | ||||||
|                  << std::endl; |     << std::endl; | ||||||
|      |      | ||||||
|     FermionField    source(env().getGrid(Ls_)), sol(env().getGrid(Ls_)), |     FermionField    source(env().getGrid(Ls_)), sol(env().getGrid(Ls_)), | ||||||
|                     tmp(env().getGrid()); |     tmp(env().getGrid()); | ||||||
|     std::string     propName = (Ls_ == 1) ? getName() : (getName() + "_5d"); |     std::string     propName = (Ls_ == 1) ? getName() : (getName() + "_5d"); | ||||||
|     PropagatorField &prop    = *env().template createLattice<PropagatorField>(propName); |     PropagatorField &prop    = *env().template createLattice<PropagatorField>(propName); | ||||||
|     PropagatorField &fullSrc = *env().template getObject<PropagatorField>(par().source); |     PropagatorField &fullSrc = *env().template getObject<PropagatorField>(par().source); | ||||||
| @@ -128,7 +101,7 @@ void TQuark<FImpl>::execute(void) | |||||||
|     } |     } | ||||||
|      |      | ||||||
|     LOG(Message) << "Inverting using solver '" << par().solver |     LOG(Message) << "Inverting using solver '" << par().solver | ||||||
|                  << "' on source '" << par().source << "'" << std::endl; |     << "' on source '" << par().source << "'" << std::endl; | ||||||
|     for (unsigned int s = 0; s < Ns; ++s) |     for (unsigned int s = 0; s < Ns; ++s) | ||||||
|     for (unsigned int c = 0; c < Nc; ++c) |     for (unsigned int c = 0; c < Nc; ++c) | ||||||
|     { |     { | ||||||
| @@ -170,7 +143,7 @@ void TQuark<FImpl>::execute(void) | |||||||
|         if (Ls_ > 1) |         if (Ls_ > 1) | ||||||
|         { |         { | ||||||
|             PropagatorField &p4d = |             PropagatorField &p4d = | ||||||
|                 *env().template getObject<PropagatorField>(getName()); |             *env().template getObject<PropagatorField>(getName()); | ||||||
|              |              | ||||||
|             axpby_ssp_pminus(sol, 0., sol, 1., sol, 0, 0); |             axpby_ssp_pminus(sol, 0., sol, 1., sol, 0, 0); | ||||||
|             axpby_ssp_pplus(sol, 1., sol, 1., sol, 0, Ls_-1); |             axpby_ssp_pplus(sol, 1., sol, 1., sol, 0, Ls_-1); | ||||||
| @@ -180,6 +153,8 @@ void TQuark<FImpl>::execute(void) | |||||||
|     } |     } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | END_MODULE_NAMESPACE | ||||||
|  | 
 | ||||||
| END_HADRONS_NAMESPACE | END_HADRONS_NAMESPACE | ||||||
| 
 | 
 | ||||||
| #endif // Hadrons_Quark_hpp_
 | #endif // Hadrons_MFermion_GaugeProp_hpp_
 | ||||||
| @@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo | |||||||
| *************************************************************************************/ | *************************************************************************************/ | ||||||
| /*  END LEGAL */ | /*  END LEGAL */ | ||||||
|  |  | ||||||
| #ifndef Hadrons_Load_hpp_ | #ifndef Hadrons_MGauge_Load_hpp_ | ||||||
| #define Hadrons_Load_hpp_ | #define Hadrons_MGauge_Load_hpp_ | ||||||
|  |  | ||||||
| #include <Grid/Hadrons/Global.hpp> | #include <Grid/Hadrons/Global.hpp> | ||||||
| #include <Grid/Hadrons/Module.hpp> | #include <Grid/Hadrons/Module.hpp> | ||||||
| @@ -70,4 +70,4 @@ END_MODULE_NAMESPACE | |||||||
|  |  | ||||||
| END_HADRONS_NAMESPACE | END_HADRONS_NAMESPACE | ||||||
|  |  | ||||||
| #endif // Hadrons_Load_hpp_ | #endif // Hadrons_MGauge_Load_hpp_ | ||||||
|   | |||||||
| @@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo | |||||||
| *************************************************************************************/ | *************************************************************************************/ | ||||||
| /*  END LEGAL */ | /*  END LEGAL */ | ||||||
|  |  | ||||||
| #ifndef Hadrons_Random_hpp_ | #ifndef Hadrons_MGauge_Random_hpp_ | ||||||
| #define Hadrons_Random_hpp_ | #define Hadrons_MGauge_Random_hpp_ | ||||||
|  |  | ||||||
| #include <Grid/Hadrons/Global.hpp> | #include <Grid/Hadrons/Global.hpp> | ||||||
| #include <Grid/Hadrons/Module.hpp> | #include <Grid/Hadrons/Module.hpp> | ||||||
| @@ -63,4 +63,4 @@ END_MODULE_NAMESPACE | |||||||
|  |  | ||||||
| END_HADRONS_NAMESPACE | END_HADRONS_NAMESPACE | ||||||
|  |  | ||||||
| #endif // Hadrons_Random_hpp_ | #endif // Hadrons_MGauge_Random_hpp_ | ||||||
|   | |||||||
							
								
								
									
										88
									
								
								extras/Hadrons/Modules/MGauge/StochEm.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										88
									
								
								extras/Hadrons/Modules/MGauge/StochEm.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,88 @@ | |||||||
|  | /************************************************************************************* | ||||||
|  |  | ||||||
|  | Grid physics library, www.github.com/paboyle/Grid  | ||||||
|  |  | ||||||
|  | Source file: extras/Hadrons/Modules/MGauge/StochEm.cc | ||||||
|  |  | ||||||
|  | Copyright (C) 2015 | ||||||
|  | Copyright (C) 2016 | ||||||
|  |  | ||||||
|  |  | ||||||
|  | This program is free software; you can redistribute it and/or modify | ||||||
|  | it under the terms of the GNU General Public License as published by | ||||||
|  | the Free Software Foundation; either version 2 of the License, or | ||||||
|  | (at your option) any later version. | ||||||
|  |  | ||||||
|  | This program is distributed in the hope that it will be useful, | ||||||
|  | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  | GNU General Public License for more details. | ||||||
|  |  | ||||||
|  | You should have received a copy of the GNU General Public License along | ||||||
|  | with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  |  | ||||||
|  | See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|  | *************************************************************************************/ | ||||||
|  | /*  END LEGAL */ | ||||||
|  | #include <Grid/Hadrons/Modules/MGauge/StochEm.hpp> | ||||||
|  |  | ||||||
|  | using namespace Grid; | ||||||
|  | using namespace Hadrons; | ||||||
|  | using namespace MGauge; | ||||||
|  |  | ||||||
|  | /****************************************************************************** | ||||||
|  | *                  TStochEm implementation                             * | ||||||
|  | ******************************************************************************/ | ||||||
|  | // constructor ///////////////////////////////////////////////////////////////// | ||||||
|  | TStochEm::TStochEm(const std::string name) | ||||||
|  | : Module<StochEmPar>(name) | ||||||
|  | {} | ||||||
|  |  | ||||||
|  | // dependencies/products /////////////////////////////////////////////////////// | ||||||
|  | std::vector<std::string> TStochEm::getInput(void) | ||||||
|  | { | ||||||
|  |     std::vector<std::string> in; | ||||||
|  |      | ||||||
|  |     return in; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | std::vector<std::string> TStochEm::getOutput(void) | ||||||
|  | { | ||||||
|  |     std::vector<std::string> out = {getName()}; | ||||||
|  |      | ||||||
|  |     return out; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // setup /////////////////////////////////////////////////////////////////////// | ||||||
|  | void TStochEm::setup(void) | ||||||
|  | { | ||||||
|  |     if (!env().hasRegisteredObject("_" + getName() + "_weight")) | ||||||
|  |     { | ||||||
|  |         env().registerLattice<EmComp>("_" + getName() + "_weight"); | ||||||
|  |     } | ||||||
|  |     env().registerLattice<EmField>(getName()); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // execution /////////////////////////////////////////////////////////////////// | ||||||
|  | void TStochEm::execute(void) | ||||||
|  | { | ||||||
|  |     PhotonR photon(par().gauge, par().zmScheme); | ||||||
|  |     EmField &a = *env().createLattice<EmField>(getName()); | ||||||
|  |     EmComp  *w; | ||||||
|  |      | ||||||
|  |     if (!env().hasCreatedObject("_" + getName() + "_weight")) | ||||||
|  |     { | ||||||
|  |         LOG(Message) << "Caching stochatic EM potential weight (gauge: " | ||||||
|  |                      << par().gauge << ", zero-mode scheme: " | ||||||
|  |                      << par().zmScheme << ")..." << std::endl; | ||||||
|  |         w = env().createLattice<EmComp>("_" + getName() + "_weight"); | ||||||
|  |         photon.StochasticWeight(*w); | ||||||
|  |     } | ||||||
|  |     else | ||||||
|  |     { | ||||||
|  |         w = env().getObject<EmComp>("_" + getName() + "_weight"); | ||||||
|  |     } | ||||||
|  |     LOG(Message) << "Generating stochatic EM potential..." << std::endl; | ||||||
|  |     photon.StochasticField(a, *env().get4dRng(), *w); | ||||||
|  | } | ||||||
							
								
								
									
										75
									
								
								extras/Hadrons/Modules/MGauge/StochEm.hpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										75
									
								
								extras/Hadrons/Modules/MGauge/StochEm.hpp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,75 @@ | |||||||
|  | /************************************************************************************* | ||||||
|  |  | ||||||
|  | Grid physics library, www.github.com/paboyle/Grid  | ||||||
|  |  | ||||||
|  | Source file: extras/Hadrons/Modules/MGauge/StochEm.hpp | ||||||
|  |  | ||||||
|  | Copyright (C) 2015 | ||||||
|  | Copyright (C) 2016 | ||||||
|  |  | ||||||
|  |  | ||||||
|  | This program is free software; you can redistribute it and/or modify | ||||||
|  | it under the terms of the GNU General Public License as published by | ||||||
|  | the Free Software Foundation; either version 2 of the License, or | ||||||
|  | (at your option) any later version. | ||||||
|  |  | ||||||
|  | This program is distributed in the hope that it will be useful, | ||||||
|  | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  | GNU General Public License for more details. | ||||||
|  |  | ||||||
|  | You should have received a copy of the GNU General Public License along | ||||||
|  | with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  |  | ||||||
|  | See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|  | *************************************************************************************/ | ||||||
|  | /*  END LEGAL */ | ||||||
|  | #ifndef Hadrons_MGauge_StochEm_hpp_ | ||||||
|  | #define Hadrons_MGauge_StochEm_hpp_ | ||||||
|  |  | ||||||
|  | #include <Grid/Hadrons/Global.hpp> | ||||||
|  | #include <Grid/Hadrons/Module.hpp> | ||||||
|  | #include <Grid/Hadrons/ModuleFactory.hpp> | ||||||
|  |  | ||||||
|  | BEGIN_HADRONS_NAMESPACE | ||||||
|  |  | ||||||
|  | /****************************************************************************** | ||||||
|  |  *                         StochEm                                 * | ||||||
|  |  ******************************************************************************/ | ||||||
|  | BEGIN_MODULE_NAMESPACE(MGauge) | ||||||
|  |  | ||||||
|  | class StochEmPar: Serializable | ||||||
|  | { | ||||||
|  | public: | ||||||
|  |     GRID_SERIALIZABLE_CLASS_MEMBERS(StochEmPar, | ||||||
|  |                                     PhotonR::Gauge,    gauge, | ||||||
|  |                                     PhotonR::ZmScheme, zmScheme); | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | class TStochEm: public Module<StochEmPar> | ||||||
|  | { | ||||||
|  | public: | ||||||
|  |     typedef PhotonR::GaugeField     EmField; | ||||||
|  |     typedef PhotonR::GaugeLinkField EmComp; | ||||||
|  | public: | ||||||
|  |     // constructor | ||||||
|  |     TStochEm(const std::string name); | ||||||
|  |     // destructor | ||||||
|  |     virtual ~TStochEm(void) = default; | ||||||
|  |     // dependency relation | ||||||
|  |     virtual std::vector<std::string> getInput(void); | ||||||
|  |     virtual std::vector<std::string> getOutput(void); | ||||||
|  |     // setup | ||||||
|  |     virtual void setup(void); | ||||||
|  |     // execution | ||||||
|  |     virtual void execute(void); | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | MODULE_REGISTER_NS(StochEm, TStochEm, MGauge); | ||||||
|  |  | ||||||
|  | END_MODULE_NAMESPACE | ||||||
|  |  | ||||||
|  | END_HADRONS_NAMESPACE | ||||||
|  |  | ||||||
|  | #endif // Hadrons_MGauge_StochEm_hpp_ | ||||||
| @@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo | |||||||
| *************************************************************************************/ | *************************************************************************************/ | ||||||
| /*  END LEGAL */ | /*  END LEGAL */ | ||||||
|  |  | ||||||
| #ifndef Hadrons_Unit_hpp_ | #ifndef Hadrons_MGauge_Unit_hpp_ | ||||||
| #define Hadrons_Unit_hpp_ | #define Hadrons_MGauge_Unit_hpp_ | ||||||
|  |  | ||||||
| #include <Grid/Hadrons/Global.hpp> | #include <Grid/Hadrons/Global.hpp> | ||||||
| #include <Grid/Hadrons/Module.hpp> | #include <Grid/Hadrons/Module.hpp> | ||||||
| @@ -63,4 +63,4 @@ END_MODULE_NAMESPACE | |||||||
|  |  | ||||||
| END_HADRONS_NAMESPACE | END_HADRONS_NAMESPACE | ||||||
|  |  | ||||||
| #endif // Hadrons_Unit_hpp_ | #endif // Hadrons_MGauge_Unit_hpp_ | ||||||
|   | |||||||
| @@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo | |||||||
| *************************************************************************************/ | *************************************************************************************/ | ||||||
| /*  END LEGAL */ | /*  END LEGAL */ | ||||||
|  |  | ||||||
| #ifndef Hadrons_NoiseLoop_hpp_ | #ifndef Hadrons_MLoop_NoiseLoop_hpp_ | ||||||
| #define Hadrons_NoiseLoop_hpp_ | #define Hadrons_MLoop_NoiseLoop_hpp_ | ||||||
|  |  | ||||||
| #include <Grid/Hadrons/Global.hpp> | #include <Grid/Hadrons/Global.hpp> | ||||||
| #include <Grid/Hadrons/Module.hpp> | #include <Grid/Hadrons/Module.hpp> | ||||||
| @@ -65,7 +65,7 @@ template <typename FImpl> | |||||||
| class TNoiseLoop: public Module<NoiseLoopPar> | class TNoiseLoop: public Module<NoiseLoopPar> | ||||||
| { | { | ||||||
| public: | public: | ||||||
|     TYPE_ALIASES(FImpl,); |     FERM_TYPE_ALIASES(FImpl,); | ||||||
| public: | public: | ||||||
|     // constructor |     // constructor | ||||||
|     TNoiseLoop(const std::string name); |     TNoiseLoop(const std::string name); | ||||||
| @@ -129,4 +129,4 @@ END_MODULE_NAMESPACE | |||||||
|  |  | ||||||
| END_HADRONS_NAMESPACE | END_HADRONS_NAMESPACE | ||||||
|  |  | ||||||
| #endif // Hadrons_NoiseLoop_hpp_ | #endif // Hadrons_MLoop_NoiseLoop_hpp_ | ||||||
|   | |||||||
							
								
								
									
										226
									
								
								extras/Hadrons/Modules/MScalar/ChargedProp.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										226
									
								
								extras/Hadrons/Modules/MScalar/ChargedProp.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,226 @@ | |||||||
|  | #include <Grid/Hadrons/Modules/MScalar/ChargedProp.hpp> | ||||||
|  | #include <Grid/Hadrons/Modules/MScalar/Scalar.hpp> | ||||||
|  |  | ||||||
|  | using namespace Grid; | ||||||
|  | using namespace Hadrons; | ||||||
|  | using namespace MScalar; | ||||||
|  |  | ||||||
|  | /****************************************************************************** | ||||||
|  | *                     TChargedProp implementation                             * | ||||||
|  | ******************************************************************************/ | ||||||
|  | // constructor ///////////////////////////////////////////////////////////////// | ||||||
|  | TChargedProp::TChargedProp(const std::string name) | ||||||
|  | : Module<ChargedPropPar>(name) | ||||||
|  | {} | ||||||
|  |  | ||||||
|  | // dependencies/products /////////////////////////////////////////////////////// | ||||||
|  | std::vector<std::string> TChargedProp::getInput(void) | ||||||
|  | { | ||||||
|  |     std::vector<std::string> in = {par().source, par().emField}; | ||||||
|  |      | ||||||
|  |     return in; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | std::vector<std::string> TChargedProp::getOutput(void) | ||||||
|  | { | ||||||
|  |     std::vector<std::string> out = {getName()}; | ||||||
|  |      | ||||||
|  |     return out; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // setup /////////////////////////////////////////////////////////////////////// | ||||||
|  | void TChargedProp::setup(void) | ||||||
|  | { | ||||||
|  |     freeMomPropName_ = FREEMOMPROP(par().mass); | ||||||
|  |     phaseName_.clear(); | ||||||
|  |     for (unsigned int mu = 0; mu < env().getNd(); ++mu) | ||||||
|  |     { | ||||||
|  |         phaseName_.push_back("_shiftphase_" + std::to_string(mu)); | ||||||
|  |     } | ||||||
|  |     GFSrcName_ = "_" + getName() + "_DinvSrc"; | ||||||
|  |     if (!env().hasRegisteredObject(freeMomPropName_)) | ||||||
|  |     { | ||||||
|  |         env().registerLattice<ScalarField>(freeMomPropName_); | ||||||
|  |     } | ||||||
|  |     if (!env().hasRegisteredObject(phaseName_[0])) | ||||||
|  |     { | ||||||
|  |         for (unsigned int mu = 0; mu < env().getNd(); ++mu) | ||||||
|  |         { | ||||||
|  |             env().registerLattice<ScalarField>(phaseName_[mu]); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |     if (!env().hasRegisteredObject(GFSrcName_)) | ||||||
|  |     { | ||||||
|  |         env().registerLattice<ScalarField>(GFSrcName_); | ||||||
|  |     } | ||||||
|  |     env().registerLattice<ScalarField>(getName()); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // execution /////////////////////////////////////////////////////////////////// | ||||||
|  | void TChargedProp::execute(void) | ||||||
|  | { | ||||||
|  |     // CACHING ANALYTIC EXPRESSIONS | ||||||
|  |     ScalarField &source = *env().getObject<ScalarField>(par().source); | ||||||
|  |     Complex     ci(0.0,1.0); | ||||||
|  |     FFT         fft(env().getGrid()); | ||||||
|  |      | ||||||
|  |     // cache free scalar propagator | ||||||
|  |     if (!env().hasCreatedObject(freeMomPropName_)) | ||||||
|  |     { | ||||||
|  |         LOG(Message) << "Caching momentum space free scalar propagator" | ||||||
|  |                      << " (mass= " << par().mass << ")..." << std::endl; | ||||||
|  |         freeMomProp_ = env().createLattice<ScalarField>(freeMomPropName_); | ||||||
|  |         SIMPL::MomentumSpacePropagator(*freeMomProp_, par().mass); | ||||||
|  |     } | ||||||
|  |     else | ||||||
|  |     { | ||||||
|  |         freeMomProp_ = env().getObject<ScalarField>(freeMomPropName_); | ||||||
|  |     } | ||||||
|  |     // cache G*F*src | ||||||
|  |     if (!env().hasCreatedObject(GFSrcName_)) | ||||||
|  |          | ||||||
|  |     { | ||||||
|  |         GFSrc_ = env().createLattice<ScalarField>(GFSrcName_); | ||||||
|  |         fft.FFT_all_dim(*GFSrc_, source, FFT::forward); | ||||||
|  |         *GFSrc_ = (*freeMomProp_)*(*GFSrc_); | ||||||
|  |     } | ||||||
|  |     else | ||||||
|  |     { | ||||||
|  |         GFSrc_ = env().getObject<ScalarField>(GFSrcName_); | ||||||
|  |     } | ||||||
|  |     // cache phases | ||||||
|  |     if (!env().hasCreatedObject(phaseName_[0])) | ||||||
|  |     { | ||||||
|  |         std::vector<int> &l = env().getGrid()->_fdimensions; | ||||||
|  |          | ||||||
|  |         LOG(Message) << "Caching shift phases..." << std::endl; | ||||||
|  |         for (unsigned int mu = 0; mu < env().getNd(); ++mu) | ||||||
|  |         { | ||||||
|  |             Real    twoPiL = M_PI*2./l[mu]; | ||||||
|  |              | ||||||
|  |             phase_.push_back(env().createLattice<ScalarField>(phaseName_[mu])); | ||||||
|  |             LatticeCoordinate(*(phase_[mu]), mu); | ||||||
|  |             *(phase_[mu]) = exp(ci*twoPiL*(*(phase_[mu]))); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |     else | ||||||
|  |     { | ||||||
|  |         for (unsigned int mu = 0; mu < env().getNd(); ++mu) | ||||||
|  |         { | ||||||
|  |             phase_.push_back(env().getObject<ScalarField>(phaseName_[mu])); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // PROPAGATOR CALCULATION | ||||||
|  |     LOG(Message) << "Computing charged scalar propagator" | ||||||
|  |                  << " (mass= " << par().mass | ||||||
|  |                  << ", charge= " << par().charge << ")..." << std::endl; | ||||||
|  |      | ||||||
|  |     ScalarField &prop   = *env().createLattice<ScalarField>(getName()); | ||||||
|  |     ScalarField buf(env().getGrid()); | ||||||
|  |     ScalarField &GFSrc = *GFSrc_, &G = *freeMomProp_; | ||||||
|  |     double      q = par().charge; | ||||||
|  |      | ||||||
|  |     // G*F*Src | ||||||
|  |     prop = GFSrc; | ||||||
|  |  | ||||||
|  |     // - q*G*momD1*G*F*Src (momD1 = F*D1*Finv) | ||||||
|  |     buf = GFSrc; | ||||||
|  |     momD1(buf, fft); | ||||||
|  |     buf = G*buf; | ||||||
|  |     prop = prop - q*buf; | ||||||
|  |  | ||||||
|  |     // + q^2*G*momD1*G*momD1*G*F*Src (here buf = G*momD1*G*F*Src) | ||||||
|  |     momD1(buf, fft); | ||||||
|  |     prop = prop + q*q*G*buf; | ||||||
|  |  | ||||||
|  |     // - q^2*G*momD2*G*F*Src (momD2 = F*D2*Finv) | ||||||
|  |     buf = GFSrc; | ||||||
|  |     momD2(buf, fft); | ||||||
|  |     prop = prop - q*q*G*buf; | ||||||
|  |  | ||||||
|  |     // final FT | ||||||
|  |     fft.FFT_all_dim(prop, prop, FFT::backward); | ||||||
|  |      | ||||||
|  |     // OUTPUT IF NECESSARY | ||||||
|  |     if (!par().output.empty()) | ||||||
|  |     { | ||||||
|  |         std::string           filename = par().output + "." + | ||||||
|  |                                          std::to_string(env().getTrajectory()); | ||||||
|  |          | ||||||
|  |         LOG(Message) << "Saving zero-momentum projection to '" | ||||||
|  |                      << filename << "'..." << std::endl; | ||||||
|  |          | ||||||
|  |         CorrWriter            writer(filename); | ||||||
|  |         std::vector<TComplex> vecBuf; | ||||||
|  |         std::vector<Complex>  result; | ||||||
|  |          | ||||||
|  |         sliceSum(prop, vecBuf, Tp); | ||||||
|  |         result.resize(vecBuf.size()); | ||||||
|  |         for (unsigned int t = 0; t < vecBuf.size(); ++t) | ||||||
|  |         { | ||||||
|  |             result[t] = TensorRemove(vecBuf[t]); | ||||||
|  |         } | ||||||
|  |         write(writer, "charge", q); | ||||||
|  |         write(writer, "prop", result); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void TChargedProp::momD1(ScalarField &s, FFT &fft) | ||||||
|  | { | ||||||
|  |     EmField     &A = *env().getObject<EmField>(par().emField); | ||||||
|  |     ScalarField buf(env().getGrid()), result(env().getGrid()), | ||||||
|  |                 Amu(env().getGrid()); | ||||||
|  |     Complex     ci(0.0,1.0); | ||||||
|  |  | ||||||
|  |     result = zero; | ||||||
|  |  | ||||||
|  |     for (unsigned int mu = 0; mu < env().getNd(); ++mu) | ||||||
|  |     { | ||||||
|  |         Amu = peekLorentz(A, mu); | ||||||
|  |         buf = (*phase_[mu])*s; | ||||||
|  |         fft.FFT_all_dim(buf, buf, FFT::backward); | ||||||
|  |         buf = Amu*buf; | ||||||
|  |         fft.FFT_all_dim(buf, buf, FFT::forward); | ||||||
|  |         result = result - ci*buf; | ||||||
|  |     } | ||||||
|  |     fft.FFT_all_dim(s, s, FFT::backward); | ||||||
|  |     for (unsigned int mu = 0; mu < env().getNd(); ++mu) | ||||||
|  |     { | ||||||
|  |         Amu = peekLorentz(A, mu); | ||||||
|  |         buf = Amu*s; | ||||||
|  |         fft.FFT_all_dim(buf, buf, FFT::forward); | ||||||
|  |         result = result + ci*adj(*phase_[mu])*buf; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     s = result; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void TChargedProp::momD2(ScalarField &s, FFT &fft) | ||||||
|  | { | ||||||
|  |     EmField     &A = *env().getObject<EmField>(par().emField); | ||||||
|  |     ScalarField buf(env().getGrid()), result(env().getGrid()), | ||||||
|  |                 Amu(env().getGrid()); | ||||||
|  |  | ||||||
|  |     result = zero; | ||||||
|  |      | ||||||
|  |     for (unsigned int mu = 0; mu < env().getNd(); ++mu) | ||||||
|  |     { | ||||||
|  |         Amu = peekLorentz(A, mu); | ||||||
|  |         buf = (*phase_[mu])*s; | ||||||
|  |         fft.FFT_all_dim(buf, buf, FFT::backward); | ||||||
|  |         buf = Amu*Amu*buf; | ||||||
|  |         fft.FFT_all_dim(buf, buf, FFT::forward); | ||||||
|  |         result = result + .5*buf; | ||||||
|  |     } | ||||||
|  |     fft.FFT_all_dim(s, s, FFT::backward); | ||||||
|  |     for (unsigned int mu = 0; mu < env().getNd(); ++mu) | ||||||
|  |     { | ||||||
|  |         Amu = peekLorentz(A, mu);         | ||||||
|  |         buf = Amu*Amu*s; | ||||||
|  |         fft.FFT_all_dim(buf, buf, FFT::forward); | ||||||
|  |         result = result + .5*adj(*phase_[mu])*buf; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     s = result; | ||||||
|  | } | ||||||
							
								
								
									
										61
									
								
								extras/Hadrons/Modules/MScalar/ChargedProp.hpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										61
									
								
								extras/Hadrons/Modules/MScalar/ChargedProp.hpp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,61 @@ | |||||||
|  | #ifndef Hadrons_MScalar_ChargedProp_hpp_ | ||||||
|  | #define Hadrons_MScalar_ChargedProp_hpp_ | ||||||
|  |  | ||||||
|  | #include <Grid/Hadrons/Global.hpp> | ||||||
|  | #include <Grid/Hadrons/Module.hpp> | ||||||
|  | #include <Grid/Hadrons/ModuleFactory.hpp> | ||||||
|  |  | ||||||
|  | BEGIN_HADRONS_NAMESPACE | ||||||
|  |  | ||||||
|  | /****************************************************************************** | ||||||
|  |  *                       Charged scalar propagator                            * | ||||||
|  |  ******************************************************************************/ | ||||||
|  | BEGIN_MODULE_NAMESPACE(MScalar) | ||||||
|  |  | ||||||
|  | class ChargedPropPar: Serializable | ||||||
|  | { | ||||||
|  | public: | ||||||
|  |     GRID_SERIALIZABLE_CLASS_MEMBERS(ChargedPropPar, | ||||||
|  |                                     std::string, emField, | ||||||
|  |                                     std::string, source, | ||||||
|  |                                     double,      mass, | ||||||
|  |                                     double,      charge, | ||||||
|  |                                     std::string, output); | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | class TChargedProp: public Module<ChargedPropPar> | ||||||
|  | { | ||||||
|  | public: | ||||||
|  |     SCALAR_TYPE_ALIASES(SIMPL,); | ||||||
|  |     typedef PhotonR::GaugeField     EmField; | ||||||
|  |     typedef PhotonR::GaugeLinkField EmComp; | ||||||
|  | public: | ||||||
|  |     // constructor | ||||||
|  |     TChargedProp(const std::string name); | ||||||
|  |     // destructor | ||||||
|  |     virtual ~TChargedProp(void) = default; | ||||||
|  |     // dependency relation | ||||||
|  |     virtual std::vector<std::string> getInput(void); | ||||||
|  |     virtual std::vector<std::string> getOutput(void); | ||||||
|  |     // setup | ||||||
|  |     virtual void setup(void); | ||||||
|  |     // execution | ||||||
|  |     virtual void execute(void); | ||||||
|  | private: | ||||||
|  |     void momD1(ScalarField &s, FFT &fft); | ||||||
|  |     void momD2(ScalarField &s, FFT &fft); | ||||||
|  | private: | ||||||
|  |     std::string                freeMomPropName_, GFSrcName_; | ||||||
|  |     std::vector<std::string>   phaseName_; | ||||||
|  |     ScalarField                *freeMomProp_, *GFSrc_; | ||||||
|  |     std::vector<ScalarField *> phase_; | ||||||
|  |     EmField                    *A; | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | MODULE_REGISTER_NS(ChargedProp, TChargedProp, MScalar); | ||||||
|  |  | ||||||
|  | END_MODULE_NAMESPACE | ||||||
|  |  | ||||||
|  | END_HADRONS_NAMESPACE | ||||||
|  |  | ||||||
|  | #endif // Hadrons_MScalar_ChargedProp_hpp_ | ||||||
							
								
								
									
										79
									
								
								extras/Hadrons/Modules/MScalar/FreeProp.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										79
									
								
								extras/Hadrons/Modules/MScalar/FreeProp.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,79 @@ | |||||||
|  | #include <Grid/Hadrons/Modules/MScalar/FreeProp.hpp> | ||||||
|  | #include <Grid/Hadrons/Modules/MScalar/Scalar.hpp> | ||||||
|  |  | ||||||
|  | using namespace Grid; | ||||||
|  | using namespace Hadrons; | ||||||
|  | using namespace MScalar; | ||||||
|  |  | ||||||
|  | /****************************************************************************** | ||||||
|  | *                        TFreeProp implementation                             * | ||||||
|  | ******************************************************************************/ | ||||||
|  | // constructor ///////////////////////////////////////////////////////////////// | ||||||
|  | TFreeProp::TFreeProp(const std::string name) | ||||||
|  | : Module<FreePropPar>(name) | ||||||
|  | {} | ||||||
|  |  | ||||||
|  | // dependencies/products /////////////////////////////////////////////////////// | ||||||
|  | std::vector<std::string> TFreeProp::getInput(void) | ||||||
|  | { | ||||||
|  |     std::vector<std::string> in = {par().source}; | ||||||
|  |      | ||||||
|  |     return in; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | std::vector<std::string> TFreeProp::getOutput(void) | ||||||
|  | { | ||||||
|  |     std::vector<std::string> out = {getName()}; | ||||||
|  |      | ||||||
|  |     return out; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // setup /////////////////////////////////////////////////////////////////////// | ||||||
|  | void TFreeProp::setup(void) | ||||||
|  | { | ||||||
|  |     freeMomPropName_ = FREEMOMPROP(par().mass); | ||||||
|  |      | ||||||
|  |     if (!env().hasRegisteredObject(freeMomPropName_)) | ||||||
|  |     { | ||||||
|  |         env().registerLattice<ScalarField>(freeMomPropName_); | ||||||
|  |     } | ||||||
|  |     env().registerLattice<ScalarField>(getName()); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // execution /////////////////////////////////////////////////////////////////// | ||||||
|  | void TFreeProp::execute(void) | ||||||
|  | { | ||||||
|  |     ScalarField &prop   = *env().createLattice<ScalarField>(getName()); | ||||||
|  |     ScalarField &source = *env().getObject<ScalarField>(par().source); | ||||||
|  |     ScalarField *freeMomProp; | ||||||
|  |  | ||||||
|  |     if (!env().hasCreatedObject(freeMomPropName_)) | ||||||
|  |     { | ||||||
|  |         LOG(Message) << "Caching momentum space free scalar propagator" | ||||||
|  |                      << " (mass= " << par().mass << ")..." << std::endl; | ||||||
|  |         freeMomProp = env().createLattice<ScalarField>(freeMomPropName_); | ||||||
|  |         SIMPL::MomentumSpacePropagator(*freeMomProp, par().mass); | ||||||
|  |     } | ||||||
|  |     else | ||||||
|  |     { | ||||||
|  |         freeMomProp = env().getObject<ScalarField>(freeMomPropName_); | ||||||
|  |     } | ||||||
|  |     LOG(Message) << "Computing free scalar propagator..." << std::endl; | ||||||
|  |     SIMPL::FreePropagator(source, prop, *freeMomProp); | ||||||
|  |      | ||||||
|  |     if (!par().output.empty()) | ||||||
|  |     { | ||||||
|  |         TextWriter            writer(par().output + "." + | ||||||
|  |                                      std::to_string(env().getTrajectory())); | ||||||
|  |         std::vector<TComplex> buf; | ||||||
|  |         std::vector<Complex>  result; | ||||||
|  |          | ||||||
|  |         sliceSum(prop, buf, Tp); | ||||||
|  |         result.resize(buf.size()); | ||||||
|  |         for (unsigned int t = 0; t < buf.size(); ++t) | ||||||
|  |         { | ||||||
|  |             result[t] = TensorRemove(buf[t]); | ||||||
|  |         } | ||||||
|  |         write(writer, "prop", result); | ||||||
|  |     } | ||||||
|  | } | ||||||
							
								
								
									
										50
									
								
								extras/Hadrons/Modules/MScalar/FreeProp.hpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										50
									
								
								extras/Hadrons/Modules/MScalar/FreeProp.hpp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,50 @@ | |||||||
|  | #ifndef Hadrons_MScalar_FreeProp_hpp_ | ||||||
|  | #define Hadrons_MScalar_FreeProp_hpp_ | ||||||
|  |  | ||||||
|  | #include <Grid/Hadrons/Global.hpp> | ||||||
|  | #include <Grid/Hadrons/Module.hpp> | ||||||
|  | #include <Grid/Hadrons/ModuleFactory.hpp> | ||||||
|  |  | ||||||
|  | BEGIN_HADRONS_NAMESPACE | ||||||
|  |  | ||||||
|  | /****************************************************************************** | ||||||
|  |  *                               FreeProp                                     * | ||||||
|  |  ******************************************************************************/ | ||||||
|  | BEGIN_MODULE_NAMESPACE(MScalar) | ||||||
|  |  | ||||||
|  | class FreePropPar: Serializable | ||||||
|  | { | ||||||
|  | public: | ||||||
|  |     GRID_SERIALIZABLE_CLASS_MEMBERS(FreePropPar, | ||||||
|  |                                     std::string, source, | ||||||
|  |                                     double,      mass, | ||||||
|  |                                     std::string, output); | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | class TFreeProp: public Module<FreePropPar> | ||||||
|  | { | ||||||
|  | public: | ||||||
|  |     SCALAR_TYPE_ALIASES(SIMPL,); | ||||||
|  | public: | ||||||
|  |     // constructor | ||||||
|  |     TFreeProp(const std::string name); | ||||||
|  |     // destructor | ||||||
|  |     virtual ~TFreeProp(void) = default; | ||||||
|  |     // dependency relation | ||||||
|  |     virtual std::vector<std::string> getInput(void); | ||||||
|  |     virtual std::vector<std::string> getOutput(void); | ||||||
|  |     // setup | ||||||
|  |     virtual void setup(void); | ||||||
|  |     // execution | ||||||
|  |     virtual void execute(void); | ||||||
|  | private: | ||||||
|  |     std::string freeMomPropName_; | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | MODULE_REGISTER_NS(FreeProp, TFreeProp, MScalar); | ||||||
|  |  | ||||||
|  | END_MODULE_NAMESPACE | ||||||
|  |  | ||||||
|  | END_HADRONS_NAMESPACE | ||||||
|  |  | ||||||
|  | #endif // Hadrons_MScalar_FreeProp_hpp_ | ||||||
							
								
								
									
										6
									
								
								extras/Hadrons/Modules/MScalar/Scalar.hpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										6
									
								
								extras/Hadrons/Modules/MScalar/Scalar.hpp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,6 @@ | |||||||
|  | #ifndef Hadrons_Scalar_hpp_ | ||||||
|  | #define Hadrons_Scalar_hpp_ | ||||||
|  |  | ||||||
|  | #define FREEMOMPROP(m) "_scalar_mom_prop_" + std::to_string(m) | ||||||
|  |  | ||||||
|  | #endif // Hadrons_Scalar_hpp_ | ||||||
							
								
								
									
										114
									
								
								extras/Hadrons/Modules/MSink/Point.hpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										114
									
								
								extras/Hadrons/Modules/MSink/Point.hpp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,114 @@ | |||||||
|  | #ifndef Hadrons_MSink_Point_hpp_ | ||||||
|  | #define Hadrons_MSink_Point_hpp_ | ||||||
|  |  | ||||||
|  | #include <Grid/Hadrons/Global.hpp> | ||||||
|  | #include <Grid/Hadrons/Module.hpp> | ||||||
|  | #include <Grid/Hadrons/ModuleFactory.hpp> | ||||||
|  |  | ||||||
|  | BEGIN_HADRONS_NAMESPACE | ||||||
|  |  | ||||||
|  | /****************************************************************************** | ||||||
|  |  *                                   Point                                    * | ||||||
|  |  ******************************************************************************/ | ||||||
|  | BEGIN_MODULE_NAMESPACE(MSink) | ||||||
|  |  | ||||||
|  | class PointPar: Serializable | ||||||
|  | { | ||||||
|  | public: | ||||||
|  |     GRID_SERIALIZABLE_CLASS_MEMBERS(PointPar, | ||||||
|  |                                     std::string, mom); | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | template <typename FImpl> | ||||||
|  | class TPoint: public Module<PointPar> | ||||||
|  | { | ||||||
|  | public: | ||||||
|  |     FERM_TYPE_ALIASES(FImpl,); | ||||||
|  |     SINK_TYPE_ALIASES(); | ||||||
|  | public: | ||||||
|  |     // constructor | ||||||
|  |     TPoint(const std::string name); | ||||||
|  |     // destructor | ||||||
|  |     virtual ~TPoint(void) = default; | ||||||
|  |     // dependency relation | ||||||
|  |     virtual std::vector<std::string> getInput(void); | ||||||
|  |     virtual std::vector<std::string> getOutput(void); | ||||||
|  |     // setup | ||||||
|  |     virtual void setup(void); | ||||||
|  |     // execution | ||||||
|  |     virtual void execute(void); | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | MODULE_REGISTER_NS(Point,       TPoint<FIMPL>,        MSink); | ||||||
|  | MODULE_REGISTER_NS(ScalarPoint, TPoint<ScalarImplCR>, MSink); | ||||||
|  |  | ||||||
|  | /****************************************************************************** | ||||||
|  |  *                          TPoint implementation                             * | ||||||
|  |  ******************************************************************************/ | ||||||
|  | // constructor ///////////////////////////////////////////////////////////////// | ||||||
|  | template <typename FImpl> | ||||||
|  | TPoint<FImpl>::TPoint(const std::string name) | ||||||
|  | : Module<PointPar>(name) | ||||||
|  | {} | ||||||
|  |  | ||||||
|  | // dependencies/products /////////////////////////////////////////////////////// | ||||||
|  | template <typename FImpl> | ||||||
|  | std::vector<std::string> TPoint<FImpl>::getInput(void) | ||||||
|  | { | ||||||
|  |     std::vector<std::string> in; | ||||||
|  |      | ||||||
|  |     return in; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | template <typename FImpl> | ||||||
|  | std::vector<std::string> TPoint<FImpl>::getOutput(void) | ||||||
|  | { | ||||||
|  |     std::vector<std::string> out = {getName()}; | ||||||
|  |      | ||||||
|  |     return out; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // setup /////////////////////////////////////////////////////////////////////// | ||||||
|  | template <typename FImpl> | ||||||
|  | void TPoint<FImpl>::setup(void) | ||||||
|  | { | ||||||
|  |     unsigned int size; | ||||||
|  |      | ||||||
|  |     size = env().template lattice4dSize<LatticeComplex>(); | ||||||
|  |     env().registerObject(getName(), size); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // execution /////////////////////////////////////////////////////////////////// | ||||||
|  | template <typename FImpl> | ||||||
|  | void TPoint<FImpl>::execute(void) | ||||||
|  | { | ||||||
|  |     std::vector<Real> p = strToVec<Real>(par().mom); | ||||||
|  |     LatticeComplex    ph(env().getGrid()), coor(env().getGrid()); | ||||||
|  |     Complex           i(0.0,1.0); | ||||||
|  |      | ||||||
|  |     LOG(Message) << "Setting up point sink function for momentum [" | ||||||
|  |                  << par().mom << "]" << std::endl; | ||||||
|  |     ph = zero; | ||||||
|  |     for(unsigned int mu = 0; mu < env().getNd(); mu++) | ||||||
|  |     { | ||||||
|  |         LatticeCoordinate(coor, mu); | ||||||
|  |         ph = ph + (p[mu]/env().getGrid()->_fdimensions[mu])*coor; | ||||||
|  |     } | ||||||
|  |     ph = exp((Real)(2*M_PI)*i*ph); | ||||||
|  |     auto sink = [ph](const PropagatorField &field) | ||||||
|  |     { | ||||||
|  |         SlicedPropagator res; | ||||||
|  |         PropagatorField  tmp = ph*field; | ||||||
|  |          | ||||||
|  |         sliceSum(tmp, res, Tp); | ||||||
|  |          | ||||||
|  |         return res; | ||||||
|  |     }; | ||||||
|  |     env().setObject(getName(), new SinkFn(sink)); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | END_MODULE_NAMESPACE | ||||||
|  |  | ||||||
|  | END_HADRONS_NAMESPACE | ||||||
|  |  | ||||||
|  | #endif // Hadrons_MSink_Point_hpp_ | ||||||
| @@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo | |||||||
| *************************************************************************************/ | *************************************************************************************/ | ||||||
| /*  END LEGAL */ | /*  END LEGAL */ | ||||||
|  |  | ||||||
| #ifndef Hadrons_RBPrecCG_hpp_ | #ifndef Hadrons_MSolver_RBPrecCG_hpp_ | ||||||
| #define Hadrons_RBPrecCG_hpp_ | #define Hadrons_MSolver_RBPrecCG_hpp_ | ||||||
|  |  | ||||||
| #include <Grid/Hadrons/Global.hpp> | #include <Grid/Hadrons/Global.hpp> | ||||||
| #include <Grid/Hadrons/Module.hpp> | #include <Grid/Hadrons/Module.hpp> | ||||||
| @@ -53,7 +53,7 @@ template <typename FImpl> | |||||||
| class TRBPrecCG: public Module<RBPrecCGPar> | class TRBPrecCG: public Module<RBPrecCGPar> | ||||||
| { | { | ||||||
| public: | public: | ||||||
|     TYPE_ALIASES(FImpl,); |     FGS_TYPE_ALIASES(FImpl,); | ||||||
| public: | public: | ||||||
|     // constructor |     // constructor | ||||||
|     TRBPrecCG(const std::string name); |     TRBPrecCG(const std::string name); | ||||||
| @@ -129,4 +129,4 @@ END_MODULE_NAMESPACE | |||||||
|  |  | ||||||
| END_HADRONS_NAMESPACE | END_HADRONS_NAMESPACE | ||||||
|  |  | ||||||
| #endif // Hadrons_RBPrecCG_hpp_ | #endif // Hadrons_MSolver_RBPrecCG_hpp_ | ||||||
|   | |||||||
| @@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo | |||||||
| *************************************************************************************/ | *************************************************************************************/ | ||||||
| /*  END LEGAL */ | /*  END LEGAL */ | ||||||
|  |  | ||||||
| #ifndef Hadrons_Point_hpp_ | #ifndef Hadrons_MSource_Point_hpp_ | ||||||
| #define Hadrons_Point_hpp_ | #define Hadrons_MSource_Point_hpp_ | ||||||
|  |  | ||||||
| #include <Grid/Hadrons/Global.hpp> | #include <Grid/Hadrons/Global.hpp> | ||||||
| #include <Grid/Hadrons/Module.hpp> | #include <Grid/Hadrons/Module.hpp> | ||||||
| @@ -63,7 +63,7 @@ template <typename FImpl> | |||||||
| class TPoint: public Module<PointPar> | class TPoint: public Module<PointPar> | ||||||
| { | { | ||||||
| public: | public: | ||||||
|     TYPE_ALIASES(FImpl,); |     FERM_TYPE_ALIASES(FImpl,); | ||||||
| public: | public: | ||||||
|     // constructor |     // constructor | ||||||
|     TPoint(const std::string name); |     TPoint(const std::string name); | ||||||
| @@ -78,7 +78,8 @@ public: | |||||||
|     virtual void execute(void); |     virtual void execute(void); | ||||||
| }; | }; | ||||||
|  |  | ||||||
| MODULE_REGISTER_NS(Point, TPoint<FIMPL>, MSource); | MODULE_REGISTER_NS(Point,       TPoint<FIMPL>,        MSource); | ||||||
|  | MODULE_REGISTER_NS(ScalarPoint, TPoint<ScalarImplCR>, MSource); | ||||||
|  |  | ||||||
| /****************************************************************************** | /****************************************************************************** | ||||||
|  *                       TPoint template implementation                       * |  *                       TPoint template implementation                       * | ||||||
| @@ -132,4 +133,4 @@ END_MODULE_NAMESPACE | |||||||
|  |  | ||||||
| END_HADRONS_NAMESPACE | END_HADRONS_NAMESPACE | ||||||
|  |  | ||||||
| #endif // Hadrons_Point_hpp_ | #endif // Hadrons_MSource_Point_hpp_ | ||||||
|   | |||||||
| @@ -28,8 +28,8 @@ See the full license in the file "LICENSE" in the top level distribution directo | |||||||
| *************************************************************************************/ | *************************************************************************************/ | ||||||
| /*  END LEGAL */ | /*  END LEGAL */ | ||||||
|  |  | ||||||
| #ifndef Hadrons_SeqGamma_hpp_ | #ifndef Hadrons_MSource_SeqGamma_hpp_ | ||||||
| #define Hadrons_SeqGamma_hpp_ | #define Hadrons_MSource_SeqGamma_hpp_ | ||||||
|  |  | ||||||
| #include <Grid/Hadrons/Global.hpp> | #include <Grid/Hadrons/Global.hpp> | ||||||
| #include <Grid/Hadrons/Module.hpp> | #include <Grid/Hadrons/Module.hpp> | ||||||
| @@ -72,7 +72,7 @@ template <typename FImpl> | |||||||
| class TSeqGamma: public Module<SeqGammaPar> | class TSeqGamma: public Module<SeqGammaPar> | ||||||
| { | { | ||||||
| public: | public: | ||||||
|     TYPE_ALIASES(FImpl,); |     FGS_TYPE_ALIASES(FImpl,); | ||||||
| public: | public: | ||||||
|     // constructor |     // constructor | ||||||
|     TSeqGamma(const std::string name); |     TSeqGamma(const std::string name); | ||||||
| @@ -161,4 +161,4 @@ END_MODULE_NAMESPACE | |||||||
|  |  | ||||||
| END_HADRONS_NAMESPACE | END_HADRONS_NAMESPACE | ||||||
|  |  | ||||||
| #endif // Hadrons_SeqGamma_hpp_ | #endif // Hadrons_MSource_SeqGamma_hpp_ | ||||||
|   | |||||||
| @@ -26,8 +26,8 @@ See the full license in the file "LICENSE" in the top level distribution directo | |||||||
| *************************************************************************************/ | *************************************************************************************/ | ||||||
| /*  END LEGAL */ | /*  END LEGAL */ | ||||||
|  |  | ||||||
| #ifndef Hadrons_WallSource_hpp_ | #ifndef Hadrons_MSource_WallSource_hpp_ | ||||||
| #define Hadrons_WallSource_hpp_ | #define Hadrons_MSource_WallSource_hpp_ | ||||||
|  |  | ||||||
| #include <Grid/Hadrons/Global.hpp> | #include <Grid/Hadrons/Global.hpp> | ||||||
| #include <Grid/Hadrons/Module.hpp> | #include <Grid/Hadrons/Module.hpp> | ||||||
| @@ -64,7 +64,7 @@ template <typename FImpl> | |||||||
| class TWall: public Module<WallPar> | class TWall: public Module<WallPar> | ||||||
| { | { | ||||||
| public: | public: | ||||||
|     TYPE_ALIASES(FImpl,); |     FERM_TYPE_ALIASES(FImpl,); | ||||||
| public: | public: | ||||||
|     // constructor |     // constructor | ||||||
|     TWall(const std::string name); |     TWall(const std::string name); | ||||||
| @@ -144,4 +144,4 @@ END_MODULE_NAMESPACE | |||||||
|  |  | ||||||
| END_HADRONS_NAMESPACE | END_HADRONS_NAMESPACE | ||||||
|  |  | ||||||
| #endif // Hadrons_WallSource_hpp_ | #endif // Hadrons_MSource_WallSource_hpp_ | ||||||
|   | |||||||
| @@ -27,8 +27,8 @@ See the full license in the file "LICENSE" in the top level distribution directo | |||||||
| *************************************************************************************/ | *************************************************************************************/ | ||||||
| /*  END LEGAL */ | /*  END LEGAL */ | ||||||
|  |  | ||||||
| #ifndef Hadrons_Z2_hpp_ | #ifndef Hadrons_MSource_Z2_hpp_ | ||||||
| #define Hadrons_Z2_hpp_ | #define Hadrons_MSource_Z2_hpp_ | ||||||
|  |  | ||||||
| #include <Grid/Hadrons/Global.hpp> | #include <Grid/Hadrons/Global.hpp> | ||||||
| #include <Grid/Hadrons/Module.hpp> | #include <Grid/Hadrons/Module.hpp> | ||||||
| @@ -67,7 +67,7 @@ template <typename FImpl> | |||||||
| class TZ2: public Module<Z2Par> | class TZ2: public Module<Z2Par> | ||||||
| { | { | ||||||
| public: | public: | ||||||
|     TYPE_ALIASES(FImpl,); |     FERM_TYPE_ALIASES(FImpl,); | ||||||
| public: | public: | ||||||
|     // constructor |     // constructor | ||||||
|     TZ2(const std::string name); |     TZ2(const std::string name); | ||||||
| @@ -82,7 +82,8 @@ public: | |||||||
|     virtual void execute(void); |     virtual void execute(void); | ||||||
| }; | }; | ||||||
|  |  | ||||||
| MODULE_REGISTER_NS(Z2, TZ2<FIMPL>, MSource); | MODULE_REGISTER_NS(Z2,       TZ2<FIMPL>,        MSource); | ||||||
|  | MODULE_REGISTER_NS(ScalarZ2, TZ2<ScalarImplCR>, MSource); | ||||||
|  |  | ||||||
| /****************************************************************************** | /****************************************************************************** | ||||||
|  *                       TZ2 template implementation                          * |  *                       TZ2 template implementation                          * | ||||||
| @@ -148,4 +149,4 @@ END_MODULE_NAMESPACE | |||||||
|  |  | ||||||
| END_HADRONS_NAMESPACE | END_HADRONS_NAMESPACE | ||||||
|  |  | ||||||
| #endif // Hadrons_Z2_hpp_ | #endif // Hadrons_MSource_Z2_hpp_ | ||||||
|   | |||||||
| @@ -1,5 +1,5 @@ | |||||||
| #ifndef Hadrons____FILEBASENAME____hpp_ | #ifndef Hadrons____NAMESPACE_______FILEBASENAME____hpp_ | ||||||
| #define Hadrons____FILEBASENAME____hpp_ | #define Hadrons____NAMESPACE_______FILEBASENAME____hpp_ | ||||||
|  |  | ||||||
| #include <Grid/Hadrons/Global.hpp> | #include <Grid/Hadrons/Global.hpp> | ||||||
| #include <Grid/Hadrons/Module.hpp> | #include <Grid/Hadrons/Module.hpp> | ||||||
| @@ -41,4 +41,4 @@ END_MODULE_NAMESPACE | |||||||
|  |  | ||||||
| END_HADRONS_NAMESPACE | END_HADRONS_NAMESPACE | ||||||
|  |  | ||||||
| #endif // Hadrons____FILEBASENAME____hpp_ | #endif // Hadrons____NAMESPACE_______FILEBASENAME____hpp_ | ||||||
|   | |||||||
| @@ -1,5 +1,5 @@ | |||||||
| #ifndef Hadrons____FILEBASENAME____hpp_ | #ifndef Hadrons____NAMESPACE_______FILEBASENAME____hpp_ | ||||||
| #define Hadrons____FILEBASENAME____hpp_ | #define Hadrons____NAMESPACE_______FILEBASENAME____hpp_ | ||||||
|  |  | ||||||
| #include <Grid/Hadrons/Global.hpp> | #include <Grid/Hadrons/Global.hpp> | ||||||
| #include <Grid/Hadrons/Module.hpp> | #include <Grid/Hadrons/Module.hpp> | ||||||
| @@ -82,4 +82,4 @@ END_MODULE_NAMESPACE | |||||||
|  |  | ||||||
| END_HADRONS_NAMESPACE | END_HADRONS_NAMESPACE | ||||||
|  |  | ||||||
| #endif // Hadrons____FILEBASENAME____hpp_ | #endif // Hadrons____NAMESPACE_______FILEBASENAME____hpp_ | ||||||
|   | |||||||
| @@ -4,7 +4,10 @@ modules_cc =\ | |||||||
|   Modules/MContraction/WeakNeutral4ptDisc.cc \ |   Modules/MContraction/WeakNeutral4ptDisc.cc \ | ||||||
|   Modules/MGauge/Load.cc \ |   Modules/MGauge/Load.cc \ | ||||||
|   Modules/MGauge/Random.cc \ |   Modules/MGauge/Random.cc \ | ||||||
|   Modules/MGauge/Unit.cc |   Modules/MGauge/StochEm.cc \ | ||||||
|  |   Modules/MGauge/Unit.cc \ | ||||||
|  |   Modules/MScalar/ChargedProp.cc \ | ||||||
|  |   Modules/MScalar/FreeProp.cc | ||||||
|  |  | ||||||
| modules_hpp =\ | modules_hpp =\ | ||||||
|   Modules/MAction/DWF.hpp \ |   Modules/MAction/DWF.hpp \ | ||||||
| @@ -17,14 +20,19 @@ modules_hpp =\ | |||||||
|   Modules/MContraction/WeakHamiltonianEye.hpp \ |   Modules/MContraction/WeakHamiltonianEye.hpp \ | ||||||
|   Modules/MContraction/WeakHamiltonianNonEye.hpp \ |   Modules/MContraction/WeakHamiltonianNonEye.hpp \ | ||||||
|   Modules/MContraction/WeakNeutral4ptDisc.hpp \ |   Modules/MContraction/WeakNeutral4ptDisc.hpp \ | ||||||
|  |   Modules/MFermion/GaugeProp.hpp \ | ||||||
|   Modules/MGauge/Load.hpp \ |   Modules/MGauge/Load.hpp \ | ||||||
|   Modules/MGauge/Random.hpp \ |   Modules/MGauge/Random.hpp \ | ||||||
|  |   Modules/MGauge/StochEm.hpp \ | ||||||
|   Modules/MGauge/Unit.hpp \ |   Modules/MGauge/Unit.hpp \ | ||||||
|   Modules/MLoop/NoiseLoop.hpp \ |   Modules/MLoop/NoiseLoop.hpp \ | ||||||
|  |   Modules/MScalar/ChargedProp.hpp \ | ||||||
|  |   Modules/MScalar/FreeProp.hpp \ | ||||||
|  |   Modules/MScalar/Scalar.hpp \ | ||||||
|  |   Modules/MSink/Point.hpp \ | ||||||
|   Modules/MSolver/RBPrecCG.hpp \ |   Modules/MSolver/RBPrecCG.hpp \ | ||||||
|   Modules/MSource/Point.hpp \ |   Modules/MSource/Point.hpp \ | ||||||
|   Modules/MSource/SeqGamma.hpp \ |   Modules/MSource/SeqGamma.hpp \ | ||||||
|   Modules/MSource/Wall.hpp \ |   Modules/MSource/Wall.hpp \ | ||||||
|   Modules/MSource/Z2.hpp \ |   Modules/MSource/Z2.hpp | ||||||
|   Modules/Quark.hpp |  | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										11
									
								
								extras/qed-fvol/Global.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										11
									
								
								extras/qed-fvol/Global.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,11 @@ | |||||||
|  | #include <qed-fvol/Global.hpp> | ||||||
|  |  | ||||||
|  | using namespace Grid; | ||||||
|  | using namespace QCD; | ||||||
|  | using namespace QedFVol; | ||||||
|  |  | ||||||
|  | QedFVolLogger QedFVol::QedFVolLogError(1,"Error"); | ||||||
|  | QedFVolLogger QedFVol::QedFVolLogWarning(1,"Warning"); | ||||||
|  | QedFVolLogger QedFVol::QedFVolLogMessage(1,"Message"); | ||||||
|  | QedFVolLogger QedFVol::QedFVolLogIterative(1,"Iterative"); | ||||||
|  | QedFVolLogger QedFVol::QedFVolLogDebug(1,"Debug"); | ||||||
							
								
								
									
										42
									
								
								extras/qed-fvol/Global.hpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										42
									
								
								extras/qed-fvol/Global.hpp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,42 @@ | |||||||
|  | #ifndef QedFVol_Global_hpp_ | ||||||
|  | #define QedFVol_Global_hpp_ | ||||||
|  |  | ||||||
|  | #include <Grid/Grid.h> | ||||||
|  |  | ||||||
|  | #define BEGIN_QEDFVOL_NAMESPACE \ | ||||||
|  | namespace Grid {\ | ||||||
|  | using namespace QCD;\ | ||||||
|  | namespace QedFVol {\ | ||||||
|  | using Grid::operator<<; | ||||||
|  | #define END_QEDFVOL_NAMESPACE }} | ||||||
|  |  | ||||||
|  | /* the 'using Grid::operator<<;' statement prevents a very nasty compilation | ||||||
|  |  * error with GCC (clang compiles fine without it). | ||||||
|  |  */ | ||||||
|  |  | ||||||
|  | BEGIN_QEDFVOL_NAMESPACE | ||||||
|  |  | ||||||
|  | class QedFVolLogger: public Logger | ||||||
|  | { | ||||||
|  | public: | ||||||
|  |     QedFVolLogger(int on, std::string nm): Logger("QedFVol", on, nm, | ||||||
|  |                                                   GridLogColours, "BLACK"){}; | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | #define LOG(channel) std::cout << QedFVolLog##channel | ||||||
|  | #define QEDFVOL_ERROR(msg)\ | ||||||
|  | LOG(Error) << msg << " (" << __FUNCTION__ << " at " << __FILE__ << ":"\ | ||||||
|  |            << __LINE__ << ")" << std::endl;\ | ||||||
|  | abort(); | ||||||
|  |  | ||||||
|  | #define DEBUG_VAR(var) LOG(Debug) << #var << "= " << (var) << std::endl; | ||||||
|  |  | ||||||
|  | extern QedFVolLogger QedFVolLogError; | ||||||
|  | extern QedFVolLogger QedFVolLogWarning; | ||||||
|  | extern QedFVolLogger QedFVolLogMessage; | ||||||
|  | extern QedFVolLogger QedFVolLogIterative; | ||||||
|  | extern QedFVolLogger QedFVolLogDebug; | ||||||
|  |  | ||||||
|  | END_QEDFVOL_NAMESPACE | ||||||
|  |  | ||||||
|  | #endif // QedFVol_Global_hpp_ | ||||||
							
								
								
									
										9
									
								
								extras/qed-fvol/Makefile.am
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								extras/qed-fvol/Makefile.am
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,9 @@ | |||||||
|  | AM_CXXFLAGS += -I$(top_srcdir)/extras | ||||||
|  |  | ||||||
|  | bin_PROGRAMS = qed-fvol | ||||||
|  |  | ||||||
|  | qed_fvol_SOURCES =   \ | ||||||
|  |     qed-fvol.cc      \ | ||||||
|  |     Global.cc | ||||||
|  |  | ||||||
|  | qed_fvol_LDADD   = -lGrid | ||||||
							
								
								
									
										265
									
								
								extras/qed-fvol/WilsonLoops.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										265
									
								
								extras/qed-fvol/WilsonLoops.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,265 @@ | |||||||
|  | #ifndef QEDFVOL_WILSONLOOPS_H | ||||||
|  | #define QEDFVOL_WILSONLOOPS_H | ||||||
|  |  | ||||||
|  | #include <Global.hpp> | ||||||
|  |  | ||||||
|  | BEGIN_QEDFVOL_NAMESPACE | ||||||
|  |  | ||||||
|  | template <class Gimpl> class NewWilsonLoops : public Gimpl { | ||||||
|  | public: | ||||||
|  |   INHERIT_GIMPL_TYPES(Gimpl); | ||||||
|  |  | ||||||
|  |   typedef typename Gimpl::GaugeLinkField GaugeMat; | ||||||
|  |   typedef typename Gimpl::GaugeField GaugeLorentz; | ||||||
|  |  | ||||||
|  |   ////////////////////////////////////////////////// | ||||||
|  |   // directed plaquette oriented in mu,nu plane | ||||||
|  |   ////////////////////////////////////////////////// | ||||||
|  |   static void dirPlaquette(GaugeMat &plaq, const std::vector<GaugeMat> &U, | ||||||
|  |                            const int mu, const int nu) { | ||||||
|  |     // Annoyingly, must use either scope resolution to find dependent base | ||||||
|  |     // class, | ||||||
|  |     // or this-> ; there is no "this" in a static method. This forces explicit | ||||||
|  |     // Gimpl scope | ||||||
|  |     // resolution throughout the usage in this file, and rather defeats the | ||||||
|  |     // purpose of deriving | ||||||
|  |     // from Gimpl. | ||||||
|  |     plaq = Gimpl::CovShiftBackward( | ||||||
|  |         U[mu], mu, Gimpl::CovShiftBackward( | ||||||
|  |                        U[nu], nu, Gimpl::CovShiftForward(U[mu], mu, U[nu]))); | ||||||
|  |   } | ||||||
|  |   ////////////////////////////////////////////////// | ||||||
|  |   // trace of directed plaquette oriented in mu,nu plane | ||||||
|  |   ////////////////////////////////////////////////// | ||||||
|  |   static void traceDirPlaquette(LatticeComplex &plaq, | ||||||
|  |                                 const std::vector<GaugeMat> &U, const int mu, | ||||||
|  |                                 const int nu) { | ||||||
|  |     GaugeMat sp(U[0]._grid); | ||||||
|  |     dirPlaquette(sp, U, mu, nu); | ||||||
|  |     plaq = trace(sp); | ||||||
|  |   } | ||||||
|  |   ////////////////////////////////////////////////// | ||||||
|  |   // sum over all planes of plaquette | ||||||
|  |   ////////////////////////////////////////////////// | ||||||
|  |   static void sitePlaquette(LatticeComplex &Plaq, | ||||||
|  |                             const std::vector<GaugeMat> &U) { | ||||||
|  |     LatticeComplex sitePlaq(U[0]._grid); | ||||||
|  |     Plaq = zero; | ||||||
|  |     for (int mu = 1; mu < U[0]._grid->_ndimension; mu++) { | ||||||
|  |       for (int nu = 0; nu < mu; nu++) { | ||||||
|  |         traceDirPlaquette(sitePlaq, U, mu, nu); | ||||||
|  |         Plaq = Plaq + sitePlaq; | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   ////////////////////////////////////////////////// | ||||||
|  |   // sum over all x,y,z,t and over all planes of plaquette | ||||||
|  |   ////////////////////////////////////////////////// | ||||||
|  |   static Real sumPlaquette(const GaugeLorentz &Umu) { | ||||||
|  |     std::vector<GaugeMat> U(4, Umu._grid); | ||||||
|  |  | ||||||
|  |     for (int mu = 0; mu < Umu._grid->_ndimension; mu++) { | ||||||
|  |       U[mu] = PeekIndex<LorentzIndex>(Umu, mu); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     LatticeComplex Plaq(Umu._grid); | ||||||
|  |  | ||||||
|  |     sitePlaquette(Plaq, U); | ||||||
|  |  | ||||||
|  |     TComplex Tp = sum(Plaq); | ||||||
|  |     Complex p = TensorRemove(Tp); | ||||||
|  |     return p.real(); | ||||||
|  |   } | ||||||
|  |   ////////////////////////////////////////////////// | ||||||
|  |   // average over all x,y,z,t and over all planes of plaquette | ||||||
|  |   ////////////////////////////////////////////////// | ||||||
|  |   static Real avgPlaquette(const GaugeLorentz &Umu) { | ||||||
|  |     int ndim = Umu._grid->_ndimension; | ||||||
|  |     Real sumplaq = sumPlaquette(Umu); | ||||||
|  |     Real vol = Umu._grid->gSites(); | ||||||
|  |     Real faces = (1.0 * ndim * (ndim - 1)) / 2.0; | ||||||
|  |     return sumplaq / vol / faces / Nc; // Nc dependent... FIXME | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   ////////////////////////////////////////////////// | ||||||
|  |   // Wilson loop of size (R1, R2), oriented in mu,nu plane | ||||||
|  |   ////////////////////////////////////////////////// | ||||||
|  |   static void wilsonLoop(GaugeMat &wl, const std::vector<GaugeMat> &U, | ||||||
|  |                            const int Rmu, const int Rnu, | ||||||
|  |                            const int mu, const int nu) { | ||||||
|  |     wl = U[nu]; | ||||||
|  |  | ||||||
|  |     for(int i = 0; i < Rnu-1; i++){ | ||||||
|  |       wl = Gimpl::CovShiftForward(U[nu], nu, wl); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     for(int i = 0; i < Rmu; i++){ | ||||||
|  |       wl = Gimpl::CovShiftForward(U[mu], mu, wl); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     for(int i = 0; i < Rnu; i++){ | ||||||
|  |       wl = Gimpl::CovShiftBackward(U[nu], nu, wl); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     for(int i = 0; i < Rmu; i++){ | ||||||
|  |       wl = Gimpl::CovShiftBackward(U[mu], mu, wl); | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   ////////////////////////////////////////////////// | ||||||
|  |   // trace of Wilson Loop oriented in mu,nu plane | ||||||
|  |   ////////////////////////////////////////////////// | ||||||
|  |   static void traceWilsonLoop(LatticeComplex &wl, | ||||||
|  |                                 const std::vector<GaugeMat> &U, | ||||||
|  |                                 const int Rmu, const int Rnu, | ||||||
|  |                                 const int mu, const int nu) { | ||||||
|  |     GaugeMat sp(U[0]._grid); | ||||||
|  |     wilsonLoop(sp, U, Rmu, Rnu, mu, nu); | ||||||
|  |     wl = trace(sp); | ||||||
|  |   } | ||||||
|  |   ////////////////////////////////////////////////// | ||||||
|  |   // sum over all planes of Wilson loop | ||||||
|  |   ////////////////////////////////////////////////// | ||||||
|  |   static void siteWilsonLoop(LatticeComplex &Wl, | ||||||
|  |                             const std::vector<GaugeMat> &U, | ||||||
|  |                             const int R1, const int R2) { | ||||||
|  |     LatticeComplex siteWl(U[0]._grid); | ||||||
|  |     Wl = zero; | ||||||
|  |     for (int mu = 1; mu < U[0]._grid->_ndimension; mu++) { | ||||||
|  |       for (int nu = 0; nu < mu; nu++) { | ||||||
|  |         traceWilsonLoop(siteWl, U, R1, R2, mu, nu); | ||||||
|  |         Wl = Wl + siteWl; | ||||||
|  |         traceWilsonLoop(siteWl, U, R2, R1, mu, nu); | ||||||
|  |         Wl = Wl + siteWl; | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   ////////////////////////////////////////////////// | ||||||
|  |   // sum over planes of Wilson loop with length R1 | ||||||
|  |   // in the time direction | ||||||
|  |   ////////////////////////////////////////////////// | ||||||
|  |   static void siteTimelikeWilsonLoop(LatticeComplex &Wl, | ||||||
|  |                             const std::vector<GaugeMat> &U, | ||||||
|  |                             const int R1, const int R2) { | ||||||
|  |     LatticeComplex siteWl(U[0]._grid); | ||||||
|  |  | ||||||
|  |     int ndim = U[0]._grid->_ndimension; | ||||||
|  |  | ||||||
|  |     Wl = zero; | ||||||
|  |     for (int nu = 0; nu < ndim - 1; nu++) { | ||||||
|  |       traceWilsonLoop(siteWl, U, R1, R2, ndim-1, nu); | ||||||
|  |       Wl = Wl + siteWl; | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   ////////////////////////////////////////////////// | ||||||
|  |   // sum Wilson loop over all planes orthogonal to the time direction | ||||||
|  |   ////////////////////////////////////////////////// | ||||||
|  |   static void siteSpatialWilsonLoop(LatticeComplex &Wl, | ||||||
|  |                             const std::vector<GaugeMat> &U, | ||||||
|  |                             const int R1, const int R2) { | ||||||
|  |     LatticeComplex siteWl(U[0]._grid); | ||||||
|  |  | ||||||
|  |     Wl = zero; | ||||||
|  |     for (int mu = 1; mu < U[0]._grid->_ndimension - 1; mu++) { | ||||||
|  |       for (int nu = 0; nu < mu; nu++) { | ||||||
|  |         traceWilsonLoop(siteWl, U, R1, R2, mu, nu); | ||||||
|  |         Wl = Wl + siteWl; | ||||||
|  |         traceWilsonLoop(siteWl, U, R2, R1, mu, nu); | ||||||
|  |         Wl = Wl + siteWl; | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   ////////////////////////////////////////////////// | ||||||
|  |   // sum over all x,y,z,t and over all planes of Wilson loop | ||||||
|  |   ////////////////////////////////////////////////// | ||||||
|  |   static Real sumWilsonLoop(const GaugeLorentz &Umu, | ||||||
|  |                             const int R1, const int R2) { | ||||||
|  |     std::vector<GaugeMat> U(4, Umu._grid); | ||||||
|  |  | ||||||
|  |     for (int mu = 0; mu < Umu._grid->_ndimension; mu++) { | ||||||
|  |       U[mu] = PeekIndex<LorentzIndex>(Umu, mu); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     LatticeComplex Wl(Umu._grid); | ||||||
|  |  | ||||||
|  |     siteWilsonLoop(Wl, U, R1, R2); | ||||||
|  |  | ||||||
|  |     TComplex Tp = sum(Wl); | ||||||
|  |     Complex p = TensorRemove(Tp); | ||||||
|  |     return p.real(); | ||||||
|  |   } | ||||||
|  |   ////////////////////////////////////////////////// | ||||||
|  |   // sum over all x,y,z,t and over all planes of timelike Wilson loop | ||||||
|  |   ////////////////////////////////////////////////// | ||||||
|  |   static Real sumTimelikeWilsonLoop(const GaugeLorentz &Umu, | ||||||
|  |                             const int R1, const int R2) { | ||||||
|  |     std::vector<GaugeMat> U(4, Umu._grid); | ||||||
|  |  | ||||||
|  |     for (int mu = 0; mu < Umu._grid->_ndimension; mu++) { | ||||||
|  |       U[mu] = PeekIndex<LorentzIndex>(Umu, mu); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     LatticeComplex Wl(Umu._grid); | ||||||
|  |  | ||||||
|  |     siteTimelikeWilsonLoop(Wl, U, R1, R2); | ||||||
|  |  | ||||||
|  |     TComplex Tp = sum(Wl); | ||||||
|  |     Complex p = TensorRemove(Tp); | ||||||
|  |     return p.real(); | ||||||
|  |   } | ||||||
|  |   ////////////////////////////////////////////////// | ||||||
|  |   // sum over all x,y,z,t and over all planes of spatial Wilson loop | ||||||
|  |   ////////////////////////////////////////////////// | ||||||
|  |   static Real sumSpatialWilsonLoop(const GaugeLorentz &Umu, | ||||||
|  |                             const int R1, const int R2) { | ||||||
|  |     std::vector<GaugeMat> U(4, Umu._grid); | ||||||
|  |  | ||||||
|  |     for (int mu = 0; mu < Umu._grid->_ndimension; mu++) { | ||||||
|  |       U[mu] = PeekIndex<LorentzIndex>(Umu, mu); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     LatticeComplex Wl(Umu._grid); | ||||||
|  |  | ||||||
|  |     siteSpatialWilsonLoop(Wl, U, R1, R2); | ||||||
|  |  | ||||||
|  |     TComplex Tp = sum(Wl); | ||||||
|  |     Complex p = TensorRemove(Tp); | ||||||
|  |     return p.real(); | ||||||
|  |   } | ||||||
|  |   ////////////////////////////////////////////////// | ||||||
|  |   // average over all x,y,z,t and over all planes of Wilson loop | ||||||
|  |   ////////////////////////////////////////////////// | ||||||
|  |   static Real avgWilsonLoop(const GaugeLorentz &Umu, | ||||||
|  |                             const int R1, const int R2) { | ||||||
|  |     int ndim = Umu._grid->_ndimension; | ||||||
|  |     Real sumWl = sumWilsonLoop(Umu, R1, R2); | ||||||
|  |     Real vol = Umu._grid->gSites(); | ||||||
|  |     Real faces = 1.0 * ndim * (ndim - 1); | ||||||
|  |     return sumWl / vol / faces / Nc; // Nc dependent... FIXME | ||||||
|  |   } | ||||||
|  |   ////////////////////////////////////////////////// | ||||||
|  |   // average over all x,y,z,t and over all planes of timelike Wilson loop | ||||||
|  |   ////////////////////////////////////////////////// | ||||||
|  |   static Real avgTimelikeWilsonLoop(const GaugeLorentz &Umu, | ||||||
|  |                             const int R1, const int R2) { | ||||||
|  |     int ndim = Umu._grid->_ndimension; | ||||||
|  |     Real sumWl = sumTimelikeWilsonLoop(Umu, R1, R2); | ||||||
|  |     Real vol = Umu._grid->gSites(); | ||||||
|  |     Real faces = 1.0 * (ndim - 1); | ||||||
|  |     return sumWl / vol / faces / Nc; // Nc dependent... FIXME | ||||||
|  |   } | ||||||
|  |   ////////////////////////////////////////////////// | ||||||
|  |   // average over all x,y,z,t and over all planes of spatial Wilson loop | ||||||
|  |   ////////////////////////////////////////////////// | ||||||
|  |   static Real avgSpatialWilsonLoop(const GaugeLorentz &Umu, | ||||||
|  |                             const int R1, const int R2) { | ||||||
|  |     int ndim = Umu._grid->_ndimension; | ||||||
|  |     Real sumWl = sumSpatialWilsonLoop(Umu, R1, R2); | ||||||
|  |     Real vol = Umu._grid->gSites(); | ||||||
|  |     Real faces = 1.0 * (ndim - 1) * (ndim - 2); | ||||||
|  |     return sumWl / vol / faces / Nc; // Nc dependent... FIXME | ||||||
|  |   } | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | END_QEDFVOL_NAMESPACE | ||||||
|  |  | ||||||
|  | #endif // QEDFVOL_WILSONLOOPS_H | ||||||
							
								
								
									
										88
									
								
								extras/qed-fvol/qed-fvol.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										88
									
								
								extras/qed-fvol/qed-fvol.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,88 @@ | |||||||
|  | #include <Global.hpp> | ||||||
|  | #include <WilsonLoops.h> | ||||||
|  |  | ||||||
|  | using namespace Grid; | ||||||
|  | using namespace QCD; | ||||||
|  | using namespace QedFVol; | ||||||
|  |  | ||||||
|  | typedef PeriodicGaugeImpl<QedGimplR>    QedPeriodicGimplR; | ||||||
|  | typedef PhotonR::GaugeField             EmField; | ||||||
|  | typedef PhotonR::GaugeLinkField         EmComp; | ||||||
|  |  | ||||||
|  | const int NCONFIGS = 10; | ||||||
|  | const int NWILSON = 10; | ||||||
|  |  | ||||||
|  | int main(int argc, char *argv[]) | ||||||
|  | { | ||||||
|  |     // parse command line | ||||||
|  |     std::string parameterFileName; | ||||||
|  |      | ||||||
|  |     if (argc < 2) | ||||||
|  |     { | ||||||
|  |         std::cerr << "usage: " << argv[0] << " <parameter file> [Grid options]"; | ||||||
|  |         std::cerr << std::endl; | ||||||
|  |         std::exit(EXIT_FAILURE); | ||||||
|  |     } | ||||||
|  |     parameterFileName = argv[1]; | ||||||
|  |      | ||||||
|  |     // initialization | ||||||
|  |     Grid_init(&argc, &argv); | ||||||
|  |     QedFVolLogError.Active(GridLogError.isActive()); | ||||||
|  |     QedFVolLogWarning.Active(GridLogWarning.isActive()); | ||||||
|  |     QedFVolLogMessage.Active(GridLogMessage.isActive()); | ||||||
|  |     QedFVolLogIterative.Active(GridLogIterative.isActive()); | ||||||
|  |     QedFVolLogDebug.Active(GridLogDebug.isActive()); | ||||||
|  |     LOG(Message) << "Grid initialized" << std::endl; | ||||||
|  |      | ||||||
|  |     // QED stuff | ||||||
|  |     std::vector<int> latt_size   = GridDefaultLatt(); | ||||||
|  |     std::vector<int> simd_layout = GridDefaultSimd(4, vComplex::Nsimd()); | ||||||
|  |     std::vector<int> mpi_layout  = GridDefaultMpi(); | ||||||
|  |     GridCartesian    grid(latt_size,simd_layout,mpi_layout); | ||||||
|  |     GridParallelRNG  pRNG(&grid); | ||||||
|  |     PhotonR          photon(PhotonR::Gauge::feynman, | ||||||
|  |                             PhotonR::ZmScheme::qedL); | ||||||
|  |     EmField          a(&grid); | ||||||
|  |     EmField          expA(&grid); | ||||||
|  |  | ||||||
|  |     Complex imag_unit(0, 1); | ||||||
|  |  | ||||||
|  |     Real wlA; | ||||||
|  |     std::vector<Real> logWlAvg(NWILSON, 0.0), logWlTime(NWILSON, 0.0), logWlSpace(NWILSON, 0.0); | ||||||
|  |  | ||||||
|  |     pRNG.SeedRandomDevice(); | ||||||
|  |  | ||||||
|  |     LOG(Message) << "Wilson loop calculation beginning" << std::endl; | ||||||
|  |     for(int ic = 0; ic < NCONFIGS; ic++){ | ||||||
|  |         LOG(Message) << "Configuration " << ic <<std::endl; | ||||||
|  |         photon.StochasticField(a, pRNG); | ||||||
|  |  | ||||||
|  |         // Exponentiate photon field | ||||||
|  |         expA = exp(imag_unit*a); | ||||||
|  |  | ||||||
|  |         // Calculate Wilson loops | ||||||
|  |         for(int iw=1; iw<=NWILSON; iw++){ | ||||||
|  |             wlA = NewWilsonLoops<QedPeriodicGimplR>::avgWilsonLoop(expA, iw, iw) * 3; | ||||||
|  |             logWlAvg[iw-1] -= 2*log(wlA); | ||||||
|  |             wlA = NewWilsonLoops<QedPeriodicGimplR>::avgTimelikeWilsonLoop(expA, iw, iw) * 3; | ||||||
|  |             logWlTime[iw-1] -= 2*log(wlA); | ||||||
|  |             wlA = NewWilsonLoops<QedPeriodicGimplR>::avgSpatialWilsonLoop(expA, iw, iw) * 3; | ||||||
|  |             logWlSpace[iw-1] -= 2*log(wlA); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |     LOG(Message) << "Wilson loop calculation completed" << std::endl; | ||||||
|  |      | ||||||
|  |     // Calculate Wilson loops | ||||||
|  |     for(int iw=1; iw<=10; iw++){ | ||||||
|  |         LOG(Message) << iw << 'x' << iw << " Wilson loop" << std::endl; | ||||||
|  |         LOG(Message) << "-2log(W) average: " << logWlAvg[iw-1]/NCONFIGS << std::endl; | ||||||
|  |         LOG(Message) << "-2log(W) timelike: " << logWlTime[iw-1]/NCONFIGS << std::endl; | ||||||
|  |         LOG(Message) << "-2log(W) spatial: " << logWlSpace[iw-1]/NCONFIGS << std::endl; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // epilogue | ||||||
|  |     LOG(Message) << "Grid is finalizing now" << std::endl; | ||||||
|  |     Grid_finalize(); | ||||||
|  |      | ||||||
|  |     return EXIT_SUCCESS; | ||||||
|  | } | ||||||
| @@ -10,8 +10,8 @@ if BUILD_COMMS_MPI3 | |||||||
|   extra_sources+=communicator/Communicator_base.cc |   extra_sources+=communicator/Communicator_base.cc | ||||||
| endif | endif | ||||||
|  |  | ||||||
| if BUILD_COMMS_MPI3L | if BUILD_COMMS_MPIT | ||||||
|   extra_sources+=communicator/Communicator_mpi3_leader.cc |   extra_sources+=communicator/Communicator_mpit.cc | ||||||
|   extra_sources+=communicator/Communicator_base.cc |   extra_sources+=communicator/Communicator_base.cc | ||||||
| endif | endif | ||||||
|  |  | ||||||
|   | |||||||
| @@ -199,7 +199,12 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X) | |||||||
|  |  | ||||||
|   Linop.HermOp(X, AD); |   Linop.HermOp(X, AD); | ||||||
|   tmp = B - AD;   |   tmp = B - AD;   | ||||||
|  |   //std::cout << GridLogMessage << " initial tmp " << norm2(tmp)<< std::endl; | ||||||
|   ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp); |   ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp); | ||||||
|  |   //std::cout << GridLogMessage << " initial Q " << norm2(Q)<< std::endl; | ||||||
|  |   //std::cout << GridLogMessage << " m_rr " << m_rr<<std::endl; | ||||||
|  |   //std::cout << GridLogMessage << " m_C " << m_C<<std::endl; | ||||||
|  |   //std::cout << GridLogMessage << " m_Cinv " << m_Cinv<<std::endl; | ||||||
|   D=Q; |   D=Q; | ||||||
|  |  | ||||||
|   std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl; |   std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl; | ||||||
| @@ -221,12 +226,14 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X) | |||||||
|     MatrixTimer.Start(); |     MatrixTimer.Start(); | ||||||
|     Linop.HermOp(D, Z);       |     Linop.HermOp(D, Z);       | ||||||
|     MatrixTimer.Stop(); |     MatrixTimer.Stop(); | ||||||
|  |     //std::cout << GridLogMessage << " norm2 Z " <<norm2(Z)<<std::endl; | ||||||
|  |  | ||||||
|     //4. M  = [D^dag Z]^{-1} |     //4. M  = [D^dag Z]^{-1} | ||||||
|     sliceInnerTimer.Start(); |     sliceInnerTimer.Start(); | ||||||
|     sliceInnerProductMatrix(m_DZ,D,Z,Orthog); |     sliceInnerProductMatrix(m_DZ,D,Z,Orthog); | ||||||
|     sliceInnerTimer.Stop(); |     sliceInnerTimer.Stop(); | ||||||
|     m_M       = m_DZ.inverse(); |     m_M       = m_DZ.inverse(); | ||||||
|  |     //std::cout << GridLogMessage << " m_DZ " <<m_DZ<<std::endl; | ||||||
|      |      | ||||||
|     //5. X  = X + D MC |     //5. X  = X + D MC | ||||||
|     m_tmp     = m_M * m_C; |     m_tmp     = m_M * m_C; | ||||||
|   | |||||||
| @@ -11,7 +11,7 @@ int PointerCache::victim; | |||||||
|  |  | ||||||
| void *PointerCache::Insert(void *ptr,size_t bytes) { | void *PointerCache::Insert(void *ptr,size_t bytes) { | ||||||
|  |  | ||||||
|   if (bytes < 4096 ) return NULL; |   if (bytes < 4096 ) return ptr; | ||||||
|  |  | ||||||
| #ifdef GRID_OMP | #ifdef GRID_OMP | ||||||
|   assert(omp_in_parallel()==0); |   assert(omp_in_parallel()==0); | ||||||
|   | |||||||
| @@ -92,18 +92,34 @@ public: | |||||||
|     size_type bytes = __n*sizeof(_Tp); |     size_type bytes = __n*sizeof(_Tp); | ||||||
|  |  | ||||||
|     _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes); |     _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes); | ||||||
|  |     //    if ( ptr != NULL )  | ||||||
|  |     //      std::cout << "alignedAllocator "<<__n << " cache hit "<< std::hex << ptr <<std::dec <<std::endl; | ||||||
|  |  | ||||||
|  |     ////////////////// | ||||||
|  |     // Hack 2MB align; could make option probably doesn't need configurability | ||||||
|  |     ////////////////// | ||||||
|  | //define GRID_ALLOC_ALIGN (128) | ||||||
|  | #define GRID_ALLOC_ALIGN (2*1024*1024) | ||||||
| #ifdef HAVE_MM_MALLOC_H | #ifdef HAVE_MM_MALLOC_H | ||||||
|     if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,128); |     if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN); | ||||||
| #else | #else | ||||||
|     if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(128,bytes); |     if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes); | ||||||
| #endif | #endif | ||||||
|  |     //    std::cout << "alignedAllocator " << std::hex << ptr <<std::dec <<std::endl; | ||||||
|  |     // First touch optimise in threaded loop | ||||||
|  |     uint8_t *cp = (uint8_t *)ptr; | ||||||
|  | #ifdef GRID_OMP | ||||||
|  | #pragma omp parallel for | ||||||
|  | #endif | ||||||
|  |     for(size_type n=0;n<bytes;n+=4096){ | ||||||
|  |       cp[n]=0; | ||||||
|  |     } | ||||||
|     return ptr; |     return ptr; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   void deallocate(pointer __p, size_type __n) {  |   void deallocate(pointer __p, size_type __n) {  | ||||||
|     size_type bytes = __n * sizeof(_Tp); |     size_type bytes = __n * sizeof(_Tp); | ||||||
|  |  | ||||||
|     pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes); |     pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes); | ||||||
|  |  | ||||||
| #ifdef HAVE_MM_MALLOC_H | #ifdef HAVE_MM_MALLOC_H | ||||||
| @@ -182,10 +198,17 @@ public: | |||||||
|   pointer allocate(size_type __n, const void* _p= 0)  |   pointer allocate(size_type __n, const void* _p= 0)  | ||||||
|   { |   { | ||||||
| #ifdef HAVE_MM_MALLOC_H | #ifdef HAVE_MM_MALLOC_H | ||||||
|     _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128); |     _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),GRID_ALLOC_ALIGN); | ||||||
| #else | #else | ||||||
|     _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp)); |     _Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,__n*sizeof(_Tp)); | ||||||
| #endif | #endif | ||||||
|  |     size_type bytes = __n*sizeof(_Tp); | ||||||
|  |     uint8_t *cp = (uint8_t *)ptr; | ||||||
|  |     // One touch per 4k page, static OMP loop to catch same loop order | ||||||
|  | #pragma omp parallel for schedule(static) | ||||||
|  |     for(size_type n=0;n<bytes;n+=4096){ | ||||||
|  |       cp[n]=0; | ||||||
|  |     } | ||||||
|     return ptr; |     return ptr; | ||||||
|   } |   } | ||||||
|   void deallocate(pointer __p, size_type) {  |   void deallocate(pointer __p, size_type) {  | ||||||
|   | |||||||
| @@ -185,17 +185,18 @@ public: | |||||||
|     //////////////////////////////////////////////////////////////// |     //////////////////////////////////////////////////////////////// | ||||||
|  |  | ||||||
|     void show_decomposition(){ |     void show_decomposition(){ | ||||||
|       std::cout << GridLogMessage << "Full Dimensions    : " << _fdimensions << std::endl; |       std::cout << GridLogMessage << "\tFull Dimensions    : " << _fdimensions << std::endl; | ||||||
|       std::cout << GridLogMessage << "Global Dimensions  : " << _gdimensions << std::endl; |       std::cout << GridLogMessage << "\tSIMD layout        : " << _simd_layout << std::endl; | ||||||
|       std::cout << GridLogMessage << "Local Dimensions   : " << _ldimensions << std::endl; |       std::cout << GridLogMessage << "\tGlobal Dimensions  : " << _gdimensions << std::endl; | ||||||
|       std::cout << GridLogMessage << "Reduced Dimensions : " << _rdimensions << std::endl; |       std::cout << GridLogMessage << "\tLocal Dimensions   : " << _ldimensions << std::endl; | ||||||
|       std::cout << GridLogMessage << "Outer strides      : " << _ostride << std::endl; |       std::cout << GridLogMessage << "\tReduced Dimensions : " << _rdimensions << std::endl; | ||||||
|       std::cout << GridLogMessage << "Inner strides      : " << _istride << std::endl; |       std::cout << GridLogMessage << "\tOuter strides      : " << _ostride << std::endl; | ||||||
|       std::cout << GridLogMessage << "iSites             : " << _isites << std::endl; |       std::cout << GridLogMessage << "\tInner strides      : " << _istride << std::endl; | ||||||
|       std::cout << GridLogMessage << "oSites             : " << _osites << std::endl; |       std::cout << GridLogMessage << "\tiSites             : " << _isites << std::endl; | ||||||
|       std::cout << GridLogMessage << "lSites             : " << lSites() << std::endl;         |       std::cout << GridLogMessage << "\toSites             : " << _osites << std::endl; | ||||||
|       std::cout << GridLogMessage << "gSites             : " << gSites() << std::endl; |       std::cout << GridLogMessage << "\tlSites             : " << lSites() << std::endl;         | ||||||
|       std::cout << GridLogMessage << "Nd                 : " << _ndimension << std::endl;              |       std::cout << GridLogMessage << "\tgSites             : " << gSites() << std::endl; | ||||||
|  |       std::cout << GridLogMessage << "\tNd                 : " << _ndimension << std::endl;              | ||||||
|     }  |     }  | ||||||
|  |  | ||||||
|     //////////////////////////////////////////////////////////////// |     //////////////////////////////////////////////////////////////// | ||||||
|   | |||||||
| @@ -62,77 +62,81 @@ public: | |||||||
|       return shift; |       return shift; | ||||||
|     } |     } | ||||||
|     GridCartesian(const std::vector<int> &dimensions, |     GridCartesian(const std::vector<int> &dimensions, | ||||||
| 		  const std::vector<int> &simd_layout, |                   const std::vector<int> &simd_layout, | ||||||
| 		  const std::vector<int> &processor_grid |                   const std::vector<int> &processor_grid) : GridBase(processor_grid) | ||||||
| 		  ) : GridBase(processor_grid) |  | ||||||
|     { |     { | ||||||
|         /////////////////////// |       /////////////////////// | ||||||
|         // Grid information |       // Grid information | ||||||
|         /////////////////////// |       /////////////////////// | ||||||
|         _ndimension = dimensions.size(); |       _ndimension = dimensions.size(); | ||||||
|  |  | ||||||
|         _fdimensions.resize(_ndimension); |       _fdimensions.resize(_ndimension); | ||||||
|         _gdimensions.resize(_ndimension); |       _gdimensions.resize(_ndimension); | ||||||
|         _ldimensions.resize(_ndimension); |       _ldimensions.resize(_ndimension); | ||||||
|         _rdimensions.resize(_ndimension); |       _rdimensions.resize(_ndimension); | ||||||
|         _simd_layout.resize(_ndimension); |       _simd_layout.resize(_ndimension); | ||||||
| 	_lstart.resize(_ndimension); |       _lstart.resize(_ndimension); | ||||||
| 	_lend.resize(_ndimension); |       _lend.resize(_ndimension); | ||||||
|  |  | ||||||
|         _ostride.resize(_ndimension); |       _ostride.resize(_ndimension); | ||||||
|         _istride.resize(_ndimension); |       _istride.resize(_ndimension); | ||||||
|  |  | ||||||
|         _fsites = _gsites = _osites = _isites = 1; |       _fsites = _gsites = _osites = _isites = 1; | ||||||
|  |  | ||||||
|         for(int d=0;d<_ndimension;d++){ |       for (int d = 0; d < _ndimension; d++) | ||||||
| 	  _fdimensions[d] = dimensions[d]; // Global dimensions |       { | ||||||
| 	  _gdimensions[d] = _fdimensions[d]; // Global dimensions |         _fdimensions[d] = dimensions[d];   // Global dimensions | ||||||
| 	  _simd_layout[d] = simd_layout[d]; |         _gdimensions[d] = _fdimensions[d]; // Global dimensions | ||||||
| 	  _fsites = _fsites * _fdimensions[d]; |         _simd_layout[d] = simd_layout[d]; | ||||||
| 	  _gsites = _gsites * _gdimensions[d]; |         _fsites = _fsites * _fdimensions[d]; | ||||||
|  |         _gsites = _gsites * _gdimensions[d]; | ||||||
|  |  | ||||||
| 	  //FIXME check for exact division |         // Use a reduced simd grid | ||||||
|  |         _ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions | ||||||
|  |         assert(_ldimensions[d] * _processors[d] == _gdimensions[d]); | ||||||
|  |  | ||||||
| 	  // Use a reduced simd grid |         _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition | ||||||
| 	  _ldimensions[d]= _gdimensions[d]/_processors[d];  //local dimensions |         assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]); | ||||||
| 	  _rdimensions[d]= _ldimensions[d]/_simd_layout[d]; //overdecomposition |  | ||||||
| 	  _lstart[d]     = _processor_coor[d]*_ldimensions[d]; |  | ||||||
| 	  _lend[d]       = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1; |  | ||||||
| 	  _osites  *= _rdimensions[d]; |  | ||||||
| 	  _isites  *= _simd_layout[d]; |  | ||||||
|  |  | ||||||
| 	  // Addressing support |         _lstart[d] = _processor_coor[d] * _ldimensions[d]; | ||||||
| 	  if ( d==0 ) { |         _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1; | ||||||
| 	    _ostride[d] = 1; |         _osites *= _rdimensions[d]; | ||||||
| 	    _istride[d] = 1; |         _isites *= _simd_layout[d]; | ||||||
| 	  } else { |  | ||||||
| 	    _ostride[d] = _ostride[d-1]*_rdimensions[d-1]; |         // Addressing support | ||||||
| 	    _istride[d] = _istride[d-1]*_simd_layout[d-1]; |         if (d == 0) | ||||||
| 	  } |         { | ||||||
|  |           _ostride[d] = 1; | ||||||
|  |           _istride[d] = 1; | ||||||
|         } |         } | ||||||
|          |         else | ||||||
|         /////////////////////// |         { | ||||||
|         // subplane information |           _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1]; | ||||||
|         /////////////////////// |           _istride[d] = _istride[d - 1] * _simd_layout[d - 1]; | ||||||
|         _slice_block.resize(_ndimension); |  | ||||||
|         _slice_stride.resize(_ndimension); |  | ||||||
|         _slice_nblock.resize(_ndimension); |  | ||||||
|              |  | ||||||
|         int block =1; |  | ||||||
|         int nblock=1; |  | ||||||
|         for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d]; |  | ||||||
|              |  | ||||||
|         for(int d=0;d<_ndimension;d++){ |  | ||||||
|             nblock/=_rdimensions[d]; |  | ||||||
|             _slice_block[d] =block; |  | ||||||
|             _slice_stride[d]=_ostride[d]*_rdimensions[d]; |  | ||||||
|             _slice_nblock[d]=nblock; |  | ||||||
|             block = block*_rdimensions[d]; |  | ||||||
|         } |         } | ||||||
|  |       } | ||||||
|  |  | ||||||
|  |       /////////////////////// | ||||||
|  |       // subplane information | ||||||
|  |       /////////////////////// | ||||||
|  |       _slice_block.resize(_ndimension); | ||||||
|  |       _slice_stride.resize(_ndimension); | ||||||
|  |       _slice_nblock.resize(_ndimension); | ||||||
|  |  | ||||||
|  |       int block = 1; | ||||||
|  |       int nblock = 1; | ||||||
|  |       for (int d = 0; d < _ndimension; d++) | ||||||
|  |         nblock *= _rdimensions[d]; | ||||||
|  |  | ||||||
|  |       for (int d = 0; d < _ndimension; d++) | ||||||
|  |       { | ||||||
|  |         nblock /= _rdimensions[d]; | ||||||
|  |         _slice_block[d] = block; | ||||||
|  |         _slice_stride[d] = _ostride[d] * _rdimensions[d]; | ||||||
|  |         _slice_nblock[d] = nblock; | ||||||
|  |         block = block * _rdimensions[d]; | ||||||
|  |       } | ||||||
|     }; |     }; | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  |  | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -131,20 +131,20 @@ public: | |||||||
|       Init(dimensions,simd_layout,processor_grid,checker_dim_mask,0); |       Init(dimensions,simd_layout,processor_grid,checker_dim_mask,0); | ||||||
|     } |     } | ||||||
|     void Init(const std::vector<int> &dimensions, |     void Init(const std::vector<int> &dimensions, | ||||||
| 	      const std::vector<int> &simd_layout, |               const std::vector<int> &simd_layout, | ||||||
| 	      const std::vector<int> &processor_grid, |               const std::vector<int> &processor_grid, | ||||||
| 	      const std::vector<int> &checker_dim_mask, |               const std::vector<int> &checker_dim_mask, | ||||||
| 	      int checker_dim) |               int checker_dim) | ||||||
|     { |     { | ||||||
|     /////////////////////// |       /////////////////////// | ||||||
|     // Grid information |       // Grid information | ||||||
|     /////////////////////// |       /////////////////////// | ||||||
|       _checker_dim = checker_dim; |       _checker_dim = checker_dim; | ||||||
|       assert(checker_dim_mask[checker_dim]==1); |       assert(checker_dim_mask[checker_dim] == 1); | ||||||
|       _ndimension = dimensions.size(); |       _ndimension = dimensions.size(); | ||||||
|       assert(checker_dim_mask.size()==_ndimension); |       assert(checker_dim_mask.size() == _ndimension); | ||||||
|       assert(processor_grid.size()==_ndimension); |       assert(processor_grid.size() == _ndimension); | ||||||
|       assert(simd_layout.size()==_ndimension); |       assert(simd_layout.size() == _ndimension); | ||||||
|  |  | ||||||
|       _fdimensions.resize(_ndimension); |       _fdimensions.resize(_ndimension); | ||||||
|       _gdimensions.resize(_ndimension); |       _gdimensions.resize(_ndimension); | ||||||
| @@ -159,47 +159,55 @@ public: | |||||||
|  |  | ||||||
|       _fsites = _gsites = _osites = _isites = 1; |       _fsites = _gsites = _osites = _isites = 1; | ||||||
|  |  | ||||||
|       _checker_dim_mask=checker_dim_mask; |       _checker_dim_mask = checker_dim_mask; | ||||||
|  |  | ||||||
|       for(int d=0;d<_ndimension;d++){ |       for (int d = 0; d < _ndimension; d++) | ||||||
| 	_fdimensions[d] = dimensions[d]; |       { | ||||||
| 	_gdimensions[d] = _fdimensions[d]; |         _fdimensions[d] = dimensions[d]; | ||||||
| 	_fsites = _fsites * _fdimensions[d]; |         _gdimensions[d] = _fdimensions[d]; | ||||||
| 	_gsites = _gsites * _gdimensions[d]; |         _fsites = _fsites * _fdimensions[d]; | ||||||
|  |         _gsites = _gsites * _gdimensions[d]; | ||||||
|  |  | ||||||
| 	if (d==_checker_dim) { |         if (d == _checker_dim) | ||||||
| 	  _gdimensions[d] = _gdimensions[d]/2; // Remove a checkerboard |         { | ||||||
| 	} |           assert((_gdimensions[d] & 0x1) == 0); | ||||||
| 	_ldimensions[d] = _gdimensions[d]/_processors[d]; |           _gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard | ||||||
| 	_lstart[d]     = _processor_coor[d]*_ldimensions[d]; |         } | ||||||
| 	_lend[d]       = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1; |         _ldimensions[d] = _gdimensions[d] / _processors[d]; | ||||||
|  |         assert(_ldimensions[d] * _processors[d] == _gdimensions[d]); | ||||||
|  |         _lstart[d] = _processor_coor[d] * _ldimensions[d]; | ||||||
|  |         _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1; | ||||||
|  |  | ||||||
| 	// Use a reduced simd grid |         // Use a reduced simd grid | ||||||
| 	_simd_layout[d] = simd_layout[d]; |         _simd_layout[d] = simd_layout[d]; | ||||||
| 	_rdimensions[d]= _ldimensions[d]/_simd_layout[d]; |         _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; // this is not checking if this is integer | ||||||
| 	assert(_rdimensions[d]>0); |         assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]); | ||||||
|  |         assert(_rdimensions[d] > 0); | ||||||
|  |  | ||||||
| 	// all elements of a simd vector must have same checkerboard. |         // all elements of a simd vector must have same checkerboard. | ||||||
| 	// If Ls vectorised, this must still be the case; e.g. dwf rb5d |         // If Ls vectorised, this must still be the case; e.g. dwf rb5d | ||||||
| 	if ( _simd_layout[d]>1 ) { |         if (_simd_layout[d] > 1) | ||||||
| 	  if ( checker_dim_mask[d] ) {  |         { | ||||||
| 	    assert( (_rdimensions[d]&0x1) == 0 ); |           if (checker_dim_mask[d]) | ||||||
| 	  } |           { | ||||||
| 	} |             assert((_rdimensions[d] & 0x1) == 0); | ||||||
|  |           } | ||||||
| 	_osites *= _rdimensions[d]; |         } | ||||||
| 	_isites *= _simd_layout[d]; |  | ||||||
|          |  | ||||||
| 	// Addressing support |  | ||||||
| 	if ( d==0 ) { |  | ||||||
| 	  _ostride[d] = 1; |  | ||||||
| 	  _istride[d] = 1; |  | ||||||
| 	} else { |  | ||||||
| 	  _ostride[d] = _ostride[d-1]*_rdimensions[d-1]; |  | ||||||
| 	  _istride[d] = _istride[d-1]*_simd_layout[d-1]; |  | ||||||
| 	} |  | ||||||
|  |  | ||||||
|  |         _osites *= _rdimensions[d]; | ||||||
|  |         _isites *= _simd_layout[d]; | ||||||
|  |  | ||||||
|  |         // Addressing support | ||||||
|  |         if (d == 0) | ||||||
|  |         { | ||||||
|  |           _ostride[d] = 1; | ||||||
|  |           _istride[d] = 1; | ||||||
|  |         } | ||||||
|  |         else | ||||||
|  |         { | ||||||
|  |           _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1]; | ||||||
|  |           _istride[d] = _istride[d - 1] * _simd_layout[d - 1]; | ||||||
|  |         } | ||||||
|       } |       } | ||||||
|  |  | ||||||
|       //////////////////////////////////////////////////////////////////////////////////////////// |       //////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| @@ -209,58 +217,69 @@ public: | |||||||
|       _slice_stride.resize(_ndimension); |       _slice_stride.resize(_ndimension); | ||||||
|       _slice_nblock.resize(_ndimension); |       _slice_nblock.resize(_ndimension); | ||||||
|  |  | ||||||
|       int block =1; |       int block = 1; | ||||||
|       int nblock=1; |       int nblock = 1; | ||||||
|       for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d]; |       for (int d = 0; d < _ndimension; d++) | ||||||
|  |         nblock *= _rdimensions[d]; | ||||||
|  |  | ||||||
|       for(int d=0;d<_ndimension;d++){ |       for (int d = 0; d < _ndimension; d++) | ||||||
| 	nblock/=_rdimensions[d]; |       { | ||||||
| 	_slice_block[d] =block; |         nblock /= _rdimensions[d]; | ||||||
| 	_slice_stride[d]=_ostride[d]*_rdimensions[d]; |         _slice_block[d] = block; | ||||||
| 	_slice_nblock[d]=nblock; |         _slice_stride[d] = _ostride[d] * _rdimensions[d]; | ||||||
| 	block = block*_rdimensions[d]; |         _slice_nblock[d] = nblock; | ||||||
|  |         block = block * _rdimensions[d]; | ||||||
|       } |       } | ||||||
|  |  | ||||||
|       //////////////////////////////////////////////// |       //////////////////////////////////////////////// | ||||||
|       // Create a checkerboard lookup table |       // Create a checkerboard lookup table | ||||||
|       //////////////////////////////////////////////// |       //////////////////////////////////////////////// | ||||||
|       int rvol = 1; |       int rvol = 1; | ||||||
|       for(int d=0;d<_ndimension;d++){ |       for (int d = 0; d < _ndimension; d++) | ||||||
| 	rvol=rvol * _rdimensions[d]; |       { | ||||||
|  |         rvol = rvol * _rdimensions[d]; | ||||||
|       } |       } | ||||||
|       _checker_board.resize(rvol); |       _checker_board.resize(rvol); | ||||||
|       for(int osite=0;osite<_osites;osite++){ |       for (int osite = 0; osite < _osites; osite++) | ||||||
| 	_checker_board[osite] = CheckerBoardFromOindex (osite); |       { | ||||||
|  |         _checker_board[osite] = CheckerBoardFromOindex(osite); | ||||||
|       } |       } | ||||||
|        |  | ||||||
|     }; |     }; | ||||||
| protected: |  | ||||||
|  |   protected: | ||||||
|     virtual int oIndex(std::vector<int> &coor) |     virtual int oIndex(std::vector<int> &coor) | ||||||
|     { |     { | ||||||
|       int idx=0; |       int idx = 0; | ||||||
|       for(int d=0;d<_ndimension;d++) { |       for (int d = 0; d < _ndimension; d++) | ||||||
| 	if( d==_checker_dim ) { |       { | ||||||
| 	  idx+=_ostride[d]*((coor[d]/2)%_rdimensions[d]); |         if (d == _checker_dim) | ||||||
| 	} else { |         { | ||||||
| 	  idx+=_ostride[d]*(coor[d]%_rdimensions[d]); |           idx += _ostride[d] * ((coor[d] / 2) % _rdimensions[d]); | ||||||
| 	} |         } | ||||||
|  |         else | ||||||
|  |         { | ||||||
|  |           idx += _ostride[d] * (coor[d] % _rdimensions[d]); | ||||||
|  |         } | ||||||
|       } |       } | ||||||
|       return idx; |       return idx; | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|     virtual int iIndex(std::vector<int> &lcoor) |     virtual int iIndex(std::vector<int> &lcoor) | ||||||
|     { |     { | ||||||
|         int idx=0; |       int idx = 0; | ||||||
|         for(int d=0;d<_ndimension;d++) { |       for (int d = 0; d < _ndimension; d++) | ||||||
| 	  if( d==_checker_dim ) { |       { | ||||||
| 	    idx+=_istride[d]*(lcoor[d]/(2*_rdimensions[d])); |         if (d == _checker_dim) | ||||||
| 	  } else {  |         { | ||||||
| 	    idx+=_istride[d]*(lcoor[d]/_rdimensions[d]); |           idx += _istride[d] * (lcoor[d] / (2 * _rdimensions[d])); | ||||||
| 	  } |         } | ||||||
| 	} |         else | ||||||
|         return idx; |         { | ||||||
|  |           idx += _istride[d] * (lcoor[d] / _rdimensions[d]); | ||||||
|  |         } | ||||||
|  |       } | ||||||
|  |       return idx; | ||||||
|     } |     } | ||||||
| }; | }; | ||||||
|  |  | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -26,6 +26,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
|     *************************************************************************************/ |     *************************************************************************************/ | ||||||
|     /*  END LEGAL */ |     /*  END LEGAL */ | ||||||
| #include <Grid/GridCore.h> | #include <Grid/GridCore.h> | ||||||
|  | #include <fcntl.h> | ||||||
|  | #include <unistd.h> | ||||||
|  | #include <limits.h> | ||||||
|  | #include <sys/mman.h> | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
| @@ -34,7 +38,10 @@ namespace Grid { | |||||||
| /////////////////////////////////////////////////////////////// | /////////////////////////////////////////////////////////////// | ||||||
| void *              CartesianCommunicator::ShmCommBuf; | void *              CartesianCommunicator::ShmCommBuf; | ||||||
| uint64_t            CartesianCommunicator::MAX_MPI_SHM_BYTES   = 128*1024*1024;  | uint64_t            CartesianCommunicator::MAX_MPI_SHM_BYTES   = 128*1024*1024;  | ||||||
| CartesianCommunicator::CommunicatorPolicy_t  CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent; | CartesianCommunicator::CommunicatorPolicy_t   | ||||||
|  | CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent; | ||||||
|  | int CartesianCommunicator::nCommThreads = -1; | ||||||
|  | int CartesianCommunicator::Hugepages = 0; | ||||||
|  |  | ||||||
| ///////////////////////////////// | ///////////////////////////////// | ||||||
| // Alloc, free shmem region | // Alloc, free shmem region | ||||||
| @@ -89,25 +96,43 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N) | |||||||
|   GlobalSumVector((double *)c,2*N); |   GlobalSumVector((double *)c,2*N); | ||||||
| } | } | ||||||
|  |  | ||||||
| #if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPI3L) | #if !defined( GRID_COMMS_MPI3)  | ||||||
|  |  | ||||||
| int                      CartesianCommunicator::NodeCount(void)    { return ProcessorCount();}; | int                      CartesianCommunicator::NodeCount(void)    { return ProcessorCount();}; | ||||||
| int                      CartesianCommunicator::RankCount(void)    { return ProcessorCount();}; | int                      CartesianCommunicator::RankCount(void)    { return ProcessorCount();}; | ||||||
|  | #endif | ||||||
| double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | #if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPIT) | ||||||
| 						       void *xmit, | double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, | ||||||
| 						       int xmit_to_rank, | 						     int xmit_to_rank, | ||||||
| 						       void *recv, | 						     void *recv, | ||||||
| 						       int recv_from_rank, | 						     int recv_from_rank, | ||||||
| 						       int bytes) | 						     int bytes, int dir) | ||||||
| { | { | ||||||
|  |   std::vector<CommsRequest_t> list; | ||||||
|  |   // Discard the "dir" | ||||||
|  |   SendToRecvFromBegin   (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes); | ||||||
|  |   SendToRecvFromComplete(list); | ||||||
|  |   return 2.0*bytes; | ||||||
|  | } | ||||||
|  | double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||||
|  | 							 void *xmit, | ||||||
|  | 							 int xmit_to_rank, | ||||||
|  | 							 void *recv, | ||||||
|  | 							 int recv_from_rank, | ||||||
|  | 							 int bytes, int dir) | ||||||
|  | { | ||||||
|  |   // Discard the "dir" | ||||||
|   SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes); |   SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes); | ||||||
|   return 2.0*bytes; |   return 2.0*bytes; | ||||||
| } | } | ||||||
| void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall) | void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir) | ||||||
| { | { | ||||||
|   SendToRecvFromComplete(waitall); |   SendToRecvFromComplete(waitall); | ||||||
| } | } | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | #if !defined( GRID_COMMS_MPI3)  | ||||||
|  |  | ||||||
| void CartesianCommunicator::StencilBarrier(void){}; | void CartesianCommunicator::StencilBarrier(void){}; | ||||||
|  |  | ||||||
| commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector; | commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector; | ||||||
| @@ -121,8 +146,22 @@ void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) { | |||||||
|   return NULL; |   return NULL; | ||||||
| } | } | ||||||
| void CartesianCommunicator::ShmInitGeneric(void){ | void CartesianCommunicator::ShmInitGeneric(void){ | ||||||
|  | #if 1 | ||||||
|  |  | ||||||
|  |   int mmap_flag = MAP_SHARED | MAP_ANONYMOUS; | ||||||
|  | #ifdef MAP_HUGETLB | ||||||
|  |   if ( Hugepages ) mmap_flag |= MAP_HUGETLB; | ||||||
|  | #endif | ||||||
|  |   ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag, -1, 0);  | ||||||
|  |   if (ShmCommBuf == (void *)MAP_FAILED) { | ||||||
|  |     perror("mmap failed "); | ||||||
|  |     exit(EXIT_FAILURE);   | ||||||
|  |   } | ||||||
|  | #else  | ||||||
|   ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES); |   ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES); | ||||||
|   ShmCommBuf=(void *)&ShmBufStorageVector[0]; |   ShmCommBuf=(void *)&ShmBufStorageVector[0]; | ||||||
|  | #endif | ||||||
|  |   bzero(ShmCommBuf,MAX_MPI_SHM_BYTES); | ||||||
| } | } | ||||||
|  |  | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -38,7 +38,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #ifdef GRID_COMMS_MPI3 | #ifdef GRID_COMMS_MPI3 | ||||||
| #include <mpi.h> | #include <mpi.h> | ||||||
| #endif | #endif | ||||||
| #ifdef GRID_COMMS_MPI3L | #ifdef GRID_COMMS_MPIT | ||||||
| #include <mpi.h> | #include <mpi.h> | ||||||
| #endif | #endif | ||||||
| #ifdef GRID_COMMS_SHMEM | #ifdef GRID_COMMS_SHMEM | ||||||
| @@ -50,12 +50,24 @@ namespace Grid { | |||||||
| class CartesianCommunicator { | class CartesianCommunicator { | ||||||
|   public:     |   public:     | ||||||
|  |  | ||||||
|   // 65536 ranks per node adequate for now |  | ||||||
|  |   //////////////////////////////////////////// | ||||||
|  |   // Isend/Irecv/Wait, or Sendrecv blocking | ||||||
|  |   //////////////////////////////////////////// | ||||||
|  |   enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential }; | ||||||
|  |   static CommunicatorPolicy_t CommunicatorPolicy; | ||||||
|  |   static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; } | ||||||
|  |  | ||||||
|  |   /////////////////////////////////////////// | ||||||
|  |   // Up to 65536 ranks per node adequate for now | ||||||
|   // 128MB shared memory for comms enought for 48^4 local vol comms |   // 128MB shared memory for comms enought for 48^4 local vol comms | ||||||
|   // Give external control (command line override?) of this |   // Give external control (command line override?) of this | ||||||
|  |   /////////////////////////////////////////// | ||||||
|   static const int      MAXLOG2RANKSPERNODE = 16;             |   static const int MAXLOG2RANKSPERNODE = 16;             | ||||||
|   static uint64_t MAX_MPI_SHM_BYTES; |   static uint64_t  MAX_MPI_SHM_BYTES; | ||||||
|  |   static int       nCommThreads; | ||||||
|  |   // use explicit huge pages | ||||||
|  |   static int       Hugepages; | ||||||
|  |  | ||||||
|   // Communicator should know nothing of the physics grid, only processor grid. |   // Communicator should know nothing of the physics grid, only processor grid. | ||||||
|   int              _Nprocessors;     // How many in all |   int              _Nprocessors;     // How many in all | ||||||
| @@ -64,14 +76,18 @@ class CartesianCommunicator { | |||||||
|   std::vector<int> _processor_coor;  // linear processor coordinate |   std::vector<int> _processor_coor;  // linear processor coordinate | ||||||
|   unsigned long _ndimension; |   unsigned long _ndimension; | ||||||
|  |  | ||||||
| #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPI3L) | #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT) | ||||||
|   static MPI_Comm communicator_world; |   static MPI_Comm communicator_world; | ||||||
|          MPI_Comm communicator; |  | ||||||
|  |   MPI_Comm              communicator; | ||||||
|  |   std::vector<MPI_Comm> communicator_halo; | ||||||
|  |  | ||||||
|   typedef MPI_Request CommsRequest_t; |   typedef MPI_Request CommsRequest_t; | ||||||
| #else  | #else  | ||||||
|   typedef int CommsRequest_t; |   typedef int CommsRequest_t; | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////////// | ||||||
|   // Helper functionality for SHM Windows common to all other impls |   // Helper functionality for SHM Windows common to all other impls | ||||||
|   //////////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////////// | ||||||
| @@ -117,10 +133,6 @@ class CartesianCommunicator { | |||||||
|   ///////////////////////////////// |   ///////////////////////////////// | ||||||
|   static void * ShmCommBuf; |   static void * ShmCommBuf; | ||||||
|  |  | ||||||
|   // Isend/Irecv/Wait, or Sendrecv blocking |  | ||||||
|   enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential }; |  | ||||||
|   static CommunicatorPolicy_t CommunicatorPolicy; |  | ||||||
|   static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; } |  | ||||||
|    |    | ||||||
|   size_t heap_top; |   size_t heap_top; | ||||||
|   size_t heap_bytes; |   size_t heap_bytes; | ||||||
| @@ -211,14 +223,21 @@ class CartesianCommunicator { | |||||||
|    |    | ||||||
|   void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall); |   void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall); | ||||||
|  |  | ||||||
|   double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, |   double StencilSendToRecvFrom(void *xmit, | ||||||
| 				  void *xmit, | 			       int xmit_to_rank, | ||||||
| 				  int xmit_to_rank, | 			       void *recv, | ||||||
| 				  void *recv, | 			       int recv_from_rank, | ||||||
| 				  int recv_from_rank, | 			       int bytes,int dir); | ||||||
| 				  int bytes); |  | ||||||
|  |  | ||||||
|   void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall); |   double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||||
|  | 				    void *xmit, | ||||||
|  | 				    int xmit_to_rank, | ||||||
|  | 				    void *recv, | ||||||
|  | 				    int recv_from_rank, | ||||||
|  | 				    int bytes,int dir); | ||||||
|  |    | ||||||
|  |    | ||||||
|  |   void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i); | ||||||
|   void StencilBarrier(void); |   void StencilBarrier(void); | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////// | ||||||
|   | |||||||
| @@ -37,11 +37,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #include <sys/ipc.h> | #include <sys/ipc.h> | ||||||
| #include <sys/shm.h> | #include <sys/shm.h> | ||||||
| #include <sys/mman.h> | #include <sys/mman.h> | ||||||
| //#include <zlib.h> | #include <zlib.h> | ||||||
| #ifndef SHM_HUGETLB | #ifdef HAVE_NUMAIF_H | ||||||
| #define SHM_HUGETLB 04000 | #include <numaif.h> | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
| /////////////////////////////////////////////////////////////////////////////////////////////////// | /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
| @@ -211,9 +212,34 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { | |||||||
|       if ( fd < 0 ) {	perror("failed shm_open");	assert(0);      } |       if ( fd < 0 ) {	perror("failed shm_open");	assert(0);      } | ||||||
|       ftruncate(fd, size); |       ftruncate(fd, size); | ||||||
|        |        | ||||||
|       void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); |       int mmap_flag = MAP_SHARED; | ||||||
|  | #ifdef MAP_HUGETLB | ||||||
|  |       if (Hugepages) mmap_flag |= MAP_HUGETLB; | ||||||
|  | #endif | ||||||
|  |       void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0); | ||||||
|  |  | ||||||
|       if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    } |       if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    } | ||||||
|       assert(((uint64_t)ptr&0x3F)==0); |       assert(((uint64_t)ptr&0x3F)==0); | ||||||
|  |  | ||||||
|  | // Experiments; Experiments; Try to force numa domain on the shm segment if we have numaif.h | ||||||
|  | #if 0 | ||||||
|  | //#ifdef HAVE_NUMAIF_H | ||||||
|  | 	int status; | ||||||
|  | 	int flags=MPOL_MF_MOVE; | ||||||
|  | #ifdef KNL | ||||||
|  | 	int nodes=1; // numa domain == MCDRAM | ||||||
|  | 	// Find out if in SNC2,SNC4 mode ? | ||||||
|  | #else | ||||||
|  | 	int nodes=r; // numa domain == MPI ID | ||||||
|  | #endif | ||||||
|  | 	unsigned long count=1; | ||||||
|  | 	for(uint64_t page=0;page<size;page+=4096){ | ||||||
|  | 	  void *pages = (void *) ( page + (uint64_t)ptr ); | ||||||
|  | 	  uint64_t *cow_it = (uint64_t *)pages;	*cow_it = 1; | ||||||
|  | 	  ierr= move_pages(0,count, &pages,&nodes,&status,flags); | ||||||
|  | 	  if (ierr && (page==0)) perror("numa relocate command failed"); | ||||||
|  | 	} | ||||||
|  | #endif | ||||||
|       ShmCommBufs[r] =ptr; |       ShmCommBufs[r] =ptr; | ||||||
|        |        | ||||||
|     } |     } | ||||||
| @@ -244,7 +270,11 @@ void CartesianCommunicator::Init(int *argc, char ***argv) { | |||||||
|     for(int r=0;r<ShmSize;r++){ |     for(int r=0;r<ShmSize;r++){ | ||||||
|       size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES; |       size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES; | ||||||
|       key_t key   = 0x4545 + r; |       key_t key   = 0x4545 + r; | ||||||
|       if ((shmids[r]= shmget(key,size, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) { |       int flags = IPC_CREAT | SHM_R | SHM_W; | ||||||
|  | #ifdef SHM_HUGETLB | ||||||
|  |       flags|=SHM_HUGETLB; | ||||||
|  | #endif | ||||||
|  |       if ((shmids[r]= shmget(key,size, flags)) < 0) { | ||||||
| 	int errsv = errno; | 	int errsv = errno; | ||||||
| 	printf("Errno %d\n",errsv); | 	printf("Errno %d\n",errsv); | ||||||
| 	perror("shmget"); | 	perror("shmget"); | ||||||
| @@ -375,8 +405,14 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) | |||||||
| {  | {  | ||||||
|   int ierr; |   int ierr; | ||||||
|   communicator=communicator_world; |   communicator=communicator_world; | ||||||
|  |  | ||||||
|   _ndimension = processors.size(); |   _ndimension = processors.size(); | ||||||
|  |  | ||||||
|  |   communicator_halo.resize (2*_ndimension); | ||||||
|  |   for(int i=0;i<_ndimension*2;i++){ | ||||||
|  |     MPI_Comm_dup(communicator,&communicator_halo[i]); | ||||||
|  |   } | ||||||
|  |  | ||||||
|   //////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////// | ||||||
|   // Assert power of two shm_size. |   // Assert power of two shm_size. | ||||||
|   //////////////////////////////////////////////////////////////// |   //////////////////////////////////////////////////////////////// | ||||||
| @@ -599,13 +635,27 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis | |||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, | ||||||
| 						       void *xmit, | 						     int dest, | ||||||
| 						       int dest, | 						     void *recv, | ||||||
| 						       void *recv, | 						     int from, | ||||||
| 						       int from, | 						     int bytes,int dir) | ||||||
| 						       int bytes) |  | ||||||
| { | { | ||||||
|  |   std::vector<CommsRequest_t> list; | ||||||
|  |   double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir); | ||||||
|  |   StencilSendToRecvFromComplete(list,dir); | ||||||
|  |   return offbytes; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||||
|  | 							 void *xmit, | ||||||
|  | 							 int dest, | ||||||
|  | 							 void *recv, | ||||||
|  | 							 int from, | ||||||
|  | 							 int bytes,int dir) | ||||||
|  | { | ||||||
|  |   assert(dir < communicator_halo.size()); | ||||||
|  |  | ||||||
|   MPI_Request xrq; |   MPI_Request xrq; | ||||||
|   MPI_Request rrq; |   MPI_Request rrq; | ||||||
|  |  | ||||||
| @@ -624,26 +674,26 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques | |||||||
|   gfrom = MPI_UNDEFINED; |   gfrom = MPI_UNDEFINED; | ||||||
| #endif | #endif | ||||||
|   if ( gfrom ==MPI_UNDEFINED) { |   if ( gfrom ==MPI_UNDEFINED) { | ||||||
|     ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); |     ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[dir],&rrq); | ||||||
|     assert(ierr==0); |     assert(ierr==0); | ||||||
|     list.push_back(rrq); |     list.push_back(rrq); | ||||||
|     off_node_bytes+=bytes; |     off_node_bytes+=bytes; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   if ( gdest == MPI_UNDEFINED ) { |   if ( gdest == MPI_UNDEFINED ) { | ||||||
|     ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); |     ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[dir],&xrq); | ||||||
|     assert(ierr==0); |     assert(ierr==0); | ||||||
|     list.push_back(xrq); |     list.push_back(xrq); | ||||||
|     off_node_bytes+=bytes; |     off_node_bytes+=bytes; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   if ( CommunicatorPolicy == CommunicatorPolicySequential ) {  |   if ( CommunicatorPolicy == CommunicatorPolicySequential ) {  | ||||||
|     this->StencilSendToRecvFromComplete(list); |     this->StencilSendToRecvFromComplete(list,dir); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   return off_node_bytes; |   return off_node_bytes; | ||||||
| } | } | ||||||
| void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall) | void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir) | ||||||
| { | { | ||||||
|   SendToRecvFromComplete(waitall); |   SendToRecvFromComplete(waitall); | ||||||
| } | } | ||||||
|   | |||||||
							
								
								
									
										286
									
								
								lib/communicator/Communicator_mpit.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										286
									
								
								lib/communicator/Communicator_mpit.cc
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,286 @@ | |||||||
|  |     /************************************************************************************* | ||||||
|  |  | ||||||
|  |     Grid physics library, www.github.com/paboyle/Grid  | ||||||
|  |  | ||||||
|  |     Source file: ./lib/communicator/Communicator_mpi.cc | ||||||
|  |  | ||||||
|  |     Copyright (C) 2015 | ||||||
|  |  | ||||||
|  | Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||||
|  |  | ||||||
|  |     This program is free software; you can redistribute it and/or modify | ||||||
|  |     it under the terms of the GNU General Public License as published by | ||||||
|  |     the Free Software Foundation; either version 2 of the License, or | ||||||
|  |     (at your option) any later version. | ||||||
|  |  | ||||||
|  |     This program is distributed in the hope that it will be useful, | ||||||
|  |     but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  |     GNU General Public License for more details. | ||||||
|  |  | ||||||
|  |     You should have received a copy of the GNU General Public License along | ||||||
|  |     with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  |  | ||||||
|  |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|  |     *************************************************************************************/ | ||||||
|  |     /*  END LEGAL */ | ||||||
|  | #include <Grid/GridCore.h> | ||||||
|  | #include <Grid/GridQCDcore.h> | ||||||
|  | #include <Grid/qcd/action/ActionCore.h> | ||||||
|  | #include <mpi.h> | ||||||
|  |  | ||||||
|  | namespace Grid { | ||||||
|  |  | ||||||
|  |  | ||||||
|  | /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  | // Info that is setup once and indept of cartesian layout | ||||||
|  | /////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  | MPI_Comm CartesianCommunicator::communicator_world; | ||||||
|  |  | ||||||
|  | // Should error check all MPI calls. | ||||||
|  | void CartesianCommunicator::Init(int *argc, char ***argv) { | ||||||
|  |   int flag; | ||||||
|  |   int provided; | ||||||
|  |   MPI_Initialized(&flag); // needed to coexist with other libs apparently | ||||||
|  |   if ( !flag ) { | ||||||
|  |     MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided); | ||||||
|  |     if ( provided != MPI_THREAD_MULTIPLE ) { | ||||||
|  |       QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute; | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world); | ||||||
|  |   ShmInitGeneric(); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors) | ||||||
|  | { | ||||||
|  |   _ndimension = processors.size(); | ||||||
|  |   std::vector<int> periodic(_ndimension,1); | ||||||
|  |  | ||||||
|  |   _Nprocessors=1; | ||||||
|  |   _processors = processors; | ||||||
|  |   _processor_coor.resize(_ndimension); | ||||||
|  |    | ||||||
|  |   MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator); | ||||||
|  |   MPI_Comm_rank(communicator,&_processor); | ||||||
|  |   MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]); | ||||||
|  |  | ||||||
|  |   for(int i=0;i<_ndimension;i++){ | ||||||
|  |     _Nprocessors*=_processors[i]; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   communicator_halo.resize (2*_ndimension); | ||||||
|  |   for(int i=0;i<_ndimension*2;i++){ | ||||||
|  |     MPI_Comm_dup(communicator,&communicator_halo[i]); | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   int Size;  | ||||||
|  |   MPI_Comm_size(communicator,&Size); | ||||||
|  |    | ||||||
|  |   assert(Size==_Nprocessors); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::GlobalSum(uint32_t &u){ | ||||||
|  |   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::GlobalSum(uint64_t &u){ | ||||||
|  |   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::GlobalXOR(uint32_t &u){ | ||||||
|  |   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::GlobalXOR(uint64_t &u){ | ||||||
|  |   int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::GlobalSum(float &f){ | ||||||
|  |   int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::GlobalSumVector(float *f,int N) | ||||||
|  | { | ||||||
|  |   int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::GlobalSum(double &d) | ||||||
|  | { | ||||||
|  |   int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::GlobalSumVector(double *d,int N) | ||||||
|  | { | ||||||
|  |   int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) | ||||||
|  | { | ||||||
|  |   int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  | int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) | ||||||
|  | { | ||||||
|  |   int rank; | ||||||
|  |   int ierr=MPI_Cart_rank  (communicator, &coor[0], &rank); | ||||||
|  |   assert(ierr==0); | ||||||
|  |   return rank; | ||||||
|  | } | ||||||
|  | void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor) | ||||||
|  | { | ||||||
|  |   coor.resize(_ndimension); | ||||||
|  |   int ierr=MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // Basic Halo comms primitive | ||||||
|  | void CartesianCommunicator::SendToRecvFrom(void *xmit, | ||||||
|  | 					   int dest, | ||||||
|  | 					   void *recv, | ||||||
|  | 					   int from, | ||||||
|  | 					   int bytes) | ||||||
|  | { | ||||||
|  |   std::vector<CommsRequest_t> reqs(0); | ||||||
|  |   SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes); | ||||||
|  |   SendToRecvFromComplete(reqs); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void CartesianCommunicator::SendRecvPacket(void *xmit, | ||||||
|  | 					   void *recv, | ||||||
|  | 					   int sender, | ||||||
|  | 					   int receiver, | ||||||
|  | 					   int bytes) | ||||||
|  | { | ||||||
|  |   MPI_Status stat; | ||||||
|  |   assert(sender != receiver); | ||||||
|  |   int tag = sender; | ||||||
|  |   if ( _processor == sender ) { | ||||||
|  |     MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator); | ||||||
|  |   } | ||||||
|  |   if ( _processor == receiver ) {  | ||||||
|  |     MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // Basic Halo comms primitive | ||||||
|  | void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||||
|  | 						void *xmit, | ||||||
|  | 						int dest, | ||||||
|  | 						void *recv, | ||||||
|  | 						int from, | ||||||
|  | 						int bytes) | ||||||
|  | { | ||||||
|  |   int myrank = _processor; | ||||||
|  |   int ierr; | ||||||
|  |   if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {  | ||||||
|  |     MPI_Request xrq; | ||||||
|  |     MPI_Request rrq; | ||||||
|  |  | ||||||
|  |     ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq); | ||||||
|  |     ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq); | ||||||
|  |      | ||||||
|  |     assert(ierr==0); | ||||||
|  |     list.push_back(xrq); | ||||||
|  |     list.push_back(rrq); | ||||||
|  |   } else {  | ||||||
|  |     // Give the CPU to MPI immediately; can use threads to overlap optionally | ||||||
|  |     ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank, | ||||||
|  | 		      recv,bytes,MPI_CHAR,from, from, | ||||||
|  | 		      communicator,MPI_STATUS_IGNORE); | ||||||
|  |     assert(ierr==0); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list) | ||||||
|  | { | ||||||
|  |   if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {  | ||||||
|  |     int nreq=list.size(); | ||||||
|  |     std::vector<MPI_Status> status(nreq); | ||||||
|  |     int ierr = MPI_Waitall(nreq,&list[0],&status[0]); | ||||||
|  |     assert(ierr==0); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void CartesianCommunicator::Barrier(void) | ||||||
|  | { | ||||||
|  |   int ierr = MPI_Barrier(communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void CartesianCommunicator::Broadcast(int root,void* data, int bytes) | ||||||
|  | { | ||||||
|  |   int ierr=MPI_Bcast(data, | ||||||
|  | 		     bytes, | ||||||
|  | 		     MPI_BYTE, | ||||||
|  | 		     root, | ||||||
|  | 		     communicator); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  |   /////////////////////////////////////////////////////// | ||||||
|  |   // Should only be used prior to Grid Init finished. | ||||||
|  |   // Check for this? | ||||||
|  |   /////////////////////////////////////////////////////// | ||||||
|  | int CartesianCommunicator::RankWorld(void){  | ||||||
|  |   int r;  | ||||||
|  |   MPI_Comm_rank(communicator_world,&r); | ||||||
|  |   return r; | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) | ||||||
|  | { | ||||||
|  |   int ierr= MPI_Bcast(data, | ||||||
|  | 		      bytes, | ||||||
|  | 		      MPI_BYTE, | ||||||
|  | 		      root, | ||||||
|  | 		      communicator_world); | ||||||
|  |   assert(ierr==0); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, | ||||||
|  | 							 void *xmit, | ||||||
|  | 							 int xmit_to_rank, | ||||||
|  | 							 void *recv, | ||||||
|  | 							 int recv_from_rank, | ||||||
|  | 							 int bytes,int dir) | ||||||
|  | { | ||||||
|  |   int myrank = _processor; | ||||||
|  |   int ierr; | ||||||
|  |   assert(dir < communicator_halo.size()); | ||||||
|  |    | ||||||
|  |   //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl; | ||||||
|  |   // Give the CPU to MPI immediately; can use threads to overlap optionally | ||||||
|  |   MPI_Request req[2]; | ||||||
|  |   MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]); | ||||||
|  |   MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank  ,myrank        , communicator_halo[dir],&req[0]); | ||||||
|  |  | ||||||
|  |   list.push_back(req[0]); | ||||||
|  |   list.push_back(req[1]); | ||||||
|  |   return 2.0*bytes; | ||||||
|  | } | ||||||
|  | void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir) | ||||||
|  | {  | ||||||
|  |   int nreq=waitall.size(); | ||||||
|  |   MPI_Waitall(nreq, &waitall[0], MPI_STATUSES_IGNORE); | ||||||
|  | }; | ||||||
|  | double CartesianCommunicator::StencilSendToRecvFrom(void *xmit, | ||||||
|  | 						    int xmit_to_rank, | ||||||
|  | 						    void *recv, | ||||||
|  | 						    int recv_from_rank, | ||||||
|  | 						    int bytes,int dir) | ||||||
|  | { | ||||||
|  |   int myrank = _processor; | ||||||
|  |   int ierr; | ||||||
|  |   assert(dir < communicator_halo.size()); | ||||||
|  |    | ||||||
|  |   //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl; | ||||||
|  |   // Give the CPU to MPI immediately; can use threads to overlap optionally | ||||||
|  |   MPI_Request req[2]; | ||||||
|  |   MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]); | ||||||
|  |   MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank  ,myrank        , communicator_halo[dir],&req[0]); | ||||||
|  |   MPI_Waitall(2, req, MPI_STATUSES_IGNORE); | ||||||
|  |   return 2.0*bytes; | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  | } | ||||||
|  |  | ||||||
| @@ -42,7 +42,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
| #include <Grid/cshift/Cshift_mpi.h> | #include <Grid/cshift/Cshift_mpi.h> | ||||||
| #endif  | #endif  | ||||||
|  |  | ||||||
| #ifdef GRID_COMMS_MPI3L | #ifdef GRID_COMMS_MPIT | ||||||
| #include <Grid/cshift/Cshift_mpi.h> | #include <Grid/cshift/Cshift_mpi.h> | ||||||
| #endif  | #endif  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -369,6 +369,7 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice | |||||||
|   } |   } | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  | /* | ||||||
| inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog) | inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog) | ||||||
| { | { | ||||||
|   int NN    = BlockSolverGrid->_ndimension; |   int NN    = BlockSolverGrid->_ndimension; | ||||||
| @@ -387,6 +388,7 @@ inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Or | |||||||
|   } |   } | ||||||
|   return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys);  |   return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys);  | ||||||
| } | } | ||||||
|  | */ | ||||||
|  |  | ||||||
| template<class vobj> | template<class vobj> | ||||||
| static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)  | static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)  | ||||||
| @@ -398,14 +400,15 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice | |||||||
|   int Nblock = X._grid->GlobalDimensions()[Orthog]; |   int Nblock = X._grid->GlobalDimensions()[Orthog]; | ||||||
|  |  | ||||||
|   GridBase *FullGrid  = X._grid; |   GridBase *FullGrid  = X._grid; | ||||||
|   GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); |   //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); | ||||||
|  |  | ||||||
|   Lattice<vobj> Xslice(SliceGrid); |   //  Lattice<vobj> Xslice(SliceGrid); | ||||||
|   Lattice<vobj> Rslice(SliceGrid); |   //  Lattice<vobj> Rslice(SliceGrid); | ||||||
|  |  | ||||||
|   assert( FullGrid->_simd_layout[Orthog]==1); |   assert( FullGrid->_simd_layout[Orthog]==1); | ||||||
|   int nh =  FullGrid->_ndimension; |   int nh =  FullGrid->_ndimension; | ||||||
|   int nl = SliceGrid->_ndimension; |   //  int nl = SliceGrid->_ndimension; | ||||||
|  |   int nl = nh-1; | ||||||
|  |  | ||||||
|   //FIXME package in a convenient iterator |   //FIXME package in a convenient iterator | ||||||
|   //Should loop over a plane orthogonal to direction "Orthog" |   //Should loop over a plane orthogonal to direction "Orthog" | ||||||
| @@ -448,14 +451,14 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice< | |||||||
|   int Nblock = X._grid->GlobalDimensions()[Orthog]; |   int Nblock = X._grid->GlobalDimensions()[Orthog]; | ||||||
|  |  | ||||||
|   GridBase *FullGrid  = X._grid; |   GridBase *FullGrid  = X._grid; | ||||||
|   GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); |   //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); | ||||||
|  |   //  Lattice<vobj> Xslice(SliceGrid); | ||||||
|   Lattice<vobj> Xslice(SliceGrid); |   //  Lattice<vobj> Rslice(SliceGrid); | ||||||
|   Lattice<vobj> Rslice(SliceGrid); |  | ||||||
|  |  | ||||||
|   assert( FullGrid->_simd_layout[Orthog]==1); |   assert( FullGrid->_simd_layout[Orthog]==1); | ||||||
|   int nh =  FullGrid->_ndimension; |   int nh =  FullGrid->_ndimension; | ||||||
|   int nl = SliceGrid->_ndimension; |   //  int nl = SliceGrid->_ndimension; | ||||||
|  |   int nl=1; | ||||||
|  |  | ||||||
|   //FIXME package in a convenient iterator |   //FIXME package in a convenient iterator | ||||||
|   //Should loop over a plane orthogonal to direction "Orthog" |   //Should loop over a plane orthogonal to direction "Orthog" | ||||||
| @@ -498,18 +501,19 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> | |||||||
|   typedef typename vobj::vector_type vector_type; |   typedef typename vobj::vector_type vector_type; | ||||||
|    |    | ||||||
|   GridBase *FullGrid  = lhs._grid; |   GridBase *FullGrid  = lhs._grid; | ||||||
|   GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); |   //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog); | ||||||
|    |    | ||||||
|   int Nblock = FullGrid->GlobalDimensions()[Orthog]; |   int Nblock = FullGrid->GlobalDimensions()[Orthog]; | ||||||
|    |    | ||||||
|   Lattice<vobj> Lslice(SliceGrid); |   //  Lattice<vobj> Lslice(SliceGrid); | ||||||
|   Lattice<vobj> Rslice(SliceGrid); |   //  Lattice<vobj> Rslice(SliceGrid); | ||||||
|    |    | ||||||
|   mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); |   mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); | ||||||
|  |  | ||||||
|   assert( FullGrid->_simd_layout[Orthog]==1); |   assert( FullGrid->_simd_layout[Orthog]==1); | ||||||
|   int nh =  FullGrid->_ndimension; |   int nh =  FullGrid->_ndimension; | ||||||
|   int nl = SliceGrid->_ndimension; |   //  int nl = SliceGrid->_ndimension; | ||||||
|  |   int nl = nh-1; | ||||||
|  |  | ||||||
|   //FIXME package in a convenient iterator |   //FIXME package in a convenient iterator | ||||||
|   //Should loop over a plane orthogonal to direction "Orthog" |   //Should loop over a plane orthogonal to direction "Orthog" | ||||||
| @@ -540,7 +544,8 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> | |||||||
|       for(int i=0;i<Nblock;i++){ |       for(int i=0;i<Nblock;i++){ | ||||||
|       for(int j=0;j<Nblock;j++){ |       for(int j=0;j<Nblock;j++){ | ||||||
| 	auto tmp = innerProduct(Left[i],Right[j]); | 	auto tmp = innerProduct(Left[i],Right[j]); | ||||||
| 	vector_typeD rtmp = TensorRemove(tmp); | 	//	vector_typeD rtmp = TensorRemove(tmp); | ||||||
|  | 	auto rtmp = TensorRemove(tmp); | ||||||
| 	mat_thread(i,j) += Reduce(rtmp); | 	mat_thread(i,j) += Reduce(rtmp); | ||||||
|       }} |       }} | ||||||
|     }} |     }} | ||||||
| @@ -549,6 +554,14 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> | |||||||
|       mat += mat_thread; |       mat += mat_thread; | ||||||
|     }   |     }   | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |   for(int i=0;i<Nblock;i++){ | ||||||
|  |   for(int j=0;j<Nblock;j++){ | ||||||
|  |     ComplexD sum = mat(i,j); | ||||||
|  |     FullGrid->GlobalSum(sum); | ||||||
|  |     mat(i,j)=sum; | ||||||
|  |   }} | ||||||
|  |  | ||||||
|   return; |   return; | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -95,7 +95,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) { | |||||||
| //////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////// | ||||||
| void Grid_quiesce_nodes(void) { | void Grid_quiesce_nodes(void) { | ||||||
|   int me = 0; |   int me = 0; | ||||||
| #if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPI3L) | #if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT) | ||||||
|   MPI_Comm_rank(MPI_COMM_WORLD, &me); |   MPI_Comm_rank(MPI_COMM_WORLD, &me); | ||||||
| #endif | #endif | ||||||
| #ifdef GRID_COMMS_SHMEM | #ifdef GRID_COMMS_SHMEM | ||||||
|   | |||||||
| @@ -29,7 +29,7 @@ | |||||||
| #ifndef GRID_BINARY_IO_H | #ifndef GRID_BINARY_IO_H | ||||||
| #define GRID_BINARY_IO_H | #define GRID_BINARY_IO_H | ||||||
|  |  | ||||||
| #if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3)  | #if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)  | ||||||
| #define USE_MPI_IO | #define USE_MPI_IO | ||||||
| #else | #else | ||||||
| #undef  USE_MPI_IO | #undef  USE_MPI_IO | ||||||
| @@ -99,34 +99,38 @@ class BinaryIO { | |||||||
|     NerscChecksum(grid,scalardata,nersc_csum); |     NerscChecksum(grid,scalardata,nersc_csum); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   template<class fobj> static inline void NerscChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &nersc_csum) |   template <class fobj> | ||||||
|  |   static inline void NerscChecksum(GridBase *grid, std::vector<fobj> &fbuf, uint32_t &nersc_csum) | ||||||
|   { |   { | ||||||
|     const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t); |     const uint64_t size32 = sizeof(fobj) / sizeof(uint32_t); | ||||||
|  |  | ||||||
|  |     uint64_t lsites = grid->lSites(); | ||||||
|     uint64_t lsites              =grid->lSites(); |     if (fbuf.size() == 1) | ||||||
|     if (fbuf.size()==1) { |     { | ||||||
|       lsites=1; |       lsites = 1; | ||||||
|     } |     } | ||||||
|  |  | ||||||
| #pragma omp parallel |     #pragma omp parallel | ||||||
|     { |     { | ||||||
|       uint32_t nersc_csum_thr=0; |       uint32_t nersc_csum_thr = 0; | ||||||
|  |  | ||||||
| #pragma omp for |       #pragma omp for | ||||||
|       for(uint64_t local_site=0;local_site<lsites;local_site++){ |       for (uint64_t local_site = 0; local_site < lsites; local_site++) | ||||||
| 	uint32_t * site_buf = (uint32_t *)&fbuf[local_site]; |       { | ||||||
| 	for(uint64_t j=0;j<size32;j++){ |         uint32_t *site_buf = (uint32_t *)&fbuf[local_site]; | ||||||
| 	  nersc_csum_thr=nersc_csum_thr+site_buf[j]; |         for (uint64_t j = 0; j < size32; j++) | ||||||
| 	} |         { | ||||||
|  |           nersc_csum_thr = nersc_csum_thr + site_buf[j]; | ||||||
|  |         } | ||||||
|       } |       } | ||||||
|  |  | ||||||
| #pragma omp critical |       #pragma omp critical | ||||||
|       { |       { | ||||||
| 	nersc_csum  += nersc_csum_thr; |         nersc_csum += nersc_csum_thr; | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   template<class fobj> static inline void ScidacChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &scidac_csuma,uint32_t &scidac_csumb) |   template<class fobj> static inline void ScidacChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &scidac_csuma,uint32_t &scidac_csumb) | ||||||
|   { |   { | ||||||
|     const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t); |     const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t); | ||||||
| @@ -363,17 +367,21 @@ class BinaryIO { | |||||||
| 	assert(0); | 	assert(0); | ||||||
| #endif | #endif | ||||||
|       } else { |       } else { | ||||||
| 	std::cout<< GridLogMessage<< "C++ read I/O "<< file<<" : " |         std::cout << GridLogMessage << "C++ read I/O " << file << " : " | ||||||
| 		 << iodata.size()*sizeof(fobj)<<" bytes"<<std::endl; |                   << iodata.size() * sizeof(fobj) << " bytes" << std::endl; | ||||||
| 	std::ifstream fin; |         std::ifstream fin; | ||||||
| 	fin.open(file,std::ios::binary|std::ios::in); |         fin.open(file, std::ios::binary | std::ios::in); | ||||||
| 	if ( control & BINARYIO_MASTER_APPEND )  { |         if (control & BINARYIO_MASTER_APPEND) | ||||||
| 	  fin.seekg(-sizeof(fobj),fin.end); |         { | ||||||
| 	} else {  |           fin.seekg(-sizeof(fobj), fin.end); | ||||||
| 	  fin.seekg(offset+myrank*lsites*sizeof(fobj)); |         } | ||||||
| 	} |         else | ||||||
| 	fin.read((char *)&iodata[0],iodata.size()*sizeof(fobj));assert( fin.fail()==0); |         { | ||||||
| 	fin.close(); |           fin.seekg(offset + myrank * lsites * sizeof(fobj)); | ||||||
|  |         } | ||||||
|  |         fin.read((char *)&iodata[0], iodata.size() * sizeof(fobj)); | ||||||
|  |         assert(fin.fail() == 0); | ||||||
|  |         fin.close(); | ||||||
|       } |       } | ||||||
|       timer.Stop(); |       timer.Stop(); | ||||||
|  |  | ||||||
| @@ -405,30 +413,78 @@ class BinaryIO { | |||||||
|       timer.Start(); |       timer.Start(); | ||||||
|       if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) { |       if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) { | ||||||
| #ifdef USE_MPI_IO | #ifdef USE_MPI_IO | ||||||
| 	std::cout<< GridLogMessage<< "MPI write I/O "<< file<< std::endl; |         std::cout << GridLogMessage << "MPI write I/O " << file << std::endl; | ||||||
| 	ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDWR|MPI_MODE_CREATE,MPI_INFO_NULL, &fh); assert(ierr==0); |         ierr = MPI_File_open(grid->communicator, (char *)file.c_str(), MPI_MODE_RDWR | MPI_MODE_CREATE, MPI_INFO_NULL, &fh); | ||||||
| 	ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);                        assert(ierr==0); |         std::cout << GridLogMessage << "Checking for errors" << std::endl; | ||||||
| 	ierr=MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);                                        assert(ierr==0); |         if (ierr != MPI_SUCCESS) | ||||||
| 	MPI_File_close(&fh); |         { | ||||||
| 	MPI_Type_free(&fileArray); |           char error_string[BUFSIZ]; | ||||||
| 	MPI_Type_free(&localArray); |           int length_of_error_string, error_class; | ||||||
|  |  | ||||||
|  |           MPI_Error_class(ierr, &error_class); | ||||||
|  |           MPI_Error_string(error_class, error_string, &length_of_error_string); | ||||||
|  |           fprintf(stderr, "%3d: %s\n", myrank, error_string); | ||||||
|  |           MPI_Error_string(ierr, error_string, &length_of_error_string); | ||||||
|  |           fprintf(stderr, "%3d: %s\n", myrank, error_string); | ||||||
|  |           MPI_Abort(MPI_COMM_WORLD, 1); //assert(ierr == 0); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         std::cout << GridLogDebug << "MPI read I/O set view " << file << std::endl; | ||||||
|  |         ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); | ||||||
|  |         assert(ierr == 0); | ||||||
|  |  | ||||||
|  |         std::cout << GridLogDebug << "MPI read I/O write all " << file << std::endl; | ||||||
|  |         ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status); | ||||||
|  |         assert(ierr == 0); | ||||||
|  |  | ||||||
|  |         MPI_File_close(&fh); | ||||||
|  |         MPI_Type_free(&fileArray); | ||||||
|  |         MPI_Type_free(&localArray); | ||||||
| #else  | #else  | ||||||
| 	assert(0); | 	assert(0); | ||||||
| #endif | #endif | ||||||
|       } else {  |       } else {  | ||||||
| 	std::ofstream fout; fout.open(file,std::ios::binary|std::ios::out|std::ios::in); |          | ||||||
| 	std::cout<< GridLogMessage<< "C++ write I/O "<< file<<" : " | 	std::ofstream fout;  | ||||||
| 		 << iodata.size()*sizeof(fobj)<<" bytes"<<std::endl; |   fout.exceptions ( std::fstream::failbit | std::fstream::badbit ); | ||||||
| 	if ( control & BINARYIO_MASTER_APPEND )  { |   try { | ||||||
|  |     fout.open(file,std::ios::binary|std::ios::out|std::ios::in); | ||||||
|  |   } catch (const std::fstream::failure& exc) { | ||||||
|  |     std::cout << GridLogError << "Error in opening the file " << file << " for output" <<std::endl; | ||||||
|  |     std::cout << GridLogError << "Exception description: " << exc.what() << std::endl; | ||||||
|  |     std::cout << GridLogError << "Probable cause: wrong path, inaccessible location "<< std::endl; | ||||||
|  |     #ifdef USE_MPI_IO | ||||||
|  |     MPI_Abort(MPI_COMM_WORLD,1); | ||||||
|  |     #else | ||||||
|  |     exit(1); | ||||||
|  |     #endif | ||||||
|  |   } | ||||||
|  | 	std::cout << GridLogMessage<< "C++ write I/O "<< file<<" : " | ||||||
|  | 		        << iodata.size()*sizeof(fobj)<<" bytes"<<std::endl; | ||||||
|  | 	 | ||||||
|  |   if ( control & BINARYIO_MASTER_APPEND )  { | ||||||
| 	  fout.seekp(0,fout.end); | 	  fout.seekp(0,fout.end); | ||||||
| 	} else { | 	} else { | ||||||
| 	  fout.seekp(offset+myrank*lsites*sizeof(fobj)); | 	  fout.seekp(offset+myrank*lsites*sizeof(fobj)); | ||||||
| 	} | 	} | ||||||
| 	fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));assert( fout.fail()==0); |    | ||||||
|  |   try { | ||||||
|  |   	fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));//assert( fout.fail()==0); | ||||||
|  |   } | ||||||
|  |   catch (const std::fstream::failure& exc) { | ||||||
|  |     std::cout << "Exception in writing file " << file << std::endl; | ||||||
|  |     std::cout << GridLogError << "Exception description: "<< exc.what() << std::endl; | ||||||
|  |     #ifdef USE_MPI_IO | ||||||
|  |     MPI_Abort(MPI_COMM_WORLD,1); | ||||||
|  |     #else | ||||||
|  |     exit(1); | ||||||
|  |     #endif | ||||||
|  |   } | ||||||
|  |  | ||||||
| 	fout.close(); | 	fout.close(); | ||||||
|       } |   } | ||||||
|       timer.Stop(); |   timer.Stop(); | ||||||
|     } |   } | ||||||
|  |  | ||||||
|     std::cout<<GridLogMessage<<"IOobject: "; |     std::cout<<GridLogMessage<<"IOobject: "; | ||||||
|     if ( control & BINARYIO_READ) std::cout << " read  "; |     if ( control & BINARYIO_READ) std::cout << " read  "; | ||||||
| @@ -442,11 +498,14 @@ class BinaryIO { | |||||||
|     ////////////////////////////////////////////////////////////////////////////// |     ////////////////////////////////////////////////////////////////////////////// | ||||||
|     // Safety check |     // Safety check | ||||||
|     ////////////////////////////////////////////////////////////////////////////// |     ////////////////////////////////////////////////////////////////////////////// | ||||||
|     grid->Barrier(); |     // if the data size is 1 we do not want to sum over the MPI ranks | ||||||
|     grid->GlobalSum(nersc_csum); |     if (iodata.size() != 1){ | ||||||
|     grid->GlobalXOR(scidac_csuma); |       grid->Barrier(); | ||||||
|     grid->GlobalXOR(scidac_csumb); |       grid->GlobalSum(nersc_csum); | ||||||
|     grid->Barrier(); |       grid->GlobalXOR(scidac_csuma); | ||||||
|  |       grid->GlobalXOR(scidac_csumb); | ||||||
|  |       grid->Barrier(); | ||||||
|  |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   ///////////////////////////////////////////////////////////////////////////// |   ///////////////////////////////////////////////////////////////////////////// | ||||||
| @@ -546,9 +605,9 @@ class BinaryIO { | |||||||
|     int gsites = grid->gSites(); |     int gsites = grid->gSites(); | ||||||
|     int lsites = grid->lSites(); |     int lsites = grid->lSites(); | ||||||
|  |  | ||||||
|     uint32_t nersc_csum_tmp; |     uint32_t nersc_csum_tmp   = 0; | ||||||
|     uint32_t scidac_csuma_tmp; |     uint32_t scidac_csuma_tmp = 0; | ||||||
|     uint32_t scidac_csumb_tmp; |     uint32_t scidac_csumb_tmp = 0; | ||||||
|  |  | ||||||
|     GridStopWatch timer; |     GridStopWatch timer; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -40,7 +40,7 @@ const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::Performan | |||||||
|   { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES          ,  "CPUCYCLES.........." , INSTRUCTIONS}, |   { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES          ,  "CPUCYCLES.........." , INSTRUCTIONS}, | ||||||
|   { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS        ,  "INSTRUCTIONS......." , CPUCYCLES   }, |   { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS        ,  "INSTRUCTIONS......." , CPUCYCLES   }, | ||||||
|     // 4 |     // 4 | ||||||
| #ifdef AVX512 | #ifdef KNL | ||||||
|     { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES    }, |     { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES    }, | ||||||
|     { PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS  }, |     { PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS  }, | ||||||
|     { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS    }, |     { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS    }, | ||||||
|   | |||||||
| @@ -414,7 +414,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co | |||||||
|   for(int i=0; i < Ls; i++){ |   for(int i=0; i < Ls; i++){ | ||||||
|     as[i] = 1.0; |     as[i] = 1.0; | ||||||
|     omega[i] = gamma[i]*zolo_hi; //NB reciprocal relative to Chroma NEF code |     omega[i] = gamma[i]*zolo_hi; //NB reciprocal relative to Chroma NEF code | ||||||
|     //    assert(fabs(omega[i])>0.0); |     assert(omega[i]!=Coeff_t(0.0)); | ||||||
|     bs[i] = 0.5*(bpc/omega[i] + bmc); |     bs[i] = 0.5*(bpc/omega[i] + bmc); | ||||||
|     cs[i] = 0.5*(bpc/omega[i] - bmc); |     cs[i] = 0.5*(bpc/omega[i] - bmc); | ||||||
|   } |   } | ||||||
| @@ -429,7 +429,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co | |||||||
|    |    | ||||||
|   for(int i=0;i<Ls;i++){ |   for(int i=0;i<Ls;i++){ | ||||||
|     bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);      |     bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);      | ||||||
|     //    assert(fabs(bee[i])>0.0); |     assert(bee[i]!=Coeff_t(0.0)); | ||||||
|     cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5)); |     cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5)); | ||||||
|     beo[i]=as[i]*bs[i]; |     beo[i]=as[i]*bs[i]; | ||||||
|     ceo[i]=-as[i]*cs[i]; |     ceo[i]=-as[i]*cs[i]; | ||||||
| @@ -456,10 +456,16 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co | |||||||
|      |      | ||||||
|     if ( i < Ls-1 ) { |     if ( i < Ls-1 ) { | ||||||
|  |  | ||||||
|  |       assert(bee[i]!=Coeff_t(0.0)); | ||||||
|  |       assert(bee[0]!=Coeff_t(0.0)); | ||||||
|  |        | ||||||
|       lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column |       lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column | ||||||
|        |        | ||||||
|       leem[i]=mass*cee[Ls-1]/bee[0]; |       leem[i]=mass*cee[Ls-1]/bee[0]; | ||||||
|       for(int j=0;j<i;j++)  leem[i]*= aee[j]/bee[j+1]; |       for(int j=0;j<i;j++) { | ||||||
|  | 	assert(bee[j+1]!=Coeff_t(0.0)); | ||||||
|  | 	leem[i]*= aee[j]/bee[j+1]; | ||||||
|  |       } | ||||||
|        |        | ||||||
|       uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row |       uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row | ||||||
|        |        | ||||||
| @@ -478,7 +484,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co | |||||||
|   {  |   {  | ||||||
|     Coeff_t delta_d=mass*cee[Ls-1]; |     Coeff_t delta_d=mass*cee[Ls-1]; | ||||||
|     for(int j=0;j<Ls-1;j++) { |     for(int j=0;j<Ls-1;j++) { | ||||||
|       //      assert(fabs(bee[j])>0.0); |       assert(bee[j] != Coeff_t(0.0)); | ||||||
|       delta_d *= cee[j]/bee[j]; |       delta_d *= cee[j]/bee[j]; | ||||||
|     } |     } | ||||||
|     dee[Ls-1] += delta_d; |     dee[Ls-1] += delta_d; | ||||||
|   | |||||||
| @@ -237,4 +237,11 @@ typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermion | |||||||
|  |  | ||||||
|   }} |   }} | ||||||
|  |  | ||||||
|  | //////////////////// | ||||||
|  | // Scalar QED actions | ||||||
|  | // TODO: this needs to move to another header after rename to Fermion.h | ||||||
|  | //////////////////// | ||||||
|  | #include <Grid/qcd/action/scalar/Scalar.h> | ||||||
|  | #include <Grid/qcd/action/gauge/Photon.h> | ||||||
|  |  | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -230,8 +230,15 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr | |||||||
| { | { | ||||||
|   Compressor compressor; |   Compressor compressor; | ||||||
|   int LLs = in._grid->_rdimensions[0]; |   int LLs = in._grid->_rdimensions[0]; | ||||||
|   st.HaloExchange(in,compressor); |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |   DhopTotalTime -= usecond(); | ||||||
|  |   DhopCommTime -= usecond(); | ||||||
|  |   st.HaloExchange(in,compressor); | ||||||
|  |   DhopCommTime += usecond(); | ||||||
|  |    | ||||||
|  |   DhopComputeTime -= usecond(); | ||||||
|   // Dhop takes the 4d grid from U, and makes a 5d index for fermion |   // Dhop takes the 4d grid from U, and makes a 5d index for fermion | ||||||
|   if (dag == DaggerYes) { |   if (dag == DaggerYes) { | ||||||
|     parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) { |     parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) { | ||||||
| @@ -244,12 +251,15 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr | |||||||
| 	Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out); | 	Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out); | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |   DhopComputeTime += usecond(); | ||||||
|  |   DhopTotalTime   += usecond(); | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag) | void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag) | ||||||
| { | { | ||||||
|  |   DhopCalls+=1; | ||||||
|   conformable(in._grid,FermionRedBlackGrid());    // verifies half grid |   conformable(in._grid,FermionRedBlackGrid());    // verifies half grid | ||||||
|   conformable(in._grid,out._grid); // drops the cb check |   conformable(in._grid,out._grid); // drops the cb check | ||||||
|  |  | ||||||
| @@ -261,6 +271,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionFie | |||||||
| template<class Impl> | template<class Impl> | ||||||
| void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) | void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) | ||||||
| { | { | ||||||
|  |   DhopCalls+=1; | ||||||
|   conformable(in._grid,FermionRedBlackGrid());    // verifies half grid |   conformable(in._grid,FermionRedBlackGrid());    // verifies half grid | ||||||
|   conformable(in._grid,out._grid); // drops the cb check |   conformable(in._grid,out._grid); // drops the cb check | ||||||
|  |  | ||||||
| @@ -272,6 +283,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionFie | |||||||
| template<class Impl> | template<class Impl> | ||||||
| void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag) | void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag) | ||||||
| { | { | ||||||
|  |   DhopCalls+=2; | ||||||
|   conformable(in._grid,FermionGrid()); // verifies full grid |   conformable(in._grid,FermionGrid()); // verifies full grid | ||||||
|   conformable(in._grid,out._grid); |   conformable(in._grid,out._grid); | ||||||
|  |  | ||||||
| @@ -280,6 +292,54 @@ void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField | |||||||
|   DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag); |   DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | template<class Impl> | ||||||
|  | void ImprovedStaggeredFermion5D<Impl>::Report(void)  | ||||||
|  | { | ||||||
|  |   std::vector<int> latt = GridDefaultLatt();           | ||||||
|  |   RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu]; | ||||||
|  |   RealD NP = _FourDimGrid->_Nprocessors; | ||||||
|  |   RealD NN = _FourDimGrid->NodeCount(); | ||||||
|  |  | ||||||
|  |   std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; | ||||||
|  |  | ||||||
|  |   std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Number of DhopEO Calls   : "  | ||||||
|  | 	    << DhopCalls   << std::endl; | ||||||
|  |   std::cout << GridLogMessage << "ImprovedStaggeredFermion5D TotalTime   /Calls       : "  | ||||||
|  | 	    << DhopTotalTime   / DhopCalls << " us" << std::endl; | ||||||
|  |   std::cout << GridLogMessage << "ImprovedStaggeredFermion5D CommTime    /Calls       : "  | ||||||
|  | 	    << DhopCommTime    / DhopCalls << " us" << std::endl; | ||||||
|  |   std::cout << GridLogMessage << "ImprovedStaggeredFermion5D ComputeTime/Calls        : "  | ||||||
|  | 	    << DhopComputeTime / DhopCalls << " us" << std::endl; | ||||||
|  |  | ||||||
|  |   // Average the compute time | ||||||
|  |   _FourDimGrid->GlobalSum(DhopComputeTime); | ||||||
|  |   DhopComputeTime/=NP; | ||||||
|  |  | ||||||
|  |   RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting | ||||||
|  |   std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl; | ||||||
|  |   std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl; | ||||||
|  |   std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl; | ||||||
|  |    | ||||||
|  |   RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting | ||||||
|  |   std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl; | ||||||
|  |   std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl; | ||||||
|  |   std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl; | ||||||
|  |  | ||||||
|  |   std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Stencil"    <<std::endl;  Stencil.Report(); | ||||||
|  |   std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilEven"<<std::endl;  StencilEven.Report(); | ||||||
|  |   std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilOdd" <<std::endl;  StencilOdd.Report(); | ||||||
|  | } | ||||||
|  | template<class Impl> | ||||||
|  | void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void)  | ||||||
|  | { | ||||||
|  |   DhopCalls       = 0; | ||||||
|  |   DhopTotalTime    = 0; | ||||||
|  |   DhopCommTime    = 0; | ||||||
|  |   DhopComputeTime = 0; | ||||||
|  |   Stencil.ZeroCounters(); | ||||||
|  |   StencilEven.ZeroCounters(); | ||||||
|  |   StencilOdd.ZeroCounters(); | ||||||
|  | } | ||||||
|  |  | ||||||
| ///////////////////////////////////////////////////////////////////////// | ///////////////////////////////////////////////////////////////////////// | ||||||
| // Implement the general interface. Here we use SAME mass on all slices | // Implement the general interface. Here we use SAME mass on all slices | ||||||
|   | |||||||
| @@ -55,6 +55,16 @@ namespace QCD { | |||||||
|       FermionField _tmp; |       FermionField _tmp; | ||||||
|       FermionField &tmp(void) { return _tmp; } |       FermionField &tmp(void) { return _tmp; } | ||||||
|  |  | ||||||
|  |       //////////////////////////////////////// | ||||||
|  |       // Performance monitoring | ||||||
|  |       //////////////////////////////////////// | ||||||
|  |       void Report(void); | ||||||
|  |       void ZeroCounters(void); | ||||||
|  |       double DhopTotalTime; | ||||||
|  |       double DhopCalls; | ||||||
|  |       double DhopCommTime; | ||||||
|  |       double DhopComputeTime; | ||||||
|  |  | ||||||
|       /////////////////////////////////////////////////////////////// |       /////////////////////////////////////////////////////////////// | ||||||
|       // Implement the abstract base |       // Implement the abstract base | ||||||
|       /////////////////////////////////////////////////////////////// |       /////////////////////////////////////////////////////////////// | ||||||
|   | |||||||
| @@ -238,7 +238,33 @@ template<typename HCS,typename HS,typename S> using WilsonCompressor = WilsonCom | |||||||
| template<class vobj,class cobj> | template<class vobj,class cobj> | ||||||
| class WilsonStencil : public CartesianStencil<vobj,cobj> { | class WilsonStencil : public CartesianStencil<vobj,cobj> { | ||||||
| public: | public: | ||||||
|  |   double timer0; | ||||||
|  |   double timer1; | ||||||
|  |   double timer2; | ||||||
|  |   double timer3; | ||||||
|  |   double timer4; | ||||||
|  |   double timer5; | ||||||
|  |   double timer6; | ||||||
|  |   uint64_t callsi; | ||||||
|  |   void ZeroCountersi(void) | ||||||
|  |   { | ||||||
|  |     timer0=0; | ||||||
|  |     timer1=0; | ||||||
|  |     timer2=0; | ||||||
|  |     timer3=0; | ||||||
|  |     timer4=0; | ||||||
|  |     timer5=0; | ||||||
|  |     timer6=0; | ||||||
|  |     callsi=0; | ||||||
|  |   } | ||||||
|  |   void Reporti(int calls) | ||||||
|  |   { | ||||||
|  |     if ( timer0 ) std::cout << GridLogMessage << " timer0 (HaloGatherOpt) " <<timer0/calls <<std::endl; | ||||||
|  |     if ( timer1 ) std::cout << GridLogMessage << " timer1 (Communicate)   " <<timer1/calls <<std::endl; | ||||||
|  |     if ( timer2 ) std::cout << GridLogMessage << " timer2 (CommsMerge )   " <<timer2/calls <<std::endl; | ||||||
|  |     if ( timer3 ) std::cout << GridLogMessage << " timer3 (commsMergeShm) " <<timer3/calls <<std::endl; | ||||||
|  |     if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl; | ||||||
|  |   } | ||||||
|   typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; |   typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; | ||||||
|  |  | ||||||
|   std::vector<int> same_node; |   std::vector<int> same_node; | ||||||
| @@ -252,6 +278,7 @@ public: | |||||||
|     : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) , |     : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) , | ||||||
|     same_node(npoints) |     same_node(npoints) | ||||||
|   {  |   {  | ||||||
|  |     ZeroCountersi(); | ||||||
|     surface_list.resize(0); |     surface_list.resize(0); | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
| @@ -261,7 +288,6 @@ public: | |||||||
|     // Here we know the distance is 1 for WilsonStencil |     // Here we know the distance is 1 for WilsonStencil | ||||||
|     for(int point=0;point<this->_npoints;point++){ |     for(int point=0;point<this->_npoints;point++){ | ||||||
|       same_node[point] = this->SameNode(point); |       same_node[point] = this->SameNode(point); | ||||||
|       //      std::cout << " dir " <<point<<" same_node " <<same_node[point]<<std::endl; |  | ||||||
|     } |     } | ||||||
|      |      | ||||||
|     for(int site = 0 ;site< vol4;site++){ |     for(int site = 0 ;site< vol4;site++){ | ||||||
| @@ -282,17 +308,28 @@ public: | |||||||
|   { |   { | ||||||
|     std::vector<std::vector<CommsRequest_t> > reqs; |     std::vector<std::vector<CommsRequest_t> > reqs; | ||||||
|     this->HaloExchangeOptGather(source,compress); |     this->HaloExchangeOptGather(source,compress); | ||||||
|     this->CommunicateBegin(reqs); |     double t1=usecond(); | ||||||
|     this->CommunicateComplete(reqs); |     // Asynchronous MPI calls multidirectional, Isend etc... | ||||||
|  |     //    this->CommunicateBegin(reqs); | ||||||
|  |     //    this->CommunicateComplete(reqs); | ||||||
|  |     // Non-overlapped directions within a thread. Asynchronous calls except MPI3, threaded up to comm threads ways. | ||||||
|  |     this->Communicate(); | ||||||
|  |     double t2=usecond(); timer1 += t2-t1; | ||||||
|     this->CommsMerge(compress); |     this->CommsMerge(compress); | ||||||
|  |     double t3=usecond(); timer2 += t3-t2; | ||||||
|     this->CommsMergeSHM(compress); |     this->CommsMergeSHM(compress); | ||||||
|  |     double t4=usecond(); timer3 += t4-t3; | ||||||
|   } |   } | ||||||
|    |    | ||||||
|   template <class compressor> |   template <class compressor> | ||||||
|   void HaloExchangeOptGather(const Lattice<vobj> &source,compressor &compress)  |   void HaloExchangeOptGather(const Lattice<vobj> &source,compressor &compress)  | ||||||
|   { |   { | ||||||
|     this->Prepare(); |     this->Prepare(); | ||||||
|  |     double t0=usecond(); | ||||||
|     this->HaloGatherOpt(source,compress); |     this->HaloGatherOpt(source,compress); | ||||||
|  |     double t1=usecond(); | ||||||
|  |     timer0 += t1-t0; | ||||||
|  |     callsi++; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   template <class compressor> |   template <class compressor> | ||||||
| @@ -304,7 +341,9 @@ public: | |||||||
|     typedef typename compressor::SiteHalfSpinor     SiteHalfSpinor; |     typedef typename compressor::SiteHalfSpinor     SiteHalfSpinor; | ||||||
|     typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor; |     typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor; | ||||||
|  |  | ||||||
|  |     this->mpi3synctime_g-=usecond(); | ||||||
|     this->_grid->StencilBarrier(); |     this->_grid->StencilBarrier(); | ||||||
|  |     this->mpi3synctime_g+=usecond(); | ||||||
|  |  | ||||||
|     assert(source._grid==this->_grid); |     assert(source._grid==this->_grid); | ||||||
|     this->halogtime-=usecond(); |     this->halogtime-=usecond(); | ||||||
| @@ -323,7 +362,6 @@ public: | |||||||
|     int dag = compress.dag; |     int dag = compress.dag; | ||||||
|     int face_idx=0; |     int face_idx=0; | ||||||
|     if ( dag ) {  |     if ( dag ) {  | ||||||
|       //	std::cout << " Optimised Dagger compress " <<std::endl; |  | ||||||
|       assert(same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx)); |       assert(same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx)); | ||||||
|       assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx)); |       assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx)); | ||||||
|       assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx)); |       assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx)); | ||||||
|   | |||||||
| @@ -123,22 +123,24 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu, | |||||||
|   int vol4; |   int vol4; | ||||||
|   vol4=FourDimGrid.oSites(); |   vol4=FourDimGrid.oSites(); | ||||||
|   Stencil.BuildSurfaceList(LLs,vol4); |   Stencil.BuildSurfaceList(LLs,vol4); | ||||||
|  |  | ||||||
|   vol4=FourDimRedBlackGrid.oSites(); |   vol4=FourDimRedBlackGrid.oSites(); | ||||||
|   StencilEven.BuildSurfaceList(LLs,vol4); |   StencilEven.BuildSurfaceList(LLs,vol4); | ||||||
|    StencilOdd.BuildSurfaceList(LLs,vol4); |    StencilOdd.BuildSurfaceList(LLs,vol4); | ||||||
|  |  | ||||||
|   std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size() |    //  std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size() | ||||||
|                        <<" " << StencilEven.surface_list.size()<<std::endl; |    //                       <<" " << StencilEven.surface_list.size()<<std::endl; | ||||||
|  |  | ||||||
| } | } | ||||||
|       |       | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void WilsonFermion5D<Impl>::Report(void) | void WilsonFermion5D<Impl>::Report(void) | ||||||
| { | { | ||||||
|     std::vector<int> latt = GridDefaultLatt();           |   RealD NP     = _FourDimGrid->_Nprocessors; | ||||||
|     RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu]; |   RealD NN     = _FourDimGrid->NodeCount(); | ||||||
|     RealD NP = _FourDimGrid->_Nprocessors; |   RealD volume = Ls;   | ||||||
|     RealD NN = _FourDimGrid->NodeCount(); |   std::vector<int> latt = _FourDimGrid->GlobalDimensions(); | ||||||
|  |   for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu]; | ||||||
|  |  | ||||||
|   if ( DhopCalls > 0 ) { |   if ( DhopCalls > 0 ) { | ||||||
|     std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; |     std::cout << GridLogMessage << "#### Dhop calls report " << std::endl; | ||||||
| @@ -184,6 +186,11 @@ void WilsonFermion5D<Impl>::Report(void) | |||||||
|     std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl;  StencilEven.Report(); |     std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl;  StencilEven.Report(); | ||||||
|     std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl;  StencilOdd.Report(); |     std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl;  StencilOdd.Report(); | ||||||
|   } |   } | ||||||
|  |   if ( DhopCalls > 0){ | ||||||
|  |     std::cout << GridLogMessage << "WilsonFermion5D Stencil     Reporti()"    <<std::endl;  Stencil.Reporti(DhopCalls); | ||||||
|  |     std::cout << GridLogMessage << "WilsonFermion5D StencilEven Reporti()"<<std::endl;  StencilEven.Reporti(DhopCalls); | ||||||
|  |     std::cout << GridLogMessage << "WilsonFermion5D StencilOdd  Reporti()" <<std::endl;  StencilOdd.Reporti(DhopCalls); | ||||||
|  |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| template<class Impl> | template<class Impl> | ||||||
| @@ -203,6 +210,9 @@ void WilsonFermion5D<Impl>::ZeroCounters(void) { | |||||||
|   Stencil.ZeroCounters(); |   Stencil.ZeroCounters(); | ||||||
|   StencilEven.ZeroCounters(); |   StencilEven.ZeroCounters(); | ||||||
|   StencilOdd.ZeroCounters(); |   StencilOdd.ZeroCounters(); | ||||||
|  |   Stencil.ZeroCountersi(); | ||||||
|  |   StencilEven.ZeroCountersi(); | ||||||
|  |   StencilOdd.ZeroCountersi(); | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -379,7 +389,6 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg | |||||||
| { | { | ||||||
| #ifdef GRID_OMP | #ifdef GRID_OMP | ||||||
|   //  assert((dag==DaggerNo) ||(dag==DaggerYes)); |   //  assert((dag==DaggerNo) ||(dag==DaggerYes)); | ||||||
|   typedef CartesianCommunicator::CommsRequest_t CommsRequest_t; |  | ||||||
|  |  | ||||||
|   Compressor compressor(dag); |   Compressor compressor(dag); | ||||||
|  |  | ||||||
| @@ -388,46 +397,70 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg | |||||||
|  |  | ||||||
|   DhopFaceTime-=usecond(); |   DhopFaceTime-=usecond(); | ||||||
|   st.HaloExchangeOptGather(in,compressor); |   st.HaloExchangeOptGather(in,compressor); | ||||||
|   DhopFaceTime+=usecond(); |   st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms | ||||||
|   std::vector<std::vector<CommsRequest_t> > reqs; |  | ||||||
|  |  | ||||||
|   // Rely on async comms; start comms before merge of local data |  | ||||||
|   DhopCommTime-=usecond(); |  | ||||||
|   st.CommunicateBegin(reqs); |  | ||||||
|  |  | ||||||
|   DhopFaceTime-=usecond(); |  | ||||||
|   st.CommsMergeSHM(compressor); |  | ||||||
|   DhopFaceTime+=usecond(); |   DhopFaceTime+=usecond(); | ||||||
|  |  | ||||||
|   // Perhaps use omp task and region |   double ctime=0; | ||||||
| #pragma omp parallel  |   double ptime=0; | ||||||
|  |  | ||||||
|  |   ////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  |   // Ugly explicit thread mapping introduced for OPA reasons. | ||||||
|  |   ////////////////////////////////////////////////////////////////////////////////////////////////////// | ||||||
|  | #pragma omp parallel reduction(max:ctime) reduction(max:ptime) | ||||||
|   {  |   {  | ||||||
|  |     int tid = omp_get_thread_num(); | ||||||
|     int nthreads = omp_get_num_threads(); |     int nthreads = omp_get_num_threads(); | ||||||
|     int me = omp_get_thread_num(); |     int ncomms = CartesianCommunicator::nCommThreads; | ||||||
|     int myoff, mywork; |     if (ncomms == -1) ncomms = 1; | ||||||
|  |     assert(nthreads > ncomms); | ||||||
|  |     if (tid >= ncomms) { | ||||||
|  |       double start = usecond(); | ||||||
|  |       nthreads -= ncomms; | ||||||
|  |       int ttid = tid - ncomms; | ||||||
|  |       int n = U._grid->oSites(); | ||||||
|  |       int chunk = n / nthreads; | ||||||
|  |       int rem = n % nthreads; | ||||||
|  |       int myblock, myn; | ||||||
|  |       if (ttid < rem) { | ||||||
|  | 	myblock = ttid * chunk + ttid; | ||||||
|  | 	myn = chunk+1; | ||||||
|  |       } else { | ||||||
|  | 	myblock = ttid*chunk + rem; | ||||||
|  | 	myn = chunk; | ||||||
|  |       } | ||||||
|        |        | ||||||
|     GridThread::GetWork(len,me-1,mywork,myoff,nthreads-1); |       // do the compute | ||||||
|     int sF = LLs * myoff; |       if (dag == DaggerYes) { | ||||||
|  | 	for (int ss = myblock; ss < myblock+myn; ++ss) { | ||||||
|     if ( me == 0 ) { | 	  int sU = ss; | ||||||
|       st.CommunicateComplete(reqs); | 	  int sF = LLs * sU; | ||||||
|       DhopCommTime+=usecond(); | 	  Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0); | ||||||
|     } else {  | 	} | ||||||
|       // Interior links in stencil |       } else { | ||||||
|       if ( me==1 ) DhopComputeTime-=usecond(); | 	for (int ss = myblock; ss < myblock+myn; ++ss) { | ||||||
|       if (dag == DaggerYes) Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0); | 	  int sU = ss; | ||||||
|       else      	    Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0); | 	  int sF = LLs * sU; | ||||||
|       if ( me==1 ) DhopComputeTime+=usecond(); | 	  Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0); | ||||||
|  | 	} | ||||||
|  |       } | ||||||
|  | 	ptime = usecond() - start; | ||||||
|  |     } | ||||||
|  |     { | ||||||
|  |       double start = usecond(); | ||||||
|  |       st.CommunicateThreaded(); | ||||||
|  |       ctime = usecond() - start; | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |   DhopCommTime += ctime; | ||||||
|  |   DhopComputeTime+=ptime; | ||||||
|  |  | ||||||
|  |   // First to enter, last to leave timing | ||||||
|  |   st.CollateThreads(); | ||||||
|  |  | ||||||
|   DhopFaceTime-=usecond(); |   DhopFaceTime-=usecond(); | ||||||
|   st.CommsMerge(compressor); |   st.CommsMerge(compressor); | ||||||
|   DhopFaceTime+=usecond(); |   DhopFaceTime+=usecond(); | ||||||
|  |  | ||||||
|   // Load imbalance alert. Should use dynamic schedule OMP for loop |  | ||||||
|   // Perhaps create a list of only those sites with face work, and  |  | ||||||
|   // load balance process the list. |  | ||||||
|   DhopComputeTime2-=usecond(); |   DhopComputeTime2-=usecond(); | ||||||
|   if (dag == DaggerYes) { |   if (dag == DaggerYes) { | ||||||
|     int sz=st.surface_list.size(); |     int sz=st.surface_list.size(); | ||||||
| @@ -448,11 +481,9 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg | |||||||
| #else  | #else  | ||||||
|   assert(0); |   assert(0); | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| template<class Impl> | template<class Impl> | ||||||
| void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo, | void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo, | ||||||
| 					 DoubledGaugeField & U, | 					 DoubledGaugeField & U, | ||||||
|   | |||||||
							
								
								
									
										286
									
								
								lib/qcd/action/gauge/Photon.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										286
									
								
								lib/qcd/action/gauge/Photon.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,286 @@ | |||||||
|  | /************************************************************************************* | ||||||
|  |   | ||||||
|  |  Grid physics library, www.github.com/paboyle/Grid | ||||||
|  |   | ||||||
|  |  Source file: ./lib/qcd/action/gauge/Photon.h | ||||||
|  |   | ||||||
|  |  Copyright (C) 2015 | ||||||
|  |   | ||||||
|  |  Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||||
|  |   | ||||||
|  |  This program is free software; you can redistribute it and/or modify | ||||||
|  |  it under the terms of the GNU General Public License as published by | ||||||
|  |  the Free Software Foundation; either version 2 of the License, or | ||||||
|  |  (at your option) any later version. | ||||||
|  |   | ||||||
|  |  This program is distributed in the hope that it will be useful, | ||||||
|  |  but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  |  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  |  GNU General Public License for more details. | ||||||
|  |   | ||||||
|  |  You should have received a copy of the GNU General Public License along | ||||||
|  |  with this program; if not, write to the Free Software Foundation, Inc., | ||||||
|  |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||||||
|  |   | ||||||
|  |  See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|  |  *************************************************************************************/ | ||||||
|  | /*  END LEGAL */ | ||||||
|  | #ifndef QCD_PHOTON_ACTION_H | ||||||
|  | #define QCD_PHOTON_ACTION_H | ||||||
|  |  | ||||||
|  | namespace Grid{ | ||||||
|  | namespace QCD{ | ||||||
|  |   template <class S> | ||||||
|  |   class QedGimpl | ||||||
|  |   { | ||||||
|  |   public: | ||||||
|  |     typedef S Simd; | ||||||
|  |      | ||||||
|  |     template <typename vtype> | ||||||
|  |     using iImplGaugeLink  = iScalar<iScalar<iScalar<vtype>>>; | ||||||
|  |     template <typename vtype> | ||||||
|  |     using iImplGaugeField = iVector<iScalar<iScalar<vtype>>, Nd>; | ||||||
|  |      | ||||||
|  |     typedef iImplGaugeLink<Simd>  SiteLink; | ||||||
|  |     typedef iImplGaugeField<Simd> SiteField; | ||||||
|  |     typedef SiteField             SiteComplex; | ||||||
|  |      | ||||||
|  |     typedef Lattice<SiteLink>  LinkField; | ||||||
|  |     typedef Lattice<SiteField> Field; | ||||||
|  |     typedef Field              ComplexField; | ||||||
|  |   }; | ||||||
|  |    | ||||||
|  |   typedef QedGimpl<vComplex> QedGimplR; | ||||||
|  |    | ||||||
|  |   template<class Gimpl> | ||||||
|  |   class Photon | ||||||
|  |   { | ||||||
|  |   public: | ||||||
|  |     INHERIT_GIMPL_TYPES(Gimpl); | ||||||
|  |     GRID_SERIALIZABLE_ENUM(Gauge, undef, feynman, 1, coulomb, 2, landau, 3); | ||||||
|  |     GRID_SERIALIZABLE_ENUM(ZmScheme, undef, qedL, 1, qedTL, 2); | ||||||
|  |   public: | ||||||
|  |     Photon(Gauge gauge, ZmScheme zmScheme); | ||||||
|  |     virtual ~Photon(void) = default; | ||||||
|  |     void FreePropagator(const GaugeField &in, GaugeField &out); | ||||||
|  |     void MomentumSpacePropagator(const GaugeField &in, GaugeField &out); | ||||||
|  |     void StochasticWeight(GaugeLinkField &weight); | ||||||
|  |     void StochasticField(GaugeField &out, GridParallelRNG &rng); | ||||||
|  |     void StochasticField(GaugeField &out, GridParallelRNG &rng, | ||||||
|  |                          const GaugeLinkField &weight); | ||||||
|  |   private: | ||||||
|  |     void invKHatSquared(GaugeLinkField &out); | ||||||
|  |     void zmSub(GaugeLinkField &out); | ||||||
|  |   private: | ||||||
|  |     Gauge    gauge_; | ||||||
|  |     ZmScheme zmScheme_; | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  |   typedef Photon<QedGimplR>  PhotonR; | ||||||
|  |    | ||||||
|  |   template<class Gimpl> | ||||||
|  |   Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme) | ||||||
|  |   : gauge_(gauge), zmScheme_(zmScheme) | ||||||
|  |   {} | ||||||
|  |    | ||||||
|  |   template<class Gimpl> | ||||||
|  |   void Photon<Gimpl>::FreePropagator (const GaugeField &in,GaugeField &out) | ||||||
|  |   { | ||||||
|  |     FFT theFFT(in._grid); | ||||||
|  |      | ||||||
|  |     GaugeField in_k(in._grid); | ||||||
|  |     GaugeField prop_k(in._grid); | ||||||
|  |      | ||||||
|  |     theFFT.FFT_all_dim(in_k,in,FFT::forward); | ||||||
|  |     MomentumSpacePropagator(prop_k,in_k); | ||||||
|  |     theFFT.FFT_all_dim(out,prop_k,FFT::backward); | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   template<class Gimpl> | ||||||
|  |   void Photon<Gimpl>::invKHatSquared(GaugeLinkField &out) | ||||||
|  |   { | ||||||
|  |     GridBase           *grid = out._grid; | ||||||
|  |     GaugeLinkField     kmu(grid), one(grid); | ||||||
|  |     const unsigned int nd    = grid->_ndimension; | ||||||
|  |     std::vector<int>   &l    = grid->_fdimensions; | ||||||
|  |     std::vector<int>   zm(nd,0); | ||||||
|  |     TComplex           Tone = Complex(1.0,0.0); | ||||||
|  |     TComplex           Tzero= Complex(0.0,0.0); | ||||||
|  |      | ||||||
|  |     one = Complex(1.0,0.0); | ||||||
|  |     out = zero; | ||||||
|  |     for(int mu = 0; mu < nd; mu++) | ||||||
|  |     { | ||||||
|  |       Real twoPiL = M_PI*2./l[mu]; | ||||||
|  |        | ||||||
|  |       LatticeCoordinate(kmu,mu); | ||||||
|  |       kmu = 2.*sin(.5*twoPiL*kmu); | ||||||
|  |       out = out + kmu*kmu; | ||||||
|  |     } | ||||||
|  |     pokeSite(Tone, out, zm); | ||||||
|  |     out = one/out; | ||||||
|  |     pokeSite(Tzero, out, zm); | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   template<class Gimpl> | ||||||
|  |   void Photon<Gimpl>::zmSub(GaugeLinkField &out) | ||||||
|  |   { | ||||||
|  |     GridBase           *grid = out._grid; | ||||||
|  |     const unsigned int nd    = grid->_ndimension; | ||||||
|  |      | ||||||
|  |     switch (zmScheme_) | ||||||
|  |     { | ||||||
|  |       case ZmScheme::qedTL: | ||||||
|  |       { | ||||||
|  |         std::vector<int> zm(nd,0); | ||||||
|  |         TComplex         Tzero = Complex(0.0,0.0); | ||||||
|  |          | ||||||
|  |         pokeSite(Tzero, out, zm); | ||||||
|  |          | ||||||
|  |         break; | ||||||
|  |       } | ||||||
|  |       case ZmScheme::qedL: | ||||||
|  |       { | ||||||
|  |         LatticeInteger spNrm(grid), coor(grid); | ||||||
|  |         GaugeLinkField z(grid); | ||||||
|  |          | ||||||
|  |         spNrm = zero; | ||||||
|  |         for(int d = 0; d < grid->_ndimension - 1; d++) | ||||||
|  |         { | ||||||
|  |           LatticeCoordinate(coor,d); | ||||||
|  |           spNrm = spNrm + coor*coor; | ||||||
|  |         } | ||||||
|  |         out = where(spNrm == Integer(0), 0.*out, out); | ||||||
|  |          | ||||||
|  |         break; | ||||||
|  |       } | ||||||
|  |       default: | ||||||
|  |         break; | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   template<class Gimpl> | ||||||
|  |   void Photon<Gimpl>::MomentumSpacePropagator(const GaugeField &in, | ||||||
|  |                                                GaugeField &out) | ||||||
|  |   { | ||||||
|  |     GridBase           *grid = out._grid; | ||||||
|  |     LatticeComplex     k2Inv(grid); | ||||||
|  |      | ||||||
|  |     invKHatSquared(k2Inv); | ||||||
|  |     zmSub(k2Inv); | ||||||
|  |      | ||||||
|  |     out = in*k2Inv; | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   template<class Gimpl> | ||||||
|  |   void Photon<Gimpl>::StochasticWeight(GaugeLinkField &weight) | ||||||
|  |   { | ||||||
|  |     auto               *grid     = dynamic_cast<GridCartesian *>(weight._grid); | ||||||
|  |     const unsigned int nd        = grid->_ndimension; | ||||||
|  |     std::vector<int>   latt_size = grid->_fdimensions; | ||||||
|  |      | ||||||
|  |     Integer vol = 1; | ||||||
|  |     for(int d = 0; d < nd; d++) | ||||||
|  |     { | ||||||
|  |       vol = vol * latt_size[d]; | ||||||
|  |     } | ||||||
|  |     invKHatSquared(weight); | ||||||
|  |     weight = sqrt(vol*real(weight)); | ||||||
|  |     zmSub(weight); | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   template<class Gimpl> | ||||||
|  |   void Photon<Gimpl>::StochasticField(GaugeField &out, GridParallelRNG &rng) | ||||||
|  |   { | ||||||
|  |     auto           *grid = dynamic_cast<GridCartesian *>(out._grid); | ||||||
|  |     GaugeLinkField weight(grid); | ||||||
|  |      | ||||||
|  |     StochasticWeight(weight); | ||||||
|  |     StochasticField(out, rng, weight); | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   template<class Gimpl> | ||||||
|  |   void Photon<Gimpl>::StochasticField(GaugeField &out, GridParallelRNG &rng, | ||||||
|  |                                       const GaugeLinkField &weight) | ||||||
|  |   { | ||||||
|  |     auto               *grid = dynamic_cast<GridCartesian *>(out._grid); | ||||||
|  |     const unsigned int nd = grid->_ndimension; | ||||||
|  |     GaugeLinkField     r(grid); | ||||||
|  |     GaugeField         aTilde(grid); | ||||||
|  |     FFT                fft(grid); | ||||||
|  |      | ||||||
|  |     for(int mu = 0; mu < nd; mu++) | ||||||
|  |     { | ||||||
|  |       gaussian(rng, r); | ||||||
|  |       r = weight*r; | ||||||
|  |       pokeLorentz(aTilde, r, mu); | ||||||
|  |     } | ||||||
|  |     fft.FFT_all_dim(out, aTilde, FFT::backward); | ||||||
|  |      | ||||||
|  |     out = real(out); | ||||||
|  |   } | ||||||
|  | //  template<class Gimpl> | ||||||
|  | //  void Photon<Gimpl>::FeynmanGaugeMomentumSpacePropagator_L(GaugeField &out, | ||||||
|  | //                                                            const GaugeField &in) | ||||||
|  | //  { | ||||||
|  | //     | ||||||
|  | //    FeynmanGaugeMomentumSpacePropagator_TL(out,in); | ||||||
|  | //     | ||||||
|  | //    GridBase *grid = out._grid; | ||||||
|  | //    LatticeInteger     coor(grid); | ||||||
|  | //    GaugeField zz(grid); zz=zero; | ||||||
|  | //     | ||||||
|  | //    // xyzt | ||||||
|  | //    for(int d = 0; d < grid->_ndimension-1;d++){ | ||||||
|  | //      LatticeCoordinate(coor,d); | ||||||
|  | //      out = where(coor==Integer(0),zz,out); | ||||||
|  | //    } | ||||||
|  | //  } | ||||||
|  | //   | ||||||
|  | //  template<class Gimpl> | ||||||
|  | //  void Photon<Gimpl>::FeynmanGaugeMomentumSpacePropagator_TL(GaugeField &out, | ||||||
|  | //                                                             const GaugeField &in) | ||||||
|  | //  { | ||||||
|  | //     | ||||||
|  | //    // what type LatticeComplex | ||||||
|  | //    GridBase *grid = out._grid; | ||||||
|  | //    int nd = grid->_ndimension; | ||||||
|  | //     | ||||||
|  | //    typedef typename GaugeField::vector_type vector_type; | ||||||
|  | //    typedef typename GaugeField::scalar_type ScalComplex; | ||||||
|  | //    typedef Lattice<iSinglet<vector_type> > LatComplex; | ||||||
|  | //     | ||||||
|  | //    std::vector<int> latt_size   = grid->_fdimensions; | ||||||
|  | //     | ||||||
|  | //    LatComplex denom(grid); denom= zero; | ||||||
|  | //    LatComplex   one(grid); one = ScalComplex(1.0,0.0); | ||||||
|  | //    LatComplex   kmu(grid); | ||||||
|  | //     | ||||||
|  | //    ScalComplex ci(0.0,1.0); | ||||||
|  | //    // momphase = n * 2pi / L | ||||||
|  | //    for(int mu=0;mu<Nd;mu++) { | ||||||
|  | //       | ||||||
|  | //      LatticeCoordinate(kmu,mu); | ||||||
|  | //       | ||||||
|  | //      RealD TwoPiL =  M_PI * 2.0/ latt_size[mu]; | ||||||
|  | //       | ||||||
|  | //      kmu = TwoPiL * kmu ; | ||||||
|  | //       | ||||||
|  | //      denom = denom + 4.0*sin(kmu*0.5)*sin(kmu*0.5); // Wilson term | ||||||
|  | //    } | ||||||
|  | //    std::vector<int> zero_mode(nd,0); | ||||||
|  | //    TComplexD Tone = ComplexD(1.0,0.0); | ||||||
|  | //    TComplexD Tzero= ComplexD(0.0,0.0); | ||||||
|  | //     | ||||||
|  | //    pokeSite(Tone,denom,zero_mode); | ||||||
|  | //     | ||||||
|  | //    denom= one/denom; | ||||||
|  | //     | ||||||
|  | //    pokeSite(Tzero,denom,zero_mode); | ||||||
|  | //     | ||||||
|  | //    out = zero; | ||||||
|  | //    out = in*denom; | ||||||
|  | //  }; | ||||||
|  |    | ||||||
|  | }} | ||||||
|  | #endif | ||||||
| @@ -31,6 +31,7 @@ directory | |||||||
|  |  | ||||||
| #include <Grid/qcd/action/scalar/ScalarImpl.h> | #include <Grid/qcd/action/scalar/ScalarImpl.h> | ||||||
| #include <Grid/qcd/action/scalar/ScalarAction.h> | #include <Grid/qcd/action/scalar/ScalarAction.h> | ||||||
|  | #include <Grid/qcd/action/scalar/ScalarInteractionAction.h> | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
| namespace QCD { | namespace QCD { | ||||||
| @@ -39,6 +40,10 @@ namespace QCD { | |||||||
|   typedef ScalarAction<ScalarImplF>                 ScalarActionF; |   typedef ScalarAction<ScalarImplF>                 ScalarActionF; | ||||||
|   typedef ScalarAction<ScalarImplD>                 ScalarActionD; |   typedef ScalarAction<ScalarImplD>                 ScalarActionD; | ||||||
|  |  | ||||||
|  |   template <int Colours, int Dimensions> using ScalarAdjActionR = ScalarInteractionAction<ScalarNxNAdjImplR<Colours>, Dimensions>; | ||||||
|  |   template <int Colours, int Dimensions> using ScalarAdjActionF = ScalarInteractionAction<ScalarNxNAdjImplF<Colours>, Dimensions>; | ||||||
|  |   template <int Colours, int Dimensions> using ScalarAdjActionD = ScalarInteractionAction<ScalarNxNAdjImplD<Colours>, Dimensions>; | ||||||
|  |    | ||||||
| } | } | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -6,10 +6,10 @@ | |||||||
|  |  | ||||||
|   Copyright (C) 2015 |   Copyright (C) 2015 | ||||||
|  |  | ||||||
| Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> |   Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> | ||||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> |   Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||||
| Author: neo <cossu@post.kek.jp> |   Author: neo <cossu@post.kek.jp> | ||||||
| Author: paboyle <paboyle@ph.ed.ac.uk> |   Author: paboyle <paboyle@ph.ed.ac.uk> | ||||||
|  |  | ||||||
|   This program is free software; you can redistribute it and/or modify |   This program is free software; you can redistribute it and/or modify | ||||||
|   it under the terms of the GNU General Public License as published by |   it under the terms of the GNU General Public License as published by | ||||||
| @@ -36,49 +36,48 @@ directory | |||||||
| namespace Grid { | namespace Grid { | ||||||
|   // FIXME drop the QCD namespace everywhere here |   // FIXME drop the QCD namespace everywhere here | ||||||
|  |  | ||||||
|   template <class Impl> | template <class Impl> | ||||||
|   class ScalarAction : public QCD::Action<typename Impl::Field> { | class ScalarAction : public QCD::Action<typename Impl::Field> { | ||||||
|   public: |  public: | ||||||
|     INHERIT_FIELD_TYPES(Impl); |     INHERIT_FIELD_TYPES(Impl); | ||||||
|  |  | ||||||
|   private: |  private: | ||||||
|     RealD mass_square; |     RealD mass_square; | ||||||
|     RealD lambda; |     RealD lambda; | ||||||
|  |  | ||||||
|   public: |  public: | ||||||
|     ScalarAction(RealD ms, RealD l) : mass_square(ms), lambda(l){}; |     ScalarAction(RealD ms, RealD l) : mass_square(ms), lambda(l) {} | ||||||
|  |  | ||||||
|     virtual std::string LogParameters(){ |     virtual std::string LogParameters() { | ||||||
|       std::stringstream sstream; |       std::stringstream sstream; | ||||||
|       sstream << GridLogMessage << "[ScalarAction] lambda      : " << lambda      << std::endl; |       sstream << GridLogMessage << "[ScalarAction] lambda      : " << lambda      << std::endl; | ||||||
|       sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl; |       sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl; | ||||||
|       return sstream.str(); |       return sstream.str(); | ||||||
|        |  | ||||||
|     } |     } | ||||||
|  |     virtual std::string action_name() {return "ScalarAction";} | ||||||
|  |  | ||||||
|     virtual std::string action_name(){return "ScalarAction";} |     virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}  // noop as no pseudoferms | ||||||
|      |  | ||||||
|     virtual void refresh(const Field &U, |  | ||||||
| 			 GridParallelRNG &pRNG){};  // noop as no pseudoferms |  | ||||||
|  |  | ||||||
|     virtual RealD S(const Field &p) { |     virtual RealD S(const Field &p) { | ||||||
|       return (mass_square * 0.5 + QCD::Nd) * ScalarObs<Impl>::sumphisquared(p) + |       return (mass_square * 0.5 + QCD::Nd) * ScalarObs<Impl>::sumphisquared(p) + | ||||||
| 	(lambda / 24.) * ScalarObs<Impl>::sumphifourth(p) + |     (lambda / 24.) * ScalarObs<Impl>::sumphifourth(p) + | ||||||
| 	ScalarObs<Impl>::sumphider(p); |     ScalarObs<Impl>::sumphider(p); | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|     virtual void deriv(const Field &p, |     virtual void deriv(const Field &p, | ||||||
| 		       Field &force) { |                        Field &force) { | ||||||
|       Field tmp(p._grid); |       Field tmp(p._grid); | ||||||
|       Field p2(p._grid); |       Field p2(p._grid); | ||||||
|       ScalarObs<Impl>::phisquared(p2, p); |       ScalarObs<Impl>::phisquared(p2, p); | ||||||
|       tmp = -(Cshift(p, 0, -1) + Cshift(p, 0, 1)); |       tmp = -(Cshift(p, 0, -1) + Cshift(p, 0, 1)); | ||||||
|       for (int mu = 1; mu < QCD::Nd; mu++) tmp -= Cshift(p, mu, -1) + Cshift(p, mu, 1); |       for (int mu = 1; mu < QCD::Nd; mu++) tmp -= Cshift(p, mu, -1) + Cshift(p, mu, 1); | ||||||
|  |  | ||||||
|       force=+(mass_square + 2. * QCD::Nd) * p + (lambda / 6.) * p2 * p + tmp; |       force =+(mass_square + 2. * QCD::Nd) * p + (lambda / 6.) * p2 * p + tmp; | ||||||
|     }; |     } | ||||||
|   }; | }; | ||||||
|  |  | ||||||
| } // Grid |  | ||||||
|  |  | ||||||
|  | }  // namespace Grid | ||||||
|  |  | ||||||
| #endif // SCALAR_ACTION_H | #endif // SCALAR_ACTION_H | ||||||
|   | |||||||
| @@ -5,20 +5,22 @@ | |||||||
| namespace Grid { | namespace Grid { | ||||||
|   //namespace QCD { |   //namespace QCD { | ||||||
|  |  | ||||||
|   template <class S> | template <class S> | ||||||
|   class ScalarImplTypes { | class ScalarImplTypes { | ||||||
|   public: |  public: | ||||||
|     typedef S Simd; |     typedef S Simd; | ||||||
|  |  | ||||||
|     template <typename vtype> |     template <typename vtype> | ||||||
|     using iImplField = iScalar<iScalar<iScalar<vtype> > >; |     using iImplField = iScalar<iScalar<iScalar<vtype> > >; | ||||||
|  |  | ||||||
|     typedef iImplField<Simd> SiteField; |     typedef iImplField<Simd> SiteField; | ||||||
|      |     typedef SiteField        SitePropagator; | ||||||
|     template <typename vtype> using iImplScalar= iScalar<iScalar<iScalar<vtype   > > >; |     typedef SiteField        SiteComplex; | ||||||
|     typedef iImplScalar<Simd> ComplexField; |  | ||||||
|      |      | ||||||
|     typedef Lattice<SiteField> Field; |     typedef Lattice<SiteField> Field; | ||||||
|  |     typedef Field              ComplexField; | ||||||
|  |     typedef Field              FermionField; | ||||||
|  |     typedef Field              PropagatorField; | ||||||
|      |      | ||||||
|     static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){ |     static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){ | ||||||
|       gaussian(pRNG, P); |       gaussian(pRNG, P); | ||||||
| @@ -26,11 +28,11 @@ namespace Grid { | |||||||
|  |  | ||||||
|     static inline Field projectForce(Field& P){return P;} |     static inline Field projectForce(Field& P){return P;} | ||||||
|  |  | ||||||
|     static inline void update_field(Field& P, Field& U, double ep){ |     static inline void update_field(Field& P, Field& U, double ep) { | ||||||
|       U += P*ep; |       U += P*ep; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     static inline RealD FieldSquareNorm(Field& U){ |     static inline RealD FieldSquareNorm(Field& U) { | ||||||
|       return (- sum(trace(U*U))/2.0); |       return (- sum(trace(U*U))/2.0); | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -46,46 +48,91 @@ namespace Grid { | |||||||
|       U = 1.0; |       U = 1.0; | ||||||
|     } |     } | ||||||
|      |      | ||||||
|  |     static void MomentumSpacePropagator(Field &out, RealD m) | ||||||
|  |     { | ||||||
|  |       GridBase           *grid = out._grid; | ||||||
|  |       Field              kmu(grid), one(grid); | ||||||
|  |       const unsigned int nd    = grid->_ndimension; | ||||||
|  |       std::vector<int>   &l    = grid->_fdimensions; | ||||||
|  |        | ||||||
|  |       one = Complex(1.0,0.0); | ||||||
|  |       out = m*m; | ||||||
|  |       for(int mu = 0; mu < nd; mu++) | ||||||
|  |       { | ||||||
|  |         Real twoPiL = M_PI*2./l[mu]; | ||||||
|  |          | ||||||
|  |         LatticeCoordinate(kmu,mu); | ||||||
|  |         kmu = 2.*sin(.5*twoPiL*kmu); | ||||||
|  |         out = out + kmu*kmu; | ||||||
|  |       } | ||||||
|  |       out = one/out; | ||||||
|  |     } | ||||||
|  |      | ||||||
|  |     static void FreePropagator(const Field &in, Field &out, | ||||||
|  |                                const Field &momKernel) | ||||||
|  |     { | ||||||
|  |       FFT   fft((GridCartesian *)in._grid); | ||||||
|  |       Field inFT(in._grid); | ||||||
|  |        | ||||||
|  |       fft.FFT_all_dim(inFT, in, FFT::forward); | ||||||
|  |       inFT = inFT*momKernel; | ||||||
|  |       fft.FFT_all_dim(out, inFT, FFT::backward); | ||||||
|  |     } | ||||||
|  |      | ||||||
|  |     static void FreePropagator(const Field &in, Field &out, RealD m) | ||||||
|  |     { | ||||||
|  |       Field momKernel(in._grid); | ||||||
|  |        | ||||||
|  |       MomentumSpacePropagator(momKernel, m); | ||||||
|  |       FreePropagator(in, out, momKernel); | ||||||
|  |     } | ||||||
|  |      | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   template <class S, unsigned int N> |   template <class S, unsigned int N> | ||||||
|   class ScalarMatrixImplTypes { |   class ScalarAdjMatrixImplTypes { | ||||||
|   public: |   public: | ||||||
|     typedef S Simd; |     typedef S Simd; | ||||||
|  |     typedef QCD::SU<N> Group; | ||||||
|      |      | ||||||
|     template <typename vtype> using iImplField = iScalar<iScalar<iMatrix<vtype, N> > >; |     template <typename vtype> | ||||||
|  |     using iImplField   = iScalar<iScalar<iMatrix<vtype, N>>>; | ||||||
|  |     template <typename vtype> | ||||||
|  |     using iImplComplex = iScalar<iScalar<iScalar<vtype>>>; | ||||||
|  |  | ||||||
|     typedef iImplField<Simd> SiteField; |     typedef iImplField<Simd>   SiteField; | ||||||
|     typedef Lattice<SiteField> Field; |     typedef SiteField          SitePropagator; | ||||||
|  |     typedef iImplComplex<Simd> SiteComplex; | ||||||
|      |      | ||||||
|     template <typename vtype> using iImplScalar= iScalar<iScalar<iScalar<vtype   > > >; |     typedef Lattice<SiteField>   Field; | ||||||
|     typedef iImplScalar<Simd> ComplexField; |     typedef Lattice<SiteComplex> ComplexField; | ||||||
|  |     typedef Field                FermionField; | ||||||
|  |     typedef Field                PropagatorField; | ||||||
|  |  | ||||||
|      |     static inline void generate_momenta(Field& P, GridParallelRNG& pRNG) { | ||||||
|     static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){ |       Group::GaussianFundamentalLieAlgebraMatrix(pRNG, P); | ||||||
|       gaussian(pRNG, P); |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     static inline Field projectForce(Field& P){return P;} |     static inline Field projectForce(Field& P) {return P;} | ||||||
|  |  | ||||||
|     static inline void update_field(Field& P, Field& U, double ep){ |     static inline void update_field(Field& P, Field& U, double ep) { | ||||||
|       U += P*ep; |       U += P*ep; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     static inline RealD FieldSquareNorm(Field& U){ |     static inline RealD FieldSquareNorm(Field& U) { | ||||||
|       return (TensorRemove(- sum(trace(U*U))*0.5).real()); |       return (TensorRemove(sum(trace(U*U))).real()); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) { |     static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) { | ||||||
|       gaussian(pRNG, U); |       Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) { |     static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) { | ||||||
|       gaussian(pRNG, U); |       Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U, 0.01); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) { |     static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) { | ||||||
|       U = 1.0; |       U = zero; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|   }; |   }; | ||||||
| @@ -96,6 +143,18 @@ namespace Grid { | |||||||
|   typedef ScalarImplTypes<vReal> ScalarImplR; |   typedef ScalarImplTypes<vReal> ScalarImplR; | ||||||
|   typedef ScalarImplTypes<vRealF> ScalarImplF; |   typedef ScalarImplTypes<vRealF> ScalarImplF; | ||||||
|   typedef ScalarImplTypes<vRealD> ScalarImplD; |   typedef ScalarImplTypes<vRealD> ScalarImplD; | ||||||
|  |   typedef ScalarImplTypes<vComplex> ScalarImplCR; | ||||||
|  |   typedef ScalarImplTypes<vComplexF> ScalarImplCF; | ||||||
|  |   typedef ScalarImplTypes<vComplexD> ScalarImplCD; | ||||||
|  |      | ||||||
|  |   // Hardcoding here the size of the matrices | ||||||
|  |   typedef ScalarAdjMatrixImplTypes<vComplex,  QCD::Nc> ScalarAdjImplR; | ||||||
|  |   typedef ScalarAdjMatrixImplTypes<vComplexF, QCD::Nc> ScalarAdjImplF; | ||||||
|  |   typedef ScalarAdjMatrixImplTypes<vComplexD, QCD::Nc> ScalarAdjImplD; | ||||||
|  |  | ||||||
|  |   template <int Colours > using ScalarNxNAdjImplR = ScalarAdjMatrixImplTypes<vComplex,   Colours >; | ||||||
|  |   template <int Colours > using ScalarNxNAdjImplF = ScalarAdjMatrixImplTypes<vComplexF,  Colours >; | ||||||
|  |   template <int Colours > using ScalarNxNAdjImplD = ScalarAdjMatrixImplTypes<vComplexD,  Colours >; | ||||||
|    |    | ||||||
|   //} |   //} | ||||||
| } | } | ||||||
|   | |||||||
| @@ -6,10 +6,7 @@ | |||||||
|  |  | ||||||
|   Copyright (C) 2015 |   Copyright (C) 2015 | ||||||
|  |  | ||||||
| Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk> |   Author: Guido Cossu <guido,cossu@ed.ac.uk> | ||||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> |  | ||||||
| Author: neo <cossu@post.kek.jp> |  | ||||||
| Author: paboyle <paboyle@ph.ed.ac.uk> |  | ||||||
|  |  | ||||||
|   This program is free software; you can redistribute it and/or modify |   This program is free software; you can redistribute it and/or modify | ||||||
|   it under the terms of the GNU General Public License as published by |   it under the terms of the GNU General Public License as published by | ||||||
| @@ -30,55 +27,122 @@ directory | |||||||
|   *************************************************************************************/ |   *************************************************************************************/ | ||||||
| /*  END LEGAL */ | /*  END LEGAL */ | ||||||
|  |  | ||||||
| #ifndef SCALAR_ACTION_H | #ifndef SCALAR_INT_ACTION_H | ||||||
| #define SCALAR_ACTION_H | #define SCALAR_INT_ACTION_H | ||||||
|  |  | ||||||
|  |  | ||||||
|  | // Note: this action can completely absorb the ScalarAction for real float fields | ||||||
|  | // use the scalarObjs to generalise the structure | ||||||
|  |  | ||||||
| namespace Grid { | namespace Grid { | ||||||
|   // FIXME drop the QCD namespace everywhere here |   // FIXME drop the QCD namespace everywhere here | ||||||
|  |  | ||||||
|   template <class Impl> |   template <class Impl, int Ndim > | ||||||
|   class ScalarInteractionAction : public QCD::Action<typename Impl::Field> { |   class ScalarInteractionAction : public QCD::Action<typename Impl::Field> { | ||||||
|   public: |   public: | ||||||
|     INHERIT_FIELD_TYPES(Impl); |     INHERIT_FIELD_TYPES(Impl); | ||||||
|      |  | ||||||
|   private: |   private: | ||||||
|     RealD mass_square; |     RealD mass_square; | ||||||
|     RealD lambda; |     RealD lambda; | ||||||
|  |  | ||||||
|   public: |  | ||||||
|     ScalarAction(RealD ms, RealD l) : mass_square(ms), lambda(l){}; |  | ||||||
|  |  | ||||||
|     virtual std::string LogParameters(){ |     typedef typename Field::vector_object vobj; | ||||||
|  |     typedef CartesianStencil<vobj,vobj> Stencil; | ||||||
|  |  | ||||||
|  |     SimpleCompressor<vobj> compressor; | ||||||
|  |     int npoint = 2*Ndim; | ||||||
|  |     std::vector<int> directions;//    = {0,1,2,3,0,1,2,3};  // forcing 4 dimensions | ||||||
|  |     std::vector<int> displacements;//  = {1,1,1,1, -1,-1,-1,-1}; | ||||||
|  |  | ||||||
|  |  | ||||||
|  |   public: | ||||||
|  |  | ||||||
|  |     ScalarInteractionAction(RealD ms, RealD l) : mass_square(ms), lambda(l), displacements(2*Ndim,0), directions(2*Ndim,0){ | ||||||
|  |       for (int mu = 0 ; mu < Ndim; mu++){ | ||||||
|  | 		directions[mu]         = mu; directions[mu+Ndim]    = mu; | ||||||
|  | 		displacements[mu]      =  1; displacements[mu+Ndim] = -1; | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     virtual std::string LogParameters() { | ||||||
|       std::stringstream sstream; |       std::stringstream sstream; | ||||||
|       sstream << GridLogMessage << "[ScalarAction] lambda      : " << lambda      << std::endl; |       sstream << GridLogMessage << "[ScalarAction] lambda      : " << lambda      << std::endl; | ||||||
|       sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl; |       sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl; | ||||||
|       return sstream.str(); |       return sstream.str(); | ||||||
|        |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     virtual std::string action_name(){return "ScalarAction";} |     virtual std::string action_name() {return "ScalarAction";} | ||||||
|  |  | ||||||
|     virtual void refresh(const Field &U, |     virtual void refresh(const Field &U, GridParallelRNG &pRNG) {} | ||||||
| 			 GridParallelRNG &pRNG){};  // noop as no pseudoferms |  | ||||||
|  |  | ||||||
|     virtual RealD S(const Field &p) { |     virtual RealD S(const Field &p) { | ||||||
|       return (mass_square * 0.5 + QCD::Nd) * ScalarObs<Impl>::sumphisquared(p) + |       assert(p._grid->Nd() == Ndim); | ||||||
| 	(lambda / 24.) * ScalarObs<Impl>::sumphifourth(p) + |       static Stencil phiStencil(p._grid, npoint, 0, directions, displacements); | ||||||
| 	ScalarObs<Impl>::sumphider(p); |       phiStencil.HaloExchange(p, compressor); | ||||||
|  |       Field action(p._grid), pshift(p._grid), phisquared(p._grid); | ||||||
|  |       phisquared = p*p; | ||||||
|  |       action = (2.0*Ndim + mass_square)*phisquared - lambda/24.*phisquared*phisquared; | ||||||
|  |       for (int mu = 0; mu < Ndim; mu++) { | ||||||
|  | 	//  pshift = Cshift(p, mu, +1);  // not efficient, implement with stencils | ||||||
|  | 	parallel_for (int i = 0; i < p._grid->oSites(); i++) { | ||||||
|  | 	  int permute_type; | ||||||
|  | 	  StencilEntry *SE; | ||||||
|  | 	  vobj temp2; | ||||||
|  | 	  const vobj *temp, *t_p; | ||||||
|  | 	     | ||||||
|  | 	  SE = phiStencil.GetEntry(permute_type, mu, i); | ||||||
|  | 	  t_p  = &p._odata[i]; | ||||||
|  | 	  if ( SE->_is_local ) { | ||||||
|  | 	    temp = &p._odata[SE->_offset]; | ||||||
|  | 	    if ( SE->_permute ) { | ||||||
|  | 	      permute(temp2, *temp, permute_type); | ||||||
|  | 	      action._odata[i] -= temp2*(*t_p) + (*t_p)*temp2; | ||||||
|  | 	    } else { | ||||||
|  | 	      action._odata[i] -= (*temp)*(*t_p) + (*t_p)*(*temp); | ||||||
|  | 	    } | ||||||
|  | 	  } else { | ||||||
|  | 	    action._odata[i] -= phiStencil.CommBuf()[SE->_offset]*(*t_p) + (*t_p)*phiStencil.CommBuf()[SE->_offset]; | ||||||
|  | 	  } | ||||||
|  | 	} | ||||||
|  | 	//  action -= pshift*p + p*pshift; | ||||||
|  |       } | ||||||
|  |       // NB the trace in the algebra is normalised to 1/2 | ||||||
|  |       // minus sign coming from the antihermitian fields | ||||||
|  |       return -(TensorRemove(sum(trace(action)))).real(); | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|     virtual void deriv(const Field &p, |     virtual void deriv(const Field &p, Field &force) { | ||||||
| 		       Field &force) { |       assert(p._grid->Nd() == Ndim); | ||||||
|       Field tmp(p._grid); |       force = (2.0*Ndim + mass_square)*p - lambda/12.*p*p*p; | ||||||
|       Field p2(p._grid); |       // move this outside | ||||||
|       ScalarObs<Impl>::phisquared(p2, p); |       static Stencil phiStencil(p._grid, npoint, 0, directions, displacements); | ||||||
|       tmp = -(Cshift(p, 0, -1) + Cshift(p, 0, 1)); |       phiStencil.HaloExchange(p, compressor); | ||||||
|       for (int mu = 1; mu < QCD::Nd; mu++) tmp -= Cshift(p, mu, -1) + Cshift(p, mu, 1); |  | ||||||
|        |        | ||||||
|       force=+(mass_square + 2. * QCD::Nd) * p + (lambda / 6.) * p2 * p + tmp; |       //for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1); | ||||||
|     }; |       for (int point = 0; point < npoint; point++) { | ||||||
|  | 	parallel_for (int i = 0; i < p._grid->oSites(); i++) { | ||||||
|  | 	  const vobj *temp; | ||||||
|  | 	  vobj temp2; | ||||||
|  | 	  int permute_type; | ||||||
|  | 	  StencilEntry *SE; | ||||||
|  | 	  SE = phiStencil.GetEntry(permute_type, point, i); | ||||||
|  | 	   | ||||||
|  | 	  if ( SE->_is_local ) { | ||||||
|  | 	    temp = &p._odata[SE->_offset]; | ||||||
|  | 	    if ( SE->_permute ) { | ||||||
|  | 	      permute(temp2, *temp, permute_type); | ||||||
|  | 	      force._odata[i] -= temp2; | ||||||
|  | 	    } else { | ||||||
|  | 	      force._odata[i] -= *temp; | ||||||
|  | 	    } | ||||||
|  | 	  } else { | ||||||
|  | 	    force._odata[i] -= phiStencil.CommBuf()[SE->_offset]; | ||||||
|  | 	  } | ||||||
|  | 	} | ||||||
|  |       } | ||||||
|  |     } | ||||||
|   }; |   }; | ||||||
|    |    | ||||||
| } // Grid | }  // namespace Grid | ||||||
|  |  | ||||||
| #endif // SCALAR_ACTION_H | #endif  // SCALAR_INT_ACTION_H | ||||||
|   | |||||||
| @@ -207,6 +207,12 @@ using GenericHMCRunnerTemplate = HMCWrapperTemplate<Implementation, Integrator, | |||||||
| typedef HMCWrapperTemplate<ScalarImplR, MinimumNorm2, ScalarFields> | typedef HMCWrapperTemplate<ScalarImplR, MinimumNorm2, ScalarFields> | ||||||
|     ScalarGenericHMCRunner; |     ScalarGenericHMCRunner; | ||||||
|  |  | ||||||
|  | typedef HMCWrapperTemplate<ScalarAdjImplR, MinimumNorm2, ScalarMatrixFields> | ||||||
|  |     ScalarAdjGenericHMCRunner; | ||||||
|  |  | ||||||
|  | template <int Colours>  | ||||||
|  | using ScalarNxNAdjGenericHMCRunner = HMCWrapperTemplate < ScalarNxNAdjImplR<Colours>, MinimumNorm2, ScalarNxNMatrixFields<Colours> >; | ||||||
|  |  | ||||||
| }  // namespace QCD | }  // namespace QCD | ||||||
| }  // namespace Grid | }  // namespace Grid | ||||||
|  |  | ||||||
|   | |||||||
| @@ -76,7 +76,7 @@ struct HMCparameters: Serializable { | |||||||
|  |  | ||||||
|   template < class ReaderClass >  |   template < class ReaderClass >  | ||||||
|   void initialize(Reader<ReaderClass> &TheReader){ |   void initialize(Reader<ReaderClass> &TheReader){ | ||||||
|   	std::cout << "Reading HMC\n"; |   	std::cout << GridLogMessage << "Reading HMC\n"; | ||||||
|   	read(TheReader, "HMC", *this); |   	read(TheReader, "HMC", *this); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -165,7 +165,7 @@ class HMCResourceManager { | |||||||
|   // Grids |   // Grids | ||||||
|   ////////////////////////////////////////////////////////////// |   ////////////////////////////////////////////////////////////// | ||||||
|  |  | ||||||
|   void AddGrid(std::string s, GridModule& M) { |   void AddGrid(const std::string s, GridModule& M) { | ||||||
|     // Check for name clashes |     // Check for name clashes | ||||||
|     auto search = Grids.find(s); |     auto search = Grids.find(s); | ||||||
|     if (search != Grids.end()) { |     if (search != Grids.end()) { | ||||||
| @@ -174,14 +174,24 @@ class HMCResourceManager { | |||||||
|       exit(1); |       exit(1); | ||||||
|     } |     } | ||||||
|     Grids[s] = std::move(M); |     Grids[s] = std::move(M); | ||||||
|  |     std::cout << GridLogMessage << "::::::::::::::::::::::::::::::::::::::::" <<std::endl; | ||||||
|  |     std::cout << GridLogMessage << "HMCResourceManager:" << std::endl; | ||||||
|  |     std::cout << GridLogMessage << "Created grid set with name '" << s << "' and decomposition for the full cartesian " << std::endl; | ||||||
|  |     Grids[s].show_full_decomposition(); | ||||||
|  |     std::cout << GridLogMessage << "::::::::::::::::::::::::::::::::::::::::" <<std::endl; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   // Add a named grid set, 4d shortcut |   // Add a named grid set, 4d shortcut | ||||||
|   void AddFourDimGrid(std::string s) { |   void AddFourDimGrid(const std::string s) { | ||||||
|     GridFourDimModule<vComplex> Mod; |     GridFourDimModule<vComplex> Mod; | ||||||
|     AddGrid(s, Mod); |     AddGrid(s, Mod); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |   // Add a named grid set, 4d shortcut + tweak simd lanes | ||||||
|  |   void AddFourDimGrid(const std::string s, const std::vector<int> simd_decomposition) { | ||||||
|  |     GridFourDimModule<vComplex> Mod(simd_decomposition); | ||||||
|  |     AddGrid(s, Mod); | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |  | ||||||
|   GridCartesian* GetCartesian(std::string s = "") { |   GridCartesian* GetCartesian(std::string s = "") { | ||||||
| @@ -253,6 +263,7 @@ class HMCResourceManager { | |||||||
|   template<class T, class... Types> |   template<class T, class... Types> | ||||||
|   void AddObservable(Types&&... Args){ |   void AddObservable(Types&&... Args){ | ||||||
|     ObservablesList.push_back(std::unique_ptr<T>(new T(std::forward<Types>(Args)...))); |     ObservablesList.push_back(std::unique_ptr<T>(new T(std::forward<Types>(Args)...))); | ||||||
|  |     ObservablesList.back()->print_parameters(); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   std::vector<HmcObservable<typename ImplementationPolicy::Field>* > GetObservables(){ |   std::vector<HmcObservable<typename ImplementationPolicy::Field>* > GetObservables(){ | ||||||
|   | |||||||
| @@ -43,11 +43,12 @@ public: | |||||||
|   std::string, lattice, |   std::string, lattice, | ||||||
|   std::string, mpi); |   std::string, mpi); | ||||||
|  |  | ||||||
|   std::vector<int> getLattice(){return strToVec<int>(lattice);} |   std::vector<int> getLattice() const {return strToVec<int>(lattice);} | ||||||
|   std::vector<int> getMpi()    {return strToVec<int>(mpi);} |   std::vector<int> getMpi()     const {return strToVec<int>(mpi);} | ||||||
|  |  | ||||||
|   void check(){ |  | ||||||
|     if (getLattice().size() != getMpi().size()) { |   void check() const { | ||||||
|  |     if (getLattice().size() != getMpi().size() ) { | ||||||
|       std::cout << GridLogError |       std::cout << GridLogError | ||||||
|                 << "Error in GridModuleParameters: lattice and mpi dimensions " |                 << "Error in GridModuleParameters: lattice and mpi dimensions " | ||||||
|                    "do not match" |                    "do not match" | ||||||
| @@ -84,6 +85,8 @@ class GridModule { | |||||||
|  |  | ||||||
|   void set_full(GridCartesian* grid) { grid_.reset(grid); } |   void set_full(GridCartesian* grid) { grid_.reset(grid); } | ||||||
|   void set_rb(GridRedBlackCartesian* rbgrid) { rbgrid_.reset(rbgrid); } |   void set_rb(GridRedBlackCartesian* rbgrid) { rbgrid_.reset(rbgrid); } | ||||||
|  |   void show_full_decomposition(){ grid_->show_decomposition(); } | ||||||
|  |   void show_rb_decomposition(){ rbgrid_->show_decomposition(); } | ||||||
|  |  | ||||||
|  protected: |  protected: | ||||||
|   std::unique_ptr<GridCartesian> grid_; |   std::unique_ptr<GridCartesian> grid_; | ||||||
| @@ -95,31 +98,72 @@ class GridModule { | |||||||
| // Classes for the user | // Classes for the user | ||||||
| //////////////////////////////////// | //////////////////////////////////// | ||||||
| // Note: the space time grid should be out of the QCD namespace | // Note: the space time grid should be out of the QCD namespace | ||||||
| template< class vector_type> | template <class vector_type> | ||||||
| class GridFourDimModule : public GridModule { | class GridFourDimModule : public GridModule | ||||||
|  public: | { | ||||||
|   GridFourDimModule() { | public: | ||||||
|  |   GridFourDimModule() | ||||||
|  |   { | ||||||
|     using namespace QCD; |     using namespace QCD; | ||||||
|     set_full(SpaceTimeGrid::makeFourDimGrid( |     set_full(SpaceTimeGrid::makeFourDimGrid( | ||||||
|         GridDefaultLatt(), GridDefaultSimd(4, vector_type::Nsimd()), |         GridDefaultLatt(),  | ||||||
|  |         GridDefaultSimd(4, vector_type::Nsimd()), | ||||||
|         GridDefaultMpi())); |         GridDefaultMpi())); | ||||||
|     set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get())); |     set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get())); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   GridFourDimModule(GridModuleParameters Params) { |   GridFourDimModule(const std::vector<int> tweak_simd) | ||||||
|  |   { | ||||||
|  |     using namespace QCD; | ||||||
|  |     if (tweak_simd.size() != 4) | ||||||
|  |     { | ||||||
|  |       std::cout << GridLogError | ||||||
|  |                 << "Error in GridFourDimModule: SIMD size different from 4"  | ||||||
|  |                 << std::endl; | ||||||
|  |       exit(1); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // Checks that the product agrees with the expectation | ||||||
|  |     int simd_sum = 1; | ||||||
|  |     for (auto &n : tweak_simd) | ||||||
|  |       simd_sum *= n; | ||||||
|  |     std::cout << GridLogDebug << "TweakSIMD: " << tweak_simd << "  Sum: " << simd_sum << std::endl; | ||||||
|  |  | ||||||
|  |     if (simd_sum == vector_type::Nsimd()) | ||||||
|  |     { | ||||||
|  |       set_full(SpaceTimeGrid::makeFourDimGrid( | ||||||
|  |           GridDefaultLatt(),  | ||||||
|  |           tweak_simd,  | ||||||
|  |           GridDefaultMpi())); | ||||||
|  |       set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get())); | ||||||
|  |     } | ||||||
|  |     else | ||||||
|  |     { | ||||||
|  |       std::cout << GridLogError  | ||||||
|  |                 << "Error in GridFourDimModule: SIMD lanes must sum to "  | ||||||
|  |                 << vector_type::Nsimd()  | ||||||
|  |                 << std::endl; | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   GridFourDimModule(const GridModuleParameters Params) | ||||||
|  |   { | ||||||
|     using namespace QCD; |     using namespace QCD; | ||||||
|     Params.check(); |  | ||||||
|     std::vector<int> lattice_v = Params.getLattice(); |     std::vector<int> lattice_v = Params.getLattice(); | ||||||
|     std::vector<int> mpi_v = Params.getMpi(); |     std::vector<int> mpi_v = Params.getMpi(); | ||||||
|     if (lattice_v.size() == 4) { |     if (lattice_v.size() == 4) | ||||||
|  |     { | ||||||
|       set_full(SpaceTimeGrid::makeFourDimGrid( |       set_full(SpaceTimeGrid::makeFourDimGrid( | ||||||
|           lattice_v, GridDefaultSimd(4, vector_type::Nsimd()), |           lattice_v,  | ||||||
|  |           GridDefaultSimd(4, vector_type::Nsimd()), | ||||||
|           mpi_v)); |           mpi_v)); | ||||||
|       set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get())); |       set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get())); | ||||||
|     } else { |     } | ||||||
|  |     else | ||||||
|  |     { | ||||||
|       std::cout << GridLogError |       std::cout << GridLogError | ||||||
|           << "Error in GridFourDimModule: lattice dimension different from 4" |                 << "Error in GridFourDimModule: lattice dimension different from 4" | ||||||
|           << std::endl; |                 << std::endl; | ||||||
|       exit(1); |       exit(1); | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|   | |||||||
| @@ -84,8 +84,6 @@ class PlaquetteMod: public ObservableModule<PlaquetteLogger<Impl>, NoParameters> | |||||||
|   typedef ObservableModule<PlaquetteLogger<Impl>, NoParameters> ObsBase; |   typedef ObservableModule<PlaquetteLogger<Impl>, NoParameters> ObsBase; | ||||||
|   using ObsBase::ObsBase; // for constructors |   using ObsBase::ObsBase; // for constructors | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   // acquire resource |   // acquire resource | ||||||
|   virtual void initialize(){ |   virtual void initialize(){ | ||||||
|     this->ObservablePtr.reset(new PlaquetteLogger<Impl>()); |     this->ObservablePtr.reset(new PlaquetteLogger<Impl>()); | ||||||
| @@ -94,23 +92,22 @@ class PlaquetteMod: public ObservableModule<PlaquetteLogger<Impl>, NoParameters> | |||||||
|   PlaquetteMod(): ObsBase(NoParameters()){} |   PlaquetteMod(): ObsBase(NoParameters()){} | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  |  | ||||||
| template < class Impl > | template < class Impl > | ||||||
| class TopologicalChargeMod: public ObservableModule<TopologicalCharge<Impl>, NoParameters>{ | class TopologicalChargeMod: public ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters>{ | ||||||
|   typedef ObservableModule<TopologicalCharge<Impl>, NoParameters> ObsBase; |   typedef ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters> ObsBase; | ||||||
|   using ObsBase::ObsBase; // for constructors |   using ObsBase::ObsBase; // for constructors | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   // acquire resource |   // acquire resource | ||||||
|   virtual void initialize(){ |   virtual void initialize(){ | ||||||
|     this->ObservablePtr.reset(new TopologicalCharge<Impl>()); |     this->ObservablePtr.reset(new TopologicalCharge<Impl>(this->Par_)); | ||||||
|   } |   } | ||||||
|   public: |   public: | ||||||
|   TopologicalChargeMod(): ObsBase(NoParameters()){} |   TopologicalChargeMod(TopologyObsParameters Par): ObsBase(Par){} | ||||||
|  |   TopologicalChargeMod(): ObsBase(){} | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| }// QCD temporarily here | }// QCD temporarily here | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -33,9 +33,45 @@ directory | |||||||
| namespace Grid { | namespace Grid { | ||||||
| namespace QCD { | namespace QCD { | ||||||
|  |  | ||||||
|  | struct TopologySmearingParameters : Serializable { | ||||||
|  |     GRID_SERIALIZABLE_CLASS_MEMBERS(TopologySmearingParameters, | ||||||
|  |     int, steps, | ||||||
|  |     float, step_size, | ||||||
|  |     int, meas_interval, | ||||||
|  |     float, maxTau); | ||||||
|  |  | ||||||
|  |     TopologySmearingParameters(int s = 0, float ss = 0.0f, int mi = 0, float mT = 0.0f): | ||||||
|  |         steps(s), step_size(ss), meas_interval(mi), maxTau(mT){} | ||||||
|  |  | ||||||
|  |     template < class ReaderClass > | ||||||
|  |     TopologySmearingParameters(Reader<ReaderClass>& Reader){ | ||||||
|  |         read(Reader, "Smearing", *this);   | ||||||
|  |     }   | ||||||
|  | }; | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  | struct TopologyObsParameters : Serializable { | ||||||
|  |     GRID_SERIALIZABLE_CLASS_MEMBERS(TopologyObsParameters, | ||||||
|  |       int, interval, | ||||||
|  |       bool, do_smearing, | ||||||
|  |       TopologySmearingParameters, Smearing);   | ||||||
|  |  | ||||||
|  |     TopologyObsParameters(int interval = 1, bool smearing = false): | ||||||
|  |         interval(interval), Smearing(smearing){} | ||||||
|  |  | ||||||
|  |     template <class ReaderClass > | ||||||
|  |       TopologyObsParameters(Reader<ReaderClass>& Reader){ | ||||||
|  |         read(Reader, "TopologyMeasurement", *this); | ||||||
|  |   } | ||||||
|  | }; | ||||||
|  |  | ||||||
|  |  | ||||||
| // this is only defined for a gauge theory | // this is only defined for a gauge theory | ||||||
| template <class Impl> | template <class Impl> | ||||||
| class TopologicalCharge : public HmcObservable<typename Impl::Field> { | class TopologicalCharge : public HmcObservable<typename Impl::Field> { | ||||||
|  |     TopologyObsParameters Pars; | ||||||
|  |  | ||||||
|  public: |  public: | ||||||
|     // here forces the Impl to be of gauge fields |     // here forces the Impl to be of gauge fields | ||||||
|     // if not the compiler will complain |     // if not the compiler will complain | ||||||
| @@ -44,20 +80,39 @@ class TopologicalCharge : public HmcObservable<typename Impl::Field> { | |||||||
|     // necessary for HmcObservable compatibility |     // necessary for HmcObservable compatibility | ||||||
|     typedef typename Impl::Field Field; |     typedef typename Impl::Field Field; | ||||||
|  |  | ||||||
|  |     TopologicalCharge(int interval = 1, bool do_smearing = false): | ||||||
|  |         Pars(interval, do_smearing){} | ||||||
|  |      | ||||||
|  |     TopologicalCharge(TopologyObsParameters P):Pars(P){ | ||||||
|  |         std::cout << GridLogDebug << "Creating TopologicalCharge " << std::endl; | ||||||
|  |     } | ||||||
|  |  | ||||||
|     void TrajectoryComplete(int traj, |     void TrajectoryComplete(int traj, | ||||||
|                             Field &U, |                             Field &U, | ||||||
|                             GridSerialRNG &sRNG, |                             GridSerialRNG &sRNG, | ||||||
|                             GridParallelRNG &pRNG) { |                             GridParallelRNG &pRNG) { | ||||||
|  |  | ||||||
|     Real q = WilsonLoops<Impl>::TopologicalCharge(U); |     if (traj%Pars.interval == 0){ | ||||||
|  |         // Smearing | ||||||
|  |         Field Usmear = U; | ||||||
|  |         int def_prec = std::cout.precision(); | ||||||
|          |          | ||||||
|     int def_prec = std::cout.precision(); |         if (Pars.do_smearing){ | ||||||
|  |             // using wilson flow by default here | ||||||
|  |             WilsonFlow<PeriodicGimplR> WF(Pars.Smearing.steps, Pars.Smearing.step_size, Pars.Smearing.meas_interval); | ||||||
|  |             WF.smear_adaptive(Usmear, U, Pars.Smearing.maxTau); | ||||||
|  |             Real T0   = WF.energyDensityPlaquette(Usmear); | ||||||
|  |             std::cout << GridLogMessage << std::setprecision(std::numeric_limits<Real>::digits10 + 1) | ||||||
|  |                       << "T0                : [ " << traj << " ] "<< T0 << std::endl; | ||||||
|  |         } | ||||||
|  |  | ||||||
|     std::cout << GridLogMessage |         Real q    = WilsonLoops<Impl>::TopologicalCharge(Usmear); | ||||||
|         << std::setprecision(std::numeric_limits<Real>::digits10 + 1) |         std::cout << GridLogMessage | ||||||
|         << "Topological Charge: [ " << traj << " ] "<< q << std::endl; |             << std::setprecision(std::numeric_limits<Real>::digits10 + 1) | ||||||
|  |             << "Topological Charge: [ " << traj << " ] "<< q << std::endl; | ||||||
|  |  | ||||||
|     std::cout.precision(def_prec); |         std::cout.precision(def_prec); | ||||||
|  |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
| }; | }; | ||||||
|   | |||||||
| @@ -62,7 +62,10 @@ class Representations { | |||||||
|  |  | ||||||
| typedef Representations<FundamentalRepresentation> NoHirep; | typedef Representations<FundamentalRepresentation> NoHirep; | ||||||
| typedef Representations<EmptyRep<typename ScalarImplR::Field> > ScalarFields; | typedef Representations<EmptyRep<typename ScalarImplR::Field> > ScalarFields; | ||||||
|   //typedef Representations<EmptyRep<typename ScalarMatrixImplR::Field> > ScalarMatrixFields; | typedef Representations<EmptyRep<typename ScalarAdjImplR::Field> > ScalarMatrixFields; | ||||||
|  |  | ||||||
|  | template < int Colours>  | ||||||
|  | using ScalarNxNMatrixFields = Representations<EmptyRep<typename ScalarNxNAdjImplR<Colours>::Field> >; | ||||||
|  |  | ||||||
| // Helper classes to access the elements | // Helper classes to access the elements | ||||||
| // Strips the first N parameters from the tuple | // Strips the first N parameters from the tuple | ||||||
|   | |||||||
| @@ -108,7 +108,7 @@ void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real | |||||||
|     if (maxTau - taus < epsilon){ |     if (maxTau - taus < epsilon){ | ||||||
|         epsilon = maxTau-taus; |         epsilon = maxTau-taus; | ||||||
|     } |     } | ||||||
|     std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl; |     //std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl; | ||||||
|     GaugeField Z(U._grid); |     GaugeField Z(U._grid); | ||||||
|     GaugeField Zprime(U._grid); |     GaugeField Zprime(U._grid); | ||||||
|     GaugeField tmp(U._grid), Uprime(U._grid); |     GaugeField tmp(U._grid), Uprime(U._grid); | ||||||
| @@ -138,10 +138,10 @@ void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real | |||||||
|     // adjust integration step |     // adjust integration step | ||||||
|      |      | ||||||
|     taus += epsilon; |     taus += epsilon; | ||||||
|     std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl; |     //std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl; | ||||||
|      |      | ||||||
|     epsilon = epsilon*0.95*std::pow(1e-4/diff,1./3.); |     epsilon = epsilon*0.95*std::pow(1e-4/diff,1./3.); | ||||||
|     std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl; |     //std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl; | ||||||
|  |  | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -166,7 +166,6 @@ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const { | |||||||
|     out = in; |     out = in; | ||||||
|     for (unsigned int step = 1; step <= Nstep; step++) { |     for (unsigned int step = 1; step <= Nstep; step++) { | ||||||
|         auto start = std::chrono::high_resolution_clock::now(); |         auto start = std::chrono::high_resolution_clock::now(); | ||||||
|         std::cout << GridLogMessage << "Evolution time :"<< tau(step) << std::endl; |  | ||||||
|         evolve_step(out); |         evolve_step(out); | ||||||
|         auto end = std::chrono::high_resolution_clock::now(); |         auto end = std::chrono::high_resolution_clock::now(); | ||||||
|         std::chrono::duration<double> diff = end - start; |         std::chrono::duration<double> diff = end - start; | ||||||
| @@ -191,7 +190,7 @@ void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, Re | |||||||
|     unsigned int step = 0; |     unsigned int step = 0; | ||||||
|     do{ |     do{ | ||||||
|         step++; |         step++; | ||||||
|         std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl; |         //std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl; | ||||||
|         evolve_step_adaptive(out, maxTau); |         evolve_step_adaptive(out, maxTau); | ||||||
|         std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : " |         std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : " | ||||||
|             << step << "  " |             << step << "  " | ||||||
|   | |||||||
| @@ -26,12 +26,14 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk> | |||||||
|     /*  END LEGAL */ |     /*  END LEGAL */ | ||||||
| //#include <Grid/Grid.h> | //#include <Grid/Grid.h> | ||||||
|  |  | ||||||
| using namespace Grid; | #ifndef GRID_QCD_GAUGE_FIX_H | ||||||
| using namespace Grid::QCD; | #define GRID_QCD_GAUGE_FIX_H | ||||||
|  | namespace Grid { | ||||||
|  | namespace QCD { | ||||||
|  |  | ||||||
| template <class Gimpl>  | template <class Gimpl>  | ||||||
| class FourierAcceleratedGaugeFixer  : public Gimpl { | class FourierAcceleratedGaugeFixer  : public Gimpl { | ||||||
|   public: |  public: | ||||||
|   INHERIT_GIMPL_TYPES(Gimpl); |   INHERIT_GIMPL_TYPES(Gimpl); | ||||||
|  |  | ||||||
|   typedef typename Gimpl::GaugeLinkField GaugeMat; |   typedef typename Gimpl::GaugeLinkField GaugeMat; | ||||||
| @@ -186,3 +188,6 @@ class FourierAcceleratedGaugeFixer  : public Gimpl { | |||||||
|   }   |   }   | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  | } | ||||||
|  | } | ||||||
|  | #endif | ||||||
|   | |||||||
| @@ -716,8 +716,7 @@ template<typename GaugeField,typename GaugeMat> | |||||||
|  |  | ||||||
|     for (int a = 0; a < AdjointDimension; a++) { |     for (int a = 0; a < AdjointDimension; a++) { | ||||||
|       generator(a, Ta); |       generator(a, Ta); | ||||||
|       auto tmp = - 2.0 * (trace(timesI(Ta) * in)) * scale;// 2.0 for the normalization of the trace in the fundamental rep |       pokeColour(h_out, - 2.0 * (trace(timesI(Ta) * in)) * scale, a); | ||||||
|       pokeColour(h_out, tmp, a); |  | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -65,10 +65,12 @@ Hdf5Reader::Hdf5Reader(const std::string &fileName) | |||||||
|                       Hdf5Type<unsigned int>::type()); |                       Hdf5Type<unsigned int>::type()); | ||||||
| } | } | ||||||
|  |  | ||||||
| void Hdf5Reader::push(const std::string &s) | bool Hdf5Reader::push(const std::string &s) | ||||||
| { | { | ||||||
|   group_ = group_.openGroup(s); |   group_ = group_.openGroup(s); | ||||||
|   path_.push_back(s); |   path_.push_back(s); | ||||||
|  |    | ||||||
|  |   return true; | ||||||
| } | } | ||||||
|  |  | ||||||
| void Hdf5Reader::pop(void) | void Hdf5Reader::pop(void) | ||||||
|   | |||||||
| @@ -54,7 +54,7 @@ namespace Grid | |||||||
|   public: |   public: | ||||||
|     Hdf5Reader(const std::string &fileName); |     Hdf5Reader(const std::string &fileName); | ||||||
|     virtual ~Hdf5Reader(void) = default; |     virtual ~Hdf5Reader(void) = default; | ||||||
|     void push(const std::string &s); |     bool push(const std::string &s); | ||||||
|     void pop(void); |     void pop(void); | ||||||
|     template <typename U> |     template <typename U> | ||||||
|     void readDefault(const std::string &s, U &output); |     void readDefault(const std::string &s, U &output); | ||||||
|   | |||||||
| @@ -701,9 +701,28 @@ namespace Optimization { | |||||||
|   //Integer Reduce |   //Integer Reduce | ||||||
|   template<> |   template<> | ||||||
|   inline Integer Reduce<Integer, __m256i>::operator()(__m256i in){ |   inline Integer Reduce<Integer, __m256i>::operator()(__m256i in){ | ||||||
|     // FIXME unimplemented |     __m128i ret; | ||||||
|     printf("Reduce : Missing integer implementation -> FIX\n"); | #if defined (AVX2) | ||||||
|     assert(0); |     // AVX2 horizontal adds within upper and lower halves of register; use | ||||||
|  |     // SSE to add upper and lower halves for result. | ||||||
|  |     __m256i v1, v2; | ||||||
|  |     __m128i u1, u2; | ||||||
|  |     v1  = _mm256_hadd_epi32(in, in); | ||||||
|  |     v2  = _mm256_hadd_epi32(v1, v1); | ||||||
|  |     u1  = _mm256_castsi256_si128(v2);      // upper half | ||||||
|  |     u2  = _mm256_extracti128_si256(v2, 1); // lower half | ||||||
|  |     ret = _mm_add_epi32(u1, u2); | ||||||
|  | #else | ||||||
|  |     // No AVX horizontal add; extract upper and lower halves of register & use | ||||||
|  |     // SSE intrinsics. | ||||||
|  |     __m128i u1, u2, u3; | ||||||
|  |     u1  = _mm256_extractf128_si256(in, 0); // upper half | ||||||
|  |     u2  = _mm256_extractf128_si256(in, 1); // lower half | ||||||
|  |     u3  = _mm_add_epi32(u1, u2); | ||||||
|  |     u1  = _mm_hadd_epi32(u3, u3); | ||||||
|  |     ret = _mm_hadd_epi32(u1, u1); | ||||||
|  | #endif | ||||||
|  |     return _mm_cvtsi128_si32(ret); | ||||||
|   } |   } | ||||||
|  |  | ||||||
| } | } | ||||||
|   | |||||||
| @@ -543,6 +543,24 @@ namespace Optimization { | |||||||
|      u512d conv; conv.v = v1; |      u512d conv; conv.v = v1; | ||||||
|      return conv.f[0]; |      return conv.f[0]; | ||||||
|   } |   } | ||||||
|  |    | ||||||
|  |   //Integer Reduce | ||||||
|  |   template<> | ||||||
|  |   inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){ | ||||||
|  |     // No full vector reduce, use AVX to add upper and lower halves of register | ||||||
|  |     // and perform AVX reduction. | ||||||
|  |     __m256i v1, v2, v3; | ||||||
|  |     __m128i u1, u2, ret; | ||||||
|  |     v1  = _mm512_castsi512_si256(in);       // upper half | ||||||
|  |     v2  = _mm512_extracti32x8_epi32(in, 1); // lower half | ||||||
|  |     v3  = _mm256_add_epi32(v1, v2); | ||||||
|  |     v1  = _mm256_hadd_epi32(v3, v3); | ||||||
|  |     v2  = _mm256_hadd_epi32(v1, v1); | ||||||
|  |     u1  = _mm256_castsi256_si128(v2)        // upper half | ||||||
|  |     u2  = _mm256_extracti128_si256(v2, 1);  // lower half | ||||||
|  |     ret = _mm_add_epi32(u1, u2); | ||||||
|  |     return _mm_cvtsi128_si32(ret); | ||||||
|  |   } | ||||||
| #else | #else | ||||||
|   //Complex float Reduce |   //Complex float Reduce | ||||||
|   template<> |   template<> | ||||||
| @@ -570,9 +588,7 @@ namespace Optimization { | |||||||
|   //Integer Reduce |   //Integer Reduce | ||||||
|   template<> |   template<> | ||||||
|   inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){ |   inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){ | ||||||
|     // FIXME unimplemented |     return _mm512_reduce_add_epi32(in); | ||||||
|     printf("Reduce : Missing integer implementation -> FIX\n"); |  | ||||||
|     assert(0); |  | ||||||
|   } |   } | ||||||
| #endif | #endif | ||||||
|    |    | ||||||
|   | |||||||
| @@ -401,9 +401,7 @@ namespace Optimization { | |||||||
|   //Integer Reduce |   //Integer Reduce | ||||||
|   template<> |   template<> | ||||||
|   inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){ |   inline Integer Reduce<Integer, __m512i>::operator()(__m512i in){ | ||||||
|     // FIXME unimplemented |     return _mm512_reduce_add_epi32(in); | ||||||
|     printf("Reduce : Missing integer implementation -> FIX\n"); |  | ||||||
|     assert(0); |  | ||||||
|   } |   } | ||||||
|    |    | ||||||
|    |    | ||||||
|   | |||||||
| @@ -1,4 +1,4 @@ | |||||||
|     /************************************************************************************* | /************************************************************************************* | ||||||
|  |  | ||||||
|     Grid physics library, www.github.com/paboyle/Grid |     Grid physics library, www.github.com/paboyle/Grid | ||||||
|  |  | ||||||
| @@ -6,8 +6,9 @@ | |||||||
|  |  | ||||||
|     Copyright (C) 2015 |     Copyright (C) 2015 | ||||||
|  |  | ||||||
| Author: Peter Boyle <paboyle@ph.ed.ac.uk> |     Author: Nils Meyer <nils.meyer@ur.de> | ||||||
| Author: neo <cossu@post.kek.jp> |     Author: Peter Boyle <paboyle@ph.ed.ac.uk> | ||||||
|  |     Author: neo <cossu@post.kek.jp> | ||||||
|  |  | ||||||
|     This program is free software; you can redistribute it and/or modify |     This program is free software; you can redistribute it and/or modify | ||||||
|     it under the terms of the GNU General Public License as published by |     it under the terms of the GNU General Public License as published by | ||||||
| @@ -26,19 +27,25 @@ Author: neo <cossu@post.kek.jp> | |||||||
|     See the full license in the file "LICENSE" in the top level distribution directory |     See the full license in the file "LICENSE" in the top level distribution directory | ||||||
|     *************************************************************************************/ |     *************************************************************************************/ | ||||||
|     /*  END LEGAL */ |     /*  END LEGAL */ | ||||||
| //---------------------------------------------------------------------- |  | ||||||
| /*! @file Grid_sse4.h |  | ||||||
|   @brief Optimization libraries for NEON (ARM) instructions set ARMv8 |  | ||||||
|  |  | ||||||
|   Experimental - Using intrinsics - DEVELOPING!  | /* | ||||||
|  |  | ||||||
|  |   ARMv8 NEON intrinsics layer by | ||||||
|  |  | ||||||
|  |   Nils Meyer <nils.meyer@ur.de>, | ||||||
|  |   University of Regensburg, Germany | ||||||
|  |   SFB/TRR55 | ||||||
|  |  | ||||||
| */ | */ | ||||||
| // Time-stamp: <2015-07-10 17:45:09 neo> |  | ||||||
| //---------------------------------------------------------------------- |  | ||||||
|  |  | ||||||
|  | #ifndef GEN_SIMD_WIDTH | ||||||
|  | #define GEN_SIMD_WIDTH 16u | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | #include "Grid_generic_types.h" | ||||||
| #include <arm_neon.h> | #include <arm_neon.h> | ||||||
|  |  | ||||||
| // ARMv8 supports double precision | namespace Grid { | ||||||
|  |  | ||||||
| namespace Optimization { | namespace Optimization { | ||||||
|  |  | ||||||
|   template<class vtype> |   template<class vtype> | ||||||
| @@ -46,14 +53,18 @@ namespace Optimization { | |||||||
|     float32x4_t f; |     float32x4_t f; | ||||||
|     vtype v; |     vtype v; | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   union u128f { |   union u128f { | ||||||
|     float32x4_t v; |     float32x4_t v; | ||||||
|     float f[4]; |     float f[4]; | ||||||
|   }; |   }; | ||||||
|   union u128d { |   union u128d { | ||||||
|     float64x2_t v; |     float64x2_t v; | ||||||
|     double f[4]; |     double f[2]; | ||||||
|  |   }; | ||||||
|  |   // half precision | ||||||
|  |   union u128h { | ||||||
|  |     float16x8_t v; | ||||||
|  |     uint16_t f[8]; | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   struct Vsplat{ |   struct Vsplat{ | ||||||
| @@ -64,20 +75,20 @@ namespace Optimization { | |||||||
|     } |     } | ||||||
|     // Real float |     // Real float | ||||||
|     inline float32x4_t operator()(float a){ |     inline float32x4_t operator()(float a){ | ||||||
|       return vld1q_dup_f32(&a); |       return vdupq_n_f32(a); | ||||||
|     } |     } | ||||||
|     //Complex double |     //Complex double | ||||||
|     inline float32x4_t operator()(double a, double b){ |     inline float64x2_t operator()(double a, double b){ | ||||||
|       float tmp[4]={(float)a,(float)b,(float)a,(float)b}; |       double tmp[2]={a,b}; | ||||||
|       return vld1q_f32(tmp); |       return vld1q_f64(tmp); | ||||||
|     } |     } | ||||||
|     //Real double |     //Real double // N:tbc | ||||||
|     inline float32x4_t operator()(double a){ |     inline float64x2_t operator()(double a){ | ||||||
|       return vld1q_dup_f32(&a); |       return vdupq_n_f64(a); | ||||||
|     } |     } | ||||||
|     //Integer |     //Integer // N:tbc | ||||||
|     inline uint32x4_t operator()(Integer a){ |     inline uint32x4_t operator()(Integer a){ | ||||||
|       return vld1q_dup_u32(&a); |       return vdupq_n_u32(a); | ||||||
|     } |     } | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
| @@ -87,8 +98,8 @@ namespace Optimization { | |||||||
|       vst1q_f32(F, a); |       vst1q_f32(F, a); | ||||||
|     } |     } | ||||||
|     //Double |     //Double | ||||||
|     inline void operator()(float32x4_t a, double* D){ |     inline void operator()(float64x2_t a, double* D){ | ||||||
|       vst1q_f32((float*)D, a); |       vst1q_f64(D, a); | ||||||
|     } |     } | ||||||
|     //Integer |     //Integer | ||||||
|     inline void operator()(uint32x4_t a, Integer* I){ |     inline void operator()(uint32x4_t a, Integer* I){ | ||||||
| @@ -97,54 +108,54 @@ namespace Optimization { | |||||||
|  |  | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   struct Vstream{ |   struct Vstream{ // N:equivalents to _mm_stream_p* in NEON? | ||||||
|     //Float |     //Float // N:generic | ||||||
|     inline void operator()(float * a, float32x4_t b){ |     inline void operator()(float * a, float32x4_t b){ | ||||||
|      |       memcpy(a,&b,4*sizeof(float)); | ||||||
|     } |     } | ||||||
|     //Double |     //Double // N:generic | ||||||
|     inline void operator()(double * a, float32x4_t b){ |     inline void operator()(double * a, float64x2_t b){ | ||||||
|    |       memcpy(a,&b,2*sizeof(double)); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |  | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|  |   // Nils: Vset untested; not used currently in Grid at all; | ||||||
|  |   // git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b | ||||||
|   struct Vset{ |   struct Vset{ | ||||||
|     // Complex float  |     // Complex float // N:ok | ||||||
|     inline float32x4_t operator()(Grid::ComplexF *a){ |     inline float32x4_t operator()(Grid::ComplexF *a){ | ||||||
|       float32x4_t foo; |       float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()}; | ||||||
|       return foo; |       return vld1q_f32(tmp); | ||||||
|     } |     } | ||||||
|     // Complex double  |     // Complex double // N:ok | ||||||
|     inline float32x4_t operator()(Grid::ComplexD *a){ |     inline float64x2_t operator()(Grid::ComplexD *a){ | ||||||
|       float32x4_t foo; |       double tmp[2]={a[0].imag(),a[0].real()}; | ||||||
|       return foo; |       return vld1q_f64(tmp); | ||||||
|     } |     } | ||||||
|     // Real float  |     // Real float // N:ok | ||||||
|     inline float32x4_t operator()(float *a){ |     inline float32x4_t operator()(float *a){ | ||||||
|       float32x4_t foo; |       float tmp[4]={a[3],a[2],a[1],a[0]}; | ||||||
|       return foo; |       return vld1q_f32(tmp); | ||||||
|     } |     } | ||||||
|     // Real double |     // Real double // N:ok | ||||||
|     inline float32x4_t operator()(double *a){ |     inline float64x2_t operator()(double *a){ | ||||||
|       float32x4_t foo; |       double tmp[2]={a[1],a[0]}; | ||||||
|       return foo; |       return vld1q_f64(tmp); | ||||||
|     } |     } | ||||||
|     // Integer |     // Integer // N:ok | ||||||
|     inline uint32x4_t operator()(Integer *a){ |     inline uint32x4_t operator()(Integer *a){ | ||||||
|       uint32x4_t foo; |       return vld1q_dup_u32(a); | ||||||
|       return foo; |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |  | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|  |   // N:leaving as is | ||||||
|   template <typename Out_type, typename In_type> |   template <typename Out_type, typename In_type> | ||||||
|   struct Reduce{ |   struct Reduce{ | ||||||
|     //Need templated class to overload output type |     //Need templated class to overload output type | ||||||
|     //General form must generate error if compiled |     //General form must generate error if compiled | ||||||
|     inline Out_type operator()(In_type in){ |       inline Out_type operator()(In_type in){ | ||||||
|       printf("Error, using wrong Reduce function\n"); |       printf("Error, using wrong Reduce function\n"); | ||||||
|       exit(1); |       exit(1); | ||||||
|       return 0; |       return 0; | ||||||
| @@ -184,26 +195,98 @@ namespace Optimization { | |||||||
|     } |     } | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|  |   struct MultRealPart{ | ||||||
|  |     inline float32x4_t operator()(float32x4_t a, float32x4_t b){ | ||||||
|  |       float32x4_t re = vtrn1q_f32(a, a); | ||||||
|  |       return vmulq_f32(re, b); | ||||||
|  |     } | ||||||
|  |     inline float64x2_t operator()(float64x2_t a, float64x2_t b){ | ||||||
|  |       float64x2_t re = vzip1q_f64(a, a); | ||||||
|  |       return vmulq_f64(re, b); | ||||||
|  |     } | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  |   struct MaddRealPart{ | ||||||
|  |     inline float32x4_t operator()(float32x4_t a, float32x4_t b, float32x4_t c){ | ||||||
|  |       float32x4_t re = vtrn1q_f32(a, a); | ||||||
|  |       return vfmaq_f32(c, re, b); | ||||||
|  |     } | ||||||
|  |     inline float64x2_t operator()(float64x2_t a, float64x2_t b, float64x2_t c){ | ||||||
|  |       float64x2_t re = vzip1q_f64(a, a); | ||||||
|  |       return vfmaq_f64(c, re, b); | ||||||
|  |     } | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  |   struct Div{ | ||||||
|  |     // Real float | ||||||
|  |     inline float32x4_t operator()(float32x4_t a, float32x4_t b){ | ||||||
|  |       return vdivq_f32(a, b); | ||||||
|  |     } | ||||||
|  |     // Real double | ||||||
|  |     inline float64x2_t operator()(float64x2_t a, float64x2_t b){ | ||||||
|  |       return vdivq_f64(a, b); | ||||||
|  |     } | ||||||
|  |   }; | ||||||
|  |  | ||||||
|   struct MultComplex{ |   struct MultComplex{ | ||||||
|     // Complex float |     // Complex float | ||||||
|     inline float32x4_t operator()(float32x4_t a, float32x4_t b){ |     inline float32x4_t operator()(float32x4_t a, float32x4_t b){ | ||||||
|       float32x4_t foo; |  | ||||||
|       return foo; |       float32x4_t r0, r1, r2, r3, r4; | ||||||
|  |  | ||||||
|  |       // a = ar ai Ar Ai | ||||||
|  |       // b = br bi Br Bi | ||||||
|  |       // collect real/imag part, negate bi and Bi | ||||||
|  |       r0 = vtrn1q_f32(b, b);       //  br  br  Br  Br | ||||||
|  |       r1 = vnegq_f32(b);           // -br -bi -Br -Bi | ||||||
|  |       r2 = vtrn2q_f32(b, r1);      //  bi -bi  Bi -Bi | ||||||
|  |  | ||||||
|  |       // the fun part | ||||||
|  |       r3 = vmulq_f32(r2, a);       //  bi*ar -bi*ai ... | ||||||
|  |       r4 = vrev64q_f32(r3);        // -bi*ai  bi*ar ... | ||||||
|  |  | ||||||
|  |       // fma(a,b,c) = a+b*c | ||||||
|  |       return vfmaq_f32(r4, r0, a); //  ar*br-ai*bi ai*br+ar*bi ... | ||||||
|  |  | ||||||
|  |       // no fma, use mul and add | ||||||
|  |       //float32x4_t r5; | ||||||
|  |       //r5 = vmulq_f32(r0, a); | ||||||
|  |       //return vaddq_f32(r4, r5); | ||||||
|     } |     } | ||||||
|     // Complex double |     // Complex double | ||||||
|     inline float64x2_t operator()(float64x2_t a, float64x2_t b){ |     inline float64x2_t operator()(float64x2_t a, float64x2_t b){ | ||||||
|       float32x4_t foo; |  | ||||||
|       return foo; |       float64x2_t r0, r1, r2, r3, r4; | ||||||
|  |  | ||||||
|  |       // b = br bi | ||||||
|  |       // collect real/imag part, negate bi | ||||||
|  |       r0 = vtrn1q_f64(b, b);       //  br  br | ||||||
|  |       r1 = vnegq_f64(b);           // -br -bi | ||||||
|  |       r2 = vtrn2q_f64(b, r1);      //  bi -bi | ||||||
|  |  | ||||||
|  |       // the fun part | ||||||
|  |       r3 = vmulq_f64(r2, a);       //  bi*ar -bi*ai | ||||||
|  |       r4 = vextq_f64(r3,r3,1);     // -bi*ai  bi*ar | ||||||
|  |  | ||||||
|  |       // fma(a,b,c) = a+b*c | ||||||
|  |       return vfmaq_f64(r4, r0, a); //  ar*br-ai*bi ai*br+ar*bi | ||||||
|  |  | ||||||
|  |       // no fma, use mul and add | ||||||
|  |       //float64x2_t r5; | ||||||
|  |       //r5 = vmulq_f64(r0, a); | ||||||
|  |       //return vaddq_f64(r4, r5); | ||||||
|     } |     } | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   struct Mult{ |   struct Mult{ | ||||||
|     // Real float |     // Real float | ||||||
|     inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){ |     inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){ | ||||||
|       return vaddq_f32(vmulq_f32(b,c),a); |       //return vaddq_f32(vmulq_f32(b,c),a); | ||||||
|  |       return vfmaq_f32(a, b, c); | ||||||
|     } |     } | ||||||
|     inline float64x2_t mac(float64x2_t a, float64x2_t b, float64x2_t c){ |     inline float64x2_t mac(float64x2_t a, float64x2_t b, float64x2_t c){ | ||||||
|       return vaddq_f64(vmulq_f64(b,c),a); |       //return vaddq_f64(vmulq_f64(b,c),a); | ||||||
|  |       return vfmaq_f64(a, b, c); | ||||||
|     } |     } | ||||||
|     inline float32x4_t operator()(float32x4_t a, float32x4_t b){ |     inline float32x4_t operator()(float32x4_t a, float32x4_t b){ | ||||||
|       return vmulq_f32(a,b); |       return vmulq_f32(a,b); | ||||||
| @@ -221,89 +304,275 @@ namespace Optimization { | |||||||
|   struct Conj{ |   struct Conj{ | ||||||
|     // Complex single |     // Complex single | ||||||
|     inline float32x4_t operator()(float32x4_t in){ |     inline float32x4_t operator()(float32x4_t in){ | ||||||
|       return in; |       // ar ai br bi -> ar -ai br -bi | ||||||
|  |       float32x4_t r0, r1; | ||||||
|  |       r0 = vnegq_f32(in);        // -ar -ai -br -bi | ||||||
|  |       r1 = vrev64q_f32(r0);      // -ai -ar -bi -br | ||||||
|  |       return vtrn1q_f32(in, r1); //  ar -ai  br -bi | ||||||
|     } |     } | ||||||
|     // Complex double |     // Complex double | ||||||
|     //inline float32x4_t operator()(float32x4_t in){ |     inline float64x2_t operator()(float64x2_t in){ | ||||||
|     // return 0; |  | ||||||
|     //} |       float64x2_t r0, r1; | ||||||
|  |       r0 = vextq_f64(in, in, 1);    //  ai  ar | ||||||
|  |       r1 = vnegq_f64(r0);           // -ai -ar | ||||||
|  |       return vextq_f64(r0, r1, 1);  //  ar -ai | ||||||
|  |     } | ||||||
|     // do not define for integer input |     // do not define for integer input | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   struct TimesMinusI{ |   struct TimesMinusI{ | ||||||
|     //Complex single |     //Complex single | ||||||
|     inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ |     inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ | ||||||
|       return in; |       // ar ai br bi -> ai -ar ai -br | ||||||
|  |       float32x4_t r0, r1; | ||||||
|  |       r0 = vnegq_f32(in);        // -ar -ai -br -bi | ||||||
|  |       r1 = vrev64q_f32(in);      //  ai  ar  bi  br | ||||||
|  |       return vtrn1q_f32(r1, r0); //  ar -ai  br -bi | ||||||
|     } |     } | ||||||
|     //Complex double |     //Complex double | ||||||
|     //inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ |     inline float64x2_t operator()(float64x2_t in, float64x2_t ret){ | ||||||
|     //  return in; |       // a ib -> b -ia | ||||||
|     //} |       float64x2_t tmp; | ||||||
|  |       tmp = vnegq_f64(in); | ||||||
|  |       return vextq_f64(in, tmp, 1); | ||||||
|  |     } | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   struct TimesI{ |   struct TimesI{ | ||||||
|     //Complex single |     //Complex single | ||||||
|     inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ |     inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ | ||||||
|       //need shuffle |       // ar ai br bi -> -ai ar -bi br | ||||||
|       return in; |       float32x4_t r0, r1; | ||||||
|  |       r0 = vnegq_f32(in);        // -ar -ai -br -bi | ||||||
|  |       r1 = vrev64q_f32(r0);      // -ai -ar -bi -br | ||||||
|  |       return vtrn1q_f32(r1, in); // -ai  ar -bi  br | ||||||
|     } |     } | ||||||
|     //Complex double |     //Complex double | ||||||
|     //inline float32x4_t operator()(float32x4_t in, float32x4_t ret){ |     inline float64x2_t operator()(float64x2_t in, float64x2_t ret){ | ||||||
|     //  return 0; |       // a ib -> -b ia | ||||||
|     //} |       float64x2_t tmp; | ||||||
|  |       tmp = vnegq_f64(in); | ||||||
|  |       return vextq_f64(tmp, in, 1); | ||||||
|  |     } | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  |   struct Permute{ | ||||||
|  |  | ||||||
|  |     static inline float32x4_t Permute0(float32x4_t in){ // N:ok | ||||||
|  |       // AB CD -> CD AB | ||||||
|  |       return vextq_f32(in, in, 2); | ||||||
|  |     }; | ||||||
|  |     static inline float32x4_t Permute1(float32x4_t in){ // N:ok | ||||||
|  |       // AB CD -> BA DC | ||||||
|  |       return vrev64q_f32(in); | ||||||
|  |     }; | ||||||
|  |     static inline float32x4_t Permute2(float32x4_t in){ // N:not used by Boyle | ||||||
|  |       return in; | ||||||
|  |     }; | ||||||
|  |     static inline float32x4_t Permute3(float32x4_t in){ // N:not used by Boyle | ||||||
|  |       return in; | ||||||
|  |     }; | ||||||
|  |  | ||||||
|  |     static inline float64x2_t Permute0(float64x2_t in){ // N:ok | ||||||
|  |       // AB -> BA | ||||||
|  |       return vextq_f64(in, in, 1); | ||||||
|  |     }; | ||||||
|  |     static inline float64x2_t Permute1(float64x2_t in){ // N:not used by Boyle | ||||||
|  |       return in; | ||||||
|  |     }; | ||||||
|  |     static inline float64x2_t Permute2(float64x2_t in){ // N:not used by Boyle | ||||||
|  |       return in; | ||||||
|  |     }; | ||||||
|  |     static inline float64x2_t Permute3(float64x2_t in){ // N:not used by Boyle | ||||||
|  |       return in; | ||||||
|  |     }; | ||||||
|  |  | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  |   struct Rotate{ | ||||||
|  |  | ||||||
|  |     static inline float32x4_t rotate(float32x4_t in,int n){ // N:ok | ||||||
|  |       switch(n){ | ||||||
|  |       case 0: // AB CD -> AB CD | ||||||
|  |         return tRotate<0>(in); | ||||||
|  |         break; | ||||||
|  |       case 1: // AB CD -> BC DA | ||||||
|  |         return tRotate<1>(in); | ||||||
|  |         break; | ||||||
|  |       case 2: // AB CD -> CD AB | ||||||
|  |         return tRotate<2>(in); | ||||||
|  |         break; | ||||||
|  |       case 3: // AB CD -> DA BC | ||||||
|  |         return tRotate<3>(in); | ||||||
|  |         break; | ||||||
|  |       default: assert(0); | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |     static inline float64x2_t rotate(float64x2_t in,int n){ // N:ok | ||||||
|  |       switch(n){ | ||||||
|  |       case 0: // AB -> AB | ||||||
|  |         return tRotate<0>(in); | ||||||
|  |         break; | ||||||
|  |       case 1: // AB -> BA | ||||||
|  |         return tRotate<1>(in); | ||||||
|  |         break; | ||||||
|  |       default: assert(0); | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  | // working, but no restriction on n | ||||||
|  | //    template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n); }; | ||||||
|  | //    template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n); }; | ||||||
|  |  | ||||||
|  | // restriction on n | ||||||
|  |     template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); }; | ||||||
|  |     template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); }; | ||||||
|  |  | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  |   struct PrecisionChange { | ||||||
|  |  | ||||||
|  |     static inline float16x8_t StoH (const float32x4_t &a,const float32x4_t &b) { | ||||||
|  |       float16x4_t h = vcvt_f16_f32(a); | ||||||
|  |       return vcvt_high_f16_f32(h, b); | ||||||
|  |     } | ||||||
|  |     static inline void  HtoS (float16x8_t h,float32x4_t &sa,float32x4_t &sb) { | ||||||
|  |       sb = vcvt_high_f32_f16(h); | ||||||
|  |       // there is no direct conversion from lower float32x4_t to float64x2_t | ||||||
|  |       // vextq_f16 not supported by clang 3.8 / 4.0 / arm clang | ||||||
|  |       //float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang | ||||||
|  |       // workaround for clang | ||||||
|  |       uint32x4_t h1u = reinterpret_cast<uint32x4_t>(h); | ||||||
|  |       float16x8_t h1 = reinterpret_cast<float16x8_t>(vextq_u32(h1u, h1u, 2)); | ||||||
|  |       sa = vcvt_high_f32_f16(h1); | ||||||
|  |     } | ||||||
|  |     static inline float32x4_t DtoS (float64x2_t a,float64x2_t b) { | ||||||
|  |       float32x2_t s = vcvt_f32_f64(a); | ||||||
|  |       return vcvt_high_f32_f64(s, b); | ||||||
|  |  | ||||||
|  |     } | ||||||
|  |     static inline void StoD (float32x4_t s,float64x2_t &a,float64x2_t &b) { | ||||||
|  |       b = vcvt_high_f64_f32(s); | ||||||
|  |       // there is no direct conversion from lower float32x4_t to float64x2_t | ||||||
|  |       float32x4_t s1 = vextq_f32(s, s, 2); | ||||||
|  |       a = vcvt_high_f64_f32(s1); | ||||||
|  |  | ||||||
|  |     } | ||||||
|  |     static inline float16x8_t DtoH (float64x2_t a,float64x2_t b,float64x2_t c,float64x2_t d) { | ||||||
|  |       float32x4_t s1 = DtoS(a, b); | ||||||
|  |       float32x4_t s2 = DtoS(c, d); | ||||||
|  |       return StoH(s1, s2); | ||||||
|  |     } | ||||||
|  |     static inline void HtoD (float16x8_t h,float64x2_t &a,float64x2_t &b,float64x2_t &c,float64x2_t &d) { | ||||||
|  |       float32x4_t s1, s2; | ||||||
|  |       HtoS(h, s1, s2); | ||||||
|  |       StoD(s1, a, b); | ||||||
|  |       StoD(s2, c, d); | ||||||
|  |     } | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  |   ////////////////////////////////////////////// | ||||||
|  |   // Exchange support | ||||||
|  |  | ||||||
|  |   struct Exchange{ | ||||||
|  |     static inline void Exchange0(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){ | ||||||
|  |       // in1: ABCD -> out1: ABEF | ||||||
|  |       // in2: EFGH -> out2: CDGH | ||||||
|  |  | ||||||
|  |       // z: CDAB | ||||||
|  |       float32x4_t z = vextq_f32(in1, in1, 2); | ||||||
|  |       // out1: ABEF | ||||||
|  |       out1 = vextq_f32(z, in2, 2); | ||||||
|  |  | ||||||
|  |       // z: GHEF | ||||||
|  |       z = vextq_f32(in2, in2, 2); | ||||||
|  |       // out2: CDGH | ||||||
|  |       out2 = vextq_f32(in1, z, 2); | ||||||
|  |     }; | ||||||
|  |  | ||||||
|  |     static inline void Exchange1(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){ | ||||||
|  |       // in1: ABCD -> out1: AECG | ||||||
|  |       // in2: EFGH -> out2: BFDH | ||||||
|  |       out1 = vtrn1q_f32(in1, in2); | ||||||
|  |       out2 = vtrn2q_f32(in1, in2); | ||||||
|  |     }; | ||||||
|  |     static inline void Exchange2(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){ | ||||||
|  |       assert(0); | ||||||
|  |       return; | ||||||
|  |     }; | ||||||
|  |     static inline void Exchange3(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){ | ||||||
|  |       assert(0); | ||||||
|  |       return; | ||||||
|  |     }; | ||||||
|  |     // double precision | ||||||
|  |     static inline void Exchange0(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){ | ||||||
|  |       // in1: AB -> out1: AC | ||||||
|  |       // in2: CD -> out2: BD | ||||||
|  |       out1 = vzip1q_f64(in1, in2); | ||||||
|  |       out2 = vzip2q_f64(in1, in2); | ||||||
|  |     }; | ||||||
|  |     static inline void Exchange1(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){ | ||||||
|  |       assert(0); | ||||||
|  |       return; | ||||||
|  |     }; | ||||||
|  |     static inline void Exchange2(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){ | ||||||
|  |       assert(0); | ||||||
|  |       return; | ||||||
|  |     }; | ||||||
|  |     static inline void Exchange3(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){ | ||||||
|  |       assert(0); | ||||||
|  |       return; | ||||||
|  |     }; | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   ////////////////////////////////////////////// |   ////////////////////////////////////////////// | ||||||
|   // Some Template specialization |   // Some Template specialization | ||||||
|   template < typename vtype >  |  | ||||||
|     void permute(vtype &a, vtype b, int perm) { |  | ||||||
|  |  | ||||||
|   };  |  | ||||||
|  |  | ||||||
|   //Complex float Reduce |   //Complex float Reduce | ||||||
|   template<> |   template<> | ||||||
|   inline Grid::ComplexF Reduce<Grid::ComplexF, float32x4_t>::operator()(float32x4_t in){ |   inline Grid::ComplexF Reduce<Grid::ComplexF, float32x4_t>::operator()(float32x4_t in){ | ||||||
|     return 0; |     float32x4_t v1; // two complex | ||||||
|  |     v1 = Optimization::Permute::Permute0(in); | ||||||
|  |     v1 = vaddq_f32(v1,in); | ||||||
|  |     u128f conv;    conv.v=v1; | ||||||
|  |     return Grid::ComplexF(conv.f[0],conv.f[1]); | ||||||
|   } |   } | ||||||
|   //Real float Reduce |   //Real float Reduce | ||||||
|   template<> |   template<> | ||||||
|   inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){ |   inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){ | ||||||
|     float32x2_t high = vget_high_f32(in); |     return vaddvq_f32(in); | ||||||
|     float32x2_t low = vget_low_f32(in); |  | ||||||
|     float32x2_t tmp = vadd_f32(low, high); |  | ||||||
|     float32x2_t sum = vpadd_f32(tmp, tmp); |  | ||||||
|     return vget_lane_f32(sum,0); |  | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |  | ||||||
|   //Complex double Reduce |   //Complex double Reduce | ||||||
|   template<> |   template<> // N:by Boyle | ||||||
|   inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){ |   inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){ | ||||||
|     return 0; |     u128d conv; conv.v = in; | ||||||
|  |     return Grid::ComplexD(conv.f[0],conv.f[1]); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   //Real double Reduce |   //Real double Reduce | ||||||
|   template<> |   template<> | ||||||
|   inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){ |   inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){ | ||||||
|     float64x2_t sum = vpaddq_f64(in, in); |     return vaddvq_f64(in); | ||||||
|     return vgetq_lane_f64(sum,0); |  | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   //Integer Reduce |   //Integer Reduce | ||||||
|   template<> |   template<> | ||||||
|   inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){ |   inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){ | ||||||
|     // FIXME unimplemented |     // FIXME unimplemented | ||||||
|    printf("Reduce : Missing integer implementation -> FIX\n"); |     printf("Reduce : Missing integer implementation -> FIX\n"); | ||||||
|     assert(0); |     assert(0); | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| ////////////////////////////////////////////////////////////////////////////////////// | ////////////////////////////////////////////////////////////////////////////////////// | ||||||
| // Here assign types | // Here assign types | ||||||
| namespace Grid { |  | ||||||
|  |  | ||||||
|  | // typedef Optimization::vech SIMD_Htype; // Reduced precision type | ||||||
|  |   typedef float16x8_t  SIMD_Htype; // Half precision type | ||||||
|   typedef float32x4_t  SIMD_Ftype; // Single precision type |   typedef float32x4_t  SIMD_Ftype; // Single precision type | ||||||
|   typedef float64x2_t  SIMD_Dtype; // Double precision type |   typedef float64x2_t  SIMD_Dtype; // Double precision type | ||||||
|   typedef uint32x4_t   SIMD_Itype; // Integer type |   typedef uint32x4_t   SIMD_Itype; // Integer type | ||||||
| @@ -312,13 +581,6 @@ namespace Grid { | |||||||
|   inline void prefetch_HINT_T0(const char *ptr){}; |   inline void prefetch_HINT_T0(const char *ptr){}; | ||||||
|  |  | ||||||
|  |  | ||||||
|   // Gpermute function |  | ||||||
|   template < typename VectorSIMD >  |  | ||||||
|     inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) { |  | ||||||
|     Optimization::permute(y.v,b.v,perm); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   // Function name aliases |   // Function name aliases | ||||||
|   typedef Optimization::Vsplat   VsplatSIMD; |   typedef Optimization::Vsplat   VsplatSIMD; | ||||||
|   typedef Optimization::Vstore   VstoreSIMD; |   typedef Optimization::Vstore   VstoreSIMD; | ||||||
| @@ -332,8 +594,11 @@ namespace Grid { | |||||||
|   // Arithmetic operations |   // Arithmetic operations | ||||||
|   typedef Optimization::Sum         SumSIMD; |   typedef Optimization::Sum         SumSIMD; | ||||||
|   typedef Optimization::Sub         SubSIMD; |   typedef Optimization::Sub         SubSIMD; | ||||||
|  |   typedef Optimization::Div         DivSIMD; | ||||||
|   typedef Optimization::Mult        MultSIMD; |   typedef Optimization::Mult        MultSIMD; | ||||||
|   typedef Optimization::MultComplex MultComplexSIMD; |   typedef Optimization::MultComplex MultComplexSIMD; | ||||||
|  |   typedef Optimization::MultRealPart MultRealPartSIMD; | ||||||
|  |   typedef Optimization::MaddRealPart MaddRealPartSIMD; | ||||||
|   typedef Optimization::Conj        ConjSIMD; |   typedef Optimization::Conj        ConjSIMD; | ||||||
|   typedef Optimization::TimesMinusI TimesMinusISIMD; |   typedef Optimization::TimesMinusI TimesMinusISIMD; | ||||||
|   typedef Optimization::TimesI      TimesISIMD; |   typedef Optimization::TimesI      TimesISIMD; | ||||||
|   | |||||||
| @@ -374,6 +374,84 @@ namespace Optimization { | |||||||
|     // Complex float |     // Complex float | ||||||
|     FLOAT_WRAP_2(operator(), inline) |     FLOAT_WRAP_2(operator(), inline) | ||||||
|   }; |   }; | ||||||
|  | #define USE_FP16 | ||||||
|  |   struct PrecisionChange { | ||||||
|  |     static inline vech StoH (const vector4float &a, const vector4float &b) { | ||||||
|  |       vech ret; | ||||||
|  |       std::cout << GridLogError << "QPX single to half precision conversion not yet supported." << std::endl; | ||||||
|  |       assert(0); | ||||||
|  |       return ret; | ||||||
|  |     } | ||||||
|  |     static inline void  HtoS (vech h, vector4float &sa, vector4float &sb) { | ||||||
|  |       std::cout << GridLogError << "QPX half to single precision conversion not yet supported." << std::endl; | ||||||
|  |       assert(0); | ||||||
|  |     } | ||||||
|  |     static inline vector4float DtoS (vector4double a, vector4double b) { | ||||||
|  |       vector4float ret; | ||||||
|  |       std::cout << GridLogError << "QPX double to single precision conversion not yet supported." << std::endl; | ||||||
|  |       assert(0); | ||||||
|  |       return ret; | ||||||
|  |     } | ||||||
|  |     static inline void StoD (vector4float s, vector4double &a, vector4double &b) { | ||||||
|  |       std::cout << GridLogError << "QPX single to double precision conversion not yet supported." << std::endl; | ||||||
|  |       assert(0); | ||||||
|  |     } | ||||||
|  |     static inline vech DtoH (vector4double a, vector4double b,  | ||||||
|  |                              vector4double c, vector4double d) { | ||||||
|  |       vech ret; | ||||||
|  |       std::cout << GridLogError << "QPX double to half precision conversion not yet supported." << std::endl; | ||||||
|  |       assert(0); | ||||||
|  |       return ret; | ||||||
|  |     } | ||||||
|  |     static inline void HtoD (vech h, vector4double &a, vector4double &b,  | ||||||
|  |                                      vector4double &c, vector4double &d) { | ||||||
|  |       std::cout << GridLogError << "QPX half to double precision conversion not yet supported." << std::endl; | ||||||
|  |       assert(0); | ||||||
|  |     } | ||||||
|  |   }; | ||||||
|  |  | ||||||
|  |   ////////////////////////////////////////////// | ||||||
|  |   // Exchange support | ||||||
|  | #define FLOAT_WRAP_EXCHANGE(fn) \ | ||||||
|  |   static inline void fn(vector4float &out1, vector4float &out2, \ | ||||||
|  |                         vector4float in1,  vector4float in2) \ | ||||||
|  |   { \ | ||||||
|  |     vector4double out1d, out2d, in1d, in2d; \ | ||||||
|  |     in1d  = Vset()(in1);   \ | ||||||
|  |     in2d  = Vset()(in2);   \ | ||||||
|  |     fn(out1d, out2d, in1d, in2d); \ | ||||||
|  |     Vstore()(out1d, out1); \ | ||||||
|  |     Vstore()(out2d, out2); \ | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |   struct Exchange{ | ||||||
|  |  | ||||||
|  |     // double precision | ||||||
|  |     static inline void Exchange0(vector4double &out1, vector4double &out2, | ||||||
|  |                                  vector4double in1,  vector4double in2) { | ||||||
|  |       out1 = vec_perm(in1, in2, vec_gpci(0145)); | ||||||
|  |       out2 = vec_perm(in1, in2, vec_gpci(02367)); | ||||||
|  |     } | ||||||
|  |     static inline void Exchange1(vector4double &out1, vector4double &out2, | ||||||
|  |                                  vector4double in1,  vector4double in2) { | ||||||
|  |       out1 = vec_perm(in1, in2, vec_gpci(0426)); | ||||||
|  |       out2 = vec_perm(in1, in2, vec_gpci(01537)); | ||||||
|  |     } | ||||||
|  |     static inline void Exchange2(vector4double &out1, vector4double &out2, | ||||||
|  |                                  vector4double in1,  vector4double in2) { | ||||||
|  |       assert(0); | ||||||
|  |     } | ||||||
|  |     static inline void Exchange3(vector4double &out1, vector4double &out2, | ||||||
|  |                                  vector4double in1,  vector4double in2) { | ||||||
|  |       assert(0); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // single precision | ||||||
|  |     FLOAT_WRAP_EXCHANGE(Exchange0); | ||||||
|  |     FLOAT_WRAP_EXCHANGE(Exchange1); | ||||||
|  |     FLOAT_WRAP_EXCHANGE(Exchange2); | ||||||
|  |     FLOAT_WRAP_EXCHANGE(Exchange3); | ||||||
|  |   }; | ||||||
|  |  | ||||||
|   struct Permute{ |   struct Permute{ | ||||||
|     //Complex double |     //Complex double | ||||||
| @@ -497,15 +575,19 @@ namespace Optimization { | |||||||
|    |    | ||||||
|   //Integer Reduce |   //Integer Reduce | ||||||
|   template<> |   template<> | ||||||
|   inline Integer Reduce<Integer, int>::operator()(int in){ |   inline Integer Reduce<Integer, veci>::operator()(veci in){ | ||||||
|     // FIXME unimplemented |     Integer a = 0; | ||||||
|     printf("Reduce : Missing integer implementation -> FIX\n"); |     for (unsigned int i = 0; i < W<Integer>::r; ++i) | ||||||
|     assert(0); |     { | ||||||
|  |         a += in.v[i]; | ||||||
|  |     } | ||||||
|  |     return a; | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
| //////////////////////////////////////////////////////////////////////////////// | //////////////////////////////////////////////////////////////////////////////// | ||||||
| // Here assign types | // Here assign types | ||||||
|  | typedef Optimization::vech         SIMD_Htype;  // Half precision type | ||||||
| typedef Optimization::vector4float SIMD_Ftype;  // Single precision type | typedef Optimization::vector4float SIMD_Ftype;  // Single precision type | ||||||
| typedef vector4double              SIMD_Dtype; // Double precision type | typedef vector4double              SIMD_Dtype; // Double precision type | ||||||
| typedef Optimization::veci         SIMD_Itype; // Integer type | typedef Optimization::veci         SIMD_Itype; // Integer type | ||||||
|   | |||||||
| @@ -570,9 +570,9 @@ namespace Optimization { | |||||||
|   //Integer Reduce |   //Integer Reduce | ||||||
|   template<> |   template<> | ||||||
|   inline Integer Reduce<Integer, __m128i>::operator()(__m128i in){ |   inline Integer Reduce<Integer, __m128i>::operator()(__m128i in){ | ||||||
|     // FIXME unimplemented |     __m128i v1 = _mm_hadd_epi32(in, in); | ||||||
|    printf("Reduce : Missing integer implementation -> FIX\n"); |     __m128i v2 = _mm_hadd_epi32(v1, v1); | ||||||
|     assert(0); |     return _mm_cvtsi128_si32(v2); | ||||||
|   } |   } | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -53,7 +53,7 @@ directory | |||||||
| #if defined IMCI | #if defined IMCI | ||||||
| #include "Grid_imci.h" | #include "Grid_imci.h" | ||||||
| #endif | #endif | ||||||
| #ifdef NEONv8 | #ifdef NEONV8 | ||||||
| #include "Grid_neon.h" | #include "Grid_neon.h" | ||||||
| #endif | #endif | ||||||
| #if defined QPX | #if defined QPX | ||||||
|   | |||||||
| @@ -32,8 +32,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk> | |||||||
| namespace Grid { | namespace Grid { | ||||||
|  |  | ||||||
| int LebesgueOrder::UseLebesgueOrder; | int LebesgueOrder::UseLebesgueOrder; | ||||||
|  | #ifdef KNL | ||||||
| std::vector<int> LebesgueOrder::Block({8,2,2,2}); | std::vector<int> LebesgueOrder::Block({8,2,2,2}); | ||||||
|  | #else | ||||||
|  | std::vector<int> LebesgueOrder::Block({2,2,2,2}); | ||||||
|  | #endif | ||||||
| LebesgueOrder::IndexInteger LebesgueOrder::alignup(IndexInteger n){ | LebesgueOrder::IndexInteger LebesgueOrder::alignup(IndexInteger n){ | ||||||
|   n--;           // 1000 0011 --> 1000 0010 |   n--;           // 1000 0011 --> 1000 0010 | ||||||
|   n |= n >> 1;   // 1000 0010 | 0100 0001 = 1100 0011 |   n |= n >> 1;   // 1000 0010 | 0100 0001 = 1100 0011 | ||||||
| @@ -51,8 +54,31 @@ LebesgueOrder::LebesgueOrder(GridBase *_grid) | |||||||
|   if ( Block[0]==0) ZGraph(); |   if ( Block[0]==0) ZGraph(); | ||||||
|   else if ( Block[1]==0) NoBlocking(); |   else if ( Block[1]==0) NoBlocking(); | ||||||
|   else CartesianBlocking(); |   else CartesianBlocking(); | ||||||
| } |  | ||||||
|  |  | ||||||
|  |   if (0) { | ||||||
|  |     std::cout << "Thread Interleaving"<<std::endl; | ||||||
|  |     ThreadInterleave(); | ||||||
|  |   }  | ||||||
|  | } | ||||||
|  | void LebesgueOrder::ThreadInterleave(void) | ||||||
|  | { | ||||||
|  |   std::vector<IndexInteger> reorder = _LebesgueReorder; | ||||||
|  |   std::vector<IndexInteger> throrder; | ||||||
|  |   int vol = _LebesgueReorder.size(); | ||||||
|  |   int threads = GridThread::GetThreads(); | ||||||
|  |   int blockbits=3; | ||||||
|  |   int blocklen = 8; | ||||||
|  |   int msk      = 0x7; | ||||||
|  |  | ||||||
|  |   for(int t=0;t<threads;t++){ | ||||||
|  |     for(int ss=0;ss<vol;ss++){ | ||||||
|  |        if ( ( ss >> blockbits) % threads == t ) {  | ||||||
|  |          throrder.push_back(reorder[ss]); | ||||||
|  |        } | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   _LebesgueReorder = throrder; | ||||||
|  | } | ||||||
| void LebesgueOrder::NoBlocking(void)  | void LebesgueOrder::NoBlocking(void)  | ||||||
| { | { | ||||||
|   std::cout<<GridLogDebug<<"Lexicographic : no cache blocking"<<std::endl; |   std::cout<<GridLogDebug<<"Lexicographic : no cache blocking"<<std::endl; | ||||||
|   | |||||||
| @@ -70,6 +70,8 @@ namespace Grid { | |||||||
| 		  std::vector<IndexInteger> & xi, | 		  std::vector<IndexInteger> & xi, | ||||||
| 		  std::vector<IndexInteger> &dims); | 		  std::vector<IndexInteger> &dims); | ||||||
|  |  | ||||||
|  |     void ThreadInterleave(void); | ||||||
|  |  | ||||||
|   private: |   private: | ||||||
|     std::vector<IndexInteger> _LebesgueReorder; |     std::vector<IndexInteger> _LebesgueReorder; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -176,6 +176,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
|   // Timing info; ugly; possibly temporary |   // Timing info; ugly; possibly temporary | ||||||
|   ///////////////////////////////////////// |   ///////////////////////////////////////// | ||||||
|   double commtime; |   double commtime; | ||||||
|  |   double mpi3synctime; | ||||||
|  |   double mpi3synctime_g; | ||||||
|  |   double shmmergetime; | ||||||
|   double gathertime; |   double gathertime; | ||||||
|   double gathermtime; |   double gathermtime; | ||||||
|   double halogtime; |   double halogtime; | ||||||
| @@ -185,6 +188,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
|   double splicetime; |   double splicetime; | ||||||
|   double nosplicetime; |   double nosplicetime; | ||||||
|   double calls; |   double calls; | ||||||
|  |   std::vector<double> comm_bytes_thr; | ||||||
|  |   std::vector<double> comm_time_thr; | ||||||
|  |   std::vector<double> comm_enter_thr; | ||||||
|  |   std::vector<double> comm_leave_thr; | ||||||
|  |  | ||||||
|   //////////////////////////////////////// |   //////////////////////////////////////// | ||||||
|   // Stencil query |   // Stencil query | ||||||
| @@ -248,35 +255,120 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
|   ////////////////////////////////////////// |   ////////////////////////////////////////// | ||||||
|   // Comms packet queue for asynch thread |   // Comms packet queue for asynch thread | ||||||
|   ////////////////////////////////////////// |   ////////////////////////////////////////// | ||||||
|  |   void CommunicateThreaded() | ||||||
|  |   { | ||||||
|  | #ifdef GRID_OMP | ||||||
|  |     // must be called in parallel region | ||||||
|  |     int mythread = omp_get_thread_num(); | ||||||
|  |     int nthreads = CartesianCommunicator::nCommThreads; | ||||||
|  | #else | ||||||
|  |     int mythread = 0; | ||||||
|  |     int nthreads = 1; | ||||||
|  | #endif | ||||||
|  |     if (nthreads == -1) nthreads = 1; | ||||||
|  |     if (mythread < nthreads) { | ||||||
|  |       comm_enter_thr[mythread] = usecond(); | ||||||
|  |       for (int i = mythread; i < Packets.size(); i += nthreads) { | ||||||
|  | 	uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf, | ||||||
|  | 						      Packets[i].to_rank, | ||||||
|  | 						      Packets[i].recv_buf, | ||||||
|  | 						      Packets[i].from_rank, | ||||||
|  | 						      Packets[i].bytes,i); | ||||||
|  | 	comm_bytes_thr[mythread] += bytes; | ||||||
|  |       } | ||||||
|  |       comm_leave_thr[mythread]= usecond(); | ||||||
|  |       comm_time_thr[mythread] += comm_leave_thr[mythread] - comm_enter_thr[mythread]; | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |    | ||||||
|  |   void CollateThreads(void) | ||||||
|  |   { | ||||||
|  |     int nthreads = CartesianCommunicator::nCommThreads; | ||||||
|  |     double first=0.0; | ||||||
|  |     double last =0.0; | ||||||
|  |  | ||||||
|  |     for(int t=0;t<nthreads;t++) { | ||||||
|  |  | ||||||
|  |       double t0 = comm_enter_thr[t]; | ||||||
|  |       double t1 = comm_leave_thr[t]; | ||||||
|  |       comms_bytes+=comm_bytes_thr[t]; | ||||||
|  |  | ||||||
|  |       comm_enter_thr[t] = 0.0; | ||||||
|  |       comm_leave_thr[t] = 0.0; | ||||||
|  |       comm_time_thr[t]   = 0.0; | ||||||
|  |       comm_bytes_thr[t]=0; | ||||||
|  |  | ||||||
|  |       if ( first == 0.0 ) first = t0;                   // first is t0 | ||||||
|  |       if ( (t0 > 0.0) && ( t0 < first ) ) first = t0;   // min time seen | ||||||
|  |  | ||||||
|  |       if ( t1 > last ) last = t1;                       // max time seen | ||||||
|  |        | ||||||
|  |     } | ||||||
|  |     commtime+= last-first; | ||||||
|  |   } | ||||||
|   void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs) |   void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs) | ||||||
|   { |   { | ||||||
|     reqs.resize(Packets.size()); |     reqs.resize(Packets.size()); | ||||||
|     commtime-=usecond(); |     commtime-=usecond(); | ||||||
|     for(int i=0;i<Packets.size();i++){ |     for(int i=0;i<Packets.size();i++){ | ||||||
|       comms_bytes+=_grid->StencilSendToRecvFromBegin(reqs[i], |       comms_bytes+=_grid->StencilSendToRecvFromBegin(reqs[i], | ||||||
| 					  Packets[i].send_buf, | 						     Packets[i].send_buf, | ||||||
| 					  Packets[i].to_rank, | 						     Packets[i].to_rank, | ||||||
| 					  Packets[i].recv_buf, | 						     Packets[i].recv_buf, | ||||||
| 					  Packets[i].from_rank, | 						     Packets[i].from_rank, | ||||||
| 					  Packets[i].bytes); | 						     Packets[i].bytes,i); | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs) |   void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs) | ||||||
|   { |   { | ||||||
|     for(int i=0;i<Packets.size();i++){ |     for(int i=0;i<Packets.size();i++){ | ||||||
|       _grid->StencilSendToRecvFromComplete(reqs[i]); |       _grid->StencilSendToRecvFromComplete(reqs[i],i); | ||||||
|     } |     } | ||||||
|     commtime+=usecond(); |     commtime+=usecond(); | ||||||
|   } |   } | ||||||
|  |   void Communicate(void) | ||||||
|  |   { | ||||||
|  | #ifdef GRID_OMP | ||||||
|  | #pragma omp parallel  | ||||||
|  |     { | ||||||
|  |       // must be called in parallel region | ||||||
|  |       int mythread  = omp_get_thread_num(); | ||||||
|  |       int maxthreads= omp_get_max_threads(); | ||||||
|  |       int nthreads = CartesianCommunicator::nCommThreads; | ||||||
|  |       assert(nthreads <= maxthreads); | ||||||
|  |  | ||||||
|  |       if (nthreads == -1) nthreads = 1; | ||||||
|  | #else | ||||||
|  |       int mythread = 0; | ||||||
|  |       int nthreads = 1; | ||||||
|  | #endif | ||||||
|  |       if (mythread < nthreads) { | ||||||
|  | 	for (int i = mythread; i < Packets.size(); i += nthreads) { | ||||||
|  | 	  double start = usecond(); | ||||||
|  | 	  comm_bytes_thr[mythread] += _grid->StencilSendToRecvFrom(Packets[i].send_buf, | ||||||
|  | 								   Packets[i].to_rank, | ||||||
|  | 								   Packets[i].recv_buf, | ||||||
|  | 								   Packets[i].from_rank, | ||||||
|  | 								   Packets[i].bytes,i); | ||||||
|  | 	  comm_time_thr[mythread] += usecond() - start; | ||||||
|  | 	} | ||||||
|  |       } | ||||||
|  | #ifdef GRID_OMP | ||||||
|  |     } | ||||||
|  | #endif | ||||||
|  |   } | ||||||
|    |    | ||||||
|   template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)  |   template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress)  | ||||||
|   { |   { | ||||||
|     std::vector<std::vector<CommsRequest_t> > reqs; |     std::vector<std::vector<CommsRequest_t> > reqs; | ||||||
|     Prepare(); |     Prepare(); | ||||||
|     HaloGather(source,compress); |     HaloGather(source,compress); | ||||||
|     CommunicateBegin(reqs); |     // Concurrent | ||||||
|     CommunicateComplete(reqs); |     //CommunicateBegin(reqs); | ||||||
|  |     //CommunicateComplete(reqs); | ||||||
|  |     // Sequential, possibly threaded | ||||||
|  |     Communicate(); | ||||||
|     CommsMergeSHM(compress);  |     CommsMergeSHM(compress);  | ||||||
|     CommsMerge(compress);  |     CommsMerge(compress);  | ||||||
|   } |   } | ||||||
| @@ -337,7 +429,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
|   template<class compressor> |   template<class compressor> | ||||||
|   void HaloGather(const Lattice<vobj> &source,compressor &compress) |   void HaloGather(const Lattice<vobj> &source,compressor &compress) | ||||||
|   { |   { | ||||||
|  |     mpi3synctime_g-=usecond(); | ||||||
|     _grid->StencilBarrier();// Synch shared memory on a single nodes |     _grid->StencilBarrier();// Synch shared memory on a single nodes | ||||||
|  |     mpi3synctime_g+=usecond(); | ||||||
|  |  | ||||||
|     // conformable(source._grid,_grid); |     // conformable(source._grid,_grid); | ||||||
|     assert(source._grid==_grid); |     assert(source._grid==_grid); | ||||||
| @@ -397,8 +491,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
|     CommsMerge(decompress,Mergers,Decompressions);  |     CommsMerge(decompress,Mergers,Decompressions);  | ||||||
|   } |   } | ||||||
|   template<class decompressor>  void CommsMergeSHM(decompressor decompress) { |   template<class decompressor>  void CommsMergeSHM(decompressor decompress) { | ||||||
|  |     mpi3synctime-=usecond();     | ||||||
|     _grid->StencilBarrier();// Synch shared memory on a single nodes |     _grid->StencilBarrier();// Synch shared memory on a single nodes | ||||||
|  |     mpi3synctime+=usecond();     | ||||||
|  |     shmmergetime-=usecond();     | ||||||
|     CommsMerge(decompress,MergersSHM,DecompressionsSHM); |     CommsMerge(decompress,MergersSHM,DecompressionsSHM); | ||||||
|  |     shmmergetime+=usecond();     | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   template<class decompressor> |   template<class decompressor> | ||||||
| @@ -442,7 +540,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
| 		  int checkerboard, | 		  int checkerboard, | ||||||
| 		  const std::vector<int> &directions, | 		  const std::vector<int> &directions, | ||||||
| 		  const std::vector<int> &distances)  | 		  const std::vector<int> &distances)  | ||||||
|    :   _permute_type(npoints), _comm_buf_size(npoints) |    : _permute_type(npoints),  | ||||||
|  |     _comm_buf_size(npoints), | ||||||
|  |     comm_bytes_thr(npoints),  | ||||||
|  |     comm_enter_thr(npoints), | ||||||
|  |     comm_leave_thr(npoints),  | ||||||
|  |        comm_time_thr(npoints) | ||||||
|   { |   { | ||||||
|     face_table_computed=0; |     face_table_computed=0; | ||||||
|     _npoints = npoints; |     _npoints = npoints; | ||||||
| @@ -996,6 +1099,15 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
|   void ZeroCounters(void) { |   void ZeroCounters(void) { | ||||||
|     gathertime = 0.; |     gathertime = 0.; | ||||||
|     commtime = 0.; |     commtime = 0.; | ||||||
|  |     mpi3synctime=0.; | ||||||
|  |     mpi3synctime_g=0.; | ||||||
|  |     shmmergetime=0.; | ||||||
|  |     for(int i=0;i<_npoints;i++){ | ||||||
|  |       comm_time_thr[i]=0; | ||||||
|  |       comm_bytes_thr[i]=0; | ||||||
|  |       comm_enter_thr[i]=0; | ||||||
|  |       comm_leave_thr[i]=0; | ||||||
|  |     } | ||||||
|     halogtime = 0.; |     halogtime = 0.; | ||||||
|     mergetime = 0.; |     mergetime = 0.; | ||||||
|     decompresstime = 0.; |     decompresstime = 0.; | ||||||
| @@ -1011,6 +1123,18 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
| #define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl; | #define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl; | ||||||
|     RealD NP = _grid->_Nprocessors; |     RealD NP = _grid->_Nprocessors; | ||||||
|     RealD NN = _grid->NodeCount(); |     RealD NN = _grid->NodeCount(); | ||||||
|  |     double t = 0; | ||||||
|  |     // if comm_time_thr is set they were all done in parallel so take the max | ||||||
|  |     // but add up the bytes | ||||||
|  |     int threaded = 0 ; | ||||||
|  |     for (int i = 0; i < 8; ++i) { | ||||||
|  |       if ( comm_time_thr[i]>0.0 ) { | ||||||
|  | 	threaded = 1; | ||||||
|  | 	comms_bytes += comm_bytes_thr[i]; | ||||||
|  | 	if (t < comm_time_thr[i]) t = comm_time_thr[i]; | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |     if (threaded) commtime += t; | ||||||
|      |      | ||||||
|     _grid->GlobalSum(commtime);    commtime/=NP; |     _grid->GlobalSum(commtime);    commtime/=NP; | ||||||
|     if ( calls > 0. ) { |     if ( calls > 0. ) { | ||||||
| @@ -1026,6 +1150,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal | |||||||
| 	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<<std::endl; | 	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<<std::endl; | ||||||
| 	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000.*NP/NN << " GB/s per node"<<std::endl; | 	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000.*NP/NN << " GB/s per node"<<std::endl; | ||||||
|       } |       } | ||||||
|  |       PRINTIT(mpi3synctime); | ||||||
|  |       PRINTIT(mpi3synctime_g); | ||||||
|  |       PRINTIT(shmmergetime); | ||||||
|       PRINTIT(splicetime); |       PRINTIT(splicetime); | ||||||
|       PRINTIT(nosplicetime); |       PRINTIT(nosplicetime); | ||||||
|     } |     } | ||||||
|   | |||||||
| @@ -98,7 +98,9 @@ template<class rtype,class vtype,class mtype,int N> | |||||||
| strong_inline void mult(iVector<rtype,N> * __restrict__ ret, | strong_inline void mult(iVector<rtype,N> * __restrict__ ret, | ||||||
|                  const iVector<vtype,N> * __restrict__ rhs, |                  const iVector<vtype,N> * __restrict__ rhs, | ||||||
|                  const iScalar<mtype> * __restrict__ lhs){ |                  const iScalar<mtype> * __restrict__ lhs){ | ||||||
|     mult(ret,lhs,rhs); |     for(int c1=0;c1<N;c1++){ | ||||||
|  |         mult(&ret->_internal[c1],&rhs->_internal[c1],&lhs->_internal); | ||||||
|  |     }                  | ||||||
| } | } | ||||||
|      |      | ||||||
|  |  | ||||||
|   | |||||||
| @@ -222,6 +222,11 @@ void Grid_init(int *argc,char ***argv) | |||||||
|     CartesianCommunicator::MAX_MPI_SHM_BYTES = MB*1024*1024; |     CartesianCommunicator::MAX_MPI_SHM_BYTES = MB*1024*1024; | ||||||
|   } |   } | ||||||
|  |  | ||||||
|  |   if( GridCmdOptionExists(*argv,*argv+*argc,"--shm-hugepages") ){ | ||||||
|  |     CartesianCommunicator::Hugepages = 1; | ||||||
|  |   } | ||||||
|  |  | ||||||
|  |  | ||||||
|   if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){ |   if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){ | ||||||
|     Grid_debug_handler_init(); |     Grid_debug_handler_init(); | ||||||
|   } |   } | ||||||
| @@ -304,6 +309,7 @@ void Grid_init(int *argc,char ***argv) | |||||||
|     std::cout<<GridLogMessage<<"  --threads n     : default number of OMP threads"<<std::endl; |     std::cout<<GridLogMessage<<"  --threads n     : default number of OMP threads"<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<"  --grid n.n.n.n  : default Grid size"<<std::endl; |     std::cout<<GridLogMessage<<"  --grid n.n.n.n  : default Grid size"<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<"  --shm  M        : allocate M megabytes of shared memory for comms"<<std::endl; |     std::cout<<GridLogMessage<<"  --shm  M        : allocate M megabytes of shared memory for comms"<<std::endl; | ||||||
|  |     std::cout<<GridLogMessage<<"  --shm-hugepages : use explicit huge pages in mmap call "<<std::endl;     | ||||||
|     std::cout<<GridLogMessage<<std::endl; |     std::cout<<GridLogMessage<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl; |     std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<std::endl; |     std::cout<<GridLogMessage<<std::endl; | ||||||
| @@ -317,7 +323,7 @@ void Grid_init(int *argc,char ***argv) | |||||||
|     std::cout<<GridLogMessage<<std::endl; |     std::cout<<GridLogMessage<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<"  --comms-concurrent : Asynchronous MPI calls; several dirs at a time "<<std::endl;     |     std::cout<<GridLogMessage<<"  --comms-concurrent : Asynchronous MPI calls; several dirs at a time "<<std::endl;     | ||||||
|     std::cout<<GridLogMessage<<"  --comms-sequential : Synchronous MPI calls; one dirs at a time "<<std::endl;     |     std::cout<<GridLogMessage<<"  --comms-sequential : Synchronous MPI calls; one dirs at a time "<<std::endl;     | ||||||
|     std::cout<<GridLogMessage<<"  --comms-overlap : Overlap comms with compute "<<std::endl;     |     std::cout<<GridLogMessage<<"  --comms-overlap    : Overlap comms with compute "<<std::endl;     | ||||||
|     std::cout<<GridLogMessage<<std::endl; |     std::cout<<GridLogMessage<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<"  --dslash-generic: Wilson kernel for generic Nc"<<std::endl;     |     std::cout<<GridLogMessage<<"  --dslash-generic: Wilson kernel for generic Nc"<<std::endl;     | ||||||
|     std::cout<<GridLogMessage<<"  --dslash-unroll : Wilson kernel for Nc=3"<<std::endl;     |     std::cout<<GridLogMessage<<"  --dslash-unroll : Wilson kernel for Nc=3"<<std::endl;     | ||||||
| @@ -356,10 +362,15 @@ void Grid_init(int *argc,char ***argv) | |||||||
|   if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-sequential") ){ |   if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-sequential") ){ | ||||||
|     CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential); |     CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){ |   if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){ | ||||||
|     LebesgueOrder::UseLebesgueOrder=1; |     LebesgueOrder::UseLebesgueOrder=1; | ||||||
|   } |   } | ||||||
|  |   CartesianCommunicator::nCommThreads = -1; | ||||||
|  |   if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-threads") ){ | ||||||
|  |     arg= GridCmdOptionPayload(*argv,*argv+*argc,"--comms-threads"); | ||||||
|  |     GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads); | ||||||
|  |   } | ||||||
|   if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){ |   if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){ | ||||||
|     arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking"); |     arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking"); | ||||||
|     GridCmdOptionIntVector(arg,LebesgueOrder::Block); |     GridCmdOptionIntVector(arg,LebesgueOrder::Block); | ||||||
| @@ -374,10 +385,13 @@ void Grid_init(int *argc,char ***argv) | |||||||
| 		  Grid_default_latt, | 		  Grid_default_latt, | ||||||
| 		  Grid_default_mpi); | 		  Grid_default_mpi); | ||||||
|  |  | ||||||
|   std::cout << GridLogDebug << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl; |   std::cout << GridLogMessage << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl; | ||||||
|  |   if ( CartesianCommunicator::Hugepages) { | ||||||
|  |     std::cout << GridLogMessage << "Mapped stencil comms buffers as MAP_HUGETLB "<<std::endl; | ||||||
|  |   } | ||||||
|  |  | ||||||
|   if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){ |   if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){ | ||||||
|     std::cout<<GridLogMessage<<"Grid Decomposition\n"; |     std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n"; | ||||||
|     std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl; |     std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl; |     std::cout<<GridLogMessage<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl; | ||||||
|     std::cout<<GridLogMessage<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl; |     std::cout<<GridLogMessage<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl; | ||||||
| @@ -393,7 +407,7 @@ void Grid_init(int *argc,char ***argv) | |||||||
|  |  | ||||||
| void Grid_finalize(void) | void Grid_finalize(void) | ||||||
| { | { | ||||||
| #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) | #if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT) | ||||||
|   MPI_Finalize(); |   MPI_Finalize(); | ||||||
|   Grid_unquiesce_nodes(); |   Grid_unquiesce_nodes(); | ||||||
| #endif | #endif | ||||||
|   | |||||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user