1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-21 17:22:03 +01:00

Compare commits

..

130 Commits

Author SHA1 Message Date
80359e0d49 Bland SYCL compile 2023-09-26 13:20:27 -07:00
3d437c5cc4 Making SYCL happy 2023-09-26 13:19:42 -07:00
b8a7004365 Partial fraction test 2023-08-14 15:17:03 -04:00
994512048e Merge pull request #439 from felixerben/bugfix/IRL_convergence
Bugfix/irl convergence
2023-07-12 16:32:26 -04:00
78bae9417c returning Nstop vectors even if not all meet true convergence criterion 2023-06-27 14:38:19 +01:00
dd170ead01 whitespace 2023-06-27 11:37:01 +01:00
014704856f do one more iteration if not all vectors converged 2023-06-27 11:33:30 +01:00
ee92e08edb Merge pull request #435 from fjosw/fix/warnings_in_WilsonKernelsImplementation
Unused variable in WilsonKernelsImplementation
2023-06-23 11:47:19 -04:00
c1dcee9328 Merge pull request #437 from fjosw/fix/stencil_debug
Added GridLogDebug to BuildSurfaceList debug message
2023-06-23 11:47:00 -04:00
6b150961fe Better script 2023-06-23 18:09:25 +03:00
5bafcaedfa Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2023-06-22 19:59:45 +03:00
bfeceae708 FTHMC 2023-06-22 12:58:18 -04:00
eacb66591f Config command 2023-06-22 19:56:40 +03:00
fadaa85626 Update 2023-06-22 19:56:27 +03:00
02a5b0d786 Updating run during testing 2023-06-22 19:52:46 +03:00
0e2141442a Dennis says broken 2023-06-22 19:19:51 +03:00
769eb0eecb Precision coverage 2023-06-22 19:19:20 +03:00
85e35c4da1 fix: added GridLogDebug to BuildSurfaceList debug message. 2023-06-16 10:31:16 +01:00
d72e914cf0 Profiling temporary code until optimised 2023-06-15 10:43:04 -04:00
3b5254e2d5 Optional checkpoint smeared configs for FTHMC 2023-06-15 10:43:04 -04:00
f1c358b596 Additional tests 2023-06-15 10:43:04 -04:00
c0ef210265 Hot start should be properly Hot 2023-06-15 10:43:04 -04:00
e3e1cc1962 Ta project 2023-06-15 10:43:04 -04:00
723eadbb5c Keep methods virtual 2023-06-15 10:43:04 -04:00
e24637ec1e Clean up 2023-06-15 10:43:04 -04:00
8b01ff4ce7 Integrator over to smeared force structure 2023-06-15 10:43:04 -04:00
588197c487 Smeared action virtual class 2023-06-15 10:43:04 -04:00
1352bad2e4 Sunspot compile 2023-06-15 11:22:46 +00:00
ffd7301649 Updated masked / fthmc smeared config container 2023-06-01 06:23:02 -04:00
d2a8494044 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2023-06-01 06:22:33 -04:00
0982e0d19b Jacobian action wrapper for FTHMC 2023-06-01 06:15:08 -04:00
3badbfc3c1 Refactor the Action and Smeared gauge configuration containers. Add first pass at FTHMC action 2023-06-01 06:14:28 -04:00
5465961e30 New test for FTHMC portion 2023-06-01 06:14:04 -04:00
477b794bc5 fix: unused variable removed. 2023-05-29 14:08:53 +01:00
4835fd1a87 HIP stream synch 2023-05-27 17:58:22 +03:00
6533c25814 Lumi 2023-05-27 16:13:32 +03:00
1b2914ec09 FT-HMC smearing, derivative chain rule, log det and force first pass. 2023-05-22 10:21:37 -04:00
519f795066 Header not liked by gcc on mac? puzzling 2023-05-22 10:21:12 -04:00
4240ad5ca8 Preparing for FTHMC 2023-05-19 21:21:55 -04:00
d418347d86 public for convenience to see rho params 2023-05-19 21:21:05 -04:00
29a4bfe5e5 Clean up 2023-05-19 21:20:45 -04:00
9955bf9daf Regresses to Qlat 2023-05-19 17:32:13 -04:00
876c8f4478 Nodes on padded cell 2023-05-11 12:35:49 -04:00
9c8750f261 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2023-05-11 12:29:09 -04:00
91efd08179 Option for Qlat generator basis 2023-05-11 12:27:45 -04:00
9953511b65 Mac compile 2023-05-11 12:27:29 -04:00
025fa9991a For FTHMC 2023-05-11 12:26:14 -04:00
e8c60c355b Padded cell code 2023-05-11 12:25:50 -04:00
6c9c7f9d85 Permute fix 2023-05-11 12:24:21 -04:00
f534523ede Debug 2023-05-11 12:23:11 -04:00
1b8a834beb Debug 2023-05-11 12:22:24 -04:00
3aa43e6065 Debug info 2023-04-20 14:21:13 -04:00
78ac4044ff HMC 2023-04-20 13:28:07 -04:00
119c3db47f Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2023-04-18 15:13:16 -04:00
21bbdb8fc2 Crusher 2023-04-18 15:11:16 -04:00
739bd7572c Example code 2023-04-17 21:51:55 +00:00
074627a5bd Pass file descriptors through AF_UNIX for level_zero 2023-04-17 21:50:52 +00:00
6a23b2c599 Drop UVM 2023-04-17 21:49:58 +00:00
bd891fb3f5 tests to compile 2023-04-12 18:32:44 -04:00
3984265851 Merge pull request #432 from paboyle/hotfix/nvcc-warnings
Unused statements generating warnings removed
2023-04-12 16:59:02 -04:00
45361d188f Merge pull request #427 from fjosw/feat/bug_report_issue_template
Feat/bug report issue template
2023-04-12 16:58:41 -04:00
80c9d77e02 Merge pull request #433 from paboyle/hotfix/virtual-dtor
Virtual destructor for LinearOperator
2023-04-12 16:56:18 -04:00
3aff64dddb Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2023-04-11 12:19:15 -07:00
b4f2ca81ff Copy queue and compute queue same as better concurrency 2023-04-11 12:18:21 -07:00
d1dea5f840 New driver 2023-04-11 12:16:52 -07:00
54f8b84d16 Fence 2023-04-11 12:16:08 -07:00
da503fef0e Name change on barrier routine 2023-04-11 12:14:04 -07:00
4a6802098a Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2023-04-07 15:43:28 -04:00
f9b41a84d2 Trajectory runs to completion on Crusher within wall clock time 2023-04-07 15:42:45 -04:00
5d7e0d18b9 virtual destructor for LinearOperator 2023-04-07 14:30:38 +01:00
9e64387933 mores unused statements removed 2023-04-07 14:27:18 +01:00
983b681d46 unused statement cleaning 2023-04-07 14:12:02 +01:00
4072408b6f Update README.md 2023-04-07 11:45:28 +01:00
bd76b47fbf Update CI badge in README 2023-04-07 11:44:48 +01:00
18ce23aa75 Fix NEON SIMD 2023-04-06 11:30:48 +01:00
ffa7fe0cc2 Merge branch 'feature/dirichlet' into develop 2023-04-04 23:13:52 -04:00
6b979f0a69 Dirichlet improvements that I failed to commit 2023-04-04 23:13:17 -04:00
86dac5ff4f Better printing 2023-04-04 07:42:19 -07:00
4a382fad3f Use distinct SYCL queue for copies 2023-04-04 07:41:41 -07:00
cc753670d9 Barrier elimination, surface list build 2023-04-04 07:39:14 -07:00
cc9d88ea1c Fence changes and EXT kernel loop cout reduction 2023-04-04 07:37:23 -07:00
b281b0166e Put the barrier in the subroutine 2023-04-04 07:36:03 -07:00
6a21f694ff Apply barrier in Gather kernel sequence.
Could place before comms, or in Gather, but decided to insist Gather means Gather is done
2023-04-04 07:33:24 -07:00
fc4db5e963 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2023-04-03 18:26:11 -04:00
6252ffaf76 No unified 2023-04-03 18:25:22 -04:00
af64c1c6b6 Had managed to drop the accelerator_barrier() in the Wilson Compressor gather 2023-03-30 17:34:44 -04:00
866f48391a Temporary fix for develop incorrect results 2023-03-30 17:10:13 -04:00
a4df527d74 Merge pull request #428 from mmphys/bugfix/comm_none
Fixes for --enable-comms=none
2023-03-30 08:38:14 -04:00
5764d21161 Fixes for --enable-comms=none 2023-03-30 10:15:28 +01:00
496d04cd85 Weaken the Fence 2023-03-29 18:58:51 -04:00
10e6d7c6ce Merge branch 'feature/dirichlet' into develop 2023-03-29 16:26:47 -04:00
c42e25e5b8 Dirichlet remove 2023-03-29 16:25:52 -04:00
a00ae981e0 Fence propagation from SYCL 2023-03-29 15:00:40 -04:00
58e020b62a Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2023-03-29 14:37:40 -04:00
a7e1aceeca Compile fix on Nvidia 2023-03-29 14:36:50 -04:00
7212432f43 More careful fencing 2023-03-28 20:10:22 -07:00
4a261fab30 Changes premerge to develop 2023-03-28 20:04:21 -07:00
6af97069b9 Preparing for close of feature/dirichlet
Initial code change review complete
2023-03-28 13:39:44 -07:00
5068413cdb Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2023-03-28 08:35:38 -07:00
71c6960eea Commet 2023-03-28 08:34:24 -07:00
ddf6d5c9e3 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2023-03-28 11:33:05 -04:00
39214702f6 feat: indentation fixed. 2023-03-28 16:30:34 +02:00
3e4614c63a feat: draft for bug-report issue template added. 2023-03-28 16:24:35 +02:00
900e01f49b Temporary 2023-03-27 21:35:06 -07:00
2376156fbc Merge branch 'develop' into feature/dirichlet 2023-03-27 21:33:50 -07:00
3f2fd49db4 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2023-03-27 17:29:54 -07:00
0efa107cb6 Script update 2023-03-27 17:29:43 -07:00
8feedb4f6f Include files moved 2023-03-27 17:29:21 -07:00
05e562e3d7 Move the copy synch out to stencil and do one per call instead of one per packet 2023-03-27 17:28:38 -07:00
dd3bbb8fa2 MOve the synchronise out to the stencil so one call instead of one call per packet 2023-03-27 17:27:45 -07:00
2fbcf13c46 SYCL fix 2023-03-27 14:25:14 -07:00
4ea48ef0c4 Merge pull request #419 from lehner/feature/gpt
Separate rankSum from sum
2023-03-24 15:42:16 -04:00
5c85774ee3 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2023-03-24 15:40:57 -04:00
d8a9a745d8 stream synchronise 2023-03-24 15:40:30 -04:00
dcf172da3b Merge pull request #415 from paboyle/feature/block_lanczos22
Feature/block lanczos22
2023-03-24 12:08:16 -04:00
546be724e7 Merge pull request #421 from UniOfLeicester/feature/accel_Copy_plane
Populate the Cshift_table in the GPU
2023-03-24 12:04:06 -04:00
481bbaf1fc Interface to query memory use 2023-03-23 12:55:31 -04:00
281488611a WriteDiscard on construct 2023-03-23 10:28:50 -04:00
bae0f8ea99 Merge pull request #425 from rrhodgson/feature/CacheLogging
Huge Cache
2023-03-21 08:59:08 -04:00
bbbcd36ae5 Merge pull request #426 from rrhodgson/feature/LCDeflation
Batched Local Coherence Tools
2023-03-21 08:58:40 -04:00
a3e935c902 Batched block project/promote size checks 2023-02-27 11:38:16 +00:00
7731c7db8e Add huge cache type and allow Ncache==0 2023-02-26 14:15:28 +00:00
ff97340324 Expose cached bytes 2023-02-26 12:22:45 +00:00
920a51438d Added batched Mixed precision CG 2023-02-14 17:04:13 +00:00
be528b6d27 Add batched block project/promote functions 2023-02-14 14:37:10 +00:00
ccd21f96ff Plaquette agreeing and moving to final form (slowly) need to optimise 2023-02-01 22:57:44 -05:00
4b90cb8888 First cut passes combining padded cell with general stencil towards fast plaquette and staggered force 2023-02-01 22:14:10 -05:00
7d62f1d6d2 Populate the Cshift_table in the GPU
Cshift is allocated in Unified memory and used
in the LambdaApply kernels but also populated
from the host. This creates a lot of Unified HtoD
and DtoH mem operations and has a negative effect
in performance. With this commit we populate the
Cshift table in the device with the
populate_Cshift_table() kernel.
2023-01-11 21:26:25 +00:00
458c943987 merged upstream 2022-12-31 11:16:21 +02:00
88015b0858 Split sum in rankSum and GlobalSum 2022-12-26 10:01:32 +01:00
108 changed files with 4056 additions and 727 deletions

54
.github/ISSUE_TEMPLATE/bug-report.yml vendored Normal file
View File

@ -0,0 +1,54 @@
name: Bug report
description: Report a bug.
title: "<insert title>"
labels: [bug]
body:
- type: markdown
attributes:
value: >
Thank you for taking the time to file a bug report.
Please check that the code is pointing to the HEAD of develop
or any commit in master which is tagged with a version number.
- type: textarea
attributes:
label: "Describe the issue:"
description: >
Describe the issue and any previous attempt to solve it.
validations:
required: true
- type: textarea
attributes:
label: "Code example:"
description: >
If relevant, show how to reproduce the issue using a minimal working
example.
placeholder: |
<< your code here >>
render: shell
validations:
required: false
- type: textarea
attributes:
label: "Target platform:"
description: >
Give a description of the target platform (CPU, network, compiler).
Please give the full CPU part description, using for example
`cat /proc/cpuinfo | grep 'model name' | uniq` (Linux)
or `sysctl machdep.cpu.brand_string` (macOS) and the full output
the `--version` option of your compiler.
validations:
required: true
- type: textarea
attributes:
label: "Configure options:"
description: >
Please give the exact configure command used and attach
`config.log`, `grid.config.summary` and the output of `make V=1`.
render: shell
validations:
required: true

View File

@ -55,6 +55,7 @@ NAMESPACE_CHECK(BiCGSTAB);
#include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
#include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
#include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h>
#include <Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h>
#include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
#include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>

View File

@ -542,6 +542,7 @@ public:
(*this)(in[i], out[i]);
}
}
virtual ~LinearFunction(){};
};
template<class Field> class IdentityLinearFunction : public LinearFunction<Field> {

View File

@ -191,7 +191,7 @@ public:
std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl;
std::cout << GridLogDebug << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl;
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);

View File

@ -0,0 +1,213 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
Copyright (C) 2015
Author: Raoul Hodgson <raoul.hodgson@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
NAMESPACE_BEGIN(Grid);
//Mixed precision restarted defect correction CG
template<class FieldD,class FieldF,
typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class MixedPrecisionConjugateGradientBatched : public LinearFunction<FieldD> {
public:
using LinearFunction<FieldD>::operator();
RealD Tolerance;
RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
Integer MaxInnerIterations;
Integer MaxOuterIterations;
Integer MaxPatchupIterations;
GridBase* SinglePrecGrid; //Grid for single-precision fields
RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
LinearOperatorBase<FieldF> &Linop_f;
LinearOperatorBase<FieldD> &Linop_d;
//Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
LinearFunction<FieldF> *guesser;
bool updateResidual;
MixedPrecisionConjugateGradientBatched(RealD tol,
Integer maxinnerit,
Integer maxouterit,
Integer maxpatchit,
GridBase* _sp_grid,
LinearOperatorBase<FieldF> &_Linop_f,
LinearOperatorBase<FieldD> &_Linop_d,
bool _updateResidual=true) :
Linop_f(_Linop_f), Linop_d(_Linop_d),
Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), MaxPatchupIterations(maxpatchit), SinglePrecGrid(_sp_grid),
OuterLoopNormMult(100.), guesser(NULL), updateResidual(_updateResidual) { };
void useGuesser(LinearFunction<FieldF> &g){
guesser = &g;
}
void operator() (const FieldD &src_d_in, FieldD &sol_d){
std::vector<FieldD> srcs_d_in{src_d_in};
std::vector<FieldD> sols_d{sol_d};
(*this)(srcs_d_in,sols_d);
sol_d = sols_d[0];
}
void operator() (const std::vector<FieldD> &src_d_in, std::vector<FieldD> &sol_d){
assert(src_d_in.size() == sol_d.size());
int NBatch = src_d_in.size();
std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl;
Integer TotalOuterIterations = 0; //Number of restarts
std::vector<Integer> TotalInnerIterations(NBatch,0); //Number of inner CG iterations
std::vector<Integer> TotalFinalStepIterations(NBatch,0); //Number of CG iterations in final patch-up step
GridStopWatch TotalTimer;
TotalTimer.Start();
GridStopWatch InnerCGtimer;
GridStopWatch PrecChangeTimer;
int cb = src_d_in[0].Checkerboard();
std::vector<RealD> src_norm;
std::vector<RealD> norm;
std::vector<RealD> stop;
GridBase* DoublePrecGrid = src_d_in[0].Grid();
FieldD tmp_d(DoublePrecGrid);
tmp_d.Checkerboard() = cb;
FieldD tmp2_d(DoublePrecGrid);
tmp2_d.Checkerboard() = cb;
std::vector<FieldD> src_d;
std::vector<FieldF> src_f;
std::vector<FieldF> sol_f;
for (int i=0; i<NBatch; i++) {
sol_d[i].Checkerboard() = cb;
src_norm.push_back(norm2(src_d_in[i]));
norm.push_back(0.);
stop.push_back(src_norm[i] * Tolerance*Tolerance);
src_d.push_back(src_d_in[i]); //source for next inner iteration, computed from residual during operation
src_f.push_back(SinglePrecGrid);
src_f[i].Checkerboard() = cb;
sol_f.push_back(SinglePrecGrid);
sol_f[i].Checkerboard() = cb;
}
RealD inner_tol = InnerTolerance;
ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
CG_f.ErrorOnNoConverge = false;
Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
std::cout << GridLogMessage << std::endl;
std::cout << GridLogMessage << "Outer iteration " << outer_iter << std::endl;
bool allConverged = true;
for (int i=0; i<NBatch; i++) {
//Compute double precision rsd and also new RHS vector.
Linop_d.HermOp(sol_d[i], tmp_d);
norm[i] = axpy_norm(src_d[i], -1., tmp_d, src_d_in[i]); //src_d is residual vector
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Outer iteration " << outer_iter <<" solve " << i << " residual "<< norm[i] << " target "<< stop[i] <<std::endl;
PrecChangeTimer.Start();
precisionChange(src_f[i], src_d[i]);
PrecChangeTimer.Stop();
sol_f[i] = Zero();
if(norm[i] > OuterLoopNormMult * stop[i]) {
allConverged = false;
}
}
if (allConverged) break;
if (updateResidual) {
RealD normMax = *std::max_element(std::begin(norm), std::end(norm));
RealD stopMax = *std::max_element(std::begin(stop), std::end(stop));
while( normMax * inner_tol * inner_tol < stopMax) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
CG_f.Tolerance = inner_tol;
}
//Optionally improve inner solver guess (eg using known eigenvectors)
if(guesser != NULL) {
(*guesser)(src_f, sol_f);
}
for (int i=0; i<NBatch; i++) {
//Inner CG
InnerCGtimer.Start();
CG_f(Linop_f, src_f[i], sol_f[i]);
InnerCGtimer.Stop();
TotalInnerIterations[i] += CG_f.IterationsToComplete;
//Convert sol back to double and add to double prec solution
PrecChangeTimer.Start();
precisionChange(tmp_d, sol_f[i]);
PrecChangeTimer.Stop();
axpy(sol_d[i], 1.0, tmp_d, sol_d[i]);
}
}
//Final trial CG
std::cout << GridLogMessage << std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Starting final patch-up double-precision solve"<<std::endl;
for (int i=0; i<NBatch; i++) {
ConjugateGradient<FieldD> CG_d(Tolerance, MaxPatchupIterations);
CG_d(Linop_d, src_d_in[i], sol_d[i]);
TotalFinalStepIterations[i] += CG_d.IterationsToComplete;
}
TotalTimer.Stop();
std::cout << GridLogMessage << std::endl;
for (int i=0; i<NBatch; i++) {
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: solve " << i << " Inner CG iterations " << TotalInnerIterations[i] << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations[i] << std::endl;
}
std::cout << GridLogMessage << std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
}
};
NAMESPACE_END(Grid);
#endif

View File

@ -166,16 +166,16 @@ public:
rsqf[s] =rsq[s];
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
// ps_d[s] = src_d;
precisionChangeFast(ps_f[s],src_d);
precisionChange(ps_f[s],src_d);
}
// r and p for primary
p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
r_d = p_d;
//MdagM+m[0]
precisionChangeFast(p_f,p_d);
precisionChange(p_f,p_d);
Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
precisionChangeFast(tmp_d,mmp_f);
precisionChange(tmp_d,mmp_f);
Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
tmp_d = tmp_d - mmp_d;
std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
@ -204,7 +204,7 @@ public:
for(int s=0;s<nshift;s++) {
axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
precisionChangeFast(psi_f[s],psi_d[s]);
precisionChange(psi_f[s],psi_d[s]);
}
///////////////////////////////////////
@ -225,7 +225,7 @@ public:
AXPYTimer.Stop();
PrecChangeTimer.Start();
precisionChangeFast(r_f, r_d);
precisionChange(r_f, r_d);
PrecChangeTimer.Stop();
AXPYTimer.Start();
@ -243,13 +243,13 @@ public:
cp=c;
PrecChangeTimer.Start();
precisionChangeFast(p_f, p_d); //get back single prec search direction for linop
precisionChange(p_f, p_d); //get back single prec search direction for linop
PrecChangeTimer.Stop();
MatrixTimer.Start();
Linop_f.HermOp(p_f,mmp_f);
MatrixTimer.Stop();
PrecChangeTimer.Start();
precisionChangeFast(mmp_d, mmp_f); // From Float to Double
precisionChange(mmp_d, mmp_f); // From Float to Double
PrecChangeTimer.Stop();
d=real(innerProduct(p_d,mmp_d));
@ -311,7 +311,7 @@ public:
SolverTimer.Stop();
for(int s=0;s<nshift;s++){
precisionChangeFast(psi_d[s],psi_f[s]);
precisionChange(psi_d[s],psi_f[s]);
}

View File

@ -211,7 +211,7 @@ public:
Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
tmp_d = tmp_d - mmp_d;
std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
// assert(norm2(tmp_d)< 1.0e-4);
assert(norm2(tmp_d)< 1.0);
axpy(mmp_d,mass[0],p_d,mmp_d);
RealD rn = norm2(p_d);

View File

@ -419,14 +419,15 @@ until convergence
}
}
if ( Nconv < Nstop )
if ( Nconv < Nstop ) {
std::cout << GridLogIRL << "Nconv ("<<Nconv<<") < Nstop ("<<Nstop<<")"<<std::endl;
std::cout << GridLogIRL << "returning Nstop vectors, the last "<< Nstop-Nconv << "of which might meet convergence criterion only approximately" <<std::endl;
}
eval=eval2;
//Keep only converged
eval.resize(Nconv);// Nstop?
evec.resize(Nconv,grid);// Nstop?
eval.resize(Nstop);// was Nconv
evec.resize(Nstop,grid);// was Nconv
basisSortInPlace(evec,eval,reverse);
}

View File

@ -4,11 +4,14 @@ NAMESPACE_BEGIN(Grid);
/*Allocation types, saying which pointer cache should be used*/
#define Cpu (0)
#define CpuSmall (1)
#define Acc (2)
#define AccSmall (3)
#define Shared (4)
#define SharedSmall (5)
#define CpuHuge (1)
#define CpuSmall (2)
#define Acc (3)
#define AccHuge (4)
#define AccSmall (5)
#define Shared (6)
#define SharedHuge (7)
#define SharedSmall (8)
#undef GRID_MM_VERBOSE
uint64_t total_shared;
uint64_t total_device;
@ -35,12 +38,15 @@ void MemoryManager::PrintBytes(void)
}
uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; }
uint64_t MemoryManager::HostCacheBytes() { return CacheBytes[Cpu] + CacheBytes[CpuHuge] + CacheBytes[CpuSmall]; }
//////////////////////////////////////////////////////////////////////
// Data tables for recently freed pooiniter caches
//////////////////////////////////////////////////////////////////////
MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
int MemoryManager::Victim[MemoryManager::NallocType];
int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 8, 16, 8, 16 };
int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 0, 8, 8, 0, 16, 8, 0, 16 };
uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
//////////////////////////////////////////////////////////////////////
// Actual allocation and deallocation utils
@ -170,6 +176,16 @@ void MemoryManager::Init(void)
}
}
str= getenv("GRID_ALLOC_NCACHE_HUGE");
if ( str ) {
Nc = atoi(str);
if ( (Nc>=0) && (Nc < NallocCacheMax)) {
Ncache[CpuHuge]=Nc;
Ncache[AccHuge]=Nc;
Ncache[SharedHuge]=Nc;
}
}
str= getenv("GRID_ALLOC_NCACHE_SMALL");
if ( str ) {
Nc = atoi(str);
@ -190,7 +206,9 @@ void MemoryManager::InitMessage(void) {
std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
#ifdef ALLOCATION_CACHE
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent host allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<" HUGE "<<Ncache[CpuHuge]<<std::endl;
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent device allocations: SMALL "<<Ncache[AccSmall]<<" LARGE "<<Ncache[Acc]<<" Huge "<<Ncache[AccHuge]<<std::endl;
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent shared allocations: SMALL "<<Ncache[SharedSmall]<<" LARGE "<<Ncache[Shared]<<" Huge "<<Ncache[SharedHuge]<<std::endl;
#endif
#ifdef GRID_UVM
@ -222,8 +240,11 @@ void MemoryManager::InitMessage(void) {
void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
{
#ifdef ALLOCATION_CACHE
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
int cache = type + small;
int cache;
if (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2;
else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
else cache = type;
return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);
#else
return ptr;
@ -232,11 +253,12 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes)
{
assert(ncache>0);
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
if (ncache == 0) return ptr;
void * ret = NULL;
int v = -1;
@ -271,8 +293,11 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
void *MemoryManager::Lookup(size_t bytes,int type)
{
#ifdef ALLOCATION_CACHE
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
int cache = type+small;
int cache;
if (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2;
else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
else cache = type;
return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]);
#else
return NULL;
@ -281,7 +306,6 @@ void *MemoryManager::Lookup(size_t bytes,int type)
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes)
{
assert(ncache>0);
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif

View File

@ -35,6 +35,7 @@ NAMESPACE_BEGIN(Grid);
// Move control to configure.ac and Config.h?
#define GRID_ALLOC_SMALL_LIMIT (4096)
#define GRID_ALLOC_HUGE_LIMIT (2147483648)
#define STRINGIFY(x) #x
#define TOSTRING(x) STRINGIFY(x)
@ -70,6 +71,21 @@ enum ViewMode {
CpuWriteDiscard = 0x10 // same for now
};
struct MemoryStatus {
uint64_t DeviceBytes;
uint64_t DeviceLRUBytes;
uint64_t DeviceMaxBytes;
uint64_t HostToDeviceBytes;
uint64_t DeviceToHostBytes;
uint64_t HostToDeviceXfer;
uint64_t DeviceToHostXfer;
uint64_t DeviceEvictions;
uint64_t DeviceDestroy;
uint64_t DeviceAllocCacheBytes;
uint64_t HostAllocCacheBytes;
};
class MemoryManager {
private:
@ -83,7 +99,7 @@ private:
} AllocationCacheEntry;
static const int NallocCacheMax=128;
static const int NallocType=6;
static const int NallocType=9;
static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
static int Victim[NallocType];
static int Ncache[NallocType];
@ -121,7 +137,26 @@ private:
static uint64_t DeviceToHostXfer;
static uint64_t DeviceEvictions;
static uint64_t DeviceDestroy;
static uint64_t DeviceCacheBytes();
static uint64_t HostCacheBytes();
static MemoryStatus GetFootprint(void) {
MemoryStatus stat;
stat.DeviceBytes = DeviceBytes;
stat.DeviceLRUBytes = DeviceLRUBytes;
stat.DeviceMaxBytes = DeviceMaxBytes;
stat.HostToDeviceBytes = HostToDeviceBytes;
stat.DeviceToHostBytes = DeviceToHostBytes;
stat.HostToDeviceXfer = HostToDeviceXfer;
stat.DeviceToHostXfer = DeviceToHostXfer;
stat.DeviceEvictions = DeviceEvictions;
stat.DeviceDestroy = DeviceDestroy;
stat.DeviceAllocCacheBytes = DeviceCacheBytes();
stat.HostAllocCacheBytes = HostCacheBytes();
return stat;
};
private:
#ifndef GRID_UVM
//////////////////////////////////////////////////////////////////////

View File

@ -519,7 +519,6 @@ void MemoryManager::Audit(std::string s)
uint64_t LruBytes1=0;
uint64_t LruBytes2=0;
uint64_t LruCnt=0;
uint64_t LockedBytes=0;
std::cout << " Memory Manager::Audit() from "<<s<<std::endl;
for(auto it=LRU.begin();it!=LRU.end();it++){

View File

@ -400,9 +400,6 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{
acceleratorCopySynchronise();
StencilBarrier();// Synch shared memory on a single nodes
int nreq=list.size();
if (nreq==0) return;

View File

@ -128,7 +128,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
int recv_from_rank,int dor,
int xbytes,int rbytes, int dir)
{
return 2.0*bytes;
return xbytes+rbytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
{

View File

@ -91,6 +91,59 @@ void *SharedMemory::ShmBufferSelf(void)
//std::cerr << "ShmBufferSelf "<<ShmRank<<" "<<std::hex<< ShmCommBufs[ShmRank] <<std::dec<<std::endl;
return ShmCommBufs[ShmRank];
}
static inline int divides(int a,int b)
{
return ( b == ( (b/a)*a ) );
}
void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
{
////////////////////////////////////////////////////////////////
// Allow user to configure through environment variable
////////////////////////////////////////////////////////////////
char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str());
if ( str ) {
std::vector<int> IntShmDims;
GridCmdOptionIntVector(std::string(str),IntShmDims);
assert(IntShmDims.size() == WorldDims.size());
long ShmSize = 1;
for (int dim=0;dim<WorldDims.size();dim++) {
ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
assert(divides(ShmDims[dim],WorldDims[dim]));
}
assert(ShmSize == WorldShmSize);
return;
}
////////////////////////////////////////////////////////////////
// Powers of 2,3,5 only in prime decomposition for now
////////////////////////////////////////////////////////////////
int ndimension = WorldDims.size();
ShmDims=Coordinate(ndimension,1);
std::vector<int> primes({2,3,5});
int dim = 0;
int last_dim = ndimension - 1;
int AutoShmSize = 1;
while(AutoShmSize != WorldShmSize) {
int p;
for(p=0;p<primes.size();p++) {
int prime=primes[p];
if ( divides(prime,WorldDims[dim]/ShmDims[dim])
&& divides(prime,WorldShmSize/AutoShmSize) ) {
AutoShmSize*=prime;
ShmDims[dim]*=prime;
last_dim = dim;
break;
}
}
if (p == primes.size() && last_dim == dim) {
std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl;
exit(EXIT_FAILURE);
}
dim=(dim+1) %ndimension;
}
}
NAMESPACE_END(Grid);

View File

@ -27,9 +27,10 @@ Author: Christoph Lehner <christoph@lhnr.de>
*************************************************************************************/
/* END LEGAL */
#define Mheader "SharedMemoryMpi: "
#include <Grid/GridCore.h>
#include <pwd.h>
#include <syscall.h>
#ifdef GRID_CUDA
#include <cuda_runtime_api.h>
@ -37,12 +38,120 @@ Author: Christoph Lehner <christoph@lhnr.de>
#ifdef GRID_HIP
#include <hip/hip_runtime_api.h>
#endif
#ifdef GRID_SYCl
#ifdef GRID_SYCL
#define GRID_SYCL_LEVEL_ZERO_IPC
#include <syscall.h>
#define SHM_SOCKETS
#endif
#include <sys/socket.h>
#include <sys/un.h>
NAMESPACE_BEGIN(Grid);
#define header "SharedMemoryMpi: "
#ifdef SHM_SOCKETS
/*
* Barbaric extra intranode communication route in case we need sockets to pass FDs
* Forced by level_zero not being nicely designed
*/
static int sock;
static const char *sock_path_fmt = "/tmp/GridUnixSocket.%d";
static char sock_path[256];
class UnixSockets {
public:
static void Open(int rank)
{
int errnum;
sock = socket(AF_UNIX, SOCK_DGRAM, 0); assert(sock>0);
struct sockaddr_un sa_un = { 0 };
sa_un.sun_family = AF_UNIX;
snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,rank);
unlink(sa_un.sun_path);
if (bind(sock, (struct sockaddr *)&sa_un, sizeof(sa_un))) {
perror("bind failure");
exit(EXIT_FAILURE);
}
}
static int RecvFileDescriptor(void)
{
int n;
int fd;
char buf[1];
struct iovec iov;
struct msghdr msg;
struct cmsghdr *cmsg;
char cms[CMSG_SPACE(sizeof(int))];
iov.iov_base = buf;
iov.iov_len = 1;
memset(&msg, 0, sizeof msg);
msg.msg_name = 0;
msg.msg_namelen = 0;
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
msg.msg_control = (caddr_t)cms;
msg.msg_controllen = sizeof cms;
if((n=recvmsg(sock, &msg, 0)) < 0) {
perror("recvmsg failed");
return -1;
}
if(n == 0){
perror("recvmsg returned 0");
return -1;
}
cmsg = CMSG_FIRSTHDR(&msg);
memmove(&fd, CMSG_DATA(cmsg), sizeof(int));
return fd;
}
static void SendFileDescriptor(int fildes,int xmit_to_rank)
{
struct msghdr msg;
struct iovec iov;
struct cmsghdr *cmsg = NULL;
char ctrl[CMSG_SPACE(sizeof(int))];
char data = ' ';
memset(&msg, 0, sizeof(struct msghdr));
memset(ctrl, 0, CMSG_SPACE(sizeof(int)));
iov.iov_base = &data;
iov.iov_len = sizeof(data);
sprintf(sock_path,sock_path_fmt,xmit_to_rank);
struct sockaddr_un sa_un = { 0 };
sa_un.sun_family = AF_UNIX;
snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,xmit_to_rank);
msg.msg_name = (void *)&sa_un;
msg.msg_namelen = sizeof(sa_un);
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
msg.msg_controllen = CMSG_SPACE(sizeof(int));
msg.msg_control = ctrl;
cmsg = CMSG_FIRSTHDR(&msg);
cmsg->cmsg_level = SOL_SOCKET;
cmsg->cmsg_type = SCM_RIGHTS;
cmsg->cmsg_len = CMSG_LEN(sizeof(int));
*((int *) CMSG_DATA(cmsg)) = fildes;
sendmsg(sock, &msg, 0);
};
};
#endif
/*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
{
@ -65,8 +174,8 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
MPI_Comm_size(WorldShmComm ,&WorldShmSize);
if ( WorldRank == 0) {
std::cout << header " World communicator of size " <<WorldSize << std::endl;
std::cout << header " Node communicator of size " <<WorldShmSize << std::endl;
std::cout << Mheader " World communicator of size " <<WorldSize << std::endl;
std::cout << Mheader " Node communicator of size " <<WorldShmSize << std::endl;
}
// WorldShmComm, WorldShmSize, WorldShmRank
@ -169,59 +278,7 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M
if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm,SHM);
else OptimalCommunicatorSharedMemory(processors,optimal_comm,SHM);
}
static inline int divides(int a,int b)
{
return ( b == ( (b/a)*a ) );
}
void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
{
////////////////////////////////////////////////////////////////
// Allow user to configure through environment variable
////////////////////////////////////////////////////////////////
char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str());
if ( str ) {
std::vector<int> IntShmDims;
GridCmdOptionIntVector(std::string(str),IntShmDims);
assert(IntShmDims.size() == WorldDims.size());
long ShmSize = 1;
for (int dim=0;dim<WorldDims.size();dim++) {
ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
assert(divides(ShmDims[dim],WorldDims[dim]));
}
assert(ShmSize == WorldShmSize);
return;
}
////////////////////////////////////////////////////////////////
// Powers of 2,3,5 only in prime decomposition for now
////////////////////////////////////////////////////////////////
int ndimension = WorldDims.size();
ShmDims=Coordinate(ndimension,1);
std::vector<int> primes({2,3,5});
int dim = 0;
int last_dim = ndimension - 1;
int AutoShmSize = 1;
while(AutoShmSize != WorldShmSize) {
int p;
for(p=0;p<primes.size();p++) {
int prime=primes[p];
if ( divides(prime,WorldDims[dim]/ShmDims[dim])
&& divides(prime,WorldShmSize/AutoShmSize) ) {
AutoShmSize*=prime;
ShmDims[dim]*=prime;
last_dim = dim;
break;
}
}
if (p == primes.size() && last_dim == dim) {
std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl;
exit(EXIT_FAILURE);
}
dim=(dim+1) %ndimension;
}
}
void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
{
////////////////////////////////////////////////////////////////
@ -395,7 +452,7 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
#ifdef GRID_MPI3_SHMGET
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << header "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
@ -480,7 +537,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
exit(EXIT_FAILURE);
}
std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
SharedMemoryZero(ShmCommBuf,bytes);
@ -523,7 +580,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
exit(EXIT_FAILURE);
}
if ( WorldRank == 0 ){
std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl;
}
SharedMemoryZero(ShmCommBuf,bytes);
@ -531,8 +588,13 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Loop over ranks/gpu's on our node
///////////////////////////////////////////////////////////////////////////////////////////////////////////
#ifdef SHM_SOCKETS
UnixSockets::Open(WorldShmRank);
#endif
for(int r=0;r<WorldShmSize;r++){
MPI_Barrier(WorldShmComm);
#ifndef GRID_MPI3_SHM_NONE
//////////////////////////////////////////////////
// If it is me, pass around the IPC access key
@ -540,24 +602,32 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void * thisBuf = ShmCommBuf;
if(!Stencil_force_mpi) {
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
typedef struct { int fd; pid_t pid ; } clone_mem_t;
typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;
auto zeDevice = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device());
auto zeContext = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context());
auto zeDevice = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
auto zeContext = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
ze_ipc_mem_handle_t ihandle;
clone_mem_t handle;
if ( r==WorldShmRank ) {
auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&ihandle);
if ( err != ZE_RESULT_SUCCESS ) {
std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE);
} else {
std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
}
memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
handle.pid = getpid();
memcpy((void *)&handle.ze,(void *)&ihandle,sizeof(ihandle));
#ifdef SHM_SOCKETS
for(int rr=0;rr<WorldShmSize;rr++){
if(rr!=r){
UnixSockets::SendFileDescriptor(handle.fd,rr);
}
}
#endif
}
#endif
#ifdef GRID_CUDA
@ -585,6 +655,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
// Share this IPC handle across the Shm Comm
//////////////////////////////////////////////////
{
MPI_Barrier(WorldShmComm);
int ierr=MPI_Bcast(&handle,
sizeof(handle),
MPI_BYTE,
@ -600,6 +671,10 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
if ( r!=WorldShmRank ) {
thisBuf = nullptr;
int myfd;
#ifdef SHM_SOCKETS
myfd=UnixSockets::RecvFileDescriptor();
#else
std::cout<<"mapping seeking remote pid/fd "
<<handle.pid<<"/"
<<handle.fd<<std::endl;
@ -607,16 +682,22 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
// int myfd = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
int myfd = syscall(438,pidfd,handle.fd,0);
std::cout<<"Using IpcHandle myfd "<<myfd<<"\n";
myfd = syscall(438,pidfd,handle.fd,0);
int err_t = errno;
if (myfd < 0) {
fprintf(stderr,"pidfd_getfd returned %d errno was %d\n", myfd,err_t); fflush(stderr);
perror("pidfd_getfd failed ");
assert(0);
}
#endif
std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle));
memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));
auto err = zeMemOpenIpcHandle(zeContext,zeDevice,ihandle,0,&thisBuf);
if ( err != ZE_RESULT_SUCCESS ) {
std::cout << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE);
} else {
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
@ -651,6 +732,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#else
WorldShmCommBufs[r] = ShmCommBuf;
#endif
MPI_Barrier(WorldShmComm);
}
_ShmAllocBytes=bytes;
@ -662,7 +744,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifdef GRID_MPI3_SHMMMAP
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -699,7 +781,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
assert(((uint64_t)ptr&0x3F)==0);
close(fd);
WorldShmCommBufs[r] =ptr;
// std::cout << header "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
// std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
}
_ShmAlloc=1;
_ShmAllocBytes = bytes;
@ -709,7 +791,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifdef GRID_MPI3_SHM_NONE
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -756,7 +838,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
////////////////////////////////////////////////////////////////////////////////////////////
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << header "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
MPI_Barrier(WorldShmComm);

View File

@ -297,6 +297,30 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
}
}
#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
template <typename T>
T iDivUp(T a, T b) // Round a / b to nearest higher integer value
{ return (a % b != 0) ? (a / b + 1) : (a / b); }
template <typename T>
__global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx >= e1*e2) return;
int n, b, o;
n = idx / e2;
b = idx % e2;
o = n*stride + b;
vector[2*idx + 0] = lo + o;
vector[2*idx + 1] = ro + o;
}
#endif
//////////////////////////////////////////////////////
// local to node block strided copies
//////////////////////////////////////////////////////
@ -321,12 +345,20 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
int ent=0;
if(cbmask == 0x3 ){
#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
ent = e1*e2;
dim3 blockSize(acceleratorThreads());
dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
accelerator_barrier();
#else
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride+b;
Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
}
}
#endif
} else {
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
@ -377,11 +409,19 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
int ent=0;
if ( cbmask == 0x3 ) {
#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
ent = e1*e2;
dim3 blockSize(acceleratorThreads());
dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
accelerator_barrier();
#else
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int o =n*stride;
Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
}}
#endif
} else {
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){

View File

@ -153,33 +153,44 @@ inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
}
template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
inline typename vobj::scalar_object rankSum(const Lattice<vobj> &arg)
{
Integer osites = arg.Grid()->oSites();
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
typename vobj::scalar_object ssum;
autoView( arg_v, arg, AcceleratorRead);
ssum= sum_gpu(&arg_v[0],osites);
return sum_gpu(&arg_v[0],osites);
#else
autoView(arg_v, arg, CpuRead);
auto ssum= sum_cpu(&arg_v[0],osites);
return sum_cpu(&arg_v[0],osites);
#endif
}
template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
{
auto ssum = rankSum(arg);
arg.Grid()->GlobalSum(ssum);
return ssum;
}
template<class vobj>
inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
inline typename vobj::scalar_object rankSumLarge(const Lattice<vobj> &arg)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
autoView( arg_v, arg, AcceleratorRead);
Integer osites = arg.Grid()->oSites();
auto ssum= sum_gpu_large(&arg_v[0],osites);
return sum_gpu_large(&arg_v[0],osites);
#else
autoView(arg_v, arg, CpuRead);
Integer osites = arg.Grid()->oSites();
auto ssum= sum_cpu(&arg_v[0],osites);
return sum_cpu(&arg_v[0],osites);
#endif
}
template<class vobj>
inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
{
auto ssum = rankSumLarge(arg);
arg.Grid()->GlobalSum(ssum);
return ssum;
}

View File

@ -211,25 +211,22 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
assert(ok);
Integer smemSize = numThreads * sizeof(sobj);
// UVM seems to be buggy under later CUDA drivers
// This fails on A100 and driver 5.30.02 / CUDA 12.1
// Fails with multiple NVCC versions back to 11.4,
// which worked with earlier drivers.
// Not sure which driver had first fail and this bears checking
// Is awkward as must install multiple driver versions
// Move out of UVM
// Turns out I had messed up the synchronise after move to compute stream
// as running this on the default stream fools the synchronise
#undef UVM_BLOCK_BUFFER
#ifndef UVM_BLOCK_BUFFER
commVector<sobj> buffer(numBlocks);
sobj *buffer_v = &buffer[0];
sobj result;
reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
accelerator_barrier();
acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
#else
Vector<sobj> buffer(numBlocks);
sobj *buffer_v = &buffer[0];
sobj result;
reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
accelerator_barrier();
result = *buffer_v;
#endif

View File

@ -440,17 +440,8 @@ public:
_grid->GlobalCoorToGlobalIndex(gcoor,gidx);
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
#if 1
assert(rank == _grid->ThisRank() );
#else
//
if (rank != _grid->ThisRank() ){
std::cout <<"rank "<<rank<<" _grid->ThisRank() "<<_grid->ThisRank()<< std::endl;
// exit(-42);
// assert(0);
}
#endif
assert(rank == _grid->ThisRank() );
int l_idx=generator_idx(o_idx,i_idx);
_generators[l_idx] = master_engine;

View File

@ -288,7 +288,36 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed);
}
}
template<class vobj,class CComplex,int nbasis,class VLattice>
inline void batchBlockProject(std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
const std::vector<Lattice<vobj>> &fineData,
const VLattice &Basis)
{
int NBatch = fineData.size();
assert(coarseData.size() == NBatch);
GridBase * fine = fineData[0].Grid();
GridBase * coarse= coarseData[0].Grid();
Lattice<iScalar<CComplex>> ip(coarse);
std::vector<Lattice<vobj>> fineDataCopy = fineData;
autoView(ip_, ip, AcceleratorWrite);
for(int v=0;v<nbasis;v++) {
for (int k=0; k<NBatch; k++) {
autoView( coarseData_ , coarseData[k], AcceleratorWrite);
blockInnerProductD(ip,Basis[v],fineDataCopy[k]); // ip = <basis|fine>
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
convertType(coarseData_[sc](v),ip_[sc]);
});
// improve numerical stability of projection
// |fine> = |fine> - <basis|fine> |basis>
ip=-ip;
blockZAXPY(fineDataCopy[k],ip,Basis[v],fineDataCopy[k]);
}
}
}
template<class vobj,class vobj2,class CComplex>
inline void blockZAXPY(Lattice<vobj> &fineZ,
@ -590,6 +619,26 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
}
#endif
template<class vobj,class CComplex,int nbasis,class VLattice>
inline void batchBlockPromote(const std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
std::vector<Lattice<vobj>> &fineData,
const VLattice &Basis)
{
int NBatch = coarseData.size();
assert(fineData.size() == NBatch);
GridBase * fine = fineData[0].Grid();
GridBase * coarse = coarseData[0].Grid();
for (int k=0; k<NBatch; k++)
fineData[k]=Zero();
for (int i=0;i<nbasis;i++) {
for (int k=0; k<NBatch; k++) {
Lattice<iScalar<CComplex>> ip = PeekIndex<0>(coarseData[k],i);
blockZAXPY(fineData[k],ip,Basis[i],fineData[k]);
}
}
}
// Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars.
// Simd layouts need not match since we use peek/poke Local
template<class vobj,class vvobj>
@ -658,9 +707,9 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
Coordinate ist = Tg->_istride;
Coordinate ost = Tg->_ostride;
autoView( t_v , To, AcceleratorWrite);
autoView( f_v , From, AcceleratorRead);
accelerator_for(idx,Fg->lSites(),1,{
autoView( t_v , To, CpuWrite);
autoView( f_v , From, CpuRead);
thread_for(idx,Fg->lSites(),{
sobj s;
Coordinate Fcoor(nd);
Coordinate Tcoor(nd);
@ -673,15 +722,20 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
Tcoor[d] = ToLowerLeft[d]+ Fcoor[d]-FromLowerLeft[d];
}
if (in_region) {
Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]);
Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]);
Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]);
Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]);
vector_type * fp = (vector_type *)&f_v[odx_f];
vector_type * tp = (vector_type *)&t_v[odx_t];
#if 0
Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]); // inner index from
Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]); // inner index to
Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]); // outer index from
Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]); // outer index to
scalar_type * fp = (scalar_type *)&f_v[odx_f];
scalar_type * tp = (scalar_type *)&t_v[odx_t];
for(int w=0;w<words;w++){
tp[w].putlane(fp[w].getlane(idx_f),idx_t);
}
#else
peekLocalSite(s,f_v,Fcoor);
pokeLocalSite(s,t_v,Tcoor);
#endif
}
});
}
@ -792,9 +846,9 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
for(int d=0;d<nh;d++){
if ( d!=orthog ) {
assert(lg->_processors[d] == hg->_processors[d]);
assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
}
assert(lg->_processors[d] == hg->_processors[d]);
assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
}
}
// the above should guarantee that the operations are local

136
Grid/lattice/PaddedCell.h Normal file
View File

@ -0,0 +1,136 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/PaddedCell.h
Copyright (C) 2019
Author: Peter Boyle pboyle@bnl.gov
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
class PaddedCell {
public:
GridCartesian * unpadded_grid;
int dims;
int depth;
std::vector<GridCartesian *> grids;
~PaddedCell()
{
DeleteGrids();
}
PaddedCell(int _depth,GridCartesian *_grid)
{
unpadded_grid = _grid;
depth=_depth;
dims=_grid->Nd();
AllocateGrids();
Coordinate local =unpadded_grid->LocalDimensions();
for(int d=0;d<dims;d++){
assert(local[d]>=depth);
}
}
void DeleteGrids(void)
{
for(int d=0;d<grids.size();d++){
delete grids[d];
}
grids.resize(0);
};
void AllocateGrids(void)
{
Coordinate local =unpadded_grid->LocalDimensions();
Coordinate simd =unpadded_grid->_simd_layout;
Coordinate processors=unpadded_grid->_processors;
Coordinate plocal =unpadded_grid->LocalDimensions();
Coordinate global(dims);
// expand up one dim at a time
for(int d=0;d<dims;d++){
plocal[d] += 2*depth;
for(int d=0;d<dims;d++){
global[d] = plocal[d]*processors[d];
}
grids.push_back(new GridCartesian(global,simd,processors));
}
};
template<class vobj>
inline Lattice<vobj> Extract(Lattice<vobj> &in)
{
Lattice<vobj> out(unpadded_grid);
Coordinate local =unpadded_grid->LocalDimensions();
Coordinate fll(dims,depth); // depends on the MPI spread
Coordinate tll(dims,0); // depends on the MPI spread
localCopyRegion(in,out,fll,tll,local);
return out;
}
template<class vobj>
inline Lattice<vobj> Exchange(Lattice<vobj> &in)
{
GridBase *old_grid = in.Grid();
int dims = old_grid->Nd();
Lattice<vobj> tmp = in;
for(int d=0;d<dims;d++){
tmp = Expand(d,tmp); // rvalue && assignment
}
return tmp;
}
// expand up one dim at a time
template<class vobj>
inline Lattice<vobj> Expand(int dim,Lattice<vobj> &in)
{
GridBase *old_grid = in.Grid();
GridCartesian *new_grid = grids[dim];//These are new grids
Lattice<vobj> padded(new_grid);
Lattice<vobj> shifted(old_grid);
Coordinate local =old_grid->LocalDimensions();
Coordinate plocal =new_grid->LocalDimensions();
if(dim==0) conformable(old_grid,unpadded_grid);
else conformable(old_grid,grids[dim-1]);
std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl;
// Middle bit
for(int x=0;x<local[dim];x++){
InsertSliceLocal(in,padded,x,depth+x,dim);
}
// High bit
shifted = Cshift(in,dim,depth);
for(int x=0;x<depth;x++){
InsertSliceLocal(shifted,padded,local[dim]-depth+x,depth+local[dim]+x,dim);
}
// Low bit
shifted = Cshift(in,dim,-depth);
for(int x=0;x<depth;x++){
InsertSliceLocal(shifted,padded,x,x,dim);
}
return padded;
}
};
NAMESPACE_END(Grid);

View File

@ -104,6 +104,7 @@ template<typename vtype> using iSpinMatrix = iScalar<iMatrix<iSca
template<typename vtype> using iColourMatrix = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
template<typename vtype> using iSpinColourMatrix = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
template<typename vtype> using iLorentzColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
template<typename vtype> using iLorentzComplex = iVector<iScalar<iScalar<vtype> >, Nd > ;
template<typename vtype> using iDoubleStoredColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ;
template<typename vtype> using iSpinVector = iScalar<iVector<iScalar<vtype>, Ns> >;
template<typename vtype> using iColourVector = iScalar<iScalar<iVector<vtype, Nc> > >;
@ -178,6 +179,15 @@ typedef iLorentzColourMatrix<vComplexF> vLorentzColourMatrixF;
typedef iLorentzColourMatrix<vComplexD> vLorentzColourMatrixD;
typedef iLorentzColourMatrix<vComplexD2> vLorentzColourMatrixD2;
// LorentzComplex
typedef iLorentzComplex<Complex > LorentzComplex;
typedef iLorentzComplex<ComplexF > LorentzComplexF;
typedef iLorentzComplex<ComplexD > LorentzComplexD;
typedef iLorentzComplex<vComplex > vLorentzComplex;
typedef iLorentzComplex<vComplexF> vLorentzComplexF;
typedef iLorentzComplex<vComplexD> vLorentzComplexD;
// DoubleStored gauge field
typedef iDoubleStoredColourMatrix<Complex > DoubleStoredColourMatrix;
typedef iDoubleStoredColourMatrix<ComplexF > DoubleStoredColourMatrixF;
@ -307,6 +317,10 @@ typedef Lattice<vLorentzColourMatrixF> LatticeLorentzColourMatrixF;
typedef Lattice<vLorentzColourMatrixD> LatticeLorentzColourMatrixD;
typedef Lattice<vLorentzColourMatrixD2> LatticeLorentzColourMatrixD2;
typedef Lattice<vLorentzComplex> LatticeLorentzComplex;
typedef Lattice<vLorentzComplexF> LatticeLorentzComplexF;
typedef Lattice<vLorentzComplexD> LatticeLorentzComplexD;
// DoubleStored gauge field
typedef Lattice<vDoubleStoredColourMatrix> LatticeDoubleStoredColourMatrix;
typedef Lattice<vDoubleStoredColourMatrixF> LatticeDoubleStoredColourMatrixF;

View File

@ -34,10 +34,24 @@ directory
NAMESPACE_BEGIN(Grid);
///////////////////////////////////
// Smart configuration base class
///////////////////////////////////
template< class Field >
class ConfigurationBase
{
public:
ConfigurationBase() {}
virtual ~ConfigurationBase() {}
virtual void set_Field(Field& U) =0;
virtual void smeared_force(Field&) = 0;
virtual Field& get_SmearedU() =0;
virtual Field &get_U(bool smeared = false) = 0;
};
template <class GaugeField >
class Action
{
public:
bool is_smeared = false;
RealD deriv_norm_sum;
@ -77,11 +91,39 @@ public:
void refresh_timer_stop(void) { refresh_us+=usecond(); }
void S_timer_start(void) { S_us-=usecond(); }
void S_timer_stop(void) { S_us+=usecond(); }
/////////////////////////////
// Heatbath?
/////////////////////////////
virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
virtual RealD S(const GaugeField& U) = 0; // evaluate the action
virtual RealD Sinitial(const GaugeField& U) { return this->S(U); } ; // if the refresh computes the action, can cache it. Alternately refreshAndAction() ?
virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0; // evaluate the action derivative
/////////////////////////////////////////////////////////////
// virtual smeared interface through configuration container
/////////////////////////////////////////////////////////////
virtual void refresh(ConfigurationBase<GaugeField> & U, GridSerialRNG &sRNG, GridParallelRNG& pRNG)
{
refresh(U.get_U(is_smeared),sRNG,pRNG);
}
virtual RealD S(ConfigurationBase<GaugeField>& U)
{
return S(U.get_U(is_smeared));
}
virtual RealD Sinitial(ConfigurationBase<GaugeField>& U)
{
return Sinitial(U.get_U(is_smeared));
}
virtual void deriv(ConfigurationBase<GaugeField>& U, GaugeField& dSdU)
{
deriv(U.get_U(is_smeared),dSdU);
if ( is_smeared ) {
U.smeared_force(dSdU);
}
}
///////////////////////////////
// Logging
///////////////////////////////
virtual std::string action_name() = 0; // return the action name
virtual std::string LogParameters() = 0; // prints action parameters
virtual ~Action(){}

View File

@ -30,6 +30,8 @@ directory
#ifndef QCD_ACTION_CORE
#define QCD_ACTION_CORE
#include <Grid/qcd/action/gauge/GaugeImplementations.h>
#include <Grid/qcd/action/ActionBase.h>
NAMESPACE_CHECK(ActionBase);
#include <Grid/qcd/action/ActionSet.h>

View File

@ -507,6 +507,7 @@ public:
}
this->face_table_computed=1;
assert(this->u_comm_offset==this->_unified_buffer_size);
accelerator_barrier();
}
};

View File

@ -332,8 +332,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
/////////////////////////////
{
GRID_TRACE("Gather");
st.HaloExchangeOptGather(in,compressor);
accelerator_barrier();
st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine
}
std::vector<std::vector<CommsRequest_t> > requests;

View File

@ -423,14 +423,14 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
#define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier();
#define KERNEL_CALL_EXT(A) \
const uint64_t NN = Nsite*Ls; \
const uint64_t sz = st.surface_list.size(); \
auto ptr = &st.surface_list[0]; \
accelerator_forNB( ss, sz, Simd::Nsimd(), { \
int sF = ptr[ss]; \
int sU = ss/Ls; \
int sU = sF/Ls; \
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v); \
});
}); \
accelerator_barrier();
#define ASM_CALL(A) \
thread_for( sss, Nsite, { \
@ -463,11 +463,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
if( interior && exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
#ifdef SYCL_HACK
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteSycl); return; }
#else
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
#endif
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;}
#endif
@ -478,8 +474,10 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;}
#endif
} else if( exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;}
// dependent on result of merge
acceleratorFenceComputeStream();
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL_EXT(GenericDhopSiteExt); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteExt); return;}
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); return;}
#endif
@ -502,21 +500,20 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;}
#endif
acceleratorFenceComputeStream();
} else if( interior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;}
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteDagInt); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteDagInt); return;}
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;}
#endif
} else if( exterior ) {
// Dependent on result of merge
acceleratorFenceComputeStream();
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;}
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL_EXT(GenericDhopSiteDagExt); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteDagExt); return;}
#ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;}
#endif
acceleratorFenceComputeStream();
}
assert(0 && " Kernel optimisation case not covered ");
}

View File

@ -1 +0,0 @@
../CayleyFermion5DInstantiation.cc.master

View File

@ -1 +0,0 @@
../ContinuedFractionFermion5DInstantiation.cc.master

View File

@ -1 +0,0 @@
../DomainWallEOFAFermionInstantiation.cc.master

View File

@ -1 +0,0 @@
../MobiusEOFAFermionInstantiation.cc.master

View File

@ -1 +0,0 @@
../PartialFractionFermion5DInstantiation.cc.master

View File

@ -1 +0,0 @@
../WilsonCloverFermionInstantiation.cc.master

View File

@ -1 +0,0 @@
../WilsonFermion5DInstantiation.cc.master

View File

@ -1 +0,0 @@
../WilsonFermionInstantiation.cc.master

View File

@ -1 +0,0 @@
../WilsonKernelsInstantiation.cc.master

View File

@ -1 +0,0 @@
../WilsonTMFermionInstantiation.cc.master

View File

@ -1 +0,0 @@
#define IMPLEMENTATION WilsonImplD2

View File

@ -1 +0,0 @@
../CayleyFermion5DInstantiation.cc.master

View File

@ -1 +0,0 @@
../ContinuedFractionFermion5DInstantiation.cc.master

View File

@ -1 +0,0 @@
../DomainWallEOFAFermionInstantiation.cc.master

View File

@ -1 +0,0 @@
../MobiusEOFAFermionInstantiation.cc.master

View File

@ -1 +0,0 @@
../PartialFractionFermion5DInstantiation.cc.master

View File

@ -1 +0,0 @@
../WilsonFermion5DInstantiation.cc.master

View File

@ -1 +0,0 @@
../WilsonKernelsInstantiation.cc.master

View File

@ -1 +0,0 @@
#define IMPLEMENTATION ZWilsonImplD2

View File

@ -38,91 +38,73 @@ NAMESPACE_BEGIN(Grid);
// cf. GeneralEvenOddRational.h for details
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class ImplD, class ImplF, class ImplD2>
template<class ImplD, class ImplF>
class GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction : public GeneralEvenOddRatioRationalPseudoFermionAction<ImplD> {
private:
typedef typename ImplD2::FermionField FermionFieldD2;
typedef typename ImplD::FermionField FermionFieldD;
typedef typename ImplF::FermionField FermionFieldF;
FermionOperator<ImplD> & NumOpD;
FermionOperator<ImplD> & DenOpD;
FermionOperator<ImplD2> & NumOpD2;
FermionOperator<ImplD2> & DenOpD2;
FermionOperator<ImplF> & NumOpF;
FermionOperator<ImplF> & DenOpF;
Integer ReliableUpdateFreq;
protected:
//Action evaluation
//Allow derived classes to override the multishift CG
virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, FermionFieldD &out){
#if 0
#if 1
SchurDifferentiableOperator<ImplD> schurOp(numerator ? NumOpD : DenOpD);
ConjugateGradientMultiShift<FermionFieldD> msCG(MaxIter, approx);
msCG(schurOp,in, out);
#else
SchurDifferentiableOperator<ImplD2> schurOpD2(numerator ? NumOpD2 : DenOpD2);
SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD);
SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF);
FermionFieldD2 inD2(NumOpD2.FermionRedBlackGrid());
FermionFieldD2 outD2(NumOpD2.FermionRedBlackGrid());
FermionFieldD inD(NumOpD.FermionRedBlackGrid());
FermionFieldD outD(NumOpD.FermionRedBlackGrid());
// Action better with higher precision?
ConjugateGradientMultiShiftMixedPrec<FermionFieldD2, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
precisionChange(inD2,in);
std::cout << "msCG single solve "<<norm2(inD2)<<" " <<norm2(in)<<std::endl;
msCG(schurOpD2, inD2, outD2);
precisionChange(out,outD2);
ConjugateGradientMultiShiftMixedPrec<FermionFieldD, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
msCG(schurOpD, in, out);
#endif
}
//Force evaluation
virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, std::vector<FermionFieldD> &out_elems, FermionFieldD &out){
SchurDifferentiableOperator<ImplD2> schurOpD2(numerator ? NumOpD2 : DenOpD2);
SchurDifferentiableOperator<ImplF> schurOpF (numerator ? NumOpF : DenOpF);
SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD);
SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF);
FermionFieldD2 inD2(NumOpD2.FermionRedBlackGrid());
FermionFieldD2 outD2(NumOpD2.FermionRedBlackGrid());
std::vector<FermionFieldD2> out_elemsD2(out_elems.size(),NumOpD2.FermionRedBlackGrid());
ConjugateGradientMultiShiftMixedPrecCleanup<FermionFieldD2, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
precisionChange(inD2,in);
std::cout << "msCG in "<<norm2(inD2)<<" " <<norm2(in)<<std::endl;
msCG(schurOpD2, inD2, out_elemsD2, outD2);
precisionChange(out,outD2);
for(int i=0;i<out_elems.size();i++){
precisionChange(out_elems[i],out_elemsD2[i]);
}
FermionFieldD inD(NumOpD.FermionRedBlackGrid());
FermionFieldD outD(NumOpD.FermionRedBlackGrid());
std::vector<FermionFieldD> out_elemsD(out_elems.size(),NumOpD.FermionRedBlackGrid());
ConjugateGradientMultiShiftMixedPrecCleanup<FermionFieldD, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
msCG(schurOpD, in, out_elems, out);
}
//Allow derived classes to override the gauge import
virtual void ImportGauge(const typename ImplD::GaugeField &Ud){
typename ImplF::GaugeField Uf(NumOpF.GaugeGrid());
typename ImplD2::GaugeField Ud2(NumOpD2.GaugeGrid());
precisionChange(Uf, Ud);
precisionChange(Ud2, Ud);
std::cout << "Importing "<<norm2(Ud)<<" "<< norm2(Uf)<<" " << norm2(Ud2)<<std::endl;
std::cout << "Importing "<<norm2(Ud)<<" "<< norm2(Uf)<<" " <<std::endl;
NumOpD.ImportGauge(Ud);
DenOpD.ImportGauge(Ud);
NumOpF.ImportGauge(Uf);
DenOpF.ImportGauge(Uf);
NumOpD2.ImportGauge(Ud2);
DenOpD2.ImportGauge(Ud2);
}
public:
GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction(FermionOperator<ImplD> &_NumOpD, FermionOperator<ImplD> &_DenOpD,
FermionOperator<ImplF> &_NumOpF, FermionOperator<ImplF> &_DenOpF,
FermionOperator<ImplD2> &_NumOpD2, FermionOperator<ImplD2> &_DenOpD2,
const RationalActionParams & p, Integer _ReliableUpdateFreq
) : GeneralEvenOddRatioRationalPseudoFermionAction<ImplD>(_NumOpD, _DenOpD, p),
ReliableUpdateFreq(_ReliableUpdateFreq),
NumOpD(_NumOpD), DenOpD(_DenOpD),
NumOpF(_NumOpF), DenOpF(_DenOpF),
NumOpD2(_NumOpD2), DenOpD2(_DenOpD2)
NumOpF(_NumOpF), DenOpF(_DenOpF)
{}
virtual std::string action_name(){return "GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction";}

View File

@ -67,9 +67,9 @@ NAMESPACE_BEGIN(Grid);
virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}
};
template<class Impl,class ImplF,class ImplD2>
template<class Impl,class ImplF>
class OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction
: public GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF,ImplD2> {
: public GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF> {
public:
typedef OneFlavourRationalParams Params;
private:
@ -91,11 +91,9 @@ NAMESPACE_BEGIN(Grid);
FermionOperator<Impl> &_DenOp,
FermionOperator<ImplF> &_NumOpF,
FermionOperator<ImplF> &_DenOpF,
FermionOperator<ImplD2> &_NumOpD2,
FermionOperator<ImplD2> &_DenOpD2,
const Params & p, Integer ReliableUpdateFreq
) :
GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF,ImplD2>(_NumOp, _DenOp,_NumOpF, _DenOpF,_NumOpD2, _DenOpD2, transcribe(p),ReliableUpdateFreq){}
GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF>(_NumOp, _DenOp,_NumOpF, _DenOpF, transcribe(p),ReliableUpdateFreq){}
virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}
};

View File

@ -112,40 +112,27 @@ NAMESPACE_BEGIN(Grid);
// NumOp == V
// DenOp == M
//
AUDIT();
FermionField etaOdd (NumOp.FermionRedBlackGrid());
FermionField etaEven(NumOp.FermionRedBlackGrid());
FermionField tmp (NumOp.FermionRedBlackGrid());
AUDIT();
pickCheckerboard(Even,etaEven,eta);
AUDIT();
pickCheckerboard(Odd,etaOdd,eta);
AUDIT();
NumOp.ImportGauge(U);
AUDIT();
DenOp.ImportGauge(U);
std::cout << " TwoFlavourRefresh: Imported gauge "<<std::endl;
AUDIT();
SchurDifferentiableOperator<Impl> Mpc(DenOp);
AUDIT();
SchurDifferentiableOperator<Impl> Vpc(NumOp);
AUDIT();
std::cout << " TwoFlavourRefresh: Diff ops "<<std::endl;
AUDIT();
// Odd det factors
Mpc.MpcDag(etaOdd,PhiOdd);
AUDIT();
std::cout << " TwoFlavourRefresh: MpcDag "<<std::endl;
tmp=Zero();
AUDIT();
std::cout << " TwoFlavourRefresh: Zero() guess "<<std::endl;
AUDIT();
HeatbathSolver(Vpc,PhiOdd,tmp);
AUDIT();
std::cout << " TwoFlavourRefresh: Heatbath solver "<<std::endl;
Vpc.Mpc(tmp,PhiOdd);
std::cout << " TwoFlavourRefresh: Mpc "<<std::endl;
@ -220,20 +207,27 @@ NAMESPACE_BEGIN(Grid);
//X = (Mdag M)^-1 V^dag phi
//Y = (Mdag)^-1 V^dag phi
Vpc.MpcDag(PhiOdd,Y); // Y= Vdag phi
std::cout << GridLogMessage <<" Y "<<norm2(Y)<<std::endl;
X=Zero();
DerivativeSolver(Mpc,Y,X); // X= (MdagM)^-1 Vdag phi
std::cout << GridLogMessage <<" X "<<norm2(X)<<std::endl;
Mpc.Mpc(X,Y); // Y= Mdag^-1 Vdag phi
std::cout << GridLogMessage <<" Y "<<norm2(Y)<<std::endl;
// phi^dag V (Mdag M)^-1 dV^dag phi
Vpc.MpcDagDeriv(force , X, PhiOdd ); dSdU = force;
std::cout << GridLogMessage <<" deriv "<<norm2(force)<<std::endl;
// phi^dag dV (Mdag M)^-1 V^dag phi
Vpc.MpcDeriv(force , PhiOdd, X ); dSdU = dSdU+force;
std::cout << GridLogMessage <<" deriv "<<norm2(force)<<std::endl;
// - phi^dag V (Mdag M)^-1 Mdag dM (Mdag M)^-1 V^dag phi
// - phi^dag V (Mdag M)^-1 dMdag M (Mdag M)^-1 V^dag phi
Mpc.MpcDeriv(force,Y,X); dSdU = dSdU-force;
std::cout << GridLogMessage <<" deriv "<<norm2(force)<<std::endl;
Mpc.MpcDagDeriv(force,X,Y); dSdU = dSdU-force;
std::cout << GridLogMessage <<" deriv "<<norm2(force)<<std::endl;
// FIXME No force contribution from EvenEven assumed here
// Needs a fix for clover.

View File

@ -283,12 +283,13 @@ public:
std::cout << GridLogHMC << "Total time for trajectory (s): " << (t1-t0)/1e6 << std::endl;
TheIntegrator.print_timer();
TheIntegrator.Smearer.set_Field(Ucur);
for (int obs = 0; obs < Observables.size(); obs++) {
std::cout << GridLogDebug << "Observables # " << obs << std::endl;
std::cout << GridLogDebug << "Observables total " << Observables.size() << std::endl;
std::cout << GridLogDebug << "Observables pointer " << Observables[obs] << std::endl;
Observables[obs]->TrajectoryComplete(traj + 1, Ucur, sRNG, pRNG);
Observables[obs]->TrajectoryComplete(traj + 1, TheIntegrator.Smearer, sRNG, pRNG);
}
std::cout << GridLogHMC << ":::::::::::::::::::::::::::::::::::::::::::" << std::endl;
}

View File

@ -35,13 +35,16 @@ class CheckpointerParameters : Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(CheckpointerParameters,
std::string, config_prefix,
std::string, smeared_prefix,
std::string, rng_prefix,
int, saveInterval,
bool, saveSmeared,
std::string, format, );
CheckpointerParameters(std::string cf = "cfg", std::string rn = "rng",
CheckpointerParameters(std::string cf = "cfg", std::string sf="cfg_smr" , std::string rn = "rng",
int savemodulo = 1, const std::string &f = "IEEE64BIG")
: config_prefix(cf),
smeared_prefix(sf),
rng_prefix(rn),
saveInterval(savemodulo),
format(f){};
@ -61,13 +64,21 @@ template <class Impl>
class BaseHmcCheckpointer : public HmcObservable<typename Impl::Field> {
public:
void build_filenames(int traj, CheckpointerParameters &Params,
std::string &conf_file, std::string &rng_file) {
std::string &conf_file,
std::string &smear_file,
std::string &rng_file) {
{
std::ostringstream os;
os << Params.rng_prefix << "." << traj;
rng_file = os.str();
}
{
std::ostringstream os;
os << Params.smeared_prefix << "." << traj;
smear_file = os.str();
}
{
std::ostringstream os;
os << Params.config_prefix << "." << traj;
@ -84,6 +95,11 @@ public:
}
virtual void initialize(const CheckpointerParameters &Params) = 0;
virtual void TrajectoryComplete(int traj,
typename Impl::Field &U,
GridSerialRNG &sRNG,
GridParallelRNG &pRNG) { assert(0); } ; // HMC should pass the smart config with smeared and unsmeared
virtual void CheckpointRestore(int traj, typename Impl::Field &U,
GridSerialRNG &sRNG,
GridParallelRNG &pRNG) = 0;

View File

@ -61,11 +61,14 @@ public:
fout.close();
}
void TrajectoryComplete(int traj, Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {
void TrajectoryComplete(int traj,
ConfigurationBase<Field> &SmartConfig,
GridSerialRNG &sRNG, GridParallelRNG &pRNG)
{
if ((traj % Params.saveInterval) == 0) {
std::string config, rng;
this->build_filenames(traj, Params, config, rng);
std::string config, rng, smr;
this->build_filenames(traj, Params, config, smr, rng);
uint32_t nersc_csum;
uint32_t scidac_csuma;
@ -74,9 +77,15 @@ public:
BinarySimpleUnmunger<sobj_double, sobj> munge;
truncate(rng);
BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
truncate(config);
std::cout << GridLogMessage << "Written Binary RNG " << rng
<< " checksum " << std::hex
<< nersc_csum <<"/"
<< scidac_csuma <<"/"
<< scidac_csumb
<< std::dec << std::endl;
BinaryIO::writeLatticeObject<vobj, sobj_double>(U, config, munge, 0, Params.format,
truncate(config);
BinaryIO::writeLatticeObject<vobj, sobj_double>(SmartConfig.get_U(false), config, munge, 0, Params.format,
nersc_csum,scidac_csuma,scidac_csumb);
std::cout << GridLogMessage << "Written Binary Configuration " << config
@ -85,6 +94,18 @@ public:
<< scidac_csuma <<"/"
<< scidac_csumb
<< std::dec << std::endl;
if ( Params.saveSmeared ) {
truncate(smr);
BinaryIO::writeLatticeObject<vobj, sobj_double>(SmartConfig.get_U(true), smr, munge, 0, Params.format,
nersc_csum,scidac_csuma,scidac_csumb);
std::cout << GridLogMessage << "Written Binary Smeared Configuration " << smr
<< " checksum " << std::hex
<< nersc_csum <<"/"
<< scidac_csuma <<"/"
<< scidac_csumb
<< std::dec << std::endl;
}
}
};

View File

@ -69,17 +69,27 @@ public:
}
}
void TrajectoryComplete(int traj, GaugeField &U, GridSerialRNG &sRNG,
void TrajectoryComplete(int traj,
ConfigurationBase<GaugeField> &SmartConfig,
GridSerialRNG &sRNG,
GridParallelRNG &pRNG) {
if ((traj % Params.saveInterval) == 0) {
std::string config, rng;
std::string config, rng, smr;
this->build_filenames(traj, Params, config, rng);
GridBase *grid = U.Grid();
GridBase *grid = SmartConfig.get_U(false).Grid();
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
std::cout << GridLogMessage << "Written BINARY RNG " << rng
<< " checksum " << std::hex
<< nersc_csum<<"/"
<< scidac_csuma<<"/"
<< scidac_csumb
<< std::dec << std::endl;
IldgWriter _IldgWriter(grid->IsBoss());
_IldgWriter.open(config);
_IldgWriter.writeConfiguration<GaugeStats>(U, traj, config, config);
_IldgWriter.writeConfiguration<GaugeStats>(SmartConfig.get_U(false), traj, config, config);
_IldgWriter.close();
std::cout << GridLogMessage << "Written ILDG Configuration on " << config
@ -88,6 +98,21 @@ public:
<< scidac_csuma<<"/"
<< scidac_csumb
<< std::dec << std::endl;
if ( Params.saveSmeared ) {
IldgWriter _IldgWriter(grid->IsBoss());
_IldgWriter.open(smr);
_IldgWriter.writeConfiguration<GaugeStats>(SmartConfig.get_U(true), traj, config, config);
_IldgWriter.close();
std::cout << GridLogMessage << "Written ILDG Configuration on " << smr
<< " checksum " << std::hex
<< nersc_csum<<"/"
<< scidac_csuma<<"/"
<< scidac_csumb
<< std::dec << std::endl;
}
}
};

View File

@ -52,23 +52,29 @@ public:
Params.format = "IEEE64BIG"; // fixed, overwrite any other choice
}
void TrajectoryComplete(int traj, GaugeField &U, GridSerialRNG &sRNG,
GridParallelRNG &pRNG) {
virtual void TrajectoryComplete(int traj,
ConfigurationBase<GaugeField> &SmartConfig,
GridSerialRNG &sRNG,
GridParallelRNG &pRNG)
{
if ((traj % Params.saveInterval) == 0) {
std::string config, rng;
this->build_filenames(traj, Params, config, rng);
std::string config, rng, smr;
this->build_filenames(traj, Params, config, smr, rng);
int precision32 = 1;
int tworow = 0;
NerscIO::writeRNGState(sRNG, pRNG, rng);
NerscIO::writeConfiguration<GaugeStats>(U, config, tworow, precision32);
NerscIO::writeConfiguration<GaugeStats>(SmartConfig.get_U(false), config, tworow, precision32);
if ( Params.saveSmeared ) {
NerscIO::writeConfiguration<GaugeStats>(SmartConfig.get_U(true), smr, tworow, precision32);
}
}
};
void CheckpointRestore(int traj, GaugeField &U, GridSerialRNG &sRNG,
GridParallelRNG &pRNG) {
std::string config, rng;
this->build_filenames(traj, Params, config, rng);
std::string config, rng, smr;
this->build_filenames(traj, Params, config, smr, rng );
this->check_filename(rng);
this->check_filename(config);

View File

@ -70,19 +70,37 @@ class ScidacHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
}
}
void TrajectoryComplete(int traj, Field &U, GridSerialRNG &sRNG,
void TrajectoryComplete(int traj,
ConfigurationBase<Field> &SmartConfig,
GridSerialRNG &sRNG,
GridParallelRNG &pRNG) {
if ((traj % Params.saveInterval) == 0) {
std::string config, rng;
this->build_filenames(traj, Params, config, rng);
GridBase *grid = U.Grid();
std::string config, rng,smr;
this->build_filenames(traj, Params, config, smr, rng);
GridBase *grid = SmartConfig.get_U(false).Grid();
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
ScidacWriter _ScidacWriter(grid->IsBoss());
_ScidacWriter.open(config);
_ScidacWriter.writeScidacFieldRecord(U, MData);
_ScidacWriter.close();
std::cout << GridLogMessage << "Written Binary RNG " << rng
<< " checksum " << std::hex
<< nersc_csum <<"/"
<< scidac_csuma <<"/"
<< scidac_csumb
<< std::dec << std::endl;
{
ScidacWriter _ScidacWriter(grid->IsBoss());
_ScidacWriter.open(config);
_ScidacWriter.writeScidacFieldRecord(SmartConfig.get_U(false), MData);
_ScidacWriter.close();
}
if ( Params.saveSmeared ) {
ScidacWriter _ScidacWriter(grid->IsBoss());
_ScidacWriter.open(smr);
_ScidacWriter.writeScidacFieldRecord(SmartConfig.get_U(true), MData);
_ScidacWriter.close();
}
std::cout << GridLogMessage << "Written Scidac Configuration on " << config << std::endl;
}
};

View File

@ -66,6 +66,7 @@ public:
template <class FieldImplementation_, class SmearingPolicy, class RepresentationPolicy>
class Integrator {
protected:
public:
typedef FieldImplementation_ FieldImplementation;
typedef typename FieldImplementation::Field MomentaField; //for readability
typedef typename FieldImplementation::Field Field;
@ -96,7 +97,6 @@ protected:
{
t_P[level] += ep;
update_P(P, U, level, ep);
std::cout << GridLogIntegrator << "[" << level << "] P " << " dt " << ep << " : t_P " << t_P[level] << std::endl;
}
@ -130,30 +130,20 @@ protected:
Field force(U.Grid());
conformable(U.Grid(), Mom.Grid());
Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared);
double start_force = usecond();
std::cout << GridLogMessage << "AuditForce["<<level<<"]["<<a<<"] before"<<std::endl;
AUDIT();
as[level].actions.at(a)->deriv_timer_start();
as[level].actions.at(a)->deriv(Us, force); // deriv should NOT include Ta
as[level].actions.at(a)->deriv(Smearer, force); // deriv should NOT include Ta
as[level].actions.at(a)->deriv_timer_stop();
std::cout << GridLogMessage << "AuditForce["<<level<<"]["<<a<<"] after"<<std::endl;
AUDIT();
std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl;
auto name = as[level].actions.at(a)->action_name();
if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force);
force = FieldImplementation::projectForce(force); // Ta for gauge fields
double end_force = usecond();
// DumpSliceNorm("force ",force,Nd-1);
MomFilter->applyFilter(force);
std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["<<a <<"] "<<name<<" dt "<<ep<< std::endl;
DumpSliceNorm("force filtered ",force,Nd-1);
Real force_abs = std::sqrt(norm2(force)/U.Grid()->gSites()); //average per-site norm. nb. norm2(latt) = \sum_x norm2(latt[x])
Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;
@ -379,15 +369,10 @@ public:
auto name = as[level].actions.at(actionID)->action_name();
std::cout << GridLogMessage << "refresh [" << level << "][" << actionID << "] "<<name << std::endl;
Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
std::cout << GridLogMessage << "AuditRefresh["<<level<<"]["<<actionID<<"] before"<<std::endl;
AUDIT();
as[level].actions.at(actionID)->refresh_timer_start();
as[level].actions.at(actionID)->refresh(Us, sRNG, pRNG);
as[level].actions.at(actionID)->refresh(Smearer, sRNG, pRNG);
as[level].actions.at(actionID)->refresh_timer_stop();
std::cout << GridLogMessage << "AuditRefresh["<<level<<"]["<<actionID<<"] after"<<std::endl;
AUDIT();
}
// Refresh the higher representation actions
@ -424,17 +409,16 @@ public:
// Actions
for (int level = 0; level < as.size(); ++level) {
for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
AUDIT();
// get gauge field from the SmearingPolicy and
// based on the boolean is_smeared in actionID
Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl;
as[level].actions.at(actionID)->S_timer_start();
Hterm = as[level].actions.at(actionID)->S(Us);
Hterm = as[level].actions.at(actionID)->S(Smearer);
as[level].actions.at(actionID)->S_timer_stop();
std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl;
H += Hterm;
AUDIT();
}
as[level].apply(S_hireps, Representations, level, H);
}
@ -447,9 +431,9 @@ public:
void operator()(std::vector<Action<FieldType>*> repr_set, Repr& Rep, int level, RealD& H) {
for (int a = 0; a < repr_set.size(); ++a) {
AUDIT();
RealD Hterm = repr_set.at(a)->Sinitial(Rep.U);
AUDIT();
std::cout << GridLogMessage << "Sinitial Level " << level << " term " << a << " H Hirep = " << Hterm << std::endl;
H += Hterm;
@ -471,13 +455,12 @@ public:
for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
// get gauge field from the SmearingPolicy and
// based on the boolean is_smeared in actionID
Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl;
as[level].actions.at(actionID)->S_timer_start();
AUDIT();
Hterm = as[level].actions.at(actionID)->Sinitial(Us);
as[level].actions.at(actionID)->S_timer_stop();
AUDIT();
as[level].actions.at(actionID)->S_timer_start();
Hterm = as[level].actions.at(actionID)->S(Smearer);
as[level].actions.at(actionID)->S_timer_stop();
std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl;
H += Hterm;
}
@ -490,7 +473,6 @@ public:
void integrate(Field& U)
{
AUDIT();
// reset the clocks
t_U = 0;
for (int level = 0; level < as.size(); ++level) {
@ -508,10 +490,8 @@ public:
assert(fabs(t_U - t_P[level]) < 1.0e-6); // must be the same
std::cout << GridLogIntegrator << " times[" << level << "]= " << t_P[level] << " " << t_U << std::endl;
}
AUDIT();
FieldImplementation::Project(U);
AUDIT();
// and that we indeed got to the end of the trajectory
assert(fabs(t_U - Params.trajL) < 1.0e-6);

View File

@ -34,6 +34,13 @@ NAMESPACE_BEGIN(Grid);
template <class Field>
class HmcObservable {
public:
virtual void TrajectoryComplete(int traj,
ConfigurationBase<Field> &SmartConfig,
GridSerialRNG &sRNG,
GridParallelRNG &pRNG)
{
TrajectoryComplete(traj,SmartConfig.get_U(false),sRNG,pRNG); // Unsmeared observable
};
virtual void TrajectoryComplete(int traj,
Field &U,
GridSerialRNG &sRNG,

View File

@ -42,6 +42,18 @@ public:
// necessary for HmcObservable compatibility
typedef typename Impl::Field Field;
virtual void TrajectoryComplete(int traj,
ConfigurationBase<Field> &SmartConfig,
GridSerialRNG &sRNG,
GridParallelRNG &pRNG)
{
std::cout << GridLogMessage << "+++++++++++++++++++"<<std::endl;
std::cout << GridLogMessage << "Unsmeared plaquette"<<std::endl;
TrajectoryComplete(traj,SmartConfig.get_U(false),sRNG,pRNG); // Unsmeared observable
std::cout << GridLogMessage << "Smeared plaquette"<<std::endl;
TrajectoryComplete(traj,SmartConfig.get_U(true),sRNG,pRNG); // Unsmeared observable
std::cout << GridLogMessage << "+++++++++++++++++++"<<std::endl;
};
void TrajectoryComplete(int traj,
Field &U,
GridSerialRNG &sRNG,

View File

@ -7,26 +7,27 @@
NAMESPACE_BEGIN(Grid);
//trivial class for no smearing
template< class Impl >
class NoSmearing
class NoSmearing : public ConfigurationBase<typename Impl::Field>
{
public:
INHERIT_FIELD_TYPES(Impl);
Field* ThinField;
Field* ThinLinks;
NoSmearing(): ThinField(NULL) {}
NoSmearing(): ThinLinks(NULL) {}
void set_Field(Field& U) { ThinField = &U; }
virtual void set_Field(Field& U) { ThinLinks = &U; }
void smeared_force(Field&) const {}
virtual void smeared_force(Field&) {}
Field& get_SmearedU() { return *ThinField; }
virtual Field& get_SmearedU() { return *ThinLinks; }
Field &get_U(bool smeared = false)
virtual Field &get_U(bool smeared = false)
{
return *ThinField;
return *ThinLinks;
}
};
@ -42,19 +43,24 @@ public:
It stores a list of smeared configurations.
*/
template <class Gimpl>
class SmearedConfiguration
class SmearedConfiguration : public ConfigurationBase<typename Gimpl::Field>
{
public:
INHERIT_GIMPL_TYPES(Gimpl);
private:
protected:
const unsigned int smearingLevels;
Smear_Stout<Gimpl> *StoutSmearing;
std::vector<GaugeField> SmearedSet;
public:
GaugeField* ThinLinks; /* Pointer to the thin links configuration */ // move to base???
protected:
// Member functions
//====================================================================
void fill_smearedSet(GaugeField &U)
// Overridden in masked version
virtual void fill_smearedSet(GaugeField &U)
{
ThinLinks = &U; // attach the smearing routine to the field U
@ -82,9 +88,10 @@ private:
}
}
}
//====================================================================
GaugeField AnalyticSmearedForce(const GaugeField& SigmaKPrime,
const GaugeField& GaugeK) const
//overridden in masked verson
virtual GaugeField AnalyticSmearedForce(const GaugeField& SigmaKPrime,
const GaugeField& GaugeK) const
{
GridBase* grid = GaugeK.Grid();
GaugeField C(grid), SigmaK(grid), iLambda(grid);
@ -213,8 +220,6 @@ private:
//====================================================================
public:
GaugeField*
ThinLinks; /* Pointer to the thin links configuration */
/* Standard constructor */
SmearedConfiguration(GridCartesian* UGrid, unsigned int Nsmear,
@ -230,7 +235,7 @@ public:
: smearingLevels(0), StoutSmearing(nullptr), SmearedSet(), ThinLinks(NULL) {}
// attach the smeared routines to the thin links U and fill the smeared set
void set_Field(GaugeField &U)
virtual void set_Field(GaugeField &U)
{
double start = usecond();
fill_smearedSet(U);
@ -240,7 +245,7 @@ public:
}
//====================================================================
void smeared_force(GaugeField &SigmaTilde) const
virtual void smeared_force(GaugeField &SigmaTilde)
{
if (smearingLevels > 0)
{
@ -267,14 +272,16 @@ public:
}
double end = usecond();
double time = (end - start)/ 1e3;
std::cout << GridLogMessage << "Smearing force in " << time << " ms" << std::endl;
std::cout << GridLogMessage << " GaugeConfiguration: Smeared Force chain rule took " << time << " ms" << std::endl;
} // if smearingLevels = 0 do nothing
SigmaTilde=Gimpl::projectForce(SigmaTilde); // Ta
}
//====================================================================
GaugeField& get_SmearedU() { return SmearedSet[smearingLevels - 1]; }
virtual GaugeField& get_SmearedU() { return SmearedSet[smearingLevels - 1]; }
GaugeField &get_U(bool smeared = false)
virtual GaugeField &get_U(bool smeared = false)
{
// get the config, thin links by default
if (smeared)

View File

@ -0,0 +1,813 @@
/*!
@file GaugeConfiguration.h
@brief Declares the GaugeConfiguration class
*/
#pragma once
NAMESPACE_BEGIN(Grid);
/*!
@brief Smeared configuration masked container
Modified for a multi-subset smearing (aka Luscher Flowed HMC)
*/
template <class Gimpl>
class SmearedConfigurationMasked : public SmearedConfiguration<Gimpl>
{
public:
INHERIT_GIMPL_TYPES(Gimpl);
private:
// These live in base class
// const unsigned int smearingLevels;
// Smear_Stout<Gimpl> *StoutSmearing;
// std::vector<GaugeField> SmearedSet;
std::vector<LatticeLorentzComplex> masks;
typedef typename SU3Adjoint::AMatrix AdjMatrix;
typedef typename SU3Adjoint::LatticeAdjMatrix AdjMatrixField;
typedef typename SU3Adjoint::LatticeAdjVector AdjVectorField;
// Adjoint vector to GaugeField force
void InsertForce(GaugeField &Fdet,AdjVectorField &Fdet_nu,int nu)
{
Complex ci(0,1);
GaugeLinkField Fdet_pol(Fdet.Grid());
Fdet_pol=Zero();
for(int e=0;e<8;e++){
ColourMatrix te;
SU3::generator(e, te);
auto tmp=peekColour(Fdet_nu,e);
Fdet_pol=Fdet_pol + ci*tmp*te; // but norm of te is different.. why?
}
pokeLorentz(Fdet, Fdet_pol, nu);
}
void Compute_MpInvJx_dNxxdSy(const GaugeLinkField &PlaqL,const GaugeLinkField &PlaqR, AdjMatrixField MpInvJx,AdjVectorField &Fdet2 )
{
GaugeLinkField UtaU(PlaqL.Grid());
GaugeLinkField D(PlaqL.Grid());
AdjMatrixField Dbc(PlaqL.Grid());
LatticeComplex tmp(PlaqL.Grid());
const int Ngen = SU3Adjoint::Dimension;
Complex ci(0,1);
ColourMatrix ta,tb,tc;
for(int a=0;a<Ngen;a++) {
SU3::generator(a, ta);
// Qlat Tb = 2i Tb^Grid
UtaU= 2.0*ci*adj(PlaqL)*ta*PlaqR;
for(int c=0;c<Ngen;c++) {
SU3::generator(c, tc);
D = Ta( (2.0)*ci*tc *UtaU);
for(int b=0;b<Ngen;b++){
SU3::generator(b, tb);
tmp =-trace(ci*tb*D);
PokeIndex<ColourIndex>(Dbc,tmp,b,c); // Adjoint rep
}
}
tmp = trace(MpInvJx * Dbc);
PokeIndex<ColourIndex>(Fdet2,tmp,a);
}
}
void ComputeNxy(const GaugeLinkField &PlaqL,const GaugeLinkField &PlaqR,AdjMatrixField &NxAd)
{
GaugeLinkField Nx(PlaqL.Grid());
const int Ngen = SU3Adjoint::Dimension;
Complex ci(0,1);
ColourMatrix tb;
ColourMatrix tc;
for(int b=0;b<Ngen;b++) {
SU3::generator(b, tb);
Nx = (2.0)*Ta( adj(PlaqL)*ci*tb * PlaqR );
for(int c=0;c<Ngen;c++) {
SU3::generator(c, tc);
auto tmp =closure( -trace(ci*tc*Nx));
PokeIndex<ColourIndex>(NxAd,tmp,c,b);
}
}
}
void ApplyMask(GaugeField &U,int smr)
{
LatticeComplex tmp(U.Grid());
GaugeLinkField Umu(U.Grid());
for(int mu=0;mu<Nd;mu++){
Umu=PeekIndex<LorentzIndex>(U,mu);
tmp=PeekIndex<LorentzIndex>(masks[smr],mu);
Umu=Umu*tmp;
PokeIndex<LorentzIndex>(U, Umu, mu);
}
}
public:
void logDetJacobianForceLevel(const GaugeField &U, GaugeField &force ,int smr)
{
GridBase* grid = U.Grid();
ColourMatrix tb;
ColourMatrix tc;
ColourMatrix ta;
GaugeField C(grid);
GaugeField Umsk(grid);
std::vector<GaugeLinkField> Umu(Nd,grid);
GaugeLinkField Cmu(grid); // U and staple; C contains factor of epsilon
GaugeLinkField Zx(grid); // U times Staple, contains factor of epsilon
GaugeLinkField Nxx(grid); // Nxx fundamental space
GaugeLinkField Utmp(grid);
GaugeLinkField PlaqL(grid);
GaugeLinkField PlaqR(grid);
const int Ngen = SU3Adjoint::Dimension;
AdjMatrix TRb;
ColourMatrix Ident;
LatticeComplex cplx(grid);
AdjVectorField dJdXe_nMpInv(grid);
AdjVectorField dJdXe_nMpInv_y(grid);
AdjMatrixField MpAd(grid); // Mprime luchang's notes
AdjMatrixField MpAdInv(grid); // Mprime inverse
AdjMatrixField NxxAd(grid); // Nxx in adjoint space
AdjMatrixField JxAd(grid);
AdjMatrixField ZxAd(grid);
AdjMatrixField mZxAd(grid);
AdjMatrixField X(grid);
Complex ci(0,1);
RealD t0 = usecond();
Ident = ComplexD(1.0);
for(int d=0;d<Nd;d++){
Umu[d] = peekLorentz(U, d);
}
int mu= (smr/2) %Nd;
////////////////////////////////////////////////////////////////////////////////
// Mask the gauge field
////////////////////////////////////////////////////////////////////////////////
auto mask=PeekIndex<LorentzIndex>(masks[smr],mu); // the cb mask
Umsk = U;
ApplyMask(Umsk,smr);
Utmp = peekLorentz(Umsk,mu);
////////////////////////////////////////////////////////////////////////////////
// Retrieve the eps/rho parameter(s) -- could allow all different but not so far
////////////////////////////////////////////////////////////////////////////////
double rho=this->StoutSmearing->SmearRho[1];
int idx=0;
for(int mu=0;mu<4;mu++){
for(int nu=0;nu<4;nu++){
if ( mu!=nu) assert(this->StoutSmearing->SmearRho[idx]==rho);
else assert(this->StoutSmearing->SmearRho[idx]==0.0);
idx++;
}}
//////////////////////////////////////////////////////////////////
// Assemble the N matrix
//////////////////////////////////////////////////////////////////
// Computes ALL the staples -- could compute one only and do it here
RealD time;
time=-usecond();
this->StoutSmearing->BaseSmear(C, U);
Cmu = peekLorentz(C, mu);
//////////////////////////////////////////////////////////////////
// Assemble Luscher exp diff map J matrix
//////////////////////////////////////////////////////////////////
// Ta so Z lives in Lie algabra
Zx = Ta(Cmu * adj(Umu[mu]));
time+=usecond();
std::cout << GridLogMessage << "Z took "<<time<< " us"<<std::endl;
time=-usecond();
// Move Z to the Adjoint Rep == make_adjoint_representation
ZxAd = Zero();
for(int b=0;b<8;b++) {
// Adj group sets traceless antihermitian T's -- Guido, really????
SU3::generator(b, tb); // Fund group sets traceless hermitian T's
SU3Adjoint::generator(b,TRb);
TRb=-TRb;
cplx = 2.0*trace(ci*tb*Zx); // my convention 1/2 delta ba
ZxAd = ZxAd + cplx * TRb; // is this right? YES - Guido used Anti herm Ta's and with bloody wrong sign.
}
time+=usecond();
std::cout << GridLogMessage << "ZxAd took "<<time<< " us"<<std::endl;
//////////////////////////////////////
// J(x) = 1 + Sum_k=1..N (-Zac)^k/(k+1)!
//////////////////////////////////////
time=-usecond();
X=1.0;
JxAd = X;
mZxAd = (-1.0)*ZxAd;
RealD kpfac = 1;
for(int k=1;k<12;k++){
X=X*mZxAd;
kpfac = kpfac /(k+1);
JxAd = JxAd + X * kpfac;
}
time+=usecond();
std::cout << GridLogMessage << "Jx took "<<time<< " us"<<std::endl;
//////////////////////////////////////
// dJ(x)/dxe
//////////////////////////////////////
time=-usecond();
std::vector<AdjMatrixField> dJdX; dJdX.resize(8,grid);
AdjMatrixField tbXn(grid);
AdjMatrixField sumXtbX(grid);
AdjMatrixField t2(grid);
AdjMatrixField dt2(grid);
AdjMatrixField t3(grid);
AdjMatrixField dt3(grid);
AdjMatrixField aunit(grid);
for(int b=0;b<8;b++){
aunit = ComplexD(1.0);
SU3Adjoint::generator(b, TRb); //dt2
X = (-1.0)*ZxAd;
t2 = X;
dt2 = TRb;
for (int j = 20; j > 1; --j) {
t3 = t2*(1.0 / (j + 1)) + aunit;
dt3 = dt2*(1.0 / (j + 1));
t2 = X * t3;
dt2 = TRb * t3 + X * dt3;
}
dJdX[b] = -dt2;
}
time+=usecond();
std::cout << GridLogMessage << "dJx took "<<time<< " us"<<std::endl;
/////////////////////////////////////////////////////////////////
// Mask Umu for this link
/////////////////////////////////////////////////////////////////
time=-usecond();
PlaqL = Ident;
PlaqR = Utmp*adj(Cmu);
ComputeNxy(PlaqL,PlaqR,NxxAd);
time+=usecond();
std::cout << GridLogMessage << "ComputeNxy took "<<time<< " us"<<std::endl;
////////////////////////////
// Mab
////////////////////////////
MpAd = Complex(1.0,0.0);
MpAd = MpAd - JxAd * NxxAd;
/////////////////////////
// invert the 8x8
/////////////////////////
time=-usecond();
MpAdInv = Inverse(MpAd);
time+=usecond();
std::cout << GridLogMessage << "MpAdInv took "<<time<< " us"<<std::endl;
RealD t3a = usecond();
/////////////////////////////////////////////////////////////////
// Nxx Mp^-1
/////////////////////////////////////////////////////////////////
AdjVectorField FdetV(grid);
AdjVectorField Fdet1_nu(grid);
AdjVectorField Fdet2_nu(grid);
AdjVectorField Fdet2_mu(grid);
AdjVectorField Fdet1_mu(grid);
AdjMatrixField nMpInv(grid);
nMpInv= NxxAd *MpAdInv;
AdjMatrixField MpInvJx(grid);
AdjMatrixField MpInvJx_nu(grid);
MpInvJx = (-1.0)*MpAdInv * JxAd;// rho is on the plaq factor
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx,FdetV);
Fdet2_mu=FdetV;
Fdet1_mu=Zero();
for(int e =0 ; e<8 ; e++){
LatticeComplexD tr(grid);
ColourMatrix te;
SU3::generator(e, te);
tr = trace(dJdX[e] * nMpInv);
pokeColour(dJdXe_nMpInv,tr,e);
}
///////////////////////////////
// Mask it off
///////////////////////////////
auto tmp=PeekIndex<LorentzIndex>(masks[smr],mu);
dJdXe_nMpInv = dJdXe_nMpInv*tmp;
// dJdXe_nMpInv needs to multiply:
// Nxx_mu (site local) (1)
// Nxy_mu one site forward in each nu direction (3)
// Nxy_mu one site backward in each nu direction (3)
// Nxy_nu 0,0 ; +mu,0; 0,-nu; +mu-nu [ 3x4 = 12]
// 19 terms.
AdjMatrixField Nxy(grid);
GaugeField Fdet1(grid);
GaugeField Fdet2(grid);
GaugeLinkField Fdet_pol(grid); // one polarisation
RealD t4 = usecond();
for(int nu=0;nu<Nd;nu++){
if (nu!=mu) {
///////////////// +ve nu /////////////////
// __
// | |
// x== // nu polarisation -- clockwise
time=-usecond();
PlaqL=Ident;
PlaqR=(-rho)*Gimpl::CovShiftForward(Umu[nu], nu,
Gimpl::CovShiftForward(Umu[mu], mu,
Gimpl::CovShiftBackward(Umu[nu], nu,
Gimpl::CovShiftIdentityBackward(Utmp, mu))));
time+=usecond();
std::cout << GridLogMessage << "PlaqLR took "<<time<< " us"<<std::endl;
time=-usecond();
dJdXe_nMpInv_y = dJdXe_nMpInv;
ComputeNxy(PlaqL,PlaqR,Nxy);
Fdet1_nu = transpose(Nxy)*dJdXe_nMpInv_y;
time+=usecond();
std::cout << GridLogMessage << "ComputeNxy (occurs 6x) took "<<time<< " us"<<std::endl;
time=-usecond();
PlaqR=(-1.0)*PlaqR;
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx,FdetV);
Fdet2_nu = FdetV;
time+=usecond();
std::cout << GridLogMessage << "Compute_MpInvJx_dNxxSy (occurs 6x) took "<<time<< " us"<<std::endl;
// x==
// | |
// .__| // nu polarisation -- anticlockwise
PlaqR=(rho)*Gimpl::CovShiftForward(Umu[nu], nu,
Gimpl::CovShiftBackward(Umu[mu], mu,
Gimpl::CovShiftIdentityBackward(Umu[nu], nu)));
PlaqL=Gimpl::CovShiftIdentityBackward(Utmp, mu);
dJdXe_nMpInv_y = Cshift(dJdXe_nMpInv,mu,-1);
ComputeNxy(PlaqL, PlaqR,Nxy);
Fdet1_nu = Fdet1_nu+transpose(Nxy)*dJdXe_nMpInv_y;
MpInvJx_nu = Cshift(MpInvJx,mu,-1);
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
Fdet2_nu = Fdet2_nu+FdetV;
///////////////// -ve nu /////////////////
// __
// | |
// x== // nu polarisation -- clockwise
PlaqL=(rho)* Gimpl::CovShiftForward(Umu[mu], mu,
Gimpl::CovShiftForward(Umu[nu], nu,
Gimpl::CovShiftIdentityBackward(Utmp, mu)));
PlaqR = Gimpl::CovShiftIdentityForward(Umu[nu], nu);
dJdXe_nMpInv_y = Cshift(dJdXe_nMpInv,nu,1);
ComputeNxy(PlaqL,PlaqR,Nxy);
Fdet1_nu = Fdet1_nu + transpose(Nxy)*dJdXe_nMpInv_y;
MpInvJx_nu = Cshift(MpInvJx,nu,1);
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
Fdet2_nu = Fdet2_nu+FdetV;
// x==
// | |
// |__| // nu polarisation
PlaqL=(-rho)*Gimpl::CovShiftForward(Umu[nu], nu,
Gimpl::CovShiftIdentityBackward(Utmp, mu));
PlaqR=Gimpl::CovShiftBackward(Umu[mu], mu,
Gimpl::CovShiftIdentityForward(Umu[nu], nu));
dJdXe_nMpInv_y = Cshift(dJdXe_nMpInv,mu,-1);
dJdXe_nMpInv_y = Cshift(dJdXe_nMpInv_y,nu,1);
ComputeNxy(PlaqL,PlaqR,Nxy);
Fdet1_nu = Fdet1_nu + transpose(Nxy)*dJdXe_nMpInv_y;
MpInvJx_nu = Cshift(MpInvJx,mu,-1);
MpInvJx_nu = Cshift(MpInvJx_nu,nu,1);
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
Fdet2_nu = Fdet2_nu+FdetV;
/////////////////////////////////////////////////////////////////////
// Set up the determinant force contribution in 3x3 algebra basis
/////////////////////////////////////////////////////////////////////
InsertForce(Fdet1,Fdet1_nu,nu);
InsertForce(Fdet2,Fdet2_nu,nu);
//////////////////////////////////////////////////
// Parallel direction terms
//////////////////////////////////////////////////
// __
// | "
// |__"x // mu polarisation
PlaqL=(-rho)*Gimpl::CovShiftForward(Umu[mu], mu,
Gimpl::CovShiftBackward(Umu[nu], nu,
Gimpl::CovShiftIdentityBackward(Utmp, mu)));
PlaqR=Gimpl::CovShiftIdentityBackward(Umu[nu], nu);
dJdXe_nMpInv_y = Cshift(dJdXe_nMpInv,nu,-1);
ComputeNxy(PlaqL,PlaqR,Nxy);
Fdet1_mu = Fdet1_mu + transpose(Nxy)*dJdXe_nMpInv_y;
MpInvJx_nu = Cshift(MpInvJx,nu,-1);
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
Fdet2_mu = Fdet2_mu+FdetV;
// __
// " |
// x__| // mu polarisation
PlaqL=(-rho)*Gimpl::CovShiftForward(Umu[mu], mu,
Gimpl::CovShiftForward(Umu[nu], nu,
Gimpl::CovShiftIdentityBackward(Utmp, mu)));
PlaqR=Gimpl::CovShiftIdentityForward(Umu[nu], nu);
dJdXe_nMpInv_y = Cshift(dJdXe_nMpInv,nu,1);
ComputeNxy(PlaqL,PlaqR,Nxy);
Fdet1_mu = Fdet1_mu + transpose(Nxy)*dJdXe_nMpInv_y;
MpInvJx_nu = Cshift(MpInvJx,nu,1);
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
Fdet2_mu = Fdet2_mu+FdetV;
}
}
RealD t5 = usecond();
Fdet1_mu = Fdet1_mu + transpose(NxxAd)*dJdXe_nMpInv;
InsertForce(Fdet1,Fdet1_mu,mu);
InsertForce(Fdet2,Fdet2_mu,mu);
force= (-0.5)*( Fdet1 + Fdet2);
RealD t1 = usecond();
std::cout << GridLogMessage << " logDetJacobianForce level took "<<t1-t0<<" us "<<std::endl;
std::cout << GridLogMessage << " logDetJacobianForce t3-t0 "<<t3a-t0<<" us "<<std::endl;
std::cout << GridLogMessage << " logDetJacobianForce t4-t3 dJdXe_nMpInv "<<t4-t3a<<" us "<<std::endl;
std::cout << GridLogMessage << " logDetJacobianForce t5-t4 mu nu loop "<<t5-t4<<" us "<<std::endl;
std::cout << GridLogMessage << " logDetJacobianForce t1-t5 "<<t1-t5<<" us "<<std::endl;
std::cout << GridLogMessage << " logDetJacobianForce level took "<<t1-t0<<" us "<<std::endl;
}
RealD logDetJacobianLevel(const GaugeField &U,int smr)
{
GridBase* grid = U.Grid();
GaugeField C(grid);
GaugeLinkField Nb(grid);
GaugeLinkField Z(grid);
GaugeLinkField Umu(grid), Cmu(grid);
ColourMatrix Tb;
ColourMatrix Tc;
typedef typename SU3Adjoint::AMatrix AdjMatrix;
typedef typename SU3Adjoint::LatticeAdjMatrix AdjMatrixField;
typedef typename SU3Adjoint::LatticeAdjVector AdjVectorField;
const int Ngen = SU3Adjoint::Dimension;
AdjMatrix TRb;
LatticeComplex cplx(grid);
AdjVectorField AlgV(grid);
AdjMatrixField Mab(grid);
AdjMatrixField Ncb(grid);
AdjMatrixField Jac(grid);
AdjMatrixField Zac(grid);
AdjMatrixField mZac(grid);
AdjMatrixField X(grid);
int mu= (smr/2) %Nd;
auto mask=PeekIndex<LorentzIndex>(masks[smr],mu); // the cb mask
//////////////////////////////////////////////////////////////////
// Assemble the N matrix
//////////////////////////////////////////////////////////////////
// Computes ALL the staples -- could compute one only here
this->StoutSmearing->BaseSmear(C, U);
Cmu = peekLorentz(C, mu);
Umu = peekLorentz(U, mu);
Complex ci(0,1);
for(int b=0;b<Ngen;b++) {
SU3::generator(b, Tb);
// Qlat Tb = 2i Tb^Grid
Nb = (2.0)*Ta( ci*Tb * Umu * adj(Cmu));
for(int c=0;c<Ngen;c++) {
SU3::generator(c, Tc);
auto tmp = -trace(ci*Tc*Nb); // Luchang's norm: (2Tc) (2Td) N^db = -2 delta cd N^db // - was important
PokeIndex<ColourIndex>(Ncb,tmp,c,b);
}
}
//////////////////////////////////////////////////////////////////
// Assemble Luscher exp diff map J matrix
//////////////////////////////////////////////////////////////////
// Ta so Z lives in Lie algabra
Z = Ta(Cmu * adj(Umu));
// Move Z to the Adjoint Rep == make_adjoint_representation
Zac = Zero();
for(int b=0;b<8;b++) {
// Adj group sets traceless antihermitian T's -- Guido, really????
// Is the mapping of these the same? Same structure constants
// Might never have been checked.
SU3::generator(b, Tb); // Fund group sets traceless hermitian T's
SU3Adjoint::generator(b,TRb);
TRb=-TRb;
cplx = 2.0*trace(ci*Tb*Z); // my convention 1/2 delta ba
Zac = Zac + cplx * TRb; // is this right? YES - Guido used Anti herm Ta's and with bloody wrong sign.
}
//////////////////////////////////////
// J(x) = 1 + Sum_k=1..N (-Zac)^k/(k+1)!
//////////////////////////////////////
X=1.0;
Jac = X;
mZac = (-1.0)*Zac;
RealD kpfac = 1;
for(int k=1;k<12;k++){
X=X*mZac;
kpfac = kpfac /(k+1);
Jac = Jac + X * kpfac;
}
////////////////////////////
// Mab
////////////////////////////
Mab = Complex(1.0,0.0);
Mab = Mab - Jac * Ncb;
////////////////////////////
// det
////////////////////////////
LatticeComplex det(grid);
det = Determinant(Mab);
////////////////////////////
// ln det
////////////////////////////
LatticeComplex ln_det(grid);
ln_det = log(det);
////////////////////////////
// Masked sum
////////////////////////////
ln_det = ln_det * mask;
Complex result = sum(ln_det);
return result.real();
}
public:
RealD logDetJacobian(void)
{
RealD ln_det = 0;
if (this->smearingLevels > 0)
{
double start = usecond();
for (int ismr = this->smearingLevels - 1; ismr > 0; --ismr) {
ln_det+= logDetJacobianLevel(this->get_smeared_conf(ismr-1),ismr);
}
ln_det +=logDetJacobianLevel(*(this->ThinLinks),0);
double end = usecond();
double time = (end - start)/ 1e3;
std::cout << GridLogMessage << "GaugeConfigurationMasked: logDetJacobian took " << time << " ms" << std::endl;
}
return ln_det;
}
void logDetJacobianForce(GaugeField &force)
{
force =Zero();
GaugeField force_det(force.Grid());
if (this->smearingLevels > 0)
{
double start = usecond();
GaugeLinkField tmp_mu(force.Grid());
for (int ismr = this->smearingLevels - 1; ismr > 0; --ismr) {
// remove U in UdSdU...
for (int mu = 0; mu < Nd; mu++) {
tmp_mu = adj(peekLorentz(this->get_smeared_conf(ismr), mu)) * peekLorentz(force, mu);
pokeLorentz(force, tmp_mu, mu);
}
// Propagate existing force
force = this->AnalyticSmearedForce(force, this->get_smeared_conf(ismr - 1), ismr);
// Add back U in UdSdU...
for (int mu = 0; mu < Nd; mu++) {
tmp_mu = peekLorentz(this->get_smeared_conf(ismr - 1), mu) * peekLorentz(force, mu);
pokeLorentz(force, tmp_mu, mu);
}
// Get this levels determinant force
force_det = Zero();
logDetJacobianForceLevel(this->get_smeared_conf(ismr-1),force_det,ismr);
// Sum the contributions
force = force + force_det;
}
// remove U in UdSdU...
for (int mu = 0; mu < Nd; mu++) {
tmp_mu = adj(peekLorentz(this->get_smeared_conf(0), mu)) * peekLorentz(force, mu);
pokeLorentz(force, tmp_mu, mu);
}
force = this->AnalyticSmearedForce(force, *this->ThinLinks,0);
for (int mu = 0; mu < Nd; mu++) {
tmp_mu = peekLorentz(*this->ThinLinks, mu) * peekLorentz(force, mu);
pokeLorentz(force, tmp_mu, mu);
}
force_det = Zero();
logDetJacobianForceLevel(*this->ThinLinks,force_det,0);
force = force + force_det;
force=Ta(force); // Ta
double end = usecond();
double time = (end - start)/ 1e3;
std::cout << GridLogMessage << "GaugeConfigurationMasked: lnDetJacobianForce took " << time << " ms" << std::endl;
} // if smearingLevels = 0 do nothing
}
private:
//====================================================================
// Override base clas here to mask it
virtual void fill_smearedSet(GaugeField &U)
{
this->ThinLinks = &U; // attach the smearing routine to the field U
// check the pointer is not null
if (this->ThinLinks == NULL)
std::cout << GridLogError << "[SmearedConfigurationMasked] Error in ThinLinks pointer\n";
if (this->smearingLevels > 0)
{
std::cout << GridLogMessage << "[SmearedConfigurationMasked] Filling SmearedSet\n";
GaugeField previous_u(this->ThinLinks->Grid());
GaugeField smeared_A(this->ThinLinks->Grid());
GaugeField smeared_B(this->ThinLinks->Grid());
previous_u = *this->ThinLinks;
double start = usecond();
for (int smearLvl = 0; smearLvl < this->smearingLevels; ++smearLvl)
{
this->StoutSmearing->smear(smeared_A, previous_u);
ApplyMask(smeared_A,smearLvl);
smeared_B = previous_u;
ApplyMask(smeared_B,smearLvl);
// Replace only the masked portion
this->SmearedSet[smearLvl] = previous_u-smeared_B + smeared_A;
previous_u = this->SmearedSet[smearLvl];
// For debug purposes
RealD impl_plaq = WilsonLoops<Gimpl>::avgPlaquette(previous_u);
std::cout << GridLogMessage << "[SmearedConfigurationMasked] smeared Plaq: " << impl_plaq << std::endl;
}
double end = usecond();
double time = (end - start)/ 1e3;
std::cout << GridLogMessage << "GaugeConfigurationMasked: Link smearing took " << time << " ms" << std::endl;
}
}
//====================================================================
// Override base to add masking
virtual GaugeField AnalyticSmearedForce(const GaugeField& SigmaKPrime,
const GaugeField& GaugeK,int level)
{
GridBase* grid = GaugeK.Grid();
GaugeField C(grid), SigmaK(grid), iLambda(grid);
GaugeField SigmaKPrimeA(grid);
GaugeField SigmaKPrimeB(grid);
GaugeLinkField iLambda_mu(grid);
GaugeLinkField iQ(grid), e_iQ(grid);
GaugeLinkField SigmaKPrime_mu(grid);
GaugeLinkField GaugeKmu(grid), Cmu(grid);
this->StoutSmearing->BaseSmear(C, GaugeK);
SigmaK = Zero();
iLambda = Zero();
SigmaKPrimeA = SigmaKPrime;
ApplyMask(SigmaKPrimeA,level);
SigmaKPrimeB = SigmaKPrime - SigmaKPrimeA;
// Could get away with computing only one polarisation here
// int mu= (smr/2) %Nd;
// SigmaKprime_A has only one component
for (int mu = 0; mu < Nd; mu++)
{
Cmu = peekLorentz(C, mu);
GaugeKmu = peekLorentz(GaugeK, mu);
SigmaKPrime_mu = peekLorentz(SigmaKPrimeA, mu);
iQ = Ta(Cmu * adj(GaugeKmu));
this->set_iLambda(iLambda_mu, e_iQ, iQ, SigmaKPrime_mu, GaugeKmu);
pokeLorentz(SigmaK, SigmaKPrime_mu * e_iQ + adj(Cmu) * iLambda_mu, mu);
pokeLorentz(iLambda, iLambda_mu, mu);
}
this->StoutSmearing->derivative(SigmaK, iLambda,GaugeK); // derivative of SmearBase
////////////////////////////////////////////////////////////////////////////////////
// propagate the rest of the force as identity map, just add back
////////////////////////////////////////////////////////////////////////////////////
SigmaK = SigmaK+SigmaKPrimeB;
return SigmaK;
}
public:
/* Standard constructor */
SmearedConfigurationMasked(GridCartesian* _UGrid, unsigned int Nsmear, Smear_Stout<Gimpl>& Stout)
: SmearedConfiguration<Gimpl>(_UGrid, Nsmear,Stout)
{
assert(Nsmear%(2*Nd)==0); // Or multiply by 8??
// was resized in base class
assert(this->SmearedSet.size()==Nsmear);
GridRedBlackCartesian * UrbGrid;
UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(_UGrid);
LatticeComplex one(_UGrid); one = ComplexD(1.0,0.0);
LatticeComplex tmp(_UGrid);
for (unsigned int i = 0; i < this->smearingLevels; ++i) {
masks.push_back(*(new LatticeLorentzComplex(_UGrid)));
int mu= (i/2) %Nd;
int cb= (i%2);
LatticeComplex tmpcb(UrbGrid);
masks[i]=Zero();
////////////////////
// Setup the mask
////////////////////
tmp = Zero();
pickCheckerboard(cb,tmpcb,one);
setCheckerboard(tmp,tmpcb);
PokeIndex<LorentzIndex>(masks[i],tmp, mu);
}
delete UrbGrid;
}
virtual void smeared_force(GaugeField &SigmaTilde)
{
if (this->smearingLevels > 0)
{
double start = usecond();
GaugeField force = SigmaTilde; // actually = U*SigmaTilde
GaugeLinkField tmp_mu(SigmaTilde.Grid());
// Remove U from UdSdU
for (int mu = 0; mu < Nd; mu++)
{
// to get just SigmaTilde
tmp_mu = adj(peekLorentz(this->SmearedSet[this->smearingLevels - 1], mu)) * peekLorentz(force, mu);
pokeLorentz(force, tmp_mu, mu);
}
for (int ismr = this->smearingLevels - 1; ismr > 0; --ismr) {
force = this->AnalyticSmearedForce(force, this->get_smeared_conf(ismr - 1),ismr);
}
force = this->AnalyticSmearedForce(force, *this->ThinLinks,0);
// Add U to UdSdU
for (int mu = 0; mu < Nd; mu++)
{
tmp_mu = peekLorentz(*this->ThinLinks, mu) * peekLorentz(force, mu);
pokeLorentz(SigmaTilde, tmp_mu, mu);
}
double end = usecond();
double time = (end - start)/ 1e3;
std::cout << GridLogMessage << " GaugeConfigurationMasked: Smeared Force chain rule took " << time << " ms" << std::endl;
} // if smearingLevels = 0 do nothing
SigmaTilde=Gimpl::projectForce(SigmaTilde); // Ta
}
};
NAMESPACE_END(Grid);

View File

@ -0,0 +1,87 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/gauge/JacobianAction.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////////////////////////////////
// Jacobian Action ..
////////////////////////////////////////////////////////////////////////
template <class Gimpl>
class JacobianAction : public Action<typename Gimpl::GaugeField> {
public:
INHERIT_GIMPL_TYPES(Gimpl);
SmearedConfigurationMasked<Gimpl> * smearer;
/////////////////////////// constructors
explicit JacobianAction(SmearedConfigurationMasked<Gimpl> * _smearer ) { smearer=_smearer;};
virtual std::string action_name() {return "JacobianAction";}
virtual std::string LogParameters(){
std::stringstream sstream;
sstream << GridLogMessage << "[JacobianAction] " << std::endl;
return sstream.str();
}
//////////////////////////////////
// Usual cases are not used
//////////////////////////////////
virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG){ assert(0);};
virtual RealD S(const GaugeField &U) { assert(0); }
virtual void deriv(const GaugeField &U, GaugeField &dSdU) { assert(0); }
//////////////////////////////////
// Functions of smart configs only
//////////////////////////////////
virtual void refresh(ConfigurationBase<GaugeField> & U, GridSerialRNG &sRNG, GridParallelRNG& pRNG)
{
return;
}
virtual RealD S(ConfigurationBase<GaugeField>& U)
{
// det M = e^{ - ( - logDetM) }
assert( &U == smearer );
return -smearer->logDetJacobian();
}
virtual RealD Sinitial(ConfigurationBase<GaugeField>& U)
{
return S(U);
}
virtual void deriv(ConfigurationBase<GaugeField>& U, GaugeField& dSdU)
{
assert( &U == smearer );
smearer->logDetJacobianForce(dSdU);
}
private:
};
NAMESPACE_END(Grid);

View File

@ -40,7 +40,9 @@ template <class Gimpl>
class Smear_Stout : public Smear<Gimpl> {
private:
int OrthogDim = -1;
public:
const std::vector<double> SmearRho;
private:
// Smear<Gimpl>* ownership semantics:
// Smear<Gimpl>* passed in to constructor are owned by caller, so we don't delete them here
// Smear<Gimpl>* created within constructor need to be deleted as part of the destructor

View File

@ -34,6 +34,61 @@ directory
NAMESPACE_BEGIN(Grid);
template<int N, class Vec>
Lattice<iScalar<iScalar<iScalar<Vec> > > > Determinant(const Lattice<iScalar<iScalar<iMatrix<Vec, N> > > > &Umu)
{
GridBase *grid=Umu.Grid();
auto lvol = grid->lSites();
Lattice<iScalar<iScalar<iScalar<Vec> > > > ret(grid);
typedef typename Vec::scalar_type scalar;
autoView(Umu_v,Umu,CpuRead);
autoView(ret_v,ret,CpuWrite);
thread_for(site,lvol,{
Eigen::MatrixXcd EigenU = Eigen::MatrixXcd::Zero(N,N);
Coordinate lcoor;
grid->LocalIndexToLocalCoor(site, lcoor);
iScalar<iScalar<iMatrix<scalar, N> > > Us;
peekLocalSite(Us, Umu_v, lcoor);
for(int i=0;i<N;i++){
for(int j=0;j<N;j++){
scalar tmp= Us()()(i,j);
ComplexD ztmp(real(tmp),imag(tmp));
EigenU(i,j)=ztmp;
}}
ComplexD detD = EigenU.determinant();
typename Vec::scalar_type det(detD.real(),detD.imag());
pokeLocalSite(det,ret_v,lcoor);
});
return ret;
}
template<int N, class Vec>
static void ProjectSUn(Lattice<iScalar<iScalar<iMatrix<Vec, N> > > > &Umu)
{
Umu = ProjectOnGroup(Umu);
auto det = Determinant(Umu);
det = conjugate(det);
for(int i=0;i<N;i++){
auto element = PeekIndex<ColourIndex>(Umu,N-1,i);
element = element * det;
PokeIndex<ColourIndex>(Umu,element,Nc-1,i);
}
}
template<int N,class Vec>
static void ProjectSUn(Lattice<iVector<iScalar<iMatrix<Vec, N> >,Nd> > &U)
{
GridBase *grid=U.Grid();
// Reunitarise
for(int mu=0;mu<Nd;mu++){
auto Umu = PeekIndex<LorentzIndex>(U,mu);
Umu = ProjectOnGroup(Umu);
ProjectSUn(Umu);
PokeIndex<LorentzIndex>(U,Umu,mu);
}
}
template <int ncolour>
class SU {
public:
@ -741,8 +796,14 @@ public:
typedef Lattice<vMatrixType> LatticeMatrixType;
LatticeMatrixType Umu(out.Grid());
LatticeMatrixType tmp(out.Grid());
for (int mu = 0; mu < Nd; mu++) {
LieRandomize(pRNG, Umu, 1.0);
// LieRandomize(pRNG, Umu, 1.0);
// PokeIndex<LorentzIndex>(out, Umu, mu);
gaussian(pRNG,Umu);
tmp = Ta(Umu);
taExp(tmp,Umu);
ProjectSUn(Umu);
PokeIndex<LorentzIndex>(out, Umu, mu);
}
}
@ -799,12 +860,12 @@ public:
};
template<int N>
LatticeComplexD Determinant(const Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > &Umu)
Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > Inverse(const Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > &Umu)
{
GridBase *grid=Umu.Grid();
auto lvol = grid->lSites();
LatticeComplexD ret(grid);
Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > ret(grid);
autoView(Umu_v,Umu,CpuRead);
autoView(ret_v,ret,CpuWrite);
thread_for(site,lvol,{
@ -812,42 +873,21 @@ LatticeComplexD Determinant(const Lattice<iScalar<iScalar<iMatrix<vComplexD, N>
Coordinate lcoor;
grid->LocalIndexToLocalCoor(site, lcoor);
iScalar<iScalar<iMatrix<ComplexD, N> > > Us;
iScalar<iScalar<iMatrix<ComplexD, N> > > Ui;
peekLocalSite(Us, Umu_v, lcoor);
for(int i=0;i<N;i++){
for(int j=0;j<N;j++){
EigenU(i,j) = Us()()(i,j);
}}
ComplexD det = EigenU.determinant();
pokeLocalSite(det,ret_v,lcoor);
Eigen::MatrixXcd EigenUinv = EigenU.inverse();
for(int i=0;i<N;i++){
for(int j=0;j<N;j++){
Ui()()(i,j) = EigenUinv(i,j);
}}
pokeLocalSite(Ui,ret_v,lcoor);
});
return ret;
}
template<int N>
static void ProjectSUn(Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > &Umu)
{
Umu = ProjectOnGroup(Umu);
auto det = Determinant(Umu);
det = conjugate(det);
for(int i=0;i<N;i++){
auto element = PeekIndex<ColourIndex>(Umu,N-1,i);
element = element * det;
PokeIndex<ColourIndex>(Umu,element,Nc-1,i);
}
}
template<int N>
static void ProjectSUn(Lattice<iVector<iScalar<iMatrix<vComplexD, N> >,Nd> > &U)
{
GridBase *grid=U.Grid();
// Reunitarise
for(int mu=0;mu<Nd;mu++){
auto Umu = PeekIndex<LorentzIndex>(U,mu);
Umu = ProjectOnGroup(Umu);
ProjectSUn(Umu);
PokeIndex<LorentzIndex>(U,Umu,mu);
}
}
// Explicit specialisation for SU(3).
// Explicit specialisation for SU(3).
static void

View File

@ -51,6 +51,7 @@ public:
typedef Lattice<iVector<iScalar<iMatrix<vComplexF, Dimension> >, Nd> > LatticeAdjFieldF;
typedef Lattice<iVector<iScalar<iMatrix<vComplexD, Dimension> >, Nd> > LatticeAdjFieldD;
typedef Lattice<iScalar<iScalar<iVector<vComplex, Dimension> > > > LatticeAdjVector;
template <class cplx>
static void generator(int Index, iSUnAdjointMatrix<cplx> &iAdjTa) {

View File

@ -320,7 +320,7 @@ struct Conj{
struct TimesMinusI{
//Complex single
inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
inline float32x4_t operator()(float32x4_t in){
// ar ai br bi -> ai -ar ai -br
float32x4_t r0, r1;
r0 = vnegq_f32(in); // -ar -ai -br -bi
@ -328,7 +328,7 @@ struct TimesMinusI{
return vtrn1q_f32(r1, r0); // ar -ai br -bi
}
//Complex double
inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
inline float64x2_t operator()(float64x2_t in){
// a ib -> b -ia
float64x2_t tmp;
tmp = vnegq_f64(in);
@ -338,7 +338,7 @@ struct TimesMinusI{
struct TimesI{
//Complex single
inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
inline float32x4_t operator()(float32x4_t in){
// ar ai br bi -> -ai ar -bi br
float32x4_t r0, r1;
r0 = vnegq_f32(in); // -ar -ai -br -bi
@ -346,7 +346,7 @@ struct TimesI{
return vtrn1q_f32(r1, in); // -ai ar -bi br
}
//Complex double
inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
inline float64x2_t operator()(float64x2_t in){
// a ib -> -b ia
float64x2_t tmp;
tmp = vnegq_f64(in);

View File

@ -123,7 +123,7 @@ public:
}
if ( permute_slice ) {
int ptype =grid->PermuteType(d);
uint8_t mask =grid->Nsimd() >> (ptype + 1);
uint8_t mask =0x1<<ptype;
SE._permute |= mask;
}
}

View File

@ -339,8 +339,8 @@ public:
// Vectors that live on the symmetric heap in case of SHMEM
// These are used; either SHM objects or refs to the above symmetric heap vectors
// depending on comms target
Vector<cobj *> u_simd_send_buf;
Vector<cobj *> u_simd_recv_buf;
std::vector<cobj *> u_simd_send_buf;
std::vector<cobj *> u_simd_recv_buf;
int u_comm_offset;
int _unified_buffer_size;
@ -348,7 +348,7 @@ public:
////////////////////////////////////////
// Stencil query
////////////////////////////////////////
#ifdef SHM_FAST_PATH
#if 1
inline int SameNode(int point) {
int dimension = this->_directions[point];
@ -434,7 +434,6 @@ public:
////////////////////////////////////////////////////////////////////////
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
{
accelerator_barrier();
for(int i=0;i<Packets.size();i++){
_grid->StencilSendToRecvFromBegin(MpiReqs,
Packets[i].send_buf,
@ -443,7 +442,6 @@ public:
Packets[i].from_rank,Packets[i].do_recv,
Packets[i].xbytes,Packets[i].rbytes,i);
}
_grid->StencilBarrier();// Synch shared memory on a single nodes
}
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
@ -452,6 +450,8 @@ public:
if ( this->partialDirichlet ) DslashLogPartial();
else if ( this->fullDirichlet ) DslashLogDirichlet();
else DslashLogFull();
acceleratorCopySynchronise();
_grid->StencilBarrier();
}
////////////////////////////////////////////////////////////////////////
// Blocking send and receive. Either sequential or parallel.
@ -529,7 +529,6 @@ public:
{
_grid->StencilBarrier();// Synch shared memory on a single nodes
// conformable(source.Grid(),_grid);
assert(source.Grid()==_grid);
u_comm_offset=0;
@ -540,6 +539,7 @@ public:
compress.Point(point);
HaloGatherDir(source,compress,point,face_idx);
}
accelerator_barrier();
face_table_computed=1;
assert(u_comm_offset==_unified_buffer_size);
@ -655,8 +655,8 @@ public:
CommsMerge(decompress,Mergers,Decompressions);
}
template<class decompressor> void CommsMergeSHM(decompressor decompress) {
_grid->StencilBarrier();// Synch shared memory on a single nodes
CommsMerge(decompress,MergersSHM,DecompressionsSHM);
assert(MergersSHM.size()==0);
assert(DecompressionsSHM.size()==0);
}
template<class decompressor>
@ -705,6 +705,7 @@ public:
}
}
}
std::cout << GridLogDebug << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
}
/// Introduce a block structure and switch off comms on boundaries
void DirichletBlock(const Coordinate &dirichlet_block)
@ -1366,10 +1367,11 @@ public:
int recv_from_rank;
int xmit_to_rank;
int shm_send=0;
int shm_recv=0;
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
#ifdef SHM_FAST_PATH
#warning STENCIL SHM FAST PATH SELECTED
int shm_recv=0;
// shm == receive pointer if offnode
// shm == Translate[send pointer] if on node -- my view of his send pointer
cobj *shm = (cobj *) _grid->ShmBufferTranslate(recv_from_rank,sp);
@ -1402,7 +1404,6 @@ public:
acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero
}
int do_send = (comms_send|comms_partial_send) && (!shm_send );
int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
AddPacket((void *)sp,(void *)rp,
xmit_to_rank,do_send,
recv_from_rank,do_send,

View File

@ -90,10 +90,12 @@ template<class vtype,int N> accelerator_inline iVector<vtype,N> ProjectOnGroup(c
template<class vtype,int N, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0 >::type * =nullptr>
accelerator_inline iMatrix<vtype,N> ProjectOnGroup(const iMatrix<vtype,N> &arg)
{
typedef typename iMatrix<vtype,N>::scalar_type scalar;
// need a check for the group type?
iMatrix<vtype,N> ret(arg);
vtype nrm;
vtype inner;
scalar one(1.0);
for(int c1=0;c1<N;c1++){
// Normalises row c1
@ -102,7 +104,7 @@ accelerator_inline iMatrix<vtype,N> ProjectOnGroup(const iMatrix<vtype,N> &arg)
inner += innerProduct(ret._internal[c1][c2],ret._internal[c1][c2]);
nrm = sqrt(inner);
nrm = 1.0/nrm;
nrm = one/nrm;
for(int c2=0;c2<N;c2++)
ret._internal[c1][c2]*= nrm;
@ -127,7 +129,7 @@ accelerator_inline iMatrix<vtype,N> ProjectOnGroup(const iMatrix<vtype,N> &arg)
inner += innerProduct(ret._internal[c1][c2],ret._internal[c1][c2]);
nrm = sqrt(inner);
nrm = 1.0/nrm;
nrm = one/nrm;
for(int c2=0;c2<N;c2++)
ret._internal[c1][c2]*= nrm;
}

View File

@ -55,7 +55,7 @@ template<class vtype, int N> accelerator_inline iVector<vtype, N> Exponentiate(c
// Specialisation: Cayley-Hamilton exponential for SU(3)
#ifndef GRID_ACCELERATED
#if 0
template<class vtype, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0>::type * =nullptr>
accelerator_inline iMatrix<vtype,3> Exponentiate(const iMatrix<vtype,3> &arg, RealD alpha , Integer Nexp = DEFAULT_MAT_EXP )
{

View File

@ -133,7 +133,6 @@ typename vobj::scalar_object extractLane(int lane, const vobj & __restrict__ vec
typedef scalar_type * pointer;
constexpr int words=sizeof(vobj)/sizeof(vector_type);
constexpr int Nsimd=vector_type::Nsimd();
scalar_object extracted;
pointer __restrict__ sp = (pointer)&extracted; // Type pun
@ -153,7 +152,6 @@ void insertLane(int lane, vobj & __restrict__ vec,const typename vobj::scalar_ob
typedef scalar_type * pointer;
constexpr int words=sizeof(vobj)/sizeof(vector_type);
constexpr int Nsimd=vector_type::Nsimd();
pointer __restrict__ sp = (pointer)&extracted;
vector_type *vp = (vector_type *)&vec;
@ -178,8 +176,6 @@ void extract(const vobj &vec,const ExtractPointerArray<sobj> &extracted, int off
const int s = Nsimd/Nextr;
vector_type * vp = (vector_type *)&vec;
scalar_type vtmp;
sobj_scalar_type stmp;
for(int w=0;w<words;w++){
for(int i=0;i<Nextr;i++){
sobj_scalar_type * pointer = (sobj_scalar_type *)& extracted[i][offset];
@ -205,7 +201,6 @@ void merge(vobj &vec,const ExtractPointerArray<sobj> &extracted, int offset)
vector_type * vp = (vector_type *)&vec;
scalar_type vtmp;
sobj_scalar_type stmp;
for(int w=0;w<words;w++){
for(int i=0;i<Nextr;i++){
sobj_scalar_type * pointer = (sobj_scalar_type *)& extracted[i][offset];
@ -242,9 +237,6 @@ void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __rest
typedef oextract_type * opointer;
typedef iextract_type * ipointer;
constexpr int oNsimd=ovector_type::Nsimd();
constexpr int iNsimd=ivector_type::Nsimd();
iscalar_type itmp;
oscalar_type otmp;

View File

@ -458,7 +458,8 @@ inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream);
// Common on all GPU targets
//////////////////////////////////////////////
#if defined(GRID_SYCL) || defined(GRID_CUDA) || defined(GRID_HIP)
#define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );
// FIXME -- the non-blocking nature got broken March 30 2023 by PAB
#define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );
#define accelerator_for( iter, num, nsimd, ... ) \
accelerator_forNB(iter, num, nsimd, { __VA_ARGS__ } ); \
@ -525,7 +526,7 @@ inline void acceleratorFreeCpu (void *ptr){free(ptr);};
//////////////////////////////////////////////
#ifdef GRID_SYCL
inline void acceleratorFenceComputeStream(void){ accelerator_barrier();};
inline void acceleratorFenceComputeStream(void){ theGridAccelerator->ext_oneapi_submit_barrier(); };
#else
// Ordering within a stream guaranteed on Nvidia & AMD
inline void acceleratorFenceComputeStream(void){ };

224
HMC/FTHMC2p1f.cc Normal file
View File

@ -0,0 +1,224 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Copyright (C) 2023
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
#include <Grid/qcd/smearing/JacobianAction.h>
using namespace Grid;
int main(int argc, char **argv)
{
std::cout << std::setprecision(12);
Grid_init(&argc, &argv);
int threads = GridThread::GetThreads();
// here make a routine to print all the relevant information on the run
std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
// Typedefs to simplify notation
typedef WilsonImplR FermionImplPolicy;
typedef MobiusFermionD FermionAction;
typedef typename FermionAction::FermionField FermionField;
typedef Grid::XmlReader Serialiser;
//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
IntegratorParameters MD;
// typedef GenericHMCRunner<LeapFrog> HMCWrapper;
// MD.name = std::string("Leap Frog");
// typedef GenericHMCRunner<ForceGradient> HMCWrapper;
// MD.name = std::string("Force Gradient");
typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
MD.name = std::string("MinimumNorm2");
MD.MDsteps = 12;
MD.trajL = 1.0;
HMCparameters HMCparams;
HMCparams.StartTrajectory = 0;
HMCparams.Trajectories = 200;
HMCparams.NoMetropolisUntil= 20;
// "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
HMCparams.StartingType =std::string("HotStart");
HMCparams.MD = MD;
HMCWrapper TheHMC(HMCparams);
// Grid from the command line arguments --grid and --mpi
TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
CheckpointerParameters CPparams;
CPparams.config_prefix = "ckpoint_EODWF_lat";
CPparams.smeared_prefix = "ckpoint_EODWF_lat_smr";
CPparams.rng_prefix = "ckpoint_EODWF_rng";
CPparams.saveInterval = 1;
CPparams.saveSmeared = true;
CPparams.format = "IEEE64BIG";
TheHMC.Resources.LoadNerscCheckpointer(CPparams);
RNGModuleParameters RNGpar;
RNGpar.serial_seeds = "1 2 3 4 5";
RNGpar.parallel_seeds = "6 7 8 9 10";
TheHMC.Resources.SetRNGSeeds(RNGpar);
// Construct observables
// here there is too much indirection
typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
TheHMC.Resources.AddObservable<PlaqObs>();
//////////////////////////////////////////////
const int Ls = 16;
Real beta = 2.13;
Real light_mass = 0.01;
Real strange_mass = 0.04;
Real pv_mass = 1.0;
RealD M5 = 1.8;
RealD b = 1.0; // Scale factor two
RealD c = 0.0;
OneFlavourRationalParams OFRp;
OFRp.lo = 1.0e-2;
OFRp.hi = 64;
OFRp.MaxIter = 10000;
OFRp.tolerance= 1.0e-10;
OFRp.degree = 14;
OFRp.precision= 40;
std::vector<Real> hasenbusch({ 0.1 });
auto GridPtr = TheHMC.Resources.GetCartesian();
auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
auto FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
auto FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
IwasakiGaugeActionR GaugeAction(beta);
// temporarily need a gauge field
LatticeGaugeField U(GridPtr);
LatticeGaugeField Uhot(GridPtr);
// These lines are unecessary if BC are all periodic
std::vector<Complex> boundary = {1,1,1,-1};
FermionAction::ImplParams Params(boundary);
double StoppingCondition = 1e-10;
double MaxCGIterations = 30000;
ConjugateGradient<FermionField> CG(StoppingCondition,MaxCGIterations);
bool ApplySmearing = true;
////////////////////////////////////
// Collect actions
////////////////////////////////////
ActionLevel<HMCWrapper::Field> Level1(1);
ActionLevel<HMCWrapper::Field> Level2(2);
ActionLevel<HMCWrapper::Field> Level3(4);
////////////////////////////////////
// Strange action
////////////////////////////////////
MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass, pv_mass, -1.0, 1, M5, b, c);
ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy>
EOFA(Strange_Op_L, Strange_Op_R,
CG,
CG, CG,
CG, CG,
OFRp, false);
EOFA.is_smeared = ApplySmearing;
Level1.push_back(&EOFA);
////////////////////////////////////
// up down action
////////////////////////////////////
std::vector<Real> light_den;
std::vector<Real> light_num;
int n_hasenbusch = hasenbusch.size();
light_den.push_back(light_mass);
for(int h=0;h<n_hasenbusch;h++){
light_den.push_back(hasenbusch[h]);
light_num.push_back(hasenbusch[h]);
}
light_num.push_back(pv_mass);
std::vector<FermionAction *> Numerators;
std::vector<FermionAction *> Denominators;
std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
for(int h=0;h<n_hasenbusch+1;h++){
std::cout << GridLogMessage << " 2f quotient Action "<< light_num[h] << " / " << light_den[h]<< std::endl;
Numerators.push_back (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params));
Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params));
Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],CG,CG));
}
for(int h=0;h<n_hasenbusch+1;h++){
Quotients[h]->is_smeared = ApplySmearing;
Level1.push_back(Quotients[h]);
}
/////////////////////////////////////////////////////////////
// lnDetJacobianAction
/////////////////////////////////////////////////////////////
double rho = 0.1; // smearing parameter
int Nsmear = 1; // number of smearing levels - must be multiple of 2Nd
int Nstep = 8*Nsmear; // number of smearing levels - must be multiple of 2Nd
Smear_Stout<HMCWrapper::ImplPolicy> Stout(rho);
SmearedConfigurationMasked<HMCWrapper::ImplPolicy> SmearingPolicy(GridPtr, Nstep, Stout);
JacobianAction<HMCWrapper::ImplPolicy> Jacobian(&SmearingPolicy);
if( ApplySmearing ) Level2.push_back(&Jacobian);
std::cout << GridLogMessage << " Built the Jacobian "<< std::endl;
/////////////////////////////////////////////////////////////
// Gauge action
/////////////////////////////////////////////////////////////
// GaugeAction.is_smeared = ApplySmearing;
GaugeAction.is_smeared = true;
Level3.push_back(&GaugeAction);
std::cout << GridLogMessage << " ************************************************"<< std::endl;
std::cout << GridLogMessage << " Action complete -- NO FERMIONS FOR NOW -- FIXME"<< std::endl;
std::cout << GridLogMessage << " ************************************************"<< std::endl;
std::cout << GridLogMessage << std::endl;
std::cout << GridLogMessage << std::endl;
std::cout << GridLogMessage << " Running the FT HMC "<< std::endl;
TheHMC.TheAction.push_back(Level1);
TheHMC.TheAction.push_back(Level2);
TheHMC.TheAction.push_back(Level3);
TheHMC.Run(SmearingPolicy); // for smearing
Grid_finalize();
} // main

View File

@ -227,7 +227,7 @@ int main(int argc, char **argv) {
// std::vector<Real> hasenbusch({ light_mass, 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated
// std::vector<Real> hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass });
int SP_iters=10000;
int SP_iters=9000;
RationalActionParams OFRp; // Up/down
OFRp.lo = 6.0e-5;
@ -362,12 +362,12 @@ int main(int argc, char **argv) {
// Probably dominates the force - back to EOFA.
OneFlavourRationalParams SFRp;
SFRp.lo = 0.25;
SFRp.lo = 0.1;
SFRp.hi = 25.0;
SFRp.MaxIter = 10000;
SFRp.tolerance= 1.0e-5;
SFRp.tolerance= 1.0e-8;
SFRp.mdtolerance= 2.0e-4;
SFRp.degree = 8;
SFRp.degree = 12;
SFRp.precision= 50;
MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
@ -451,7 +451,7 @@ int main(int argc, char **argv) {
#define MIXED_PRECISION
#ifdef MIXED_PRECISION
std::vector<GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicy> *> Bdys;
std::vector<GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF> *> Bdys;
#else
std::vector<GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys;
#endif
@ -526,15 +526,13 @@ int main(int argc, char **argv) {
Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],*MPCG[h],*ActionMPCG[h],CG));
} else {
#ifdef MIXED_PRECISION
Bdys.push_back( new GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicy>(
Bdys.push_back( new GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF>(
*Numerators[h],*Denominators[h],
*NumeratorsF[h],*DenominatorsF[h],
*Numerators[h],*Denominators[h],
OFRp, SP_iters) );
Bdys.push_back( new GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicy>(
Bdys.push_back( new GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF>(
*Numerators[h],*Denominators[h],
*NumeratorsF[h],*DenominatorsF[h],
*Numerators[h],*Denominators[h],
OFRp, SP_iters) );
#else
Bdys.push_back( new GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));

View File

@ -329,7 +329,6 @@ int main(int argc, char **argv) {
auto grid4= GridPtr;
auto rbgrid4= GridRBPtr;
auto rbgrid = StrangeOp.FermionRedBlackGrid();
auto grid = StrangeOp.FermionGrid();
if(1){

View File

@ -146,6 +146,8 @@ NAMESPACE_END(Grid);
int main(int argc, char **argv) {
using namespace Grid;
std::cout << " Grid Initialise "<<std::endl;
Grid_init(&argc, &argv);
CartesianCommunicator::BarrierWorld();
@ -164,35 +166,30 @@ int main(int argc, char **argv) {
typedef MobiusEOFAFermionF FermionEOFAActionF;
typedef typename FermionActionF::FermionField FermionFieldF;
typedef WilsonImplD2 FermionImplPolicyD2;
typedef MobiusFermionD2 FermionActionD2;
typedef MobiusEOFAFermionD2 FermionEOFAActionD2;
typedef typename FermionActionD2::FermionField FermionFieldD2;
typedef Grid::XmlReader Serialiser;
//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
IntegratorParameters MD;
// typedef GenericHMCRunner<LeapFrog> HMCWrapper;
// MD.name = std::string("Leap Frog");
typedef GenericHMCRunner<ForceGradient> HMCWrapper;
MD.name = std::string("Force Gradient");
//typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
// MD.name = std::string("MinimumNorm2");
// typedef GenericHMCRunner<ForceGradient> HMCWrapper;
// MD.name = std::string("Force Gradient");
typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
MD.name = std::string("MinimumNorm2");
// TrajL = 2
// 4/2 => 0.6 dH
// 3/3 => 0.8 dH .. depth 3, slower
//MD.MDsteps = 4;
MD.MDsteps = 12;
MD.MDsteps = 14;
MD.trajL = 0.5;
HMCparameters HMCparams;
HMCparams.StartTrajectory = 1077;
HMCparams.Trajectories = 1;
HMCparams.Trajectories = 20;
HMCparams.NoMetropolisUntil= 0;
// "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
// HMCparams.StartingType =std::string("ColdStart");
HMCparams.StartingType =std::string("CheckpointStart");
HMCparams.StartingType =std::string("ColdStart");
// HMCparams.StartingType =std::string("CheckpointStart");
HMCparams.MD = MD;
HMCWrapper TheHMC(HMCparams);
@ -228,7 +225,7 @@ int main(int argc, char **argv) {
Real pv_mass = 1.0;
// std::vector<Real> hasenbusch({ 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
// std::vector<Real> hasenbusch({ light_mass, 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
std::vector<Real> hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated
std::vector<Real> hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 }); // Updated
// std::vector<Real> hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass });
auto GridPtr = TheHMC.Resources.GetCartesian();
@ -249,11 +246,6 @@ int main(int argc, char **argv) {
Coordinate shm;
GlobalSharedMemory::GetShmDims(mpi,shm);
Coordinate CommDim(Nd);
for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
Coordinate NonDirichlet(Nd+1,0);
//////////////////////////
// Fermion Grids
@ -272,7 +264,6 @@ int main(int argc, char **argv) {
// temporarily need a gauge field
LatticeGaugeFieldD U(GridPtr); U=Zero();
LatticeGaugeFieldF UF(GridPtrF); UF=Zero();
LatticeGaugeFieldD2 UD2(GridPtrF); UD2=Zero();
std::cout << GridLogMessage << " Running the HMC "<< std::endl;
TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file
@ -283,15 +274,13 @@ int main(int argc, char **argv) {
std::vector<Complex> boundary = {1,1,1,-1};
FermionAction::ImplParams Params(boundary);
FermionActionF::ImplParams ParamsF(boundary);
Params.dirichlet=NonDirichlet;
ParamsF.dirichlet=NonDirichlet;
// double StoppingCondition = 1e-14;
// double MDStoppingCondition = 1e-9;
double StoppingCondition = 1e-8;
double MDStoppingCondition = 1e-7;
double MDStoppingConditionLoose = 1e-7;
double MDStoppingConditionStrange = 1e-7;
double StoppingCondition = 1e-9;
double MDStoppingCondition = 1e-8;
double MDStoppingConditionLoose = 1e-8;
double MDStoppingConditionStrange = 1e-8;
double MaxCGIterations = 300000;
ConjugateGradient<FermionField> CG(StoppingCondition,MaxCGIterations);
ConjugateGradient<FermionField> MDCG(MDStoppingCondition,MaxCGIterations);
@ -311,12 +300,12 @@ int main(int argc, char **argv) {
// Probably dominates the force - back to EOFA.
OneFlavourRationalParams SFRp;
SFRp.lo = 0.25;
SFRp.hi = 25.0;
SFRp.lo = 0.1;
SFRp.hi = 30.0;
SFRp.MaxIter = 10000;
SFRp.tolerance= 1.0e-5;
SFRp.mdtolerance= 2.0e-4;
SFRp.degree = 8;
SFRp.tolerance= 1.0e-8;
SFRp.mdtolerance= 2.0e-6;
SFRp.degree = 10;
SFRp.precision= 50;
MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
@ -376,33 +365,29 @@ int main(int argc, char **argv) {
////////////////////////////////////
std::vector<Real> light_den;
std::vector<Real> light_num;
std::vector<int> dirichlet_den;
std::vector<int> dirichlet_num;
int n_hasenbusch = hasenbusch.size();
light_den.push_back(light_mass); dirichlet_den.push_back(0);
light_den.push_back(light_mass);
for(int h=0;h<n_hasenbusch;h++){
light_den.push_back(hasenbusch[h]); dirichlet_den.push_back(0);
light_den.push_back(hasenbusch[h]);
}
for(int h=0;h<n_hasenbusch;h++){
light_num.push_back(hasenbusch[h]); dirichlet_num.push_back(0);
light_num.push_back(hasenbusch[h]);
}
light_num.push_back(pv_mass); dirichlet_num.push_back(0);
light_num.push_back(pv_mass);
std::vector<FermionAction *> Numerators;
std::vector<FermionAction *> Denominators;
std::vector<FermionActionF *> NumeratorsF;
std::vector<FermionActionF *> DenominatorsF;
std::vector<FermionActionD2 *> NumeratorsD2;
std::vector<FermionActionD2 *> DenominatorsD2;
std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
std::vector<MxPCG *> ActionMPCG;
std::vector<MxPCG *> MPCG;
#define MIXED_PRECISION
#ifdef MIXED_PRECISION
std::vector<OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicyD2> *> Bdys;
std::vector<OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF> *> Bdys;
#else
std::vector<OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys;
#endif
@ -416,9 +401,7 @@ int main(int argc, char **argv) {
std::cout << GridLogMessage
<< " 2f quotient Action ";
std::cout << "det D("<<light_den[h]<<")";
if ( dirichlet_den[h] ) std::cout << "^dirichlet ";
std::cout << "/ det D("<<light_num[h]<<")";
if ( dirichlet_num[h] ) std::cout << "^dirichlet ";
std::cout << std::endl;
FermionAction::ImplParams ParamsNum(boundary);
@ -426,21 +409,11 @@ int main(int argc, char **argv) {
FermionActionF::ImplParams ParamsDenF(boundary);
FermionActionF::ImplParams ParamsNumF(boundary);
ParamsNum.dirichlet = NonDirichlet;
ParamsDen.dirichlet = NonDirichlet;
ParamsNum.partialDirichlet = 0;
ParamsDen.partialDirichlet = 0;
Numerators.push_back (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
ParamsDenF.dirichlet = ParamsDen.dirichlet;
ParamsDenF.partialDirichlet = ParamsDen.partialDirichlet;
DenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsDenF));
ParamsNumF.dirichlet = ParamsNum.dirichlet;
ParamsNumF.partialDirichlet = ParamsNum.partialDirichlet;
NumeratorsF.push_back (new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[h],M5,b,c, ParamsNumF));
LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
@ -477,7 +450,6 @@ int main(int argc, char **argv) {
// Gauge action
/////////////////////////////////////////////////////////////
Level3.push_back(&GaugeAction);
// TheHMC.TheAction.push_back(Level1);
TheHMC.TheAction.push_back(Level2);
TheHMC.TheAction.push_back(Level3);
std::cout << GridLogMessage << " Action complete "<< std::endl;

View File

@ -1,7 +1,8 @@
# Grid [![Teamcity status](http://ci.cliath.ph.ed.ac.uk/app/rest/builds/aggregated/strob:(buildType:(affectedProject(id:GridBasedSoftware_Grid)),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=GridBasedSoftware_Grid&tab=projectOverview)
# Grid
**Data parallel C++ mathematical object library.**
[![Teamcity status](https://ci.dev.dirac.ed.ac.uk/guestAuth/app/rest/builds/aggregated/strob:(buildType:(affectedProject(id:GridBasedSoftware_Grid)),branch:default:true)/statusIcon.svg)](https://ci.dev.dirac.ed.ac.uk/project/GridBasedSoftware_Grid?mode=builds)
License: GPL v2.
Last update June 2017.

View File

@ -425,7 +425,7 @@ void Benchmark(int Ls, Coordinate Dirichlet)
err = r_eo-result;
n2e= norm2(err);
std::cout<<GridLogMessage << "norm diff "<< n2e<< " Line "<<__LINE__ <<std::endl;
std::cout<<GridLogMessage << "norm diff "<< n2e<<std::endl;
assert(n2e<1.0e-4);
pickCheckerboard(Even,src_e,err);

View File

@ -0,0 +1,387 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./benchmarks/Benchmark_dwf.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#ifdef GRID_CUDA
#define CUDA_PROFILE
#endif
#ifdef CUDA_PROFILE
#include <cuda_profiler_api.h>
#endif
using namespace std;
using namespace Grid;
template<class d>
struct scal {
d internal;
};
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
};
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
int threads = GridThread::GetThreads();
Coordinate latt4 = GridDefaultLatt();
int Ls=16;
for(int i=0;i<argc;i++)
if(std::string(argv[i]) == "-Ls"){
std::stringstream ss(argv[i+1]); ss >> Ls;
}
GridLogLayout();
long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG"));
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG"));
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
LatticeFermionF src (FGrid); random(RNG5,src);
LatticeFermionF src1 (FGrid); random(RNG5,src1);
#if 0
src = Zero();
{
Coordinate origin({0,0,0,latt4[2]-1,0});
SpinColourVectorF tmp;
tmp=Zero();
tmp()(0)(0)=Complex(-2.0,0.0);
std::cout << " source site 0 " << tmp<<std::endl;
pokeSite(tmp,src,origin);
}
#else
RealD N2 = 1.0/::sqrt(norm2(src));
src = src*N2;
#endif
LatticeFermionF result(FGrid); result=Zero();
LatticeFermionF ref(FGrid); ref=Zero();
LatticeFermionF tmp(FGrid);
LatticeFermionF err(FGrid);
std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
LatticeGaugeFieldF Umu(UGrid);
SU<Nc>::HotConfiguration(RNG4,Umu);
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
#if 0
Umu=1.0;
for(int mu=0;mu<Nd;mu++){
LatticeColourMatrixF ttmp(UGrid);
ttmp = PeekIndex<LorentzIndex>(Umu,mu);
// if (mu !=2 ) ttmp = 0;
// ttmp = ttmp* pow(10.0,mu);
PokeIndex<LorentzIndex>(Umu,ttmp,mu);
}
std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
#endif
////////////////////////////////////
// Naive wilson implementation
////////////////////////////////////
// replicate across fifth dimension
// LatticeGaugeFieldF Umu5d(FGrid);
std::vector<LatticeColourMatrixF> U(4,UGrid);
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
}
std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
if (1)
{
ref = Zero();
for(int mu=0;mu<Nd;mu++){
tmp = Cshift(src,mu+1,1);
{
autoView( tmp_v , tmp , CpuWrite);
autoView( U_v , U[mu] , CpuRead);
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
}
}
}
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
{
autoView( tmp_v , tmp , CpuWrite);
autoView( U_v , U[mu] , CpuRead);
autoView( src_v, src , CpuRead);
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
}
}
}
tmp =Cshift(tmp,mu+1,-1);
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
}
ref = -0.5*ref;
}
RealD mass=0.1;
RealD M5 =1.8;
RealD NP = UGrid->_Nprocessors;
RealD NN = UGrid->NodeCount();
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
std::cout << GridLogMessage<< "* VComplexF size is "<<sizeof(vComplexF)<< " B"<<std::endl;
if ( sizeof(RealF)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(RealF)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
#ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
#endif
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
int ncall =100;
if (1) {
FGrid->Barrier();
Dw.Dhop(src,result,0);
std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
double t0=usecond();
for(int i=0;i<ncall;i++){
Dw.Dhop(src1,result,0);
Dw.Dhop(src,result,0);
err = ref-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
assert (norm2(err)< 1.0e-4 );
}
double t1=usecond();
FGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=single_site_flops*volume*ncall;
auto nsimd = vComplex::Nsimd();
auto simdwidth = sizeof(vComplex);
// RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
// mem: Nd Wilson * Ls, Nd gauge, Nc colors
double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
// std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
// std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
std::cout<<GridLogMessage << "RF GiB/s (base 2) = "<< 1000000. * data_rf/((t1-t0))<<std::endl;
std::cout<<GridLogMessage << "mem GiB/s (base 2) = "<< 1000000. * data_mem/((t1-t0))<<std::endl;
err = ref-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
//exit(0);
if(( norm2(err)>1.0e-4) ) {
/*
std::cout << "RESULT\n " << result<<std::endl;
std::cout << "REF \n " << ref <<std::endl;
std::cout << "ERR \n " << err <<std::endl;
*/
std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
FGrid->Barrier();
exit(-1);
}
assert (norm2(err)< 1.0e-4 );
}
if (1)
{ // Naive wilson dag implementation
ref = Zero();
for(int mu=0;mu<Nd;mu++){
// ref = src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
tmp = Cshift(src,mu+1,1);
{
autoView( ref_v, ref, CpuWrite);
autoView( tmp_v, tmp, CpuRead);
autoView( U_v , U[mu] , CpuRead);
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
int i=s+Ls*ss;
ref_v[i]+= U_v[ss]*(tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]); ;
}
}
}
{
autoView( tmp_v , tmp , CpuWrite);
autoView( U_v , U[mu] , CpuRead);
autoView( src_v, src , CpuRead);
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
}
}
}
// tmp =adj(U[mu])*src;
tmp =Cshift(tmp,mu+1,-1);
{
autoView( ref_v, ref, CpuWrite);
autoView( tmp_v, tmp, CpuRead);
for(int i=0;i<ref_v.size();i++){
ref_v[i]+= tmp_v[i] - Gamma(Gmu[mu])*tmp_v[i]; ;
}
}
}
ref = -0.5*ref;
}
// dump=1;
Dw.Dhop(src,result,1);
std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "norm dag ref "<< norm2(ref)<<std::endl;
err = ref-result;
std::cout<<GridLogMessage << "norm dag diff "<< norm2(err)<<std::endl;
if((norm2(err)>1.0e-4)){
/*
std::cout<< "DAG RESULT\n " <<ref << std::endl;
std::cout<< "DAG sRESULT\n " <<result << std::endl;
std::cout<< "DAG ERR \n " << err <<std::endl;
*/
}
LatticeFermionF src_e (FrbGrid);
LatticeFermionF src_o (FrbGrid);
LatticeFermionF r_e (FrbGrid);
LatticeFermionF r_o (FrbGrid);
LatticeFermionF r_eo (FGrid);
std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd,src_o,src);
std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
// S-direction is INNERMOST and takes no part in the parity.
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionF::DhopEO "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
if ( sizeof(RealF)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(RealF)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
#ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
#endif
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
{
FGrid->Barrier();
Dw.DhopEO(src_o,r_e,DaggerNo);
double t0=usecond();
for(int i=0;i<ncall;i++){
#ifdef CUDA_PROFILE
if(i==10) cudaProfilerStart();
#endif
Dw.DhopEO(src_o,r_e,DaggerNo);
#ifdef CUDA_PROFILE
if(i==20) cudaProfilerStop();
#endif
}
double t1=usecond();
FGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=(single_site_flops*volume*ncall)/2.0;
std::cout<<GridLogMessage << "Deo mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "Deo mflop/s per rank "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "Deo mflop/s per node "<< flops/(t1-t0)/NN<<std::endl;
}
Dw.DhopEO(src_o,r_e,DaggerNo);
Dw.DhopOE(src_e,r_o,DaggerNo);
Dw.Dhop (src ,result,DaggerNo);
std::cout<<GridLogMessage << "r_e"<<norm2(r_e)<<std::endl;
std::cout<<GridLogMessage << "r_o"<<norm2(r_o)<<std::endl;
std::cout<<GridLogMessage << "res"<<norm2(result)<<std::endl;
setCheckerboard(r_eo,r_o);
setCheckerboard(r_eo,r_e);
err = r_eo-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
if((norm2(err)>1.0e-4)){
/*
std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
std::cout<< "Deo REF\n " <<result << std::endl;
std::cout<< "Deo ERR \n " << err <<std::endl;
*/
}
pickCheckerboard(Even,src_e,err);
pickCheckerboard(Odd,src_o,err);
std::cout<<GridLogMessage << "norm diff even "<< norm2(src_e)<<std::endl;
std::cout<<GridLogMessage << "norm diff odd "<< norm2(src_o)<<std::endl;
assert(norm2(src_e)<1.0e-4);
assert(norm2(src_o)<1.0e-4);
Grid_finalize();
exit(0);
}

View File

@ -0,0 +1,90 @@
Branch: develop
Files:
Grid/lattice/PaddedCell.h -- Halo exchange
tests/Test_general_stencil.cc -- test local off axis stencil addressing
tests/debug/Test_padded_cell.cc -- test PaddedCell halo exchange and the General local stencil by computing ALL plaquettes on lattice
Functionality:
-- extend a lattice field:
Grid/lattice/PaddedCell.h
// Constructor
PaddedCell(int _depth,GridCartesian *_grid)
// Expand a field "in" to depth "d"
template<class vobj>
inline Lattice<vobj> Exchange(Lattice<vobj> &in)
// Take the "apple core" of in to a smaller local volume
template<class vobj>
inline Lattice<vobj> Extract(Lattice<vobj> &in)
-- Plaquette test:
tests/debug/Test_padded_cell.cc
/////////////////////////////////////////////////
// Create a padded cell of extra padding depth=1
/////////////////////////////////////////////////
int depth = 1;
PaddedCell Ghost(depth,&GRID);
LatticeGaugeField Ughost = Ghost.Exchange(Umu);
///// Array for the site plaquette
GridBase *GhostGrid = Ughost.Grid();
LatticeComplex gplaq(GhostGrid);
std::vector<Coordinate> shifts;
for(int mu=0;mu<Nd;mu++){
for(int nu=mu+1;nu<Nd;nu++){
// Umu(x) Unu(x+mu) Umu^dag(x+nu) Unu^dag(x)
Coordinate shift_0(Nd,0);
Coordinate shift_mu(Nd,0); shift_mu[mu]=1;
Coordinate shift_nu(Nd,0); shift_nu[nu]=1;
shifts.push_back(shift_0);
shifts.push_back(shift_mu);
shifts.push_back(shift_nu);
shifts.push_back(shift_0);
}
}
GeneralLocalStencil gStencil(GhostGrid,shifts);
gplaq=Zero();
{
autoView( gp_v , gplaq, CpuWrite);
autoView( t_v , trplaq, CpuRead);
autoView( U_v , Ughost, CpuRead);
for(int ss=0;ss<gp_v.size();ss++){
int s=0;
for(int mu=0;mu<Nd;mu++){
for(int nu=mu+1;nu<Nd;nu++){
auto SE0 = gStencil.GetEntry(s+0,ss);
auto SE1 = gStencil.GetEntry(s+1,ss);
auto SE2 = gStencil.GetEntry(s+2,ss);
auto SE3 = gStencil.GetEntry(s+3,ss);
int o0 = SE0->_offset;
int o1 = SE1->_offset;
int o2 = SE2->_offset;
int o3 = SE3->_offset;
auto U0 = U_v[o0](mu);
auto U1 = U_v[o1](nu);
auto U2 = adj(U_v[o2](mu));
auto U3 = adj(U_v[o3](nu));
gpermute(U0,SE0->_permute);
gpermute(U1,SE1->_permute);
gpermute(U2,SE2->_permute);
gpermute(U3,SE3->_permute);
gp_v[ss]() =gp_v[ss]() + trace( U0*U1*U2*U3 );
s=s+4;
}
}
}
}
cplaq = Ghost.Extract(gplaq);

133
examples/socket_grid.cc Normal file
View File

@ -0,0 +1,133 @@
#include <sys/socket.h>
#include <sys/un.h>
#include <unistd.h>
#include <stdio.h>
#include <err.h>
#include <fcntl.h>
#include <assert.h>
#include <string.h>
#include <stdlib.h>
static int sock;
static const char *sock_path_fmt = "/tmp/GridUnixSocket.%d";
static char sock_path[256];
class UnixSockets {
public:
static void Open(int rank)
{
int errnum;
sock = socket(AF_UNIX, SOCK_DGRAM, 0); assert(sock>0);
printf("allocated socket %d\n",sock);
struct sockaddr_un sa_un = { 0 };
sa_un.sun_family = AF_UNIX;
snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,rank);
unlink(sa_un.sun_path);
if (bind(sock, (struct sockaddr *)&sa_un, sizeof(sa_un))) {
perror("bind failure");
exit(EXIT_FAILURE);
}
printf("bound socket %d to %s\n",sock,sa_un.sun_path);
}
static int RecvFileDescriptor(void)
{
int n;
int fd;
char buf[1];
struct iovec iov;
struct msghdr msg;
struct cmsghdr *cmsg;
char cms[CMSG_SPACE(sizeof(int))];
iov.iov_base = buf;
iov.iov_len = 1;
memset(&msg, 0, sizeof msg);
msg.msg_name = 0;
msg.msg_namelen = 0;
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
msg.msg_control = (caddr_t)cms;
msg.msg_controllen = sizeof cms;
if((n=recvmsg(sock, &msg, 0)) < 0) {
perror("recvmsg failed");
return -1;
}
if(n == 0){
perror("recvmsg returned 0");
return -1;
}
cmsg = CMSG_FIRSTHDR(&msg);
memmove(&fd, CMSG_DATA(cmsg), sizeof(int));
printf("received fd %d from socket %d\n",fd,sock);
return fd;
}
static void SendFileDescriptor(int fildes,int xmit_to_rank)
{
struct msghdr msg;
struct iovec iov;
struct cmsghdr *cmsg = NULL;
char ctrl[CMSG_SPACE(sizeof(int))];
char data = ' ';
memset(&msg, 0, sizeof(struct msghdr));
memset(ctrl, 0, CMSG_SPACE(sizeof(int)));
iov.iov_base = &data;
iov.iov_len = sizeof(data);
sprintf(sock_path,sock_path_fmt,xmit_to_rank);
printf("sending FD %d over socket %d to rank %d AF_UNIX path %s\n",fildes,sock,xmit_to_rank,sock_path);fflush(stdout);
struct sockaddr_un sa_un = { 0 };
sa_un.sun_family = AF_UNIX;
snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,xmit_to_rank);
msg.msg_name = (void *)&sa_un;
msg.msg_namelen = sizeof(sa_un);
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
msg.msg_controllen = CMSG_SPACE(sizeof(int));
msg.msg_control = ctrl;
cmsg = CMSG_FIRSTHDR(&msg);
cmsg->cmsg_level = SOL_SOCKET;
cmsg->cmsg_type = SCM_RIGHTS;
cmsg->cmsg_len = CMSG_LEN(sizeof(int));
*((int *) CMSG_DATA(cmsg)) = fildes;
if ( sendmsg(sock, &msg, 0) == -1 ) perror("sendmsg failed");
};
};
int main(int argc, char **argv)
{
int me = fork()?0:1;
UnixSockets::Open(me);
// need MPI barrier
sleep(10);
const char * message = "Hello, World\n";
if( me ) {
int fd = open("foo",O_RDWR|O_CREAT,0666);
if ( fd < 0 ) {
perror("failed to open file");
exit(EXIT_FAILURE);
}
// rank 1 sends ot rank 0
UnixSockets::SendFileDescriptor(fd,0);
close(fd);
} else {
// rank 0 sends receives frmo rank 1
int fd = UnixSockets::RecvFileDescriptor();
write(fd,(const void *)message,strlen(message));
close(fd);
}
}

View File

@ -60,7 +60,7 @@ while test $# -gt 0; do
;;
--cxxflags)
echo @GRID_CXXFLAGS@
echo @GRID_CXXFLAGS@ -I@prefix@/include
;;
--cxx)
@ -72,11 +72,11 @@ while test $# -gt 0; do
;;
--ldflags)
echo @GRID_LDFLAGS@
echo @GRID_LDFLAGS@ -L@prefix@/lib
;;
--libs)
echo @GRID_LIBS@
echo @GRID_LIBS@ -lGrid
;;
--summary)

View File

@ -1,7 +1,7 @@
CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
../../configure --enable-comms=mpi-auto \
--with-lime=$CLIME \
--enable-unified=yes \
--enable-unified=no \
--enable-shm=nvlink \
--enable-tracing=timer \
--enable-accelerator=hip \

View File

@ -5,8 +5,8 @@ module load emacs
#module load gperftools
module load PrgEnv-gnu
module load rocm/5.3.0
module load cray-mpich/8.1.16
#module load cray-mpich/8.1.17
#module load cray-mpich/8.1.16
module load cray-mpich/8.1.17
module load gmp
module load cray-fftw
module load craype-accel-amd-gfx90a

View File

@ -0,0 +1,44 @@
#!/bin/bash -l
#SBATCH --job-name=bench_lehner
#SBATCH --partition=small-g
#SBATCH --nodes=2
#SBATCH --ntasks-per-node=8
#SBATCH --cpus-per-task=7
#SBATCH --gpus-per-node=8
#SBATCH --time=00:10:00
#SBATCH --account=project_465000546
#SBATCH --gpu-bind=none
#SBATCH --exclusive
#SBATCH --mem=0
CPU_BIND="map_cpu:48,56,32,40,16,24,1,8"
echo $CPU_BIND
cat << EOF > select_gpu
#!/bin/bash
export GPU_MAP=(0 1 2 3 4 5 6 7)
export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
export HIP_VISIBLE_DEVICES=\$GPU
unset ROCR_VISIBLE_DEVICES
echo RANK \$SLURM_LOCALID using GPU \$GPU
exec \$*
EOF
chmod +x ./select_gpu
root=/scratch/project_465000546/boylepet/Grid/systems/Lumi
source ${root}/sourceme.sh
export OMP_NUM_THREADS=7
export MPICH_GPU_SUPPORT_ENABLED=1
export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
for vol in 16.16.16.64 32.32.32.64 32.32.32.128
do
srun --cpu-bind=${CPU_BIND} ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol > log.shm0.ov.$vol
#srun --cpu-bind=${CPU_BIND} ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.ov.$vol
srun --cpu-bind=${CPU_BIND} ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 0 --grid $vol > log.shm0.seq.$vol
#srun --cpu-bind=${CPU_BIND} ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.seq.$vol
done

View File

@ -0,0 +1,30 @@
spack load c-lime
spack load gmp
spack load mpfr
CLIME=`spack find --paths c-lime | grep c-lime| cut -c 15-`
GMP=`spack find --paths gmp | grep gmp | cut -c 12-`
MPFR=`spack find --paths mpfr | grep mpfr | cut -c 13-`
echo clime X$CLIME
echo gmp X$GMP
echo mpfr X$MPFR
../../configure \
--enable-comms=mpi-auto \
--with-lime=$CLIME \
--enable-unified=no \
--enable-shm=nvlink \
--enable-accelerator=hip \
--enable-gen-simd-width=64 \
--enable-simd=GPU \
--enable-accelerator-cshift \
--with-gmp=$GMP \
--with-mpfr=$MPFR \
--with-fftw=$FFTW_DIR/.. \
--disable-fermion-reps \
--disable-gparity \
CXX=hipcc MPICXX=mpicxx \
CXXFLAGS="-fPIC --offload-arch=gfx90a -I/opt/rocm/include/ -std=c++14 -I/opt/cray/pe/mpich/8.1.23/ofi/gnu/9.1/include" \
LDFLAGS="-L/opt/cray/pe/mpich/8.1.23/ofi/gnu/9.1/lib -lmpi -L/opt/cray/pe/mpich/8.1.23/gtl/lib -lmpi_gtl_hsa -lamdhip64 -fopenmp"

5
systems/Lumi/sourceme.sh Normal file
View File

@ -0,0 +1,5 @@
source ~/spack/share/spack/setup-env.sh
module load CrayEnv LUMI/22.12 partition/G cray-fftw/3.3.10.1 rocm
spack load c-lime
spack load gmp
spack load mpfr

53
systems/OEM/README Normal file
View File

@ -0,0 +1,53 @@
1. Prerequisites:
===================
Make sure you have the latest Intel ipcx release loaded (via modules or similar)
Make sure you have SYCL aware MPICH or Intel MPI loaded (assumed as mpicxx)
2. Obtain Grid:
===================
bash$
git clone https://github.com/paboyle/Grid
cd Grid
./bootstrap.sh
cd systems/PVC
3. Build Grid:
===================
Here, configure command is stored in file config-command:
bash$
../../configure \
--enable-simd=GPU \
--enable-gen-simd-width=64 \
--enable-comms=mpi-auto \
--enable-accelerator-cshift \
--disable-gparity \
--disable-fermion-reps \
--enable-shm=nvlink \
--enable-accelerator=sycl \
--enable-unified=no \
MPICXX=mpicxx \
CXX=icpx \
LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader " \
CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare "
make all
4. Run a benchmark:
===================
*** Assumes interactive access to node. ***
run Benchmark_dwf_fp32 using benchmarks/bench.sh
bash$
cd benchmarks
./bench.sh

18
systems/OEM/benchmarks/bench.sh Executable file
View File

@ -0,0 +1,18 @@
#!/bin/bash
export EnableImplicitScaling=0
export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
export ZE_AFFINITY_MASK=$gpu_id.$tile_id
export ONEAPI_DEVICE_FILTER=gpu,level_zero
export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
mpiexec -launcher ssh -n 1 -host localhost ./select_gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads 16 --shm-mpi 1 --shm 2048 --device-mem 32768 | tee 1tile.log
mpiexec -launcher ssh -n 2 -host localhost ./select_gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads 16 --shm-mpi 1 --shm 2048 --device-mem 32768 | tee 2tile.log
#mpiexec -launcher ssh -n 4 -host localhost ./select_gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.2.2 --grid 16.16.64.64 --accelerator-threads 16 --shm-mpi 0 --shm 2048 --device-mem 32768 | tee 4tile.log
#mpiexec -launcher ssh -n 8 -host localhost ./select_gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.2.4 --grid 16.16.64.128 --accelerator-threads 16 --shm-mpi 0 --shm 2048 --device-mem 32768 | tee 8tile.log

View File

@ -0,0 +1,13 @@
#!/bin/bash
num_tile=2
gpu_id=$(( (MPI_LOCAL_RANKID % num_tile ) ))
tile_id=$((MPI_LOCAL_RANKID / num_tile))
export ZE_AFFINITY_MASK=$gpu_id.$tile_id
echo "local rank $MPI_LOCALRANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK"
"$@"

View File

@ -0,0 +1,15 @@
../../configure \
--enable-simd=GPU \
--enable-gen-simd-width=64 \
--enable-comms=mpi-auto \
--enable-accelerator-cshift \
--disable-gparity \
--disable-fermion-reps \
--enable-shm=nvlink \
--enable-accelerator=sycl \
--enable-unified=no \
MPICXX=mpicxx \
CXX=icpx \
LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader " \
CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare "

3
systems/OEM/setup.sh Normal file
View File

@ -0,0 +1,3 @@
export https_proxy=http://proxy-chain.intel.com:911
module load intel-release
module load intel/mpich

View File

@ -4,7 +4,7 @@
#SBATCH -p QZ1J-ICX-PVC
##SBATCH -p QZ1J-SPR-PVC-2C
source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
#source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
export NT=8

View File

@ -4,7 +4,7 @@
#SBATCH -p QZ1J-ICX-PVC
source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
#source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
export NT=16
@ -19,11 +19,15 @@ export SYCL_DEVICE_FILTER=gpu,level_zero
export I_MPI_OFFLOAD_CELL=tile
export EnableImplicitScaling=0
export EnableWalkerPartition=0
export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1
#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0
#mpiexec -launcher ssh -n 1 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0 > 1tile.log
for i in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
do
mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 0 --device-mem 32768 > 1.1.1.2.log$i
mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 0 --device-mem 32768 > 2.1.1.1.log$i
done
mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0

View File

@ -5,10 +5,5 @@ export ZE_AFFINITY_MASK=0.$MPI_LOCALRANKID
echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK
#if [ $MPI_LOCALRANKID = "0" ]
#then
# ~psteinbr/build_pti/ze_tracer -c $@
# onetrace --chrome-kernel-timeline $@
#else
$@
#fi

View File

@ -3,8 +3,14 @@ export https_proxy=http://proxy-chain.intel.com:911
export LD_LIBRARY_PATH=$HOME/prereqs/lib/:$LD_LIBRARY_PATH
module load intel-release
source /opt/intel/oneapi/PVC_setup.sh
module load intel-comp-rt/embargo-ci-neo
#source /opt/intel/oneapi/PVC_setup.sh
#source /opt/intel/oneapi/ATS_setup.sh
#module load intel-nightly/20230331
#module load intel-comp-rt/ci-neo-master/026093
#module load intel/mpich
module load intel/mpich/pvc45.3
export PATH=~/ATS/pti-gpu/tools/onetrace/:$PATH

View File

@ -0,0 +1,46 @@
#!/bin/bash
#PBS -l select=1:system=sunspot,place=scatter
#PBS -A LatticeQCD_aesp_CNDA
#PBS -l walltime=01:00:00
#PBS -N dwf
#PBS -k doe
HDIR=/home/paboyle/
module use /soft/testing/modulefiles/
module load intel-UMD23.05.25593.11/23.05.25593.11
module load tools/pti-gpu
export LD_LIBRARY_PATH=$HDIR/tools/lib64:$LD_LIBRARY_PATH
export PATH=$HDIR/tools/bin:$PATH
export TZ='/usr/share/zoneinfo/US/Central'
export OMP_PROC_BIND=spread
export OMP_NUM_THREADS=3
unset OMP_PLACES
cd $PBS_O_WORKDIR
qsub jobscript.pbs
echo Jobid: $PBS_JOBID
echo Running on host `hostname`
echo Running on nodes `cat $PBS_NODEFILE`
echo NODES
cat $PBS_NODEFILE
NNODES=`wc -l < $PBS_NODEFILE`
NRANKS=12 # Number of MPI ranks per node
NDEPTH=4 # Number of hardware threads per rank, spacing between MPI ranks on a node
NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS
NTOTRANKS=$(( NNODES * NRANKS ))
echo "NUM_NODES=${NNODES} TOTAL_RANKS=${NTOTRANKS} RANKS_PER_NODE=${NRANKS} THREADS_PER_RANK=${OMP_NUM_THREADS}"
echo "OMP_PROC_BIND=$OMP_PROC_BIND OMP_PLACES=$OMP_PLACES"
CMD="mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} -d ${NDEPTH} --cpu-bind=depth -envall \
./gpu_tile_compact.sh \
./Benchmark_dwf_fp32 --mpi 1.1.2.6 --grid 16.32.64.192 --comms-overlap \
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32"

View File

@ -0,0 +1,52 @@
#!/bin/bash
display_help() {
echo " Will map gpu tile to rank in compact and then round-robin fashion"
echo " Usage (only work for one node of ATS/PVC):"
echo " mpiexec --np N gpu_tile_compact.sh ./a.out"
echo
echo " Example 3 GPU of 2 Tiles with 7 Ranks:"
echo " 0 Rank 0.0"
echo " 1 Rank 0.1"
echo " 2 Rank 1.0"
echo " 3 Rank 1.1"
echo " 4 Rank 2.0"
echo " 5 Rank 2.1"
echo " 6 Rank 0.0"
echo
echo " Hacked together by apl@anl.gov, please contact if bug found"
exit 1
}
#This give the exact GPU count i915 knows about and I use udev to only enumerate the devices with physical presence.
#works? num_gpu=$(/usr/bin/udevadm info /sys/module/i915/drivers/pci\:i915/* |& grep -v Unknown | grep -c "P: /devices")
num_gpu=6
num_tile=2
if [ "$#" -eq 0 ] || [ "$1" == "--help" ] || [ "$1" == "-h" ] || [ "$num_gpu" = 0 ]; then
display_help
fi
gpu_id=$(( (PALS_LOCAL_RANKID / num_tile ) % num_gpu ))
tile_id=$((PALS_LOCAL_RANKID % num_tile))
unset EnableWalkerPartition
export EnableImplicitScaling=0
export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
export ZE_AFFINITY_MASK=$gpu_id.$tile_id
export ONEAPI_DEVICE_FILTER=gpu,level_zero
export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK"
if [ $PALS_LOCAL_RANKID = 0 ]
then
onetrace --chrome-device-timeline "$@"
# "$@"
else
"$@"
fi

View File

@ -0,0 +1,16 @@
TOOLS=$HOME/tools
../../configure \
--enable-simd=GPU \
--enable-gen-simd-width=64 \
--enable-comms=mpi-auto \
--enable-accelerator-cshift \
--disable-gparity \
--disable-fermion-reps \
--enable-shm=nvlink \
--enable-accelerator=sycl \
--enable-unified=no \
MPICXX=mpicxx \
CXX=icpx \
LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -lapmidg -L$TOOLS/lib64/" \
CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -I$TOOLS/include"

View File

@ -1,2 +1,4 @@
CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GEN --enable-debug --enable-comms=mpi --enable-unified=yes
BREW=/opt/local/
MPICXX=mpicxx CXX=c++-12 ../../configure --enable-simd=GEN --enable-comms=mpi-auto --enable-unified=yes --prefix $HOME/QCD/GridInstall --with-lime=/Users/peterboyle/QCD/SciDAC/install/ --with-openssl=$BREW --disable-fermion-reps --disable-gparity --disable-debug

View File

@ -73,12 +73,12 @@ int main (int argc, char ** argv)
RealD M5 =1.8;
std::cout<<GridLogMessage<<"**************************************************************"<<std::endl;
std::cout<<GridLogMessage <<"DomainWallFermion vectorised test"<<std::endl;
std::cout<<GridLogMessage <<"DomainWallFermion test"<<std::endl;
std::cout<<GridLogMessage<<"**************************************************************"<<std::endl;
std::vector<Complex> boundary = {1,1,1,-1};
DomainWallFermionD::ImplParams Params(boundary);
Coordinate Dirichlet({0,8,8,16,32});
Params.dirichlet=Dirichlet;
// Coordinate Dirichlet({0,8,8,16,32});
// Params.dirichlet=Dirichlet;
DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,Params);
TestWhat<DomainWallFermionD>(Ddwf,FGrid,FrbGrid,UGrid,mass,M5,&RNG4,&RNG5);

Some files were not shown because too many files have changed in this diff Show More