mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-24 02:32:02 +01:00
Compare commits
145 Commits
e1c326558a
...
debug-crus
Author | SHA1 | Date | |
---|---|---|---|
bbec7f9fa9 | |||
3aa43e6065 | |||
78ac4044ff | |||
119c3db47f | |||
21bbdb8fc2 | |||
739bd7572c | |||
074627a5bd | |||
6a23b2c599 | |||
bd891fb3f5 | |||
3984265851 | |||
45361d188f | |||
80c9d77e02 | |||
3aff64dddb | |||
b4f2ca81ff | |||
d1dea5f840 | |||
54f8b84d16 | |||
da503fef0e | |||
4a6802098a | |||
f9b41a84d2 | |||
5d7e0d18b9 | |||
9e64387933 | |||
983b681d46 | |||
4072408b6f | |||
bd76b47fbf | |||
18ce23aa75 | |||
ffa7fe0cc2 | |||
6b979f0a69 | |||
86dac5ff4f | |||
4a382fad3f | |||
cc753670d9 | |||
cc9d88ea1c | |||
b281b0166e | |||
6a21f694ff | |||
fc4db5e963 | |||
6252ffaf76 | |||
af64c1c6b6 | |||
866f48391a | |||
a4df527d74 | |||
5764d21161 | |||
496d04cd85 | |||
10e6d7c6ce | |||
c42e25e5b8 | |||
a00ae981e0 | |||
58e020b62a | |||
a7e1aceeca | |||
7212432f43 | |||
4a261fab30 | |||
6af97069b9 | |||
5068413cdb | |||
71c6960eea | |||
ddf6d5c9e3 | |||
39214702f6 | |||
3e4614c63a | |||
900e01f49b | |||
2376156fbc | |||
3f2fd49db4 | |||
0efa107cb6 | |||
8feedb4f6f | |||
05e562e3d7 | |||
dd3bbb8fa2 | |||
2fbcf13c46 | |||
4ea48ef0c4 | |||
5c85774ee3 | |||
d8a9a745d8 | |||
dcf172da3b | |||
d57ed25071 | |||
546be724e7 | |||
8a1b9073f9 | |||
1a7114d4b9 | |||
3f385f717c | |||
481bbaf1fc | |||
281488611a | |||
c180a52518 | |||
90130e25e9 | |||
23298acb81 | |||
52384e34cf | |||
d0bb033ea2 | |||
c6621806ca | |||
0b6f0f6d2f | |||
b5b759df73 | |||
7db8dd7a95 | |||
8b43be39c0 | |||
f17f879206 | |||
68428fceab | |||
4135f2dcd1 | |||
c5bdf61215 | |||
88e218e8ee | |||
0f2b786436 | |||
bae0f8ea99 | |||
bbbcd36ae5 | |||
39c0815d9e | |||
a3e935c902 | |||
7731c7db8e | |||
ff97340324 | |||
83d86943db | |||
e82cf1d311 | |||
1db58a8acc | |||
920a51438d | |||
be528b6d27 | |||
796abfad80 | |||
ad0270ac8c | |||
7d62f1d6d2 | |||
458c943987 | |||
88015b0858 | |||
4ca1bf7cca | |||
2ff868f7a5 | |||
ede02b6883 | |||
1822ced302 | |||
37ba32776f | |||
99b3697b03 | |||
43a45ec97b | |||
b00a4142e5 | |||
3791bc527b | |||
d8c29f5fcf | |||
281f8101fe | |||
dc747c54be | |||
07acfe89f2 | |||
40234f531f | |||
d49694f38f | |||
dc6a38f177 | |||
82c1ecf60f | |||
97a098636d | |||
e13930c8b2 | |||
0655dab466 | |||
7f097bcc28 | |||
5c75aa5008 | |||
1873101362 | |||
63fd1dfa62 | |||
bd68861b28 | |||
82e959f66c | |||
62e52de06d | |||
184adeedb8 | |||
5fa6a8b96d | |||
a2a879b668 | |||
9317d893b2 | |||
86075fdd45 | |||
b36442e263 | |||
513d797ea6 | |||
9e4835a3e3 | |||
477ebf24f4 | |||
0d5639f707 | |||
413312f9a9 | |||
03508448f8 | |||
e1e5c75023 | |||
9296299b61 |
54
.github/ISSUE_TEMPLATE/bug-report.yml
vendored
Normal file
54
.github/ISSUE_TEMPLATE/bug-report.yml
vendored
Normal file
@ -0,0 +1,54 @@
|
||||
name: Bug report
|
||||
description: Report a bug.
|
||||
title: "<insert title>"
|
||||
labels: [bug]
|
||||
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: >
|
||||
Thank you for taking the time to file a bug report.
|
||||
Please check that the code is pointing to the HEAD of develop
|
||||
or any commit in master which is tagged with a version number.
|
||||
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: "Describe the issue:"
|
||||
description: >
|
||||
Describe the issue and any previous attempt to solve it.
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: "Code example:"
|
||||
description: >
|
||||
If relevant, show how to reproduce the issue using a minimal working
|
||||
example.
|
||||
placeholder: |
|
||||
<< your code here >>
|
||||
render: shell
|
||||
validations:
|
||||
required: false
|
||||
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: "Target platform:"
|
||||
description: >
|
||||
Give a description of the target platform (CPU, network, compiler).
|
||||
Please give the full CPU part description, using for example
|
||||
`cat /proc/cpuinfo | grep 'model name' | uniq` (Linux)
|
||||
or `sysctl machdep.cpu.brand_string` (macOS) and the full output
|
||||
the `--version` option of your compiler.
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: "Configure options:"
|
||||
description: >
|
||||
Please give the exact configure command used and attach
|
||||
`config.log`, `grid.config.summary` and the output of `make V=1`.
|
||||
render: shell
|
||||
validations:
|
||||
required: true
|
@ -45,7 +45,7 @@ directory
|
||||
//disables nvcc specific warning in json.hpp
|
||||
#pragma clang diagnostic ignored "-Wdeprecated-register"
|
||||
|
||||
#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
|
||||
#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
|
||||
//disables nvcc specific warning in json.hpp
|
||||
#pragma nv_diag_suppress unsigned_compare_with_zero
|
||||
#pragma nv_diag_suppress cast_to_qualified_type
|
||||
|
@ -14,7 +14,7 @@
|
||||
/* NVCC save and restore compile environment*/
|
||||
#ifdef __NVCC__
|
||||
#pragma push
|
||||
#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
|
||||
#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
|
||||
#pragma nv_diag_suppress code_is_unreachable
|
||||
#else
|
||||
#pragma diag_suppress code_is_unreachable
|
||||
|
@ -55,6 +55,7 @@ NAMESPACE_CHECK(BiCGSTAB);
|
||||
#include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
|
||||
#include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
|
||||
#include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h>
|
||||
#include <Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h>
|
||||
#include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
|
||||
#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
|
||||
#include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
|
||||
|
@ -542,6 +542,7 @@ public:
|
||||
(*this)(in[i], out[i]);
|
||||
}
|
||||
}
|
||||
virtual ~LinearFunction(){};
|
||||
};
|
||||
|
||||
template<class Field> class IdentityLinearFunction : public LinearFunction<Field> {
|
||||
|
@ -191,7 +191,7 @@ public:
|
||||
std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
|
||||
std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
|
||||
|
||||
std::cout << GridLogMessage << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl;
|
||||
std::cout << GridLogDebug << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl;
|
||||
|
||||
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
|
||||
|
||||
|
@ -109,6 +109,9 @@ NAMESPACE_BEGIN(Grid);
|
||||
|
||||
Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
|
||||
|
||||
precisionChangeWorkspace pc_wk_sp_to_dp(DoublePrecGrid, SinglePrecGrid);
|
||||
precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, DoublePrecGrid);
|
||||
|
||||
for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
|
||||
//Compute double precision rsd and also new RHS vector.
|
||||
Linop_d.HermOp(sol_d, tmp_d);
|
||||
@ -123,7 +126,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
|
||||
|
||||
PrecChangeTimer.Start();
|
||||
precisionChange(src_f, src_d);
|
||||
precisionChange(src_f, src_d, pc_wk_dp_to_sp);
|
||||
PrecChangeTimer.Stop();
|
||||
|
||||
sol_f = Zero();
|
||||
@ -142,7 +145,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
|
||||
//Convert sol back to double and add to double prec solution
|
||||
PrecChangeTimer.Start();
|
||||
precisionChange(tmp_d, sol_f);
|
||||
precisionChange(tmp_d, sol_f, pc_wk_sp_to_dp);
|
||||
PrecChangeTimer.Stop();
|
||||
|
||||
axpy(sol_d, 1.0, tmp_d, sol_d);
|
||||
|
213
Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
Normal file
213
Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
Normal file
@ -0,0 +1,213 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Raoul Hodgson <raoul.hodgson@ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
|
||||
#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
//Mixed precision restarted defect correction CG
|
||||
template<class FieldD,class FieldF,
|
||||
typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
|
||||
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
|
||||
class MixedPrecisionConjugateGradientBatched : public LinearFunction<FieldD> {
|
||||
public:
|
||||
using LinearFunction<FieldD>::operator();
|
||||
RealD Tolerance;
|
||||
RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
|
||||
Integer MaxInnerIterations;
|
||||
Integer MaxOuterIterations;
|
||||
Integer MaxPatchupIterations;
|
||||
GridBase* SinglePrecGrid; //Grid for single-precision fields
|
||||
RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
|
||||
LinearOperatorBase<FieldF> &Linop_f;
|
||||
LinearOperatorBase<FieldD> &Linop_d;
|
||||
|
||||
//Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
|
||||
LinearFunction<FieldF> *guesser;
|
||||
bool updateResidual;
|
||||
|
||||
MixedPrecisionConjugateGradientBatched(RealD tol,
|
||||
Integer maxinnerit,
|
||||
Integer maxouterit,
|
||||
Integer maxpatchit,
|
||||
GridBase* _sp_grid,
|
||||
LinearOperatorBase<FieldF> &_Linop_f,
|
||||
LinearOperatorBase<FieldD> &_Linop_d,
|
||||
bool _updateResidual=true) :
|
||||
Linop_f(_Linop_f), Linop_d(_Linop_d),
|
||||
Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), MaxPatchupIterations(maxpatchit), SinglePrecGrid(_sp_grid),
|
||||
OuterLoopNormMult(100.), guesser(NULL), updateResidual(_updateResidual) { };
|
||||
|
||||
void useGuesser(LinearFunction<FieldF> &g){
|
||||
guesser = &g;
|
||||
}
|
||||
|
||||
void operator() (const FieldD &src_d_in, FieldD &sol_d){
|
||||
std::vector<FieldD> srcs_d_in{src_d_in};
|
||||
std::vector<FieldD> sols_d{sol_d};
|
||||
|
||||
(*this)(srcs_d_in,sols_d);
|
||||
|
||||
sol_d = sols_d[0];
|
||||
}
|
||||
|
||||
void operator() (const std::vector<FieldD> &src_d_in, std::vector<FieldD> &sol_d){
|
||||
assert(src_d_in.size() == sol_d.size());
|
||||
int NBatch = src_d_in.size();
|
||||
|
||||
std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl;
|
||||
|
||||
Integer TotalOuterIterations = 0; //Number of restarts
|
||||
std::vector<Integer> TotalInnerIterations(NBatch,0); //Number of inner CG iterations
|
||||
std::vector<Integer> TotalFinalStepIterations(NBatch,0); //Number of CG iterations in final patch-up step
|
||||
|
||||
GridStopWatch TotalTimer;
|
||||
TotalTimer.Start();
|
||||
|
||||
GridStopWatch InnerCGtimer;
|
||||
GridStopWatch PrecChangeTimer;
|
||||
|
||||
int cb = src_d_in[0].Checkerboard();
|
||||
|
||||
std::vector<RealD> src_norm;
|
||||
std::vector<RealD> norm;
|
||||
std::vector<RealD> stop;
|
||||
|
||||
GridBase* DoublePrecGrid = src_d_in[0].Grid();
|
||||
FieldD tmp_d(DoublePrecGrid);
|
||||
tmp_d.Checkerboard() = cb;
|
||||
|
||||
FieldD tmp2_d(DoublePrecGrid);
|
||||
tmp2_d.Checkerboard() = cb;
|
||||
|
||||
std::vector<FieldD> src_d;
|
||||
std::vector<FieldF> src_f;
|
||||
std::vector<FieldF> sol_f;
|
||||
|
||||
for (int i=0; i<NBatch; i++) {
|
||||
sol_d[i].Checkerboard() = cb;
|
||||
|
||||
src_norm.push_back(norm2(src_d_in[i]));
|
||||
norm.push_back(0.);
|
||||
stop.push_back(src_norm[i] * Tolerance*Tolerance);
|
||||
|
||||
src_d.push_back(src_d_in[i]); //source for next inner iteration, computed from residual during operation
|
||||
|
||||
src_f.push_back(SinglePrecGrid);
|
||||
src_f[i].Checkerboard() = cb;
|
||||
|
||||
sol_f.push_back(SinglePrecGrid);
|
||||
sol_f[i].Checkerboard() = cb;
|
||||
}
|
||||
|
||||
RealD inner_tol = InnerTolerance;
|
||||
|
||||
ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
|
||||
CG_f.ErrorOnNoConverge = false;
|
||||
|
||||
Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
|
||||
|
||||
for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
|
||||
std::cout << GridLogMessage << std::endl;
|
||||
std::cout << GridLogMessage << "Outer iteration " << outer_iter << std::endl;
|
||||
|
||||
bool allConverged = true;
|
||||
|
||||
for (int i=0; i<NBatch; i++) {
|
||||
//Compute double precision rsd and also new RHS vector.
|
||||
Linop_d.HermOp(sol_d[i], tmp_d);
|
||||
norm[i] = axpy_norm(src_d[i], -1., tmp_d, src_d_in[i]); //src_d is residual vector
|
||||
|
||||
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Outer iteration " << outer_iter <<" solve " << i << " residual "<< norm[i] << " target "<< stop[i] <<std::endl;
|
||||
|
||||
PrecChangeTimer.Start();
|
||||
precisionChange(src_f[i], src_d[i]);
|
||||
PrecChangeTimer.Stop();
|
||||
|
||||
sol_f[i] = Zero();
|
||||
|
||||
if(norm[i] > OuterLoopNormMult * stop[i]) {
|
||||
allConverged = false;
|
||||
}
|
||||
}
|
||||
if (allConverged) break;
|
||||
|
||||
if (updateResidual) {
|
||||
RealD normMax = *std::max_element(std::begin(norm), std::end(norm));
|
||||
RealD stopMax = *std::max_element(std::begin(stop), std::end(stop));
|
||||
while( normMax * inner_tol * inner_tol < stopMax) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
|
||||
CG_f.Tolerance = inner_tol;
|
||||
}
|
||||
|
||||
//Optionally improve inner solver guess (eg using known eigenvectors)
|
||||
if(guesser != NULL) {
|
||||
(*guesser)(src_f, sol_f);
|
||||
}
|
||||
|
||||
for (int i=0; i<NBatch; i++) {
|
||||
//Inner CG
|
||||
InnerCGtimer.Start();
|
||||
CG_f(Linop_f, src_f[i], sol_f[i]);
|
||||
InnerCGtimer.Stop();
|
||||
TotalInnerIterations[i] += CG_f.IterationsToComplete;
|
||||
|
||||
//Convert sol back to double and add to double prec solution
|
||||
PrecChangeTimer.Start();
|
||||
precisionChange(tmp_d, sol_f[i]);
|
||||
PrecChangeTimer.Stop();
|
||||
|
||||
axpy(sol_d[i], 1.0, tmp_d, sol_d[i]);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//Final trial CG
|
||||
std::cout << GridLogMessage << std::endl;
|
||||
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Starting final patch-up double-precision solve"<<std::endl;
|
||||
|
||||
for (int i=0; i<NBatch; i++) {
|
||||
ConjugateGradient<FieldD> CG_d(Tolerance, MaxPatchupIterations);
|
||||
CG_d(Linop_d, src_d_in[i], sol_d[i]);
|
||||
TotalFinalStepIterations[i] += CG_d.IterationsToComplete;
|
||||
}
|
||||
|
||||
TotalTimer.Stop();
|
||||
|
||||
std::cout << GridLogMessage << std::endl;
|
||||
for (int i=0; i<NBatch; i++) {
|
||||
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: solve " << i << " Inner CG iterations " << TotalInnerIterations[i] << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations[i] << std::endl;
|
||||
}
|
||||
std::cout << GridLogMessage << std::endl;
|
||||
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
#endif
|
373
Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h
Normal file
373
Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h
Normal file
@ -0,0 +1,373 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Christopher Kelly <ckelly@bnl.gov>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#pragma once
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
//CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision.
|
||||
//The residual is stored in single precision, but the search directions and solution are stored in double precision.
|
||||
//Every update_freq iterations the residual is corrected in double precision.
|
||||
//For safety the a final regular CG is applied to clean up if necessary
|
||||
|
||||
//PB Pure single, then double fixup
|
||||
|
||||
template<class FieldD, class FieldF,
|
||||
typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
|
||||
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
|
||||
class ConjugateGradientMultiShiftMixedPrecCleanup : public OperatorMultiFunction<FieldD>,
|
||||
public OperatorFunction<FieldD>
|
||||
{
|
||||
public:
|
||||
|
||||
using OperatorFunction<FieldD>::operator();
|
||||
|
||||
RealD Tolerance;
|
||||
Integer MaxIterationsMshift;
|
||||
Integer MaxIterations;
|
||||
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
|
||||
std::vector<int> IterationsToCompleteShift; // Iterations for this shift
|
||||
int verbose;
|
||||
MultiShiftFunction shifts;
|
||||
std::vector<RealD> TrueResidualShift;
|
||||
|
||||
int ReliableUpdateFreq; //number of iterations between reliable updates
|
||||
|
||||
GridBase* SinglePrecGrid; //Grid for single-precision fields
|
||||
LinearOperatorBase<FieldF> &Linop_f; //single precision
|
||||
|
||||
ConjugateGradientMultiShiftMixedPrecCleanup(Integer maxit, const MultiShiftFunction &_shifts,
|
||||
GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
|
||||
int _ReliableUpdateFreq) :
|
||||
MaxIterationsMshift(maxit), shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq),
|
||||
MaxIterations(20000)
|
||||
{
|
||||
verbose=1;
|
||||
IterationsToCompleteShift.resize(_shifts.order);
|
||||
TrueResidualShift.resize(_shifts.order);
|
||||
}
|
||||
|
||||
void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, FieldD &psi)
|
||||
{
|
||||
GridBase *grid = src.Grid();
|
||||
int nshift = shifts.order;
|
||||
std::vector<FieldD> results(nshift,grid);
|
||||
(*this)(Linop,src,results,psi);
|
||||
}
|
||||
void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, std::vector<FieldD> &results, FieldD &psi)
|
||||
{
|
||||
int nshift = shifts.order;
|
||||
|
||||
(*this)(Linop,src,results);
|
||||
|
||||
psi = shifts.norm*src;
|
||||
for(int i=0;i<nshift;i++){
|
||||
psi = psi + shifts.residues[i]*results[i];
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
|
||||
{
|
||||
GRID_TRACE("ConjugateGradientMultiShiftMixedPrecCleanup");
|
||||
GridBase *DoublePrecGrid = src_d.Grid();
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Convenience references to the info stored in "MultiShiftFunction"
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
int nshift = shifts.order;
|
||||
|
||||
std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
|
||||
std::vector<RealD> &mresidual(shifts.tolerances);
|
||||
std::vector<RealD> alpha(nshift,1.0);
|
||||
|
||||
//Double precision search directions
|
||||
FieldD p_d(DoublePrecGrid);
|
||||
std::vector<FieldF> ps_f (nshift, SinglePrecGrid);// Search directions (single precision)
|
||||
std::vector<FieldF> psi_f(nshift, SinglePrecGrid);// solutions (single precision)
|
||||
|
||||
FieldD tmp_d(DoublePrecGrid);
|
||||
FieldD r_d(DoublePrecGrid);
|
||||
FieldF r_f(SinglePrecGrid);
|
||||
FieldD mmp_d(DoublePrecGrid);
|
||||
|
||||
assert(psi_d.size()==nshift);
|
||||
assert(mass.size()==nshift);
|
||||
assert(mresidual.size()==nshift);
|
||||
|
||||
// dynamic sized arrays on stack; 2d is a pain with vector
|
||||
RealD bs[nshift];
|
||||
RealD rsq[nshift];
|
||||
RealD rsqf[nshift];
|
||||
RealD z[nshift][2];
|
||||
int converged[nshift];
|
||||
|
||||
const int primary =0;
|
||||
|
||||
//Primary shift fields CG iteration
|
||||
RealD a,b,c,d;
|
||||
RealD cp,bp,qq; //prev
|
||||
|
||||
// Matrix mult fields
|
||||
FieldF p_f(SinglePrecGrid);
|
||||
FieldF mmp_f(SinglePrecGrid);
|
||||
|
||||
// Check lightest mass
|
||||
for(int s=0;s<nshift;s++){
|
||||
assert( mass[s]>= mass[primary] );
|
||||
converged[s]=0;
|
||||
}
|
||||
|
||||
// Wire guess to zero
|
||||
// Residuals "r" are src
|
||||
// First search direction "p" is also src
|
||||
cp = norm2(src_d);
|
||||
|
||||
// Handle trivial case of zero src.
|
||||
if( cp == 0. ){
|
||||
for(int s=0;s<nshift;s++){
|
||||
psi_d[s] = Zero();
|
||||
psi_f[s] = Zero();
|
||||
IterationsToCompleteShift[s] = 1;
|
||||
TrueResidualShift[s] = 0.;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
for(int s=0;s<nshift;s++){
|
||||
rsq[s] = cp * mresidual[s] * mresidual[s];
|
||||
rsqf[s] =rsq[s];
|
||||
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
|
||||
// ps_d[s] = src_d;
|
||||
precisionChange(ps_f[s],src_d);
|
||||
}
|
||||
// r and p for primary
|
||||
p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
|
||||
r_d = p_d;
|
||||
|
||||
//MdagM+m[0]
|
||||
precisionChange(p_f,p_d);
|
||||
Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
|
||||
precisionChange(tmp_d,mmp_f);
|
||||
Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
|
||||
tmp_d = tmp_d - mmp_d;
|
||||
std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
|
||||
// assert(norm2(tmp_d)< 1.0e-4);
|
||||
|
||||
axpy(mmp_d,mass[0],p_d,mmp_d);
|
||||
RealD rn = norm2(p_d);
|
||||
d += rn*mass[0];
|
||||
|
||||
b = -cp /d;
|
||||
|
||||
// Set up the various shift variables
|
||||
int iz=0;
|
||||
z[0][1-iz] = 1.0;
|
||||
z[0][iz] = 1.0;
|
||||
bs[0] = b;
|
||||
for(int s=1;s<nshift;s++){
|
||||
z[s][1-iz] = 1.0;
|
||||
z[s][iz] = 1.0/( 1.0 - b*(mass[s]-mass[0]));
|
||||
bs[s] = b*z[s][iz];
|
||||
}
|
||||
|
||||
// r += b[0] A.p[0]
|
||||
// c= norm(r)
|
||||
c=axpy_norm(r_d,b,mmp_d,r_d);
|
||||
|
||||
for(int s=0;s<nshift;s++) {
|
||||
axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
|
||||
precisionChange(psi_f[s],psi_d[s]);
|
||||
}
|
||||
|
||||
///////////////////////////////////////
|
||||
// Timers
|
||||
///////////////////////////////////////
|
||||
GridStopWatch AXPYTimer, ShiftTimer, QRTimer, MatrixTimer, SolverTimer, PrecChangeTimer, CleanupTimer;
|
||||
|
||||
SolverTimer.Start();
|
||||
|
||||
// Iteration loop
|
||||
int k;
|
||||
|
||||
for (k=1;k<=MaxIterationsMshift;k++){
|
||||
|
||||
a = c /cp;
|
||||
AXPYTimer.Start();
|
||||
axpy(p_d,a,p_d,r_d);
|
||||
AXPYTimer.Stop();
|
||||
|
||||
PrecChangeTimer.Start();
|
||||
precisionChange(r_f, r_d);
|
||||
PrecChangeTimer.Stop();
|
||||
|
||||
AXPYTimer.Start();
|
||||
for(int s=0;s<nshift;s++){
|
||||
if ( ! converged[s] ) {
|
||||
if (s==0){
|
||||
axpy(ps_f[s],a,ps_f[s],r_f);
|
||||
} else{
|
||||
RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
|
||||
axpby(ps_f[s],z[s][iz],as,r_f,ps_f[s]);
|
||||
}
|
||||
}
|
||||
}
|
||||
AXPYTimer.Stop();
|
||||
|
||||
cp=c;
|
||||
PrecChangeTimer.Start();
|
||||
precisionChange(p_f, p_d); //get back single prec search direction for linop
|
||||
PrecChangeTimer.Stop();
|
||||
MatrixTimer.Start();
|
||||
Linop_f.HermOp(p_f,mmp_f);
|
||||
MatrixTimer.Stop();
|
||||
PrecChangeTimer.Start();
|
||||
precisionChange(mmp_d, mmp_f); // From Float to Double
|
||||
PrecChangeTimer.Stop();
|
||||
|
||||
d=real(innerProduct(p_d,mmp_d));
|
||||
axpy(mmp_d,mass[0],p_d,mmp_d);
|
||||
RealD rn = norm2(p_d);
|
||||
d += rn*mass[0];
|
||||
|
||||
bp=b;
|
||||
b=-cp/d;
|
||||
|
||||
// Toggle the recurrence history
|
||||
bs[0] = b;
|
||||
iz = 1-iz;
|
||||
ShiftTimer.Start();
|
||||
for(int s=1;s<nshift;s++){
|
||||
if((!converged[s])){
|
||||
RealD z0 = z[s][1-iz];
|
||||
RealD z1 = z[s][iz];
|
||||
z[s][iz] = z0*z1*bp
|
||||
/ (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b));
|
||||
bs[s] = b*z[s][iz]/z0; // NB sign rel to Mike
|
||||
}
|
||||
}
|
||||
ShiftTimer.Stop();
|
||||
|
||||
//Update single precision solutions
|
||||
AXPYTimer.Start();
|
||||
for(int s=0;s<nshift;s++){
|
||||
int ss = s;
|
||||
if( (!converged[s]) ) {
|
||||
axpy(psi_f[ss],-bs[s]*alpha[s],ps_f[s],psi_f[ss]);
|
||||
}
|
||||
}
|
||||
c = axpy_norm(r_d,b,mmp_d,r_d);
|
||||
AXPYTimer.Stop();
|
||||
|
||||
// Convergence checks
|
||||
int all_converged = 1;
|
||||
for(int s=0;s<nshift;s++){
|
||||
|
||||
if ( (!converged[s]) ){
|
||||
IterationsToCompleteShift[s] = k;
|
||||
|
||||
RealD css = c * z[s][iz]* z[s][iz];
|
||||
|
||||
if(css<rsqf[s]){
|
||||
if ( ! converged[s] )
|
||||
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
|
||||
converged[s]=1;
|
||||
} else {
|
||||
all_converged=0;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
if ( all_converged || k == MaxIterationsMshift-1){
|
||||
|
||||
SolverTimer.Stop();
|
||||
|
||||
for(int s=0;s<nshift;s++){
|
||||
precisionChange(psi_d[s],psi_f[s]);
|
||||
}
|
||||
|
||||
|
||||
if ( all_converged ){
|
||||
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: All shifts have converged iteration "<<k<<std::endl;
|
||||
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: Checking solutions"<<std::endl;
|
||||
} else {
|
||||
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: Not all shifts have converged iteration "<<k<<std::endl;
|
||||
}
|
||||
|
||||
// Check answers
|
||||
for(int s=0; s < nshift; s++) {
|
||||
Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
|
||||
axpy(tmp_d,mass[s],psi_d[s],mmp_d);
|
||||
axpy(r_d,-alpha[s],src_d,tmp_d);
|
||||
RealD rn = norm2(r_d);
|
||||
RealD cn = norm2(src_d);
|
||||
TrueResidualShift[s] = std::sqrt(rn/cn);
|
||||
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift["<<s<<"] true residual "<< TrueResidualShift[s] << " target " << mresidual[s] << std::endl;
|
||||
|
||||
//If we have not reached the desired tolerance, do a (mixed precision) CG cleanup
|
||||
if(rn >= rsq[s]){
|
||||
CleanupTimer.Start();
|
||||
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: performing cleanup step for shift " << s << std::endl;
|
||||
|
||||
//Setup linear operators for final cleanup
|
||||
ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldD> Linop_shift_d(Linop_d, mass[s]);
|
||||
ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldF> Linop_shift_f(Linop_f, mass[s]);
|
||||
|
||||
MixedPrecisionConjugateGradient<FieldD,FieldF> cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d);
|
||||
cg(src_d, psi_d[s]);
|
||||
|
||||
TrueResidualShift[s] = cg.TrueResidual;
|
||||
CleanupTimer.Stop();
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrecCleanup: Time Breakdown for body"<<std::endl;
|
||||
std::cout << GridLogMessage << "\tSolver " << SolverTimer.Elapsed() <<std::endl;
|
||||
std::cout << GridLogMessage << "\t\tAXPY " << AXPYTimer.Elapsed() <<std::endl;
|
||||
std::cout << GridLogMessage << "\t\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
|
||||
std::cout << GridLogMessage << "\t\tShift " << ShiftTimer.Elapsed() <<std::endl;
|
||||
std::cout << GridLogMessage << "\t\tPrecision Change " << PrecChangeTimer.Elapsed() <<std::endl;
|
||||
std::cout << GridLogMessage << "\tFinal Cleanup " << CleanupTimer.Elapsed() <<std::endl;
|
||||
std::cout << GridLogMessage << "\tSolver+Cleanup " << SolverTimer.Elapsed() + CleanupTimer.Elapsed() << std::endl;
|
||||
|
||||
IterationsToComplete = k;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
||||
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
|
||||
assert(0);
|
||||
}
|
||||
|
||||
};
|
||||
NAMESPACE_END(Grid);
|
||||
|
@ -81,6 +81,7 @@ public:
|
||||
using OperatorFunction<FieldD>::operator();
|
||||
|
||||
RealD Tolerance;
|
||||
Integer MaxIterationsMshift;
|
||||
Integer MaxIterations;
|
||||
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
|
||||
std::vector<int> IterationsToCompleteShift; // Iterations for this shift
|
||||
@ -95,9 +96,9 @@ public:
|
||||
|
||||
ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts,
|
||||
GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
|
||||
int _ReliableUpdateFreq
|
||||
) :
|
||||
MaxIterations(maxit), shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq)
|
||||
int _ReliableUpdateFreq) :
|
||||
MaxIterationsMshift(maxit), shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq),
|
||||
MaxIterations(20000)
|
||||
{
|
||||
verbose=1;
|
||||
IterationsToCompleteShift.resize(_shifts.order);
|
||||
@ -130,6 +131,9 @@ public:
|
||||
GRID_TRACE("ConjugateGradientMultiShiftMixedPrec");
|
||||
GridBase *DoublePrecGrid = src_d.Grid();
|
||||
|
||||
precisionChangeWorkspace pc_wk_s_to_d(DoublePrecGrid,SinglePrecGrid);
|
||||
precisionChangeWorkspace pc_wk_d_to_s(SinglePrecGrid,DoublePrecGrid);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Convenience references to the info stored in "MultiShiftFunction"
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
@ -200,14 +204,14 @@ public:
|
||||
r_d = p_d;
|
||||
|
||||
//MdagM+m[0]
|
||||
precisionChangeFast(p_f,p_d);
|
||||
precisionChange(p_f, p_d, pc_wk_d_to_s);
|
||||
|
||||
Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
|
||||
precisionChangeFast(tmp_d,mmp_f);
|
||||
precisionChange(tmp_d, mmp_f, pc_wk_s_to_d);
|
||||
Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
|
||||
tmp_d = tmp_d - mmp_d;
|
||||
std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
|
||||
// assert(norm2(tmp_d)< 1.0e-4);
|
||||
assert(norm2(tmp_d)< 1.0);
|
||||
|
||||
axpy(mmp_d,mass[0],p_d,mmp_d);
|
||||
RealD rn = norm2(p_d);
|
||||
@ -244,7 +248,7 @@ public:
|
||||
// Iteration loop
|
||||
int k;
|
||||
|
||||
for (k=1;k<=MaxIterations;k++){
|
||||
for (k=1;k<=MaxIterationsMshift;k++){
|
||||
|
||||
a = c /cp;
|
||||
AXPYTimer.Start();
|
||||
@ -263,7 +267,7 @@ public:
|
||||
AXPYTimer.Stop();
|
||||
|
||||
PrecChangeTimer.Start();
|
||||
precisionChangeFast(p_f, p_d); //get back single prec search direction for linop
|
||||
precisionChange(p_f, p_d, pc_wk_d_to_s); //get back single prec search direction for linop
|
||||
PrecChangeTimer.Stop();
|
||||
|
||||
cp=c;
|
||||
@ -272,7 +276,7 @@ public:
|
||||
MatrixTimer.Stop();
|
||||
|
||||
PrecChangeTimer.Start();
|
||||
precisionChangeFast(mmp_d, mmp_f); // From Float to Double
|
||||
precisionChange(mmp_d, mmp_f, pc_wk_s_to_d); // From Float to Double
|
||||
PrecChangeTimer.Stop();
|
||||
|
||||
AXPYTimer.Start();
|
||||
@ -350,11 +354,16 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
if ( all_converged ){
|
||||
if ( all_converged || k == MaxIterationsMshift-1){
|
||||
|
||||
SolverTimer.Stop();
|
||||
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
|
||||
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
|
||||
|
||||
if ( all_converged ){
|
||||
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
|
||||
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
|
||||
} else {
|
||||
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Not all shifts have converged iteration "<<k<<std::endl;
|
||||
}
|
||||
|
||||
// Check answers
|
||||
for(int s=0; s < nshift; s++) {
|
||||
@ -397,11 +406,9 @@ public:
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
// ugly hack
|
||||
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
|
||||
// assert(0);
|
||||
assert(0);
|
||||
}
|
||||
|
||||
};
|
||||
|
@ -48,7 +48,7 @@ public:
|
||||
LinearOperatorBase<FieldF> &Linop_f;
|
||||
LinearOperatorBase<FieldD> &Linop_d;
|
||||
GridBase* SinglePrecGrid;
|
||||
RealD Delta; //reliable update parameter
|
||||
RealD Delta; //reliable update parameter. A reliable update is performed when the residual drops by a factor of Delta relative to its value at the last update
|
||||
|
||||
//Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single
|
||||
LinearOperatorBase<FieldF> *Linop_fallback;
|
||||
@ -65,7 +65,9 @@ public:
|
||||
ErrorOnNoConverge(err_on_no_conv),
|
||||
DoFinalCleanup(true),
|
||||
Linop_fallback(NULL)
|
||||
{};
|
||||
{
|
||||
assert(Delta > 0. && Delta < 1. && "Expect 0 < Delta < 1");
|
||||
};
|
||||
|
||||
void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
|
||||
Linop_fallback = &_Linop_fallback;
|
||||
@ -116,9 +118,12 @@ public:
|
||||
}
|
||||
|
||||
//Single prec initialization
|
||||
precisionChangeWorkspace pc_wk_sp_to_dp(src.Grid(), SinglePrecGrid);
|
||||
precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, src.Grid());
|
||||
|
||||
FieldF r_f(SinglePrecGrid);
|
||||
r_f.Checkerboard() = r.Checkerboard();
|
||||
precisionChange(r_f, r);
|
||||
precisionChange(r_f, r, pc_wk_dp_to_sp);
|
||||
|
||||
FieldF psi_f(r_f);
|
||||
psi_f = Zero();
|
||||
@ -134,6 +139,7 @@ public:
|
||||
GridStopWatch LinalgTimer;
|
||||
GridStopWatch MatrixTimer;
|
||||
GridStopWatch SolverTimer;
|
||||
GridStopWatch PrecChangeTimer;
|
||||
|
||||
SolverTimer.Start();
|
||||
int k = 0;
|
||||
@ -173,7 +179,9 @@ public:
|
||||
// Stopping condition
|
||||
if (cp <= rsq) {
|
||||
//Although not written in the paper, I assume that I have to add on the final solution
|
||||
precisionChange(mmp, psi_f);
|
||||
PrecChangeTimer.Start();
|
||||
precisionChange(mmp, psi_f, pc_wk_sp_to_dp);
|
||||
PrecChangeTimer.Stop();
|
||||
psi = psi + mmp;
|
||||
|
||||
|
||||
@ -194,6 +202,9 @@ public:
|
||||
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
|
||||
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
|
||||
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
|
||||
std::cout << GridLogMessage << "\tPrecChange " << PrecChangeTimer.Elapsed() <<std::endl;
|
||||
std::cout << GridLogMessage << "\tPrecChange avg time " << PrecChangeTimer.Elapsed()/(2*l+1) <<std::endl;
|
||||
|
||||
|
||||
IterationsToComplete = k;
|
||||
ReliableUpdatesPerformed = l;
|
||||
@ -214,14 +225,21 @@ public:
|
||||
else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update
|
||||
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate "
|
||||
<< cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n";
|
||||
precisionChange(mmp, psi_f);
|
||||
PrecChangeTimer.Start();
|
||||
precisionChange(mmp, psi_f, pc_wk_sp_to_dp);
|
||||
PrecChangeTimer.Stop();
|
||||
psi = psi + mmp;
|
||||
|
||||
MatrixTimer.Start();
|
||||
Linop_d.HermOpAndNorm(psi, mmp, d, qq);
|
||||
MatrixTimer.Stop();
|
||||
|
||||
r = src - mmp;
|
||||
|
||||
psi_f = Zero();
|
||||
precisionChange(r_f, r);
|
||||
PrecChangeTimer.Start();
|
||||
precisionChange(r_f, r, pc_wk_dp_to_sp);
|
||||
PrecChangeTimer.Stop();
|
||||
cp = norm2(r);
|
||||
MaxResidSinceLastRelUp = cp;
|
||||
|
||||
|
1412
Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h
Normal file
1412
Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h
Normal file
File diff suppressed because it is too large
Load Diff
@ -4,11 +4,14 @@ NAMESPACE_BEGIN(Grid);
|
||||
|
||||
/*Allocation types, saying which pointer cache should be used*/
|
||||
#define Cpu (0)
|
||||
#define CpuSmall (1)
|
||||
#define Acc (2)
|
||||
#define AccSmall (3)
|
||||
#define Shared (4)
|
||||
#define SharedSmall (5)
|
||||
#define CpuHuge (1)
|
||||
#define CpuSmall (2)
|
||||
#define Acc (3)
|
||||
#define AccHuge (4)
|
||||
#define AccSmall (5)
|
||||
#define Shared (6)
|
||||
#define SharedHuge (7)
|
||||
#define SharedSmall (8)
|
||||
#undef GRID_MM_VERBOSE
|
||||
uint64_t total_shared;
|
||||
uint64_t total_device;
|
||||
@ -35,12 +38,15 @@ void MemoryManager::PrintBytes(void)
|
||||
|
||||
}
|
||||
|
||||
uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; }
|
||||
uint64_t MemoryManager::HostCacheBytes() { return CacheBytes[Cpu] + CacheBytes[CpuHuge] + CacheBytes[CpuSmall]; }
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Data tables for recently freed pooiniter caches
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
|
||||
int MemoryManager::Victim[MemoryManager::NallocType];
|
||||
int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 8, 16, 8, 16 };
|
||||
int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 0, 8, 8, 0, 16, 8, 0, 16 };
|
||||
uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Actual allocation and deallocation utils
|
||||
@ -170,6 +176,16 @@ void MemoryManager::Init(void)
|
||||
}
|
||||
}
|
||||
|
||||
str= getenv("GRID_ALLOC_NCACHE_HUGE");
|
||||
if ( str ) {
|
||||
Nc = atoi(str);
|
||||
if ( (Nc>=0) && (Nc < NallocCacheMax)) {
|
||||
Ncache[CpuHuge]=Nc;
|
||||
Ncache[AccHuge]=Nc;
|
||||
Ncache[SharedHuge]=Nc;
|
||||
}
|
||||
}
|
||||
|
||||
str= getenv("GRID_ALLOC_NCACHE_SMALL");
|
||||
if ( str ) {
|
||||
Nc = atoi(str);
|
||||
@ -190,7 +206,9 @@ void MemoryManager::InitMessage(void) {
|
||||
|
||||
std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
|
||||
#ifdef ALLOCATION_CACHE
|
||||
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
|
||||
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent host allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<" HUGE "<<Ncache[CpuHuge]<<std::endl;
|
||||
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent device allocations: SMALL "<<Ncache[AccSmall]<<" LARGE "<<Ncache[Acc]<<" Huge "<<Ncache[AccHuge]<<std::endl;
|
||||
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent shared allocations: SMALL "<<Ncache[SharedSmall]<<" LARGE "<<Ncache[Shared]<<" Huge "<<Ncache[SharedHuge]<<std::endl;
|
||||
#endif
|
||||
|
||||
#ifdef GRID_UVM
|
||||
@ -222,8 +240,11 @@ void MemoryManager::InitMessage(void) {
|
||||
void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
|
||||
{
|
||||
#ifdef ALLOCATION_CACHE
|
||||
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
|
||||
int cache = type + small;
|
||||
int cache;
|
||||
if (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2;
|
||||
else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
|
||||
else cache = type;
|
||||
|
||||
return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);
|
||||
#else
|
||||
return ptr;
|
||||
@ -232,11 +253,12 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
|
||||
|
||||
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes)
|
||||
{
|
||||
assert(ncache>0);
|
||||
#ifdef GRID_OMP
|
||||
assert(omp_in_parallel()==0);
|
||||
#endif
|
||||
|
||||
if (ncache == 0) return ptr;
|
||||
|
||||
void * ret = NULL;
|
||||
int v = -1;
|
||||
|
||||
@ -271,8 +293,11 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
|
||||
void *MemoryManager::Lookup(size_t bytes,int type)
|
||||
{
|
||||
#ifdef ALLOCATION_CACHE
|
||||
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
|
||||
int cache = type+small;
|
||||
int cache;
|
||||
if (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2;
|
||||
else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
|
||||
else cache = type;
|
||||
|
||||
return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]);
|
||||
#else
|
||||
return NULL;
|
||||
@ -281,7 +306,6 @@ void *MemoryManager::Lookup(size_t bytes,int type)
|
||||
|
||||
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes)
|
||||
{
|
||||
assert(ncache>0);
|
||||
#ifdef GRID_OMP
|
||||
assert(omp_in_parallel()==0);
|
||||
#endif
|
||||
|
@ -35,6 +35,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
// Move control to configure.ac and Config.h?
|
||||
|
||||
#define GRID_ALLOC_SMALL_LIMIT (4096)
|
||||
#define GRID_ALLOC_HUGE_LIMIT (2147483648)
|
||||
|
||||
#define STRINGIFY(x) #x
|
||||
#define TOSTRING(x) STRINGIFY(x)
|
||||
@ -70,6 +71,21 @@ enum ViewMode {
|
||||
CpuWriteDiscard = 0x10 // same for now
|
||||
};
|
||||
|
||||
struct MemoryStatus {
|
||||
uint64_t DeviceBytes;
|
||||
uint64_t DeviceLRUBytes;
|
||||
uint64_t DeviceMaxBytes;
|
||||
uint64_t HostToDeviceBytes;
|
||||
uint64_t DeviceToHostBytes;
|
||||
uint64_t HostToDeviceXfer;
|
||||
uint64_t DeviceToHostXfer;
|
||||
uint64_t DeviceEvictions;
|
||||
uint64_t DeviceDestroy;
|
||||
uint64_t DeviceAllocCacheBytes;
|
||||
uint64_t HostAllocCacheBytes;
|
||||
};
|
||||
|
||||
|
||||
class MemoryManager {
|
||||
private:
|
||||
|
||||
@ -83,7 +99,7 @@ private:
|
||||
} AllocationCacheEntry;
|
||||
|
||||
static const int NallocCacheMax=128;
|
||||
static const int NallocType=6;
|
||||
static const int NallocType=9;
|
||||
static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
|
||||
static int Victim[NallocType];
|
||||
static int Ncache[NallocType];
|
||||
@ -97,8 +113,8 @@ private:
|
||||
static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim,uint64_t &cbytes) ;
|
||||
static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t &cbytes) ;
|
||||
|
||||
static void PrintBytes(void);
|
||||
public:
|
||||
static void PrintBytes(void);
|
||||
static void Audit(std::string s);
|
||||
static void Init(void);
|
||||
static void InitMessage(void);
|
||||
@ -119,6 +135,27 @@ private:
|
||||
static uint64_t DeviceToHostBytes;
|
||||
static uint64_t HostToDeviceXfer;
|
||||
static uint64_t DeviceToHostXfer;
|
||||
static uint64_t DeviceEvictions;
|
||||
static uint64_t DeviceDestroy;
|
||||
|
||||
static uint64_t DeviceCacheBytes();
|
||||
static uint64_t HostCacheBytes();
|
||||
|
||||
static MemoryStatus GetFootprint(void) {
|
||||
MemoryStatus stat;
|
||||
stat.DeviceBytes = DeviceBytes;
|
||||
stat.DeviceLRUBytes = DeviceLRUBytes;
|
||||
stat.DeviceMaxBytes = DeviceMaxBytes;
|
||||
stat.HostToDeviceBytes = HostToDeviceBytes;
|
||||
stat.DeviceToHostBytes = DeviceToHostBytes;
|
||||
stat.HostToDeviceXfer = HostToDeviceXfer;
|
||||
stat.DeviceToHostXfer = DeviceToHostXfer;
|
||||
stat.DeviceEvictions = DeviceEvictions;
|
||||
stat.DeviceDestroy = DeviceDestroy;
|
||||
stat.DeviceAllocCacheBytes = DeviceCacheBytes();
|
||||
stat.HostAllocCacheBytes = HostCacheBytes();
|
||||
return stat;
|
||||
};
|
||||
|
||||
private:
|
||||
#ifndef GRID_UVM
|
||||
@ -176,6 +213,7 @@ private:
|
||||
|
||||
public:
|
||||
static void Print(void);
|
||||
static void PrintAll(void);
|
||||
static void PrintState( void* CpuPtr);
|
||||
static int isOpen (void* CpuPtr);
|
||||
static void ViewClose(void* CpuPtr,ViewMode mode);
|
||||
|
@ -28,6 +28,8 @@ uint64_t MemoryManager::HostToDeviceBytes;
|
||||
uint64_t MemoryManager::DeviceToHostBytes;
|
||||
uint64_t MemoryManager::HostToDeviceXfer;
|
||||
uint64_t MemoryManager::DeviceToHostXfer;
|
||||
uint64_t MemoryManager::DeviceEvictions;
|
||||
uint64_t MemoryManager::DeviceDestroy;
|
||||
|
||||
////////////////////////////////////
|
||||
// Priority ordering for unlocked entries
|
||||
@ -115,8 +117,10 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
|
||||
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
||||
if(AccCache.AccPtr) {
|
||||
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
|
||||
DeviceDestroy++;
|
||||
DeviceBytes -=AccCache.bytes;
|
||||
LRUremove(AccCache);
|
||||
AccCache.AccPtr=(uint64_t) NULL;
|
||||
dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
|
||||
}
|
||||
uint64_t CpuPtr = AccCache.CpuPtr;
|
||||
@ -126,8 +130,14 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
|
||||
void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
|
||||
{
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Make CPU consistent, remove from Accelerator, remove entry
|
||||
// Cannot be locked. If allocated must be in LRU pool.
|
||||
// Make CPU consistent, remove from Accelerator, remove from LRU, LEAVE CPU only entry
|
||||
// Cannot be acclocked. If allocated must be in LRU pool.
|
||||
//
|
||||
// Nov 2022... Felix issue: Allocating two CpuPtrs, can have an entry in LRU-q with CPUlock.
|
||||
// and require to evict the AccPtr copy. Eviction was a mistake in CpuViewOpen
|
||||
// but there is a weakness where CpuLock entries are attempted for erase
|
||||
// Take these OUT LRU queue when CPU locked?
|
||||
// Cannot take out the table as cpuLock data is important.
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
assert(AccCache.state!=Empty);
|
||||
|
||||
@ -139,15 +149,17 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
|
||||
if(AccCache.state==AccDirty) {
|
||||
Flush(AccCache);
|
||||
}
|
||||
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
||||
if(AccCache.AccPtr) {
|
||||
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
|
||||
DeviceBytes -=AccCache.bytes;
|
||||
LRUremove(AccCache);
|
||||
AccCache.AccPtr=(uint64_t)NULL;
|
||||
AccCache.state=CpuDirty; // CPU primary now
|
||||
DeviceBytes -=AccCache.bytes;
|
||||
dprintf("MemoryManager: Free(%lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
|
||||
}
|
||||
uint64_t CpuPtr = AccCache.CpuPtr;
|
||||
EntryErase(CpuPtr);
|
||||
// uint64_t CpuPtr = AccCache.CpuPtr;
|
||||
DeviceEvictions++;
|
||||
// EntryErase(CpuPtr);
|
||||
}
|
||||
void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
|
||||
{
|
||||
@ -221,13 +233,16 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
|
||||
}
|
||||
void MemoryManager::EvictVictims(uint64_t bytes)
|
||||
{
|
||||
assert(bytes<DeviceMaxBytes);
|
||||
while(bytes+DeviceLRUBytes > DeviceMaxBytes){
|
||||
if ( DeviceLRUBytes > 0){
|
||||
assert(LRU.size()>0);
|
||||
uint64_t victim = LRU.back();
|
||||
uint64_t victim = LRU.back(); // From the LRU
|
||||
auto AccCacheIterator = EntryLookup(victim);
|
||||
auto & AccCache = AccCacheIterator->second;
|
||||
Evict(AccCache);
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -322,7 +337,8 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
|
||||
assert(0);
|
||||
}
|
||||
|
||||
// If view is opened on device remove from LRU
|
||||
assert(AccCache.accLock>0);
|
||||
// If view is opened on device must remove from LRU
|
||||
if(AccCache.LRU_valid==1){
|
||||
// must possibly remove from LRU as now locked on GPU
|
||||
dprintf("AccCache entry removed from LRU \n");
|
||||
@ -388,9 +404,10 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
|
||||
auto AccCacheIterator = EntryLookup(CpuPtr);
|
||||
auto & AccCache = AccCacheIterator->second;
|
||||
|
||||
if (!AccCache.AccPtr) {
|
||||
EvictVictims(bytes);
|
||||
}
|
||||
// CPU doesn't need to free space
|
||||
// if (!AccCache.AccPtr) {
|
||||
// EvictVictims(bytes);
|
||||
// }
|
||||
|
||||
assert((mode==CpuRead)||(mode==CpuWrite));
|
||||
assert(AccCache.accLock==0); // Programming error
|
||||
@ -444,20 +461,28 @@ void MemoryManager::NotifyDeletion(void *_ptr)
|
||||
void MemoryManager::Print(void)
|
||||
{
|
||||
PrintBytes();
|
||||
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
|
||||
std::cout << GridLogDebug << "Memory Manager " << std::endl;
|
||||
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
|
||||
std::cout << GridLogDebug << DeviceBytes << " bytes allocated on device " << std::endl;
|
||||
std::cout << GridLogDebug << DeviceLRUBytes<< " bytes evictable on device " << std::endl;
|
||||
std::cout << GridLogDebug << DeviceMaxBytes<< " bytes max on device " << std::endl;
|
||||
std::cout << GridLogDebug << HostToDeviceXfer << " transfers to device " << std::endl;
|
||||
std::cout << GridLogDebug << DeviceToHostXfer << " transfers from device " << std::endl;
|
||||
std::cout << GridLogDebug << HostToDeviceBytes<< " bytes transfered to device " << std::endl;
|
||||
std::cout << GridLogDebug << DeviceToHostBytes<< " bytes transfered from device " << std::endl;
|
||||
std::cout << GridLogDebug << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
|
||||
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
|
||||
std::cout << GridLogDebug << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
|
||||
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
|
||||
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
|
||||
std::cout << GridLogMessage << "Memory Manager " << std::endl;
|
||||
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
|
||||
std::cout << GridLogMessage << DeviceBytes << " bytes allocated on device " << std::endl;
|
||||
std::cout << GridLogMessage << DeviceLRUBytes<< " bytes evictable on device " << std::endl;
|
||||
std::cout << GridLogMessage << DeviceMaxBytes<< " bytes max on device " << std::endl;
|
||||
std::cout << GridLogMessage << HostToDeviceXfer << " transfers to device " << std::endl;
|
||||
std::cout << GridLogMessage << DeviceToHostXfer << " transfers from device " << std::endl;
|
||||
std::cout << GridLogMessage << HostToDeviceBytes<< " bytes transfered to device " << std::endl;
|
||||
std::cout << GridLogMessage << DeviceToHostBytes<< " bytes transfered from device " << std::endl;
|
||||
std::cout << GridLogMessage << DeviceEvictions << " Evictions from device " << std::endl;
|
||||
std::cout << GridLogMessage << DeviceDestroy << " Destroyed vectors on device " << std::endl;
|
||||
std::cout << GridLogMessage << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
|
||||
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
|
||||
}
|
||||
void MemoryManager::PrintAll(void)
|
||||
{
|
||||
Print();
|
||||
std::cout << GridLogMessage << std::endl;
|
||||
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
|
||||
std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
|
||||
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
|
||||
for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
|
||||
auto &AccCache = it->second;
|
||||
|
||||
@ -467,13 +492,13 @@ void MemoryManager::Print(void)
|
||||
if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
|
||||
if ( AccCache.state==Consistent)str = std::string("Consistent");
|
||||
|
||||
std::cout << GridLogDebug << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
|
||||
std::cout << GridLogMessage << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
|
||||
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
|
||||
<< "\t" << AccCache.cpuLock
|
||||
<< "\t" << AccCache.accLock
|
||||
<< "\t" << AccCache.LRU_valid<<std::endl;
|
||||
}
|
||||
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
|
||||
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
|
||||
|
||||
};
|
||||
int MemoryManager::isOpen (void* _CpuPtr)
|
||||
@ -489,6 +514,24 @@ int MemoryManager::isOpen (void* _CpuPtr)
|
||||
}
|
||||
void MemoryManager::Audit(std::string s)
|
||||
{
|
||||
uint64_t CpuBytes=0;
|
||||
uint64_t AccBytes=0;
|
||||
uint64_t LruBytes1=0;
|
||||
uint64_t LruBytes2=0;
|
||||
uint64_t LruCnt=0;
|
||||
|
||||
std::cout << " Memory Manager::Audit() from "<<s<<std::endl;
|
||||
for(auto it=LRU.begin();it!=LRU.end();it++){
|
||||
uint64_t cpuPtr = *it;
|
||||
assert(EntryPresent(cpuPtr));
|
||||
auto AccCacheIterator = EntryLookup(cpuPtr);
|
||||
auto & AccCache = AccCacheIterator->second;
|
||||
LruBytes2+=AccCache.bytes;
|
||||
assert(AccCache.LRU_valid==1);
|
||||
assert(AccCache.LRU_entry==it);
|
||||
}
|
||||
std::cout << " Memory Manager::Audit() LRU queue matches table entries "<<std::endl;
|
||||
|
||||
for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
|
||||
auto &AccCache = it->second;
|
||||
|
||||
@ -498,7 +541,14 @@ void MemoryManager::Audit(std::string s)
|
||||
if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
|
||||
if ( AccCache.state==Consistent)str = std::string("Consistent");
|
||||
|
||||
CpuBytes+=AccCache.bytes;
|
||||
if( AccCache.AccPtr ) AccBytes+=AccCache.bytes;
|
||||
if( AccCache.LRU_valid ) LruBytes1+=AccCache.bytes;
|
||||
if( AccCache.LRU_valid ) LruCnt++;
|
||||
|
||||
if ( AccCache.cpuLock || AccCache.accLock ) {
|
||||
assert(AccCache.LRU_valid==0);
|
||||
|
||||
std::cout << GridLogError << s<< "\n\t 0x"<<std::hex<<AccCache.CpuPtr<<std::dec
|
||||
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
|
||||
<< "\t cpuLock " << AccCache.cpuLock
|
||||
@ -509,6 +559,15 @@ void MemoryManager::Audit(std::string s)
|
||||
assert( AccCache.cpuLock== 0 ) ;
|
||||
assert( AccCache.accLock== 0 ) ;
|
||||
}
|
||||
std::cout << " Memory Manager::Audit() no locked table entries "<<std::endl;
|
||||
assert(LruBytes1==LruBytes2);
|
||||
assert(LruBytes1==DeviceLRUBytes);
|
||||
std::cout << " Memory Manager::Audit() evictable bytes matches sum over table "<<std::endl;
|
||||
assert(AccBytes==DeviceBytes);
|
||||
std::cout << " Memory Manager::Audit() device bytes matches sum over table "<<std::endl;
|
||||
assert(LruCnt == LRU.size());
|
||||
std::cout << " Memory Manager::Audit() LRU entry count matches "<<std::endl;
|
||||
|
||||
}
|
||||
|
||||
void MemoryManager::PrintState(void* _CpuPtr)
|
||||
@ -526,8 +585,8 @@ void MemoryManager::PrintState(void* _CpuPtr)
|
||||
if ( AccCache.state==EvictNext) str = std::string("EvictNext");
|
||||
|
||||
std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
|
||||
std::cout << GridLogMessage << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
|
||||
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
|
||||
std::cout << GridLogMessage << "\tx"<<std::hex<<AccCache.CpuPtr<<std::dec
|
||||
<< "\tx"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
|
||||
<< "\t" << AccCache.cpuLock
|
||||
<< "\t" << AccCache.accLock
|
||||
<< "\t" << AccCache.LRU_valid<<std::endl;
|
||||
|
@ -12,6 +12,8 @@ uint64_t MemoryManager::HostToDeviceBytes;
|
||||
uint64_t MemoryManager::DeviceToHostBytes;
|
||||
uint64_t MemoryManager::HostToDeviceXfer;
|
||||
uint64_t MemoryManager::DeviceToHostXfer;
|
||||
uint64_t MemoryManager::DeviceEvictions;
|
||||
uint64_t MemoryManager::DeviceDestroy;
|
||||
|
||||
void MemoryManager::Audit(std::string s){};
|
||||
void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
|
||||
@ -22,6 +24,7 @@ void MemoryManager::PrintState(void* CpuPtr)
|
||||
std::cout << GridLogMessage << "Host<->Device memory movement not currently managed by Grid." << std::endl;
|
||||
};
|
||||
void MemoryManager::Print(void){};
|
||||
void MemoryManager::PrintAll(void){};
|
||||
void MemoryManager::NotifyDeletion(void *ptr){};
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
@ -400,9 +400,6 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
||||
}
|
||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
|
||||
{
|
||||
acceleratorCopySynchronise();
|
||||
StencilBarrier();// Synch shared memory on a single nodes
|
||||
|
||||
int nreq=list.size();
|
||||
|
||||
if (nreq==0) return;
|
||||
|
@ -128,7 +128,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
||||
int recv_from_rank,int dor,
|
||||
int xbytes,int rbytes, int dir)
|
||||
{
|
||||
return 2.0*bytes;
|
||||
return xbytes+rbytes;
|
||||
}
|
||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
|
||||
{
|
||||
|
@ -91,6 +91,59 @@ void *SharedMemory::ShmBufferSelf(void)
|
||||
//std::cerr << "ShmBufferSelf "<<ShmRank<<" "<<std::hex<< ShmCommBufs[ShmRank] <<std::dec<<std::endl;
|
||||
return ShmCommBufs[ShmRank];
|
||||
}
|
||||
static inline int divides(int a,int b)
|
||||
{
|
||||
return ( b == ( (b/a)*a ) );
|
||||
}
|
||||
void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
|
||||
{
|
||||
////////////////////////////////////////////////////////////////
|
||||
// Allow user to configure through environment variable
|
||||
////////////////////////////////////////////////////////////////
|
||||
char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str());
|
||||
if ( str ) {
|
||||
std::vector<int> IntShmDims;
|
||||
GridCmdOptionIntVector(std::string(str),IntShmDims);
|
||||
assert(IntShmDims.size() == WorldDims.size());
|
||||
long ShmSize = 1;
|
||||
for (int dim=0;dim<WorldDims.size();dim++) {
|
||||
ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
|
||||
assert(divides(ShmDims[dim],WorldDims[dim]));
|
||||
}
|
||||
assert(ShmSize == WorldShmSize);
|
||||
return;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// Powers of 2,3,5 only in prime decomposition for now
|
||||
////////////////////////////////////////////////////////////////
|
||||
int ndimension = WorldDims.size();
|
||||
ShmDims=Coordinate(ndimension,1);
|
||||
|
||||
std::vector<int> primes({2,3,5});
|
||||
|
||||
int dim = 0;
|
||||
int last_dim = ndimension - 1;
|
||||
int AutoShmSize = 1;
|
||||
while(AutoShmSize != WorldShmSize) {
|
||||
int p;
|
||||
for(p=0;p<primes.size();p++) {
|
||||
int prime=primes[p];
|
||||
if ( divides(prime,WorldDims[dim]/ShmDims[dim])
|
||||
&& divides(prime,WorldShmSize/AutoShmSize) ) {
|
||||
AutoShmSize*=prime;
|
||||
ShmDims[dim]*=prime;
|
||||
last_dim = dim;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (p == primes.size() && last_dim == dim) {
|
||||
std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
dim=(dim+1) %ndimension;
|
||||
}
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
|
@ -27,9 +27,10 @@ Author: Christoph Lehner <christoph@lhnr.de>
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#define header "SharedMemoryMpi: "
|
||||
|
||||
#include <Grid/GridCore.h>
|
||||
#include <pwd.h>
|
||||
#include <syscall.h>
|
||||
|
||||
#ifdef GRID_CUDA
|
||||
#include <cuda_runtime_api.h>
|
||||
@ -37,12 +38,120 @@ Author: Christoph Lehner <christoph@lhnr.de>
|
||||
#ifdef GRID_HIP
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#endif
|
||||
#ifdef GRID_SYCl
|
||||
|
||||
#ifdef GRID_SYCL
|
||||
#define GRID_SYCL_LEVEL_ZERO_IPC
|
||||
#include <syscall.h>
|
||||
#define SHM_SOCKETS
|
||||
#endif
|
||||
|
||||
#include <sys/socket.h>
|
||||
#include <sys/un.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
#define header "SharedMemoryMpi: "
|
||||
|
||||
#ifdef SHM_SOCKETS
|
||||
|
||||
/*
|
||||
* Barbaric extra intranode communication route in case we need sockets to pass FDs
|
||||
* Forced by level_zero not being nicely designed
|
||||
*/
|
||||
static int sock;
|
||||
static const char *sock_path_fmt = "/tmp/GridUnixSocket.%d";
|
||||
static char sock_path[256];
|
||||
class UnixSockets {
|
||||
public:
|
||||
static void Open(int rank)
|
||||
{
|
||||
int errnum;
|
||||
|
||||
sock = socket(AF_UNIX, SOCK_DGRAM, 0); assert(sock>0);
|
||||
|
||||
struct sockaddr_un sa_un = { 0 };
|
||||
sa_un.sun_family = AF_UNIX;
|
||||
snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,rank);
|
||||
unlink(sa_un.sun_path);
|
||||
if (bind(sock, (struct sockaddr *)&sa_un, sizeof(sa_un))) {
|
||||
perror("bind failure");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
static int RecvFileDescriptor(void)
|
||||
{
|
||||
int n;
|
||||
int fd;
|
||||
char buf[1];
|
||||
struct iovec iov;
|
||||
struct msghdr msg;
|
||||
struct cmsghdr *cmsg;
|
||||
char cms[CMSG_SPACE(sizeof(int))];
|
||||
|
||||
iov.iov_base = buf;
|
||||
iov.iov_len = 1;
|
||||
|
||||
memset(&msg, 0, sizeof msg);
|
||||
msg.msg_name = 0;
|
||||
msg.msg_namelen = 0;
|
||||
msg.msg_iov = &iov;
|
||||
msg.msg_iovlen = 1;
|
||||
|
||||
msg.msg_control = (caddr_t)cms;
|
||||
msg.msg_controllen = sizeof cms;
|
||||
|
||||
if((n=recvmsg(sock, &msg, 0)) < 0) {
|
||||
perror("recvmsg failed");
|
||||
return -1;
|
||||
}
|
||||
if(n == 0){
|
||||
perror("recvmsg returned 0");
|
||||
return -1;
|
||||
}
|
||||
cmsg = CMSG_FIRSTHDR(&msg);
|
||||
|
||||
memmove(&fd, CMSG_DATA(cmsg), sizeof(int));
|
||||
|
||||
return fd;
|
||||
}
|
||||
|
||||
static void SendFileDescriptor(int fildes,int xmit_to_rank)
|
||||
{
|
||||
struct msghdr msg;
|
||||
struct iovec iov;
|
||||
struct cmsghdr *cmsg = NULL;
|
||||
char ctrl[CMSG_SPACE(sizeof(int))];
|
||||
char data = ' ';
|
||||
|
||||
memset(&msg, 0, sizeof(struct msghdr));
|
||||
memset(ctrl, 0, CMSG_SPACE(sizeof(int)));
|
||||
iov.iov_base = &data;
|
||||
iov.iov_len = sizeof(data);
|
||||
|
||||
sprintf(sock_path,sock_path_fmt,xmit_to_rank);
|
||||
|
||||
struct sockaddr_un sa_un = { 0 };
|
||||
sa_un.sun_family = AF_UNIX;
|
||||
snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,xmit_to_rank);
|
||||
|
||||
msg.msg_name = (void *)&sa_un;
|
||||
msg.msg_namelen = sizeof(sa_un);
|
||||
msg.msg_iov = &iov;
|
||||
msg.msg_iovlen = 1;
|
||||
msg.msg_controllen = CMSG_SPACE(sizeof(int));
|
||||
msg.msg_control = ctrl;
|
||||
|
||||
cmsg = CMSG_FIRSTHDR(&msg);
|
||||
cmsg->cmsg_level = SOL_SOCKET;
|
||||
cmsg->cmsg_type = SCM_RIGHTS;
|
||||
cmsg->cmsg_len = CMSG_LEN(sizeof(int));
|
||||
|
||||
*((int *) CMSG_DATA(cmsg)) = fildes;
|
||||
|
||||
sendmsg(sock, &msg, 0);
|
||||
};
|
||||
};
|
||||
#endif
|
||||
|
||||
|
||||
/*Construct from an MPI communicator*/
|
||||
void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
|
||||
{
|
||||
@ -169,59 +278,7 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M
|
||||
if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm,SHM);
|
||||
else OptimalCommunicatorSharedMemory(processors,optimal_comm,SHM);
|
||||
}
|
||||
static inline int divides(int a,int b)
|
||||
{
|
||||
return ( b == ( (b/a)*a ) );
|
||||
}
|
||||
void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
|
||||
{
|
||||
////////////////////////////////////////////////////////////////
|
||||
// Allow user to configure through environment variable
|
||||
////////////////////////////////////////////////////////////////
|
||||
char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str());
|
||||
if ( str ) {
|
||||
std::vector<int> IntShmDims;
|
||||
GridCmdOptionIntVector(std::string(str),IntShmDims);
|
||||
assert(IntShmDims.size() == WorldDims.size());
|
||||
long ShmSize = 1;
|
||||
for (int dim=0;dim<WorldDims.size();dim++) {
|
||||
ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
|
||||
assert(divides(ShmDims[dim],WorldDims[dim]));
|
||||
}
|
||||
assert(ShmSize == WorldShmSize);
|
||||
return;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// Powers of 2,3,5 only in prime decomposition for now
|
||||
////////////////////////////////////////////////////////////////
|
||||
int ndimension = WorldDims.size();
|
||||
ShmDims=Coordinate(ndimension,1);
|
||||
|
||||
std::vector<int> primes({2,3,5});
|
||||
|
||||
int dim = 0;
|
||||
int last_dim = ndimension - 1;
|
||||
int AutoShmSize = 1;
|
||||
while(AutoShmSize != WorldShmSize) {
|
||||
int p;
|
||||
for(p=0;p<primes.size();p++) {
|
||||
int prime=primes[p];
|
||||
if ( divides(prime,WorldDims[dim]/ShmDims[dim])
|
||||
&& divides(prime,WorldShmSize/AutoShmSize) ) {
|
||||
AutoShmSize*=prime;
|
||||
ShmDims[dim]*=prime;
|
||||
last_dim = dim;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (p == primes.size() && last_dim == dim) {
|
||||
std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
dim=(dim+1) %ndimension;
|
||||
}
|
||||
}
|
||||
void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
|
||||
{
|
||||
////////////////////////////////////////////////////////////////
|
||||
@ -531,8 +588,13 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Loop over ranks/gpu's on our node
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef SHM_SOCKETS
|
||||
UnixSockets::Open(WorldShmRank);
|
||||
#endif
|
||||
for(int r=0;r<WorldShmSize;r++){
|
||||
|
||||
MPI_Barrier(WorldShmComm);
|
||||
|
||||
#ifndef GRID_MPI3_SHM_NONE
|
||||
//////////////////////////////////////////////////
|
||||
// If it is me, pass around the IPC access key
|
||||
@ -540,7 +602,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
void * thisBuf = ShmCommBuf;
|
||||
if(!Stencil_force_mpi) {
|
||||
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
|
||||
typedef struct { int fd; pid_t pid ; } clone_mem_t;
|
||||
typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;
|
||||
|
||||
auto zeDevice = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device());
|
||||
auto zeContext = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context());
|
||||
@ -551,13 +613,21 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
if ( r==WorldShmRank ) {
|
||||
auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&ihandle);
|
||||
if ( err != ZE_RESULT_SUCCESS ) {
|
||||
std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
|
||||
std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
} else {
|
||||
std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
|
||||
}
|
||||
memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
|
||||
handle.pid = getpid();
|
||||
memcpy((void *)&handle.ze,(void *)&ihandle,sizeof(ihandle));
|
||||
#ifdef SHM_SOCKETS
|
||||
for(int rr=0;rr<WorldShmSize;rr++){
|
||||
if(rr!=r){
|
||||
UnixSockets::SendFileDescriptor(handle.fd,rr);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
#ifdef GRID_CUDA
|
||||
@ -585,6 +655,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
// Share this IPC handle across the Shm Comm
|
||||
//////////////////////////////////////////////////
|
||||
{
|
||||
MPI_Barrier(WorldShmComm);
|
||||
int ierr=MPI_Bcast(&handle,
|
||||
sizeof(handle),
|
||||
MPI_BYTE,
|
||||
@ -600,6 +671,10 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
|
||||
if ( r!=WorldShmRank ) {
|
||||
thisBuf = nullptr;
|
||||
int myfd;
|
||||
#ifdef SHM_SOCKETS
|
||||
myfd=UnixSockets::RecvFileDescriptor();
|
||||
#else
|
||||
std::cout<<"mapping seeking remote pid/fd "
|
||||
<<handle.pid<<"/"
|
||||
<<handle.fd<<std::endl;
|
||||
@ -607,16 +682,22 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
|
||||
std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
|
||||
// int myfd = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
|
||||
int myfd = syscall(438,pidfd,handle.fd,0);
|
||||
|
||||
std::cout<<"Using IpcHandle myfd "<<myfd<<"\n";
|
||||
|
||||
myfd = syscall(438,pidfd,handle.fd,0);
|
||||
int err_t = errno;
|
||||
if (myfd < 0) {
|
||||
fprintf(stderr,"pidfd_getfd returned %d errno was %d\n", myfd,err_t); fflush(stderr);
|
||||
perror("pidfd_getfd failed ");
|
||||
assert(0);
|
||||
}
|
||||
#endif
|
||||
std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
|
||||
memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle));
|
||||
memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));
|
||||
|
||||
auto err = zeMemOpenIpcHandle(zeContext,zeDevice,ihandle,0,&thisBuf);
|
||||
if ( err != ZE_RESULT_SUCCESS ) {
|
||||
std::cout << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
|
||||
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
|
||||
std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
|
||||
std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
} else {
|
||||
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
|
||||
@ -651,6 +732,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
#else
|
||||
WorldShmCommBufs[r] = ShmCommBuf;
|
||||
#endif
|
||||
MPI_Barrier(WorldShmComm);
|
||||
}
|
||||
|
||||
_ShmAllocBytes=bytes;
|
||||
|
@ -297,6 +297,30 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
|
||||
}
|
||||
}
|
||||
|
||||
#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
|
||||
|
||||
template <typename T>
|
||||
T iDivUp(T a, T b) // Round a / b to nearest higher integer value
|
||||
{ return (a % b != 0) ? (a / b + 1) : (a / b); }
|
||||
|
||||
template <typename T>
|
||||
__global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride)
|
||||
{
|
||||
int idx = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
if (idx >= e1*e2) return;
|
||||
|
||||
int n, b, o;
|
||||
|
||||
n = idx / e2;
|
||||
b = idx % e2;
|
||||
o = n*stride + b;
|
||||
|
||||
vector[2*idx + 0] = lo + o;
|
||||
vector[2*idx + 1] = ro + o;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// local to node block strided copies
|
||||
//////////////////////////////////////////////////////
|
||||
@ -321,12 +345,20 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
|
||||
int ent=0;
|
||||
|
||||
if(cbmask == 0x3 ){
|
||||
#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
|
||||
ent = e1*e2;
|
||||
dim3 blockSize(acceleratorThreads());
|
||||
dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
|
||||
populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
|
||||
accelerator_barrier();
|
||||
#else
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
int o =n*stride+b;
|
||||
Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
@ -377,11 +409,19 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
|
||||
int ent=0;
|
||||
|
||||
if ( cbmask == 0x3 ) {
|
||||
#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
|
||||
ent = e1*e2;
|
||||
dim3 blockSize(acceleratorThreads());
|
||||
dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
|
||||
populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
|
||||
accelerator_barrier();
|
||||
#else
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
int o =n*stride;
|
||||
Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
|
||||
}}
|
||||
#endif
|
||||
} else {
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
|
@ -291,8 +291,8 @@ public:
|
||||
typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
|
||||
conformable(*this,r);
|
||||
this->checkerboard = r.Checkerboard();
|
||||
auto me = View(AcceleratorWriteDiscard);
|
||||
auto him= r.View(AcceleratorRead);
|
||||
auto me = View(AcceleratorWriteDiscard);
|
||||
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
||||
coalescedWrite(me[ss],him(ss));
|
||||
});
|
||||
@ -306,8 +306,8 @@ public:
|
||||
inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
|
||||
this->checkerboard = r.Checkerboard();
|
||||
conformable(*this,r);
|
||||
auto me = View(AcceleratorWriteDiscard);
|
||||
auto him= r.View(AcceleratorRead);
|
||||
auto me = View(AcceleratorWriteDiscard);
|
||||
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
||||
coalescedWrite(me[ss],him(ss));
|
||||
});
|
||||
|
@ -28,6 +28,9 @@ Author: Christoph Lehner <christoph@lhnr.de>
|
||||
#if defined(GRID_CUDA)||defined(GRID_HIP)
|
||||
#include <Grid/lattice/Lattice_reduction_gpu.h>
|
||||
#endif
|
||||
#if defined(GRID_SYCL)
|
||||
#include <Grid/lattice/Lattice_reduction_sycl.h>
|
||||
#endif
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
@ -124,7 +127,7 @@ inline Double max(const Double *arg, Integer osites)
|
||||
template<class vobj>
|
||||
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
|
||||
{
|
||||
#if defined(GRID_CUDA)||defined(GRID_HIP)
|
||||
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
|
||||
return sum_gpu(arg,osites);
|
||||
#else
|
||||
return sum_cpu(arg,osites);
|
||||
@ -133,7 +136,7 @@ inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
|
||||
template<class vobj>
|
||||
inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
|
||||
{
|
||||
#if defined(GRID_CUDA)||defined(GRID_HIP)
|
||||
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
|
||||
return sumD_gpu(arg,osites);
|
||||
#else
|
||||
return sumD_cpu(arg,osites);
|
||||
@ -142,7 +145,7 @@ inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
|
||||
template<class vobj>
|
||||
inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
|
||||
{
|
||||
#if defined(GRID_CUDA)||defined(GRID_HIP)
|
||||
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
|
||||
return sumD_gpu_large(arg,osites);
|
||||
#else
|
||||
return sumD_cpu(arg,osites);
|
||||
@ -150,33 +153,44 @@ inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
|
||||
}
|
||||
|
||||
template<class vobj>
|
||||
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
|
||||
inline typename vobj::scalar_object rankSum(const Lattice<vobj> &arg)
|
||||
{
|
||||
#if defined(GRID_CUDA)||defined(GRID_HIP)
|
||||
autoView( arg_v, arg, AcceleratorRead);
|
||||
Integer osites = arg.Grid()->oSites();
|
||||
auto ssum= sum_gpu(&arg_v[0],osites);
|
||||
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
|
||||
autoView( arg_v, arg, AcceleratorRead);
|
||||
return sum_gpu(&arg_v[0],osites);
|
||||
#else
|
||||
autoView(arg_v, arg, CpuRead);
|
||||
Integer osites = arg.Grid()->oSites();
|
||||
auto ssum= sum_cpu(&arg_v[0],osites);
|
||||
return sum_cpu(&arg_v[0],osites);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class vobj>
|
||||
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
|
||||
{
|
||||
auto ssum = rankSum(arg);
|
||||
arg.Grid()->GlobalSum(ssum);
|
||||
return ssum;
|
||||
}
|
||||
|
||||
template<class vobj>
|
||||
inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
|
||||
inline typename vobj::scalar_object rankSumLarge(const Lattice<vobj> &arg)
|
||||
{
|
||||
#if defined(GRID_CUDA)||defined(GRID_HIP)
|
||||
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
|
||||
autoView( arg_v, arg, AcceleratorRead);
|
||||
Integer osites = arg.Grid()->oSites();
|
||||
auto ssum= sum_gpu_large(&arg_v[0],osites);
|
||||
return sum_gpu_large(&arg_v[0],osites);
|
||||
#else
|
||||
autoView(arg_v, arg, CpuRead);
|
||||
Integer osites = arg.Grid()->oSites();
|
||||
auto ssum= sum_cpu(&arg_v[0],osites);
|
||||
return sum_cpu(&arg_v[0],osites);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class vobj>
|
||||
inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
|
||||
{
|
||||
auto ssum = rankSumLarge(arg);
|
||||
arg.Grid()->GlobalSum(ssum);
|
||||
return ssum;
|
||||
}
|
||||
@ -232,11 +246,10 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
|
||||
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
|
||||
Vector<inner_t> inner_tmp(sites);
|
||||
auto inner_tmp_v = &inner_tmp[0];
|
||||
|
||||
{
|
||||
autoView( left_v , left, AcceleratorRead);
|
||||
autoView( right_v,right, AcceleratorRead);
|
||||
|
||||
// This code could read coalesce
|
||||
// GPU - SIMT lane compliance...
|
||||
accelerator_for( ss, sites, nsimd,{
|
||||
auto x_l = left_v(ss);
|
||||
|
@ -211,13 +211,25 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
|
||||
assert(ok);
|
||||
|
||||
Integer smemSize = numThreads * sizeof(sobj);
|
||||
|
||||
// Move out of UVM
|
||||
// Turns out I had messed up the synchronise after move to compute stream
|
||||
// as running this on the default stream fools the synchronise
|
||||
#undef UVM_BLOCK_BUFFER
|
||||
#ifndef UVM_BLOCK_BUFFER
|
||||
commVector<sobj> buffer(numBlocks);
|
||||
sobj *buffer_v = &buffer[0];
|
||||
sobj result;
|
||||
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
|
||||
accelerator_barrier();
|
||||
acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
|
||||
#else
|
||||
Vector<sobj> buffer(numBlocks);
|
||||
sobj *buffer_v = &buffer[0];
|
||||
|
||||
reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
|
||||
sobj result;
|
||||
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
|
||||
accelerator_barrier();
|
||||
auto result = buffer_v[0];
|
||||
result = *buffer_v;
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
|
125
Grid/lattice/Lattice_reduction_sycl.h
Normal file
125
Grid/lattice/Lattice_reduction_sycl.h
Normal file
@ -0,0 +1,125 @@
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Possibly promote to double and sum
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <class vobj>
|
||||
inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites)
|
||||
{
|
||||
typedef typename vobj::scalar_object sobj;
|
||||
typedef typename vobj::scalar_objectD sobjD;
|
||||
sobj *mysum =(sobj *) malloc_shared(sizeof(sobj),*theGridAccelerator);
|
||||
sobj identity; zeroit(identity);
|
||||
sobj ret ;
|
||||
|
||||
Integer nsimd= vobj::Nsimd();
|
||||
|
||||
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
||||
auto Reduction = cl::sycl::reduction(mysum,identity,std::plus<>());
|
||||
cgh.parallel_for(cl::sycl::range<1>{osites},
|
||||
Reduction,
|
||||
[=] (cl::sycl::id<1> item, auto &sum) {
|
||||
auto osite = item[0];
|
||||
sum +=Reduce(lat[osite]);
|
||||
});
|
||||
});
|
||||
theGridAccelerator->wait();
|
||||
ret = mysum[0];
|
||||
free(mysum,*theGridAccelerator);
|
||||
sobjD dret; convertType(dret,ret);
|
||||
return dret;
|
||||
}
|
||||
|
||||
template <class vobj>
|
||||
inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
|
||||
{
|
||||
return sumD_gpu_tensor(lat,osites);
|
||||
}
|
||||
template <class vobj>
|
||||
inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
|
||||
{
|
||||
return sumD_gpu_large(lat,osites);
|
||||
}
|
||||
|
||||
template <class vobj>
|
||||
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
|
||||
{
|
||||
return sumD_gpu_large(lat,osites);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Return as same precision as input performing reduction in double precision though
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
template <class vobj>
|
||||
inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
|
||||
{
|
||||
typedef typename vobj::scalar_object sobj;
|
||||
sobj result;
|
||||
result = sumD_gpu(lat,osites);
|
||||
return result;
|
||||
}
|
||||
|
||||
template <class vobj>
|
||||
inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
|
||||
{
|
||||
typedef typename vobj::scalar_object sobj;
|
||||
sobj result;
|
||||
result = sumD_gpu_large(lat,osites);
|
||||
return result;
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
/*
|
||||
template<class Double> Double svm_reduce(Double *vec,uint64_t L)
|
||||
{
|
||||
Double sumResult; zeroit(sumResult);
|
||||
Double *d_sum =(Double *)cl::sycl::malloc_shared(sizeof(Double),*theGridAccelerator);
|
||||
Double identity; zeroit(identity);
|
||||
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
||||
auto Reduction = cl::sycl::reduction(d_sum,identity,std::plus<>());
|
||||
cgh.parallel_for(cl::sycl::range<1>{L},
|
||||
Reduction,
|
||||
[=] (cl::sycl::id<1> index, auto &sum) {
|
||||
sum +=vec[index];
|
||||
});
|
||||
});
|
||||
theGridAccelerator->wait();
|
||||
Double ret = d_sum[0];
|
||||
free(d_sum,*theGridAccelerator);
|
||||
std::cout << " svm_reduce finished "<<L<<" sites sum = " << ret <<std::endl;
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <class vobj>
|
||||
inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
|
||||
{
|
||||
typedef typename vobj::vector_type vector;
|
||||
typedef typename vobj::scalar_type scalar;
|
||||
|
||||
typedef typename vobj::scalar_typeD scalarD;
|
||||
typedef typename vobj::scalar_objectD sobjD;
|
||||
|
||||
sobjD ret;
|
||||
scalarD *ret_p = (scalarD *)&ret;
|
||||
|
||||
const int nsimd = vobj::Nsimd();
|
||||
const int words = sizeof(vobj)/sizeof(vector);
|
||||
|
||||
Vector<scalar> buffer(osites*nsimd);
|
||||
scalar *buf = &buffer[0];
|
||||
vector *dat = (vector *)lat;
|
||||
|
||||
for(int w=0;w<words;w++) {
|
||||
|
||||
accelerator_for(ss,osites,nsimd,{
|
||||
int lane = acceleratorSIMTlane(nsimd);
|
||||
buf[ss*nsimd+lane] = dat[ss*words+w].getlane(lane);
|
||||
});
|
||||
//Precision change at this point is to late to gain precision
|
||||
ret_p[w] = svm_reduce(buf,nsimd*osites);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
*/
|
@ -440,6 +440,7 @@ public:
|
||||
_grid->GlobalCoorToGlobalIndex(gcoor,gidx);
|
||||
|
||||
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
|
||||
|
||||
assert(rank == _grid->ThisRank() );
|
||||
|
||||
int l_idx=generator_idx(o_idx,i_idx);
|
||||
|
@ -288,7 +288,36 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
|
||||
blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed);
|
||||
}
|
||||
}
|
||||
template<class vobj,class CComplex,int nbasis,class VLattice>
|
||||
inline void batchBlockProject(std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
|
||||
const std::vector<Lattice<vobj>> &fineData,
|
||||
const VLattice &Basis)
|
||||
{
|
||||
int NBatch = fineData.size();
|
||||
assert(coarseData.size() == NBatch);
|
||||
|
||||
GridBase * fine = fineData[0].Grid();
|
||||
GridBase * coarse= coarseData[0].Grid();
|
||||
|
||||
Lattice<iScalar<CComplex>> ip(coarse);
|
||||
std::vector<Lattice<vobj>> fineDataCopy = fineData;
|
||||
|
||||
autoView(ip_, ip, AcceleratorWrite);
|
||||
for(int v=0;v<nbasis;v++) {
|
||||
for (int k=0; k<NBatch; k++) {
|
||||
autoView( coarseData_ , coarseData[k], AcceleratorWrite);
|
||||
blockInnerProductD(ip,Basis[v],fineDataCopy[k]); // ip = <basis|fine>
|
||||
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
|
||||
convertType(coarseData_[sc](v),ip_[sc]);
|
||||
});
|
||||
|
||||
// improve numerical stability of projection
|
||||
// |fine> = |fine> - <basis|fine> |basis>
|
||||
ip=-ip;
|
||||
blockZAXPY(fineDataCopy[k],ip,Basis[v],fineDataCopy[k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<class vobj,class vobj2,class CComplex>
|
||||
inline void blockZAXPY(Lattice<vobj> &fineZ,
|
||||
@ -590,6 +619,26 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
|
||||
}
|
||||
#endif
|
||||
|
||||
template<class vobj,class CComplex,int nbasis,class VLattice>
|
||||
inline void batchBlockPromote(const std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
|
||||
std::vector<Lattice<vobj>> &fineData,
|
||||
const VLattice &Basis)
|
||||
{
|
||||
int NBatch = coarseData.size();
|
||||
assert(fineData.size() == NBatch);
|
||||
|
||||
GridBase * fine = fineData[0].Grid();
|
||||
GridBase * coarse = coarseData[0].Grid();
|
||||
for (int k=0; k<NBatch; k++)
|
||||
fineData[k]=Zero();
|
||||
for (int i=0;i<nbasis;i++) {
|
||||
for (int k=0; k<NBatch; k++) {
|
||||
Lattice<iScalar<CComplex>> ip = PeekIndex<0>(coarseData[k],i);
|
||||
blockZAXPY(fineData[k],ip,Basis[i],fineData[k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars.
|
||||
// Simd layouts need not match since we use peek/poke Local
|
||||
template<class vobj,class vvobj>
|
||||
@ -1080,6 +1129,7 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
|
||||
});
|
||||
}
|
||||
|
||||
//Very fast precision change. Requires in/out objects to reside on same Grid (e.g. by using double2 for the double-precision field)
|
||||
template<class VobjOut, class VobjIn>
|
||||
void precisionChangeFast(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
|
||||
{
|
||||
@ -1097,9 +1147,9 @@ void precisionChangeFast(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
|
||||
precisionChange(vout,vin,N);
|
||||
});
|
||||
}
|
||||
//Convert a Lattice from one precision to another
|
||||
//Convert a Lattice from one precision to another (original, slow implementation)
|
||||
template<class VobjOut, class VobjIn>
|
||||
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
|
||||
void precisionChangeOrig(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
|
||||
{
|
||||
assert(out.Grid()->Nd() == in.Grid()->Nd());
|
||||
for(int d=0;d<out.Grid()->Nd();d++){
|
||||
@ -1145,6 +1195,128 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
|
||||
});
|
||||
}
|
||||
|
||||
//The workspace for a precision change operation allowing for the reuse of the mapping to save time on subsequent calls
|
||||
class precisionChangeWorkspace{
|
||||
std::pair<Integer,Integer>* fmap_device; //device pointer
|
||||
//maintain grids for checking
|
||||
GridBase* _out_grid;
|
||||
GridBase* _in_grid;
|
||||
public:
|
||||
precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid): _out_grid(out_grid), _in_grid(in_grid){
|
||||
//Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device
|
||||
assert(out_grid->Nd() == in_grid->Nd());
|
||||
for(int d=0;d<out_grid->Nd();d++){
|
||||
assert(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]);
|
||||
}
|
||||
int Nsimd_out = out_grid->Nsimd();
|
||||
|
||||
std::vector<Coordinate> out_icorrs(out_grid->Nsimd()); //reuse these
|
||||
for(int lane=0; lane < out_grid->Nsimd(); lane++)
|
||||
out_grid->iCoorFromIindex(out_icorrs[lane], lane);
|
||||
|
||||
std::vector<std::pair<Integer,Integer> > fmap_host(out_grid->lSites()); //lsites = osites*Nsimd
|
||||
thread_for(out_oidx,out_grid->oSites(),{
|
||||
Coordinate out_ocorr;
|
||||
out_grid->oCoorFromOindex(out_ocorr, out_oidx);
|
||||
|
||||
Coordinate lcorr; //the local coordinate (common to both in and out as full coordinate)
|
||||
for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
|
||||
out_grid->InOutCoorToLocalCoor(out_ocorr, out_icorrs[out_lane], lcorr);
|
||||
|
||||
//int in_oidx = in_grid->oIndex(lcorr), in_lane = in_grid->iIndex(lcorr);
|
||||
//Note oIndex and OcorrFromOindex (and same for iIndex) are not inverse for checkerboarded lattice, the former coordinates being defined on the full lattice and the latter on the reduced lattice
|
||||
//Until this is fixed we need to circumvent the problem locally. Here I will use the coordinates defined on the reduced lattice for simplicity
|
||||
int in_oidx = 0, in_lane = 0;
|
||||
for(int d=0;d<in_grid->_ndimension;d++){
|
||||
in_oidx += in_grid->_ostride[d] * ( lcorr[d] % in_grid->_rdimensions[d] );
|
||||
in_lane += in_grid->_istride[d] * ( lcorr[d] / in_grid->_rdimensions[d] );
|
||||
}
|
||||
fmap_host[out_lane + Nsimd_out*out_oidx] = std::pair<Integer,Integer>( in_oidx, in_lane );
|
||||
}
|
||||
});
|
||||
|
||||
//Copy the map to the device (if we had a way to tell if an accelerator is in use we could avoid this copy for CPU-only machines)
|
||||
size_t fmap_bytes = out_grid->lSites() * sizeof(std::pair<Integer,Integer>);
|
||||
fmap_device = (std::pair<Integer,Integer>*)acceleratorAllocDevice(fmap_bytes);
|
||||
acceleratorCopyToDevice(fmap_host.data(), fmap_device, fmap_bytes);
|
||||
}
|
||||
|
||||
//Prevent moving or copying
|
||||
precisionChangeWorkspace(const precisionChangeWorkspace &r) = delete;
|
||||
precisionChangeWorkspace(precisionChangeWorkspace &&r) = delete;
|
||||
precisionChangeWorkspace &operator=(const precisionChangeWorkspace &r) = delete;
|
||||
precisionChangeWorkspace &operator=(precisionChangeWorkspace &&r) = delete;
|
||||
|
||||
std::pair<Integer,Integer> const* getMap() const{ return fmap_device; }
|
||||
|
||||
void checkGrids(GridBase* out, GridBase* in) const{
|
||||
conformable(out, _out_grid);
|
||||
conformable(in, _in_grid);
|
||||
}
|
||||
|
||||
~precisionChangeWorkspace(){
|
||||
acceleratorFreeDevice(fmap_device);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
//We would like to use precisionChangeFast when possible. However usage of this requires the Grids to be the same (runtime check)
|
||||
//*and* the precisionChange(VobjOut::vector_type, VobjIn, int) function to be defined for the types; this requires an extra compile-time check which we do using some SFINAE trickery
|
||||
template<class VobjOut, class VobjIn>
|
||||
auto _precisionChangeFastWrap(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, int dummy)->decltype( precisionChange( ((typename VobjOut::vector_type*)0), ((typename VobjIn::vector_type*)0), 1), int()){
|
||||
if(out.Grid() == in.Grid()){
|
||||
precisionChangeFast(out,in);
|
||||
return 1;
|
||||
}else{
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
template<class VobjOut, class VobjIn>
|
||||
int _precisionChangeFastWrap(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, long dummy){ //note long here is intentional; it means the above is preferred if available
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
//Convert a lattice of one precision to another. Much faster than original implementation but requires a pregenerated workspace
|
||||
//which contains the mapping data.
|
||||
template<class VobjOut, class VobjIn>
|
||||
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, const precisionChangeWorkspace &workspace){
|
||||
if(_precisionChangeFastWrap(out,in,0)) return;
|
||||
|
||||
static_assert( std::is_same<typename VobjOut::scalar_typeD, typename VobjIn::scalar_typeD>::value == 1, "precisionChange: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
|
||||
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
constexpr int Nsimd_out = VobjOut::Nsimd();
|
||||
|
||||
workspace.checkGrids(out.Grid(),in.Grid());
|
||||
std::pair<Integer,Integer> const* fmap_device = workspace.getMap();
|
||||
|
||||
//Do the copy/precision change
|
||||
autoView( out_v , out, AcceleratorWrite);
|
||||
autoView( in_v , in, AcceleratorRead);
|
||||
|
||||
accelerator_for(out_oidx, out.Grid()->oSites(), 1,{
|
||||
std::pair<Integer,Integer> const* fmap_osite = fmap_device + out_oidx*Nsimd_out;
|
||||
for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
|
||||
int in_oidx = fmap_osite[out_lane].first;
|
||||
int in_lane = fmap_osite[out_lane].second;
|
||||
copyLane(out_v[out_oidx], out_lane, in_v[in_oidx], in_lane);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
//Convert a Lattice from one precision to another. Much faster than original implementation but slower than precisionChangeFast
|
||||
//or precisionChange called with pregenerated workspace, as it needs to internally generate the workspace on the host and copy to device
|
||||
template<class VobjOut, class VobjIn>
|
||||
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
|
||||
if(_precisionChangeFastWrap(out,in,0)) return;
|
||||
precisionChangeWorkspace workspace(out.Grid(), in.Grid());
|
||||
precisionChange(out, in, workspace);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Communicate between grids
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -30,6 +30,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#ifndef GRID_PERFCOUNT_H
|
||||
#define GRID_PERFCOUNT_H
|
||||
|
||||
|
||||
#ifndef __SSC_START
|
||||
#define __SSC_START
|
||||
#define __SSC_STOP
|
||||
#endif
|
||||
|
||||
#include <sys/time.h>
|
||||
#include <ctime>
|
||||
#include <chrono>
|
||||
|
@ -16,7 +16,7 @@
|
||||
|
||||
#ifdef __NVCC__
|
||||
#pragma push
|
||||
#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
|
||||
#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
|
||||
#pragma nv_diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
|
||||
#else
|
||||
#pragma diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
|
||||
|
@ -507,9 +507,20 @@ template<class vobj> void pokeLorentz(vobj &lhs,const decltype(peekIndex<Lorentz
|
||||
// Fermion <-> propagator assignements
|
||||
//////////////////////////////////////////////
|
||||
//template <class Prop, class Ferm>
|
||||
#define FAST_FERM_TO_PROP
|
||||
template <class Fimpl>
|
||||
void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
|
||||
{
|
||||
#ifdef FAST_FERM_TO_PROP
|
||||
autoView(p_v,p,CpuWrite);
|
||||
autoView(f_v,f,CpuRead);
|
||||
thread_for(idx,p_v.oSites(),{
|
||||
for(int ss = 0; ss < Ns; ++ss) {
|
||||
for(int cc = 0; cc < Fimpl::Dimension; ++cc) {
|
||||
p_v[idx]()(ss,s)(cc,c) = f_v[idx]()(ss)(cc); // Propagator sink index is LEFT, suitable for left mult by gauge link (e.g.)
|
||||
}}
|
||||
});
|
||||
#else
|
||||
for(int j = 0; j < Ns; ++j)
|
||||
{
|
||||
auto pjs = peekSpin(p, j, s);
|
||||
@ -521,12 +532,23 @@ void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::Fermio
|
||||
}
|
||||
pokeSpin(p, pjs, j, s);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
//template <class Prop, class Ferm>
|
||||
template <class Fimpl>
|
||||
void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
|
||||
{
|
||||
#ifdef FAST_FERM_TO_PROP
|
||||
autoView(p_v,p,CpuRead);
|
||||
autoView(f_v,f,CpuWrite);
|
||||
thread_for(idx,p_v.oSites(),{
|
||||
for(int ss = 0; ss < Ns; ++ss) {
|
||||
for(int cc = 0; cc < Fimpl::Dimension; ++cc) {
|
||||
f_v[idx]()(ss)(cc) = p_v[idx]()(ss,s)(cc,c); // LEFT index is copied across for s,c right index
|
||||
}}
|
||||
});
|
||||
#else
|
||||
for(int j = 0; j < Ns; ++j)
|
||||
{
|
||||
auto pjs = peekSpin(p, j, s);
|
||||
@ -538,6 +560,7 @@ void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::Propagato
|
||||
}
|
||||
pokeSpin(f, fj, j);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////
|
||||
|
@ -205,15 +205,18 @@ public:
|
||||
typedef WilsonCloverHelpers<Impl> Helpers;
|
||||
typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
|
||||
|
||||
static void MassTerm(CloverField& Clover, RealD diag_mass) {
|
||||
static void InstantiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
|
||||
Clover += diag_mass;
|
||||
}
|
||||
|
||||
static void Exponentiate_Clover(CloverDiagonalField& Diagonal,
|
||||
CloverTriangleField& Triangle,
|
||||
RealD csw_t, RealD diag_mass) {
|
||||
static void InvertClover(CloverField& InvClover,
|
||||
const CloverDiagonalField& diagonal,
|
||||
const CloverTriangleField& triangle,
|
||||
CloverDiagonalField& diagonalInv,
|
||||
CloverTriangleField& triangleInv,
|
||||
bool fixedBoundaries) {
|
||||
|
||||
// Do nothing
|
||||
CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv);
|
||||
}
|
||||
|
||||
// TODO: implement Cmunu for better performances with compact layout, but don't do it
|
||||
@ -238,9 +241,17 @@ public:
|
||||
template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
|
||||
typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
|
||||
|
||||
static void MassTerm(CloverField& Clover, RealD diag_mass) {
|
||||
// do nothing!
|
||||
// mass term is multiplied to exp(Clover) below
|
||||
// Can this be avoided?
|
||||
static void IdentityTimesC(const CloverField& in, RealD c) {
|
||||
int DimRep = Impl::Dimension;
|
||||
|
||||
autoView(in_v, in, AcceleratorWrite);
|
||||
|
||||
accelerator_for(ss, in.Grid()->oSites(), 1, {
|
||||
for (int sa=0; sa<Ns; sa++)
|
||||
for (int ca=0; ca<DimRep; ca++)
|
||||
in_v[ss]()(sa,sa)(ca,ca) = c;
|
||||
});
|
||||
}
|
||||
|
||||
static int getNMAX(RealD prec, RealD R) {
|
||||
@ -255,175 +266,62 @@ public:
|
||||
return NMAX;
|
||||
}
|
||||
|
||||
static int getNMAX(Lattice<iImplCloverDiagonal<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
|
||||
static int getNMAX(Lattice<iImplCloverDiagonal<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
|
||||
static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
|
||||
static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
|
||||
|
||||
static void ExponentiateHermitean6by6(const iMatrix<ComplexD,6> &arg, const RealD& alpha, const std::vector<RealD>& cN, const int Niter, iMatrix<ComplexD,6>& dest){
|
||||
static void InstantiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
|
||||
|
||||
typedef iMatrix<ComplexD,6> mat;
|
||||
GridBase* grid = Clover.Grid();
|
||||
CloverField ExpClover(grid);
|
||||
|
||||
RealD qn[6];
|
||||
RealD qnold[6];
|
||||
RealD p[5];
|
||||
RealD trA2, trA3, trA4;
|
||||
int NMAX = getNMAX(Clover, 3.*csw_t/diag_mass);
|
||||
|
||||
mat A2, A3, A4, A5;
|
||||
A2 = alpha * alpha * arg * arg;
|
||||
A3 = alpha * arg * A2;
|
||||
A4 = A2 * A2;
|
||||
A5 = A2 * A3;
|
||||
Clover *= (1.0/diag_mass);
|
||||
|
||||
trA2 = toReal( trace(A2) );
|
||||
trA3 = toReal( trace(A3) );
|
||||
trA4 = toReal( trace(A4));
|
||||
|
||||
p[0] = toReal( trace(A3 * A3)) / 6.0 - 0.125 * trA4 * trA2 - trA3 * trA3 / 18.0 + trA2 * trA2 * trA2/ 48.0;
|
||||
p[1] = toReal( trace(A5)) / 5.0 - trA3 * trA2 / 6.0;
|
||||
p[2] = toReal( trace(A4)) / 4.0 - 0.125 * trA2 * trA2;
|
||||
p[3] = trA3 / 3.0;
|
||||
p[4] = 0.5 * trA2;
|
||||
|
||||
qnold[0] = cN[Niter];
|
||||
qnold[1] = 0.0;
|
||||
qnold[2] = 0.0;
|
||||
qnold[3] = 0.0;
|
||||
qnold[4] = 0.0;
|
||||
qnold[5] = 0.0;
|
||||
|
||||
for(int i = Niter-1; i >= 0; i--)
|
||||
{
|
||||
qn[0] = p[0] * qnold[5] + cN[i];
|
||||
qn[1] = p[1] * qnold[5] + qnold[0];
|
||||
qn[2] = p[2] * qnold[5] + qnold[1];
|
||||
qn[3] = p[3] * qnold[5] + qnold[2];
|
||||
qn[4] = p[4] * qnold[5] + qnold[3];
|
||||
qn[5] = qnold[4];
|
||||
|
||||
qnold[0] = qn[0];
|
||||
qnold[1] = qn[1];
|
||||
qnold[2] = qn[2];
|
||||
qnold[3] = qn[3];
|
||||
qnold[4] = qn[4];
|
||||
qnold[5] = qn[5];
|
||||
}
|
||||
|
||||
mat unit(1.0);
|
||||
|
||||
dest = (qn[0] * unit + qn[1] * alpha * arg + qn[2] * A2 + qn[3] * A3 + qn[4] * A4 + qn[5] * A5);
|
||||
|
||||
}
|
||||
|
||||
static void Exponentiate_Clover(CloverDiagonalField& Diagonal, CloverTriangleField& Triangle, RealD csw_t, RealD diag_mass) {
|
||||
|
||||
GridBase* grid = Diagonal.Grid();
|
||||
int NMAX = getNMAX(Diagonal, 3.*csw_t/diag_mass);
|
||||
|
||||
//
|
||||
// Implementation completely in Daniel's layout
|
||||
//
|
||||
|
||||
// Taylor expansion with Cayley-Hamilton recursion
|
||||
// underlying Horner scheme as above
|
||||
// Taylor expansion, slow but generic
|
||||
// Horner scheme: a0 + a1 x + a2 x^2 + .. = a0 + x (a1 + x(...))
|
||||
// qN = cN
|
||||
// qn = cn + qn+1 X
|
||||
std::vector<RealD> cn(NMAX+1);
|
||||
cn[0] = 1.0;
|
||||
for (int i=1; i<=NMAX; i++){
|
||||
for (int i=1; i<=NMAX; i++)
|
||||
cn[i] = cn[i-1] / RealD(i);
|
||||
}
|
||||
|
||||
// Taken over from Daniel's implementation
|
||||
conformable(Diagonal, Triangle);
|
||||
ExpClover = Zero();
|
||||
IdentityTimesC(ExpClover, cn[NMAX]);
|
||||
for (int i=NMAX-1; i>=0; i--)
|
||||
ExpClover = ExpClover * Clover + cn[i];
|
||||
|
||||
long lsites = grid->lSites();
|
||||
{
|
||||
typedef typename SiteCloverDiagonal::scalar_object scalar_object_diagonal;
|
||||
typedef typename SiteCloverTriangle::scalar_object scalar_object_triangle;
|
||||
typedef iMatrix<ComplexD,6> mat;
|
||||
// prepare inverse
|
||||
CloverInv = (-1.0)*Clover;
|
||||
|
||||
autoView(diagonal_v, Diagonal, CpuRead);
|
||||
autoView(triangle_v, Triangle, CpuRead);
|
||||
autoView(diagonalExp_v, Diagonal, CpuWrite);
|
||||
autoView(triangleExp_v, Triangle, CpuWrite);
|
||||
Clover = ExpClover * diag_mass;
|
||||
|
||||
thread_for(site, lsites, { // NOTE: Not on GPU because of (peek/poke)LocalSite
|
||||
ExpClover = Zero();
|
||||
IdentityTimesC(ExpClover, cn[NMAX]);
|
||||
for (int i=NMAX-1; i>=0; i--)
|
||||
ExpClover = ExpClover * CloverInv + cn[i];
|
||||
|
||||
mat srcCloverOpUL(0.0); // upper left block
|
||||
mat srcCloverOpLR(0.0); // lower right block
|
||||
mat ExpCloverOp;
|
||||
CloverInv = ExpClover * (1.0/diag_mass);
|
||||
|
||||
scalar_object_diagonal diagonal_tmp = Zero();
|
||||
scalar_object_diagonal diagonal_exp_tmp = Zero();
|
||||
scalar_object_triangle triangle_tmp = Zero();
|
||||
scalar_object_triangle triangle_exp_tmp = Zero();
|
||||
|
||||
Coordinate lcoor;
|
||||
grid->LocalIndexToLocalCoor(site, lcoor);
|
||||
|
||||
peekLocalSite(diagonal_tmp, diagonal_v, lcoor);
|
||||
peekLocalSite(triangle_tmp, triangle_v, lcoor);
|
||||
|
||||
int block;
|
||||
block = 0;
|
||||
for(int i = 0; i < 6; i++){
|
||||
for(int j = 0; j < 6; j++){
|
||||
if (i == j){
|
||||
srcCloverOpUL(i,j) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
|
||||
}
|
||||
else{
|
||||
srcCloverOpUL(i,j) = static_cast<ComplexD>(TensorRemove(CompactHelpers::triangle_elem(triangle_tmp, block, i, j)));
|
||||
}
|
||||
}
|
||||
}
|
||||
block = 1;
|
||||
for(int i = 0; i < 6; i++){
|
||||
for(int j = 0; j < 6; j++){
|
||||
if (i == j){
|
||||
srcCloverOpLR(i,j) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
|
||||
}
|
||||
else{
|
||||
srcCloverOpLR(i,j) = static_cast<ComplexD>(TensorRemove(CompactHelpers::triangle_elem(triangle_tmp, block, i, j)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// exp(Clover)
|
||||
|
||||
ExponentiateHermitean6by6(srcCloverOpUL,1.0/diag_mass,cn,NMAX,ExpCloverOp);
|
||||
|
||||
block = 0;
|
||||
for(int i = 0; i < 6; i++){
|
||||
for(int j = 0; j < 6; j++){
|
||||
if (i == j){
|
||||
diagonal_exp_tmp()(block)(i) = ExpCloverOp(i,j);
|
||||
}
|
||||
else if(i < j){
|
||||
triangle_exp_tmp()(block)(CompactHelpers::triangle_index(i, j)) = ExpCloverOp(i,j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ExponentiateHermitean6by6(srcCloverOpLR,1.0/diag_mass,cn,NMAX,ExpCloverOp);
|
||||
|
||||
block = 1;
|
||||
for(int i = 0; i < 6; i++){
|
||||
for(int j = 0; j < 6; j++){
|
||||
if (i == j){
|
||||
diagonal_exp_tmp()(block)(i) = ExpCloverOp(i,j);
|
||||
}
|
||||
else if(i < j){
|
||||
triangle_exp_tmp()(block)(CompactHelpers::triangle_index(i, j)) = ExpCloverOp(i,j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pokeLocalSite(diagonal_exp_tmp, diagonalExp_v, lcoor);
|
||||
pokeLocalSite(triangle_exp_tmp, triangleExp_v, lcoor);
|
||||
});
|
||||
}
|
||||
|
||||
Diagonal *= diag_mass;
|
||||
Triangle *= diag_mass;
|
||||
}
|
||||
|
||||
static void InvertClover(CloverField& InvClover,
|
||||
const CloverDiagonalField& diagonal,
|
||||
const CloverTriangleField& triangle,
|
||||
CloverDiagonalField& diagonalInv,
|
||||
CloverTriangleField& triangleInv,
|
||||
bool fixedBoundaries) {
|
||||
|
||||
if (fixedBoundaries)
|
||||
{
|
||||
CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv);
|
||||
}
|
||||
else
|
||||
{
|
||||
CompactHelpers::ConvertLayout(InvClover, diagonalInv, triangleInv);
|
||||
}
|
||||
}
|
||||
|
||||
static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
|
||||
assert(0);
|
||||
|
@ -225,7 +225,7 @@ public:
|
||||
RealD csw_t;
|
||||
RealD cF;
|
||||
|
||||
bool open_boundaries;
|
||||
bool fixedBoundaries;
|
||||
|
||||
CloverDiagonalField Diagonal, DiagonalEven, DiagonalOdd;
|
||||
CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
|
||||
|
@ -36,7 +36,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
// Wilson compressor will need FaceGather policies for:
|
||||
// Periodic, Dirichlet, and partial Dirichlet for DWF
|
||||
///////////////////////////////////////////////////////////////
|
||||
const int dwf_compressor_depth=1;
|
||||
const int dwf_compressor_depth=2;
|
||||
#define DWF_COMPRESS
|
||||
class FaceGatherPartialDWF
|
||||
{
|
||||
@ -110,7 +110,7 @@ public:
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
template<class vobj,class cobj,class compressor>
|
||||
static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
||||
Vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
||||
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
||||
compressor &compress,int type,int partial)
|
||||
{
|
||||
GridBase *Grid = rhs.Grid();
|
||||
@ -209,7 +209,7 @@ public:
|
||||
}
|
||||
template<class vobj,class cobj,class compressor>
|
||||
static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
||||
Vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
||||
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
||||
compressor &compress,int type,int partial)
|
||||
{
|
||||
// std::cout << " face gather exch DWF partial "<<partial <<std::endl;
|
||||
@ -320,7 +320,7 @@ public:
|
||||
typedef decltype(coalescedRead(in0)) sobj;
|
||||
typedef decltype(coalescedRead(out0)) hsobj;
|
||||
|
||||
unsigned int Nsimd = vobj::Nsimd();
|
||||
constexpr unsigned int Nsimd = vobj::Nsimd();
|
||||
unsigned int mask = Nsimd >> (type + 1);
|
||||
int lane = acceleratorSIMTlane(Nsimd);
|
||||
int j0 = lane &(~mask); // inner coor zero
|
||||
@ -507,6 +507,7 @@ public:
|
||||
}
|
||||
this->face_table_computed=1;
|
||||
assert(this->u_comm_offset==this->_unified_buffer_size);
|
||||
accelerator_barrier();
|
||||
}
|
||||
|
||||
};
|
||||
|
@ -48,7 +48,7 @@ CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(Gaug
|
||||
, csw_r(_csw_r)
|
||||
, csw_t(_csw_t)
|
||||
, cF(_cF)
|
||||
, open_boundaries(impl_p.boundary_phases[Nd-1] == 0.0)
|
||||
, fixedBoundaries(impl_p.boundary_phases[Nd-1] == 0.0)
|
||||
, Diagonal(&Fgrid), Triangle(&Fgrid)
|
||||
, DiagonalEven(&Hgrid), TriangleEven(&Hgrid)
|
||||
, DiagonalOdd(&Hgrid), TriangleOdd(&Hgrid)
|
||||
@ -67,7 +67,7 @@ CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(Gaug
|
||||
csw_r /= clover_anisotropy.xi_0;
|
||||
|
||||
ImportGauge(_Umu);
|
||||
if (open_boundaries) {
|
||||
if (fixedBoundaries) {
|
||||
this->BoundaryMaskEven.Checkerboard() = Even;
|
||||
this->BoundaryMaskOdd.Checkerboard() = Odd;
|
||||
CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
|
||||
@ -77,31 +77,31 @@ CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(Gaug
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion<Impl, CloverHelpers>::Dhop(const FermionField& in, FermionField& out, int dag) {
|
||||
WilsonBase::Dhop(in, out, dag);
|
||||
if(open_boundaries) ApplyBoundaryMask(out);
|
||||
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopOE(const FermionField& in, FermionField& out, int dag) {
|
||||
WilsonBase::DhopOE(in, out, dag);
|
||||
if(open_boundaries) ApplyBoundaryMask(out);
|
||||
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopEO(const FermionField& in, FermionField& out, int dag) {
|
||||
WilsonBase::DhopEO(in, out, dag);
|
||||
if(open_boundaries) ApplyBoundaryMask(out);
|
||||
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
|
||||
WilsonBase::DhopDir(in, out, dir, disp);
|
||||
if(this->open_boundaries) ApplyBoundaryMask(out);
|
||||
if(this->fixedBoundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
|
||||
WilsonBase::DhopDirAll(in, out);
|
||||
if(this->open_boundaries) {
|
||||
if(this->fixedBoundaries) {
|
||||
for(auto& o : out) ApplyBoundaryMask(o);
|
||||
}
|
||||
}
|
||||
@ -112,7 +112,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField& in,
|
||||
WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
|
||||
Mooee(in, Tmp);
|
||||
axpy(out, 1.0, out, Tmp);
|
||||
if(open_boundaries) ApplyBoundaryMask(out);
|
||||
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
@ -121,19 +121,19 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField& i
|
||||
WilsonBase::Dhop(in, out, DaggerYes); // call base to save applying bc
|
||||
MooeeDag(in, Tmp);
|
||||
axpy(out, 1.0, out, Tmp);
|
||||
if(open_boundaries) ApplyBoundaryMask(out);
|
||||
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion<Impl, CloverHelpers>::Meooe(const FermionField& in, FermionField& out) {
|
||||
WilsonBase::Meooe(in, out);
|
||||
if(open_boundaries) ApplyBoundaryMask(out);
|
||||
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeooeDag(const FermionField& in, FermionField& out) {
|
||||
WilsonBase::MeooeDag(in, out);
|
||||
if(open_boundaries) ApplyBoundaryMask(out);
|
||||
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
@ -147,7 +147,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField&
|
||||
} else {
|
||||
MooeeInternal(in, out, Diagonal, Triangle);
|
||||
}
|
||||
if(open_boundaries) ApplyBoundaryMask(out);
|
||||
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
@ -166,7 +166,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionFiel
|
||||
} else {
|
||||
MooeeInternal(in, out, DiagonalInv, TriangleInv);
|
||||
}
|
||||
if(open_boundaries) ApplyBoundaryMask(out);
|
||||
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
@ -186,7 +186,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::MdirAll(const FermionField
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
|
||||
assert(!open_boundaries); // TODO check for changes required for open bc
|
||||
assert(!fixedBoundaries); // TODO check for changes required for open bc
|
||||
|
||||
// NOTE: code copied from original clover term
|
||||
conformable(X.Grid(), Y.Grid());
|
||||
@ -305,6 +305,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
|
||||
GridBase* grid = _Umu.Grid();
|
||||
typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
|
||||
CloverField TmpOriginal(grid);
|
||||
CloverField TmpInverse(grid);
|
||||
|
||||
// Compute the field strength terms mu>nu
|
||||
double t2 = usecond();
|
||||
@ -324,24 +325,27 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
|
||||
TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
|
||||
TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
|
||||
TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
|
||||
// Handle mass term based on clover policy
|
||||
CloverHelpers::MassTerm(TmpOriginal, this->diag_mass);
|
||||
|
||||
// Instantiate the clover term
|
||||
// - In case of the standard clover the mass term is added
|
||||
// - In case of the exponential clover the clover term is exponentiated
|
||||
double t4 = usecond();
|
||||
CloverHelpers::InstantiateClover(TmpOriginal, TmpInverse, csw_t, this->diag_mass);
|
||||
|
||||
// Convert the data layout of the clover term
|
||||
double t4 = usecond();
|
||||
double t5 = usecond();
|
||||
CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
|
||||
|
||||
// Exponentiate the clover (nothing happens in case of the standard clover)
|
||||
double t5 = usecond();
|
||||
CloverHelpers::Exponentiate_Clover(Diagonal, Triangle, csw_t, this->diag_mass);
|
||||
|
||||
// Possible modify the boundary values
|
||||
// Modify the clover term at the temporal boundaries in case of open boundary conditions
|
||||
double t6 = usecond();
|
||||
if(open_boundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);
|
||||
if(fixedBoundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);
|
||||
|
||||
// Invert the Clover term (explicit inversion needed for the improvement in case of open boundary conditions)
|
||||
// Invert the Clover term
|
||||
// In case of the exponential clover with (anti-)periodic boundary conditions exp(-Clover) saved
|
||||
// in TmpInverse can be used. In all other cases the clover term has to be explictly inverted.
|
||||
// TODO: For now this inversion is explictly done on the CPU
|
||||
double t7 = usecond();
|
||||
CompactHelpers::Invert(Diagonal, Triangle, DiagonalInv, TriangleInv);
|
||||
CloverHelpers::InvertClover(TmpInverse, Diagonal, Triangle, DiagonalInv, TriangleInv, fixedBoundaries);
|
||||
|
||||
// Fill the remaining clover fields
|
||||
double t8 = usecond();
|
||||
@ -362,10 +366,10 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
|
||||
std::cout << GridLogDebug << "allocations = " << (t2 - t1) / 1e6 << std::endl;
|
||||
std::cout << GridLogDebug << "field strength = " << (t3 - t2) / 1e6 << std::endl;
|
||||
std::cout << GridLogDebug << "fill clover = " << (t4 - t3) / 1e6 << std::endl;
|
||||
std::cout << GridLogDebug << "convert = " << (t5 - t4) / 1e6 << std::endl;
|
||||
std::cout << GridLogDebug << "exponentiation = " << (t6 - t5) / 1e6 << std::endl;
|
||||
std::cout << GridLogDebug << "boundaries = " << (t7 - t6) / 1e6 << std::endl;
|
||||
std::cout << GridLogDebug << "inversions = " << (t8 - t7) / 1e6 << std::endl;
|
||||
std::cout << GridLogDebug << "instantiate clover = " << (t5 - t4) / 1e6 << std::endl;
|
||||
std::cout << GridLogDebug << "convert layout = " << (t6 - t5) / 1e6 << std::endl;
|
||||
std::cout << GridLogDebug << "modify boundaries = " << (t7 - t6) / 1e6 << std::endl;
|
||||
std::cout << GridLogDebug << "invert clover = " << (t8 - t7) / 1e6 << std::endl;
|
||||
std::cout << GridLogDebug << "pick cbs = " << (t9 - t8) / 1e6 << std::endl;
|
||||
std::cout << GridLogDebug << "total = " << (t9 - t0) / 1e6 << std::endl;
|
||||
}
|
||||
|
@ -196,7 +196,6 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
|
||||
|
||||
uint64_t Nsite = Umu.Grid()->oSites();
|
||||
Kernels::DhopDirKernel(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out,dirdisp,gamma);
|
||||
|
||||
};
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::DhopDirAll(const FermionField &in, std::vector<FermionField> &out)
|
||||
@ -247,10 +246,14 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
|
||||
|
||||
Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, Usites, B, Btilde, mu,gamma);
|
||||
|
||||
std::cout << " InsertForce Btilde "<< norm2(Btilde)<<std::endl;
|
||||
|
||||
////////////////////////////
|
||||
// spin trace outer product
|
||||
////////////////////////////
|
||||
Impl::InsertForce5D(mat, Btilde, Atilde, mu);
|
||||
|
||||
std::cout << " InsertForce "<< norm2(mat)<<std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
@ -332,8 +335,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
|
||||
/////////////////////////////
|
||||
{
|
||||
GRID_TRACE("Gather");
|
||||
st.HaloExchangeOptGather(in,compressor);
|
||||
accelerator_barrier();
|
||||
st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine
|
||||
}
|
||||
|
||||
std::vector<std::vector<CommsRequest_t> > requests;
|
||||
|
@ -428,9 +428,10 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
|
||||
auto ptr = &st.surface_list[0]; \
|
||||
accelerator_forNB( ss, sz, Simd::Nsimd(), { \
|
||||
int sF = ptr[ss]; \
|
||||
int sU = ss/Ls; \
|
||||
int sU = sF/Ls; \
|
||||
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v); \
|
||||
});
|
||||
}); \
|
||||
accelerator_barrier();
|
||||
|
||||
#define ASM_CALL(A) \
|
||||
thread_for( sss, Nsite, { \
|
||||
@ -463,11 +464,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
||||
|
||||
if( interior && exterior ) {
|
||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
|
||||
#ifdef SYCL_HACK
|
||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteSycl); return; }
|
||||
#else
|
||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
|
||||
#endif
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;}
|
||||
#endif
|
||||
@ -478,8 +475,10 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;}
|
||||
#endif
|
||||
} else if( exterior ) {
|
||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;}
|
||||
// dependent on result of merge
|
||||
acceleratorFenceComputeStream();
|
||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL_EXT(GenericDhopSiteExt); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteExt); return;}
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); return;}
|
||||
#endif
|
||||
@ -502,21 +501,20 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;}
|
||||
#endif
|
||||
acceleratorFenceComputeStream();
|
||||
} else if( interior ) {
|
||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteDagInt); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteDagInt); return;}
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;}
|
||||
#endif
|
||||
} else if( exterior ) {
|
||||
// Dependent on result of merge
|
||||
acceleratorFenceComputeStream();
|
||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL_EXT(GenericDhopSiteDagExt); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteDagExt); return;}
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;}
|
||||
#endif
|
||||
acceleratorFenceComputeStream();
|
||||
}
|
||||
assert(0 && " Kernel optimisation case not covered ");
|
||||
}
|
||||
|
@ -1 +0,0 @@
|
||||
../CayleyFermion5DInstantiation.cc.master
|
@ -1 +0,0 @@
|
||||
../ContinuedFractionFermion5DInstantiation.cc.master
|
@ -1 +0,0 @@
|
||||
../DomainWallEOFAFermionInstantiation.cc.master
|
@ -1 +0,0 @@
|
||||
../MobiusEOFAFermionInstantiation.cc.master
|
@ -1 +0,0 @@
|
||||
../PartialFractionFermion5DInstantiation.cc.master
|
@ -1 +0,0 @@
|
||||
../WilsonCloverFermionInstantiation.cc.master
|
@ -1 +0,0 @@
|
||||
../WilsonFermion5DInstantiation.cc.master
|
@ -1 +0,0 @@
|
||||
../WilsonFermionInstantiation.cc.master
|
@ -1 +0,0 @@
|
||||
../WilsonKernelsInstantiation.cc.master
|
@ -1 +0,0 @@
|
||||
../WilsonTMFermionInstantiation.cc.master
|
@ -1 +0,0 @@
|
||||
#define IMPLEMENTATION WilsonImplD2
|
@ -1 +0,0 @@
|
||||
../CayleyFermion5DInstantiation.cc.master
|
@ -1 +0,0 @@
|
||||
../ContinuedFractionFermion5DInstantiation.cc.master
|
@ -1 +0,0 @@
|
||||
../DomainWallEOFAFermionInstantiation.cc.master
|
@ -1 +0,0 @@
|
||||
../MobiusEOFAFermionInstantiation.cc.master
|
@ -1 +0,0 @@
|
||||
../PartialFractionFermion5DInstantiation.cc.master
|
@ -1 +0,0 @@
|
||||
../WilsonFermion5DInstantiation.cc.master
|
@ -1 +0,0 @@
|
||||
../WilsonKernelsInstantiation.cc.master
|
@ -1 +0,0 @@
|
||||
#define IMPLEMENTATION ZWilsonImplD2
|
@ -119,13 +119,19 @@ public:
|
||||
// X^dag Der_oe MeeInv Meo Y
|
||||
// Use Mooee as nontrivial but gauge field indept
|
||||
this->_Mat.MeooeDag (V,tmp1); // odd->even -- implicit -0.5 factor to be applied
|
||||
std::cout << " tmp 1" << norm2(tmp1)<<std::endl;
|
||||
this->_Mat.MooeeInvDag(tmp1,tmp2); // even->even
|
||||
std::cout << " tmp 1" << norm2(tmp2)<<std::endl;
|
||||
this->_Mat.MoeDeriv(ForceO,U,tmp2,DaggerYes);
|
||||
std::cout << " ForceO " << norm2(ForceO)<<std::endl;
|
||||
|
||||
// Accumulate X^dag M_oe MeeInv Der_eo Y
|
||||
this->_Mat.Meooe (U,tmp1); // even->odd -- implicit -0.5 factor to be applied
|
||||
std::cout << " tmp 1" << norm2(tmp1)<<std::endl;
|
||||
this->_Mat.MooeeInv(tmp1,tmp2); // even->even
|
||||
std::cout << " tmp 2" << norm2(tmp2)<<std::endl;
|
||||
this->_Mat.MeoDeriv(ForceE,tmp2,V,DaggerYes);
|
||||
std::cout << " ForceE " << norm2(ForceE)<<std::endl;
|
||||
|
||||
assert(ForceE.Checkerboard()==Even);
|
||||
assert(ForceO.Checkerboard()==Odd);
|
||||
|
@ -127,6 +127,8 @@ NAMESPACE_BEGIN(Grid);
|
||||
ApproxNegPowerAction.tolerances[i] = action_tolerance[i];
|
||||
ApproxHalfPowerAction.tolerances[i] = action_tolerance[i];
|
||||
ApproxNegHalfPowerAction.tolerances[i]= action_tolerance[i];
|
||||
}
|
||||
for(int i=0;i<ApproxPowerMD.tolerances.size();i++){
|
||||
ApproxPowerMD.tolerances[i] = md_tolerance[i];
|
||||
ApproxNegPowerMD.tolerances[i] = md_tolerance[i];
|
||||
ApproxHalfPowerMD.tolerances[i] = md_tolerance[i];
|
||||
|
@ -29,6 +29,8 @@
|
||||
#ifndef QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_MIXED_PREC_H
|
||||
#define QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_MIXED_PREC_H
|
||||
|
||||
#include <Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -36,90 +38,73 @@ NAMESPACE_BEGIN(Grid);
|
||||
// cf. GeneralEvenOddRational.h for details
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template<class ImplD, class ImplF, class ImplD2>
|
||||
template<class ImplD, class ImplF>
|
||||
class GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction : public GeneralEvenOddRatioRationalPseudoFermionAction<ImplD> {
|
||||
private:
|
||||
typedef typename ImplD2::FermionField FermionFieldD2;
|
||||
typedef typename ImplD::FermionField FermionFieldD;
|
||||
typedef typename ImplF::FermionField FermionFieldF;
|
||||
|
||||
FermionOperator<ImplD> & NumOpD;
|
||||
FermionOperator<ImplD> & DenOpD;
|
||||
|
||||
FermionOperator<ImplD2> & NumOpD2;
|
||||
FermionOperator<ImplD2> & DenOpD2;
|
||||
|
||||
FermionOperator<ImplF> & NumOpF;
|
||||
FermionOperator<ImplF> & DenOpF;
|
||||
|
||||
Integer ReliableUpdateFreq;
|
||||
protected:
|
||||
|
||||
//Action evaluation
|
||||
//Allow derived classes to override the multishift CG
|
||||
virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, FermionFieldD &out){
|
||||
#if 0
|
||||
SchurDifferentiableOperator<ImplD> schurOp(numerator ? NumOp : DenOp);
|
||||
#if 1
|
||||
SchurDifferentiableOperator<ImplD> schurOp(numerator ? NumOpD : DenOpD);
|
||||
ConjugateGradientMultiShift<FermionFieldD> msCG(MaxIter, approx);
|
||||
msCG(schurOp,in, out);
|
||||
#else
|
||||
SchurDifferentiableOperator<ImplD2> schurOpD2(numerator ? NumOpD2 : DenOpD2);
|
||||
SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD);
|
||||
SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF);
|
||||
FermionFieldD2 inD2(NumOpD2.FermionRedBlackGrid());
|
||||
FermionFieldD2 outD2(NumOpD2.FermionRedBlackGrid());
|
||||
FermionFieldD inD(NumOpD.FermionRedBlackGrid());
|
||||
FermionFieldD outD(NumOpD.FermionRedBlackGrid());
|
||||
|
||||
ConjugateGradientMultiShiftMixedPrec<FermionFieldD2, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
|
||||
precisionChange(inD2,in);
|
||||
std::cout << "msCG single solve "<<norm2(inD2)<<" " <<norm2(in)<<std::endl;
|
||||
msCG(schurOpD2, inD2, outD2);
|
||||
precisionChange(out,outD2);
|
||||
// Action better with higher precision?
|
||||
ConjugateGradientMultiShiftMixedPrec<FermionFieldD, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
|
||||
msCG(schurOpD, in, out);
|
||||
#endif
|
||||
}
|
||||
//Force evaluation
|
||||
virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, std::vector<FermionFieldD> &out_elems, FermionFieldD &out){
|
||||
SchurDifferentiableOperator<ImplD2> schurOpD2(numerator ? NumOpD2 : DenOpD2);
|
||||
SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF);
|
||||
SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD);
|
||||
SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF);
|
||||
|
||||
FermionFieldD2 inD2(NumOpD2.FermionRedBlackGrid());
|
||||
FermionFieldD2 outD2(NumOpD2.FermionRedBlackGrid());
|
||||
std::vector<FermionFieldD2> out_elemsD2(out_elems.size(),NumOpD2.FermionRedBlackGrid());
|
||||
ConjugateGradientMultiShiftMixedPrec<FermionFieldD2, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
|
||||
precisionChange(inD2,in);
|
||||
std::cout << "msCG in "<<norm2(inD2)<<" " <<norm2(in)<<std::endl;
|
||||
msCG(schurOpD2, inD2, out_elemsD2, outD2);
|
||||
precisionChange(out,outD2);
|
||||
for(int i=0;i<out_elems.size();i++){
|
||||
precisionChange(out_elems[i],out_elemsD2[i]);
|
||||
}
|
||||
FermionFieldD inD(NumOpD.FermionRedBlackGrid());
|
||||
FermionFieldD outD(NumOpD.FermionRedBlackGrid());
|
||||
std::vector<FermionFieldD> out_elemsD(out_elems.size(),NumOpD.FermionRedBlackGrid());
|
||||
ConjugateGradientMultiShiftMixedPrecCleanup<FermionFieldD, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
|
||||
msCG(schurOpD, in, out_elems, out);
|
||||
}
|
||||
//Allow derived classes to override the gauge import
|
||||
virtual void ImportGauge(const typename ImplD::GaugeField &Ud){
|
||||
|
||||
typename ImplF::GaugeField Uf(NumOpF.GaugeGrid());
|
||||
typename ImplD2::GaugeField Ud2(NumOpD2.GaugeGrid());
|
||||
precisionChange(Uf, Ud);
|
||||
precisionChange(Ud2, Ud);
|
||||
|
||||
std::cout << "Importing "<<norm2(Ud)<<" "<< norm2(Uf)<<" " << norm2(Ud2)<<std::endl;
|
||||
std::cout << "Importing "<<norm2(Ud)<<" "<< norm2(Uf)<<" " <<std::endl;
|
||||
|
||||
NumOpD.ImportGauge(Ud);
|
||||
DenOpD.ImportGauge(Ud);
|
||||
|
||||
NumOpF.ImportGauge(Uf);
|
||||
DenOpF.ImportGauge(Uf);
|
||||
|
||||
NumOpD2.ImportGauge(Ud2);
|
||||
DenOpD2.ImportGauge(Ud2);
|
||||
}
|
||||
|
||||
public:
|
||||
GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction(FermionOperator<ImplD> &_NumOpD, FermionOperator<ImplD> &_DenOpD,
|
||||
FermionOperator<ImplF> &_NumOpF, FermionOperator<ImplF> &_DenOpF,
|
||||
FermionOperator<ImplD2> &_NumOpD2, FermionOperator<ImplD2> &_DenOpD2,
|
||||
const RationalActionParams & p, Integer _ReliableUpdateFreq
|
||||
) : GeneralEvenOddRatioRationalPseudoFermionAction<ImplD>(_NumOpD, _DenOpD, p),
|
||||
ReliableUpdateFreq(_ReliableUpdateFreq),
|
||||
NumOpD(_NumOpD), DenOpD(_DenOpD),
|
||||
NumOpF(_NumOpF), DenOpF(_DenOpF),
|
||||
NumOpD2(_NumOpD2), DenOpD2(_DenOpD2)
|
||||
NumOpF(_NumOpF), DenOpF(_DenOpF)
|
||||
{}
|
||||
|
||||
virtual std::string action_name(){return "GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction";}
|
||||
|
@ -67,9 +67,9 @@ NAMESPACE_BEGIN(Grid);
|
||||
virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}
|
||||
};
|
||||
|
||||
template<class Impl,class ImplF,class ImplD2>
|
||||
template<class Impl,class ImplF>
|
||||
class OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction
|
||||
: public GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF,ImplD2> {
|
||||
: public GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF> {
|
||||
public:
|
||||
typedef OneFlavourRationalParams Params;
|
||||
private:
|
||||
@ -91,11 +91,9 @@ NAMESPACE_BEGIN(Grid);
|
||||
FermionOperator<Impl> &_DenOp,
|
||||
FermionOperator<ImplF> &_NumOpF,
|
||||
FermionOperator<ImplF> &_DenOpF,
|
||||
FermionOperator<ImplD2> &_NumOpD2,
|
||||
FermionOperator<ImplD2> &_DenOpD2,
|
||||
const Params & p, Integer ReliableUpdateFreq
|
||||
) :
|
||||
GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF,ImplD2>(_NumOp, _DenOp,_NumOpF, _DenOpF,_NumOpD2, _DenOpD2, transcribe(p),ReliableUpdateFreq){}
|
||||
GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF>(_NumOp, _DenOp,_NumOpF, _DenOpF, transcribe(p),ReliableUpdateFreq){}
|
||||
|
||||
virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}
|
||||
};
|
||||
|
@ -112,40 +112,27 @@ NAMESPACE_BEGIN(Grid);
|
||||
// NumOp == V
|
||||
// DenOp == M
|
||||
//
|
||||
AUDIT();
|
||||
FermionField etaOdd (NumOp.FermionRedBlackGrid());
|
||||
FermionField etaEven(NumOp.FermionRedBlackGrid());
|
||||
FermionField tmp (NumOp.FermionRedBlackGrid());
|
||||
|
||||
AUDIT();
|
||||
pickCheckerboard(Even,etaEven,eta);
|
||||
AUDIT();
|
||||
pickCheckerboard(Odd,etaOdd,eta);
|
||||
|
||||
AUDIT();
|
||||
NumOp.ImportGauge(U);
|
||||
AUDIT();
|
||||
DenOp.ImportGauge(U);
|
||||
std::cout << " TwoFlavourRefresh: Imported gauge "<<std::endl;
|
||||
AUDIT();
|
||||
|
||||
SchurDifferentiableOperator<Impl> Mpc(DenOp);
|
||||
AUDIT();
|
||||
SchurDifferentiableOperator<Impl> Vpc(NumOp);
|
||||
AUDIT();
|
||||
|
||||
std::cout << " TwoFlavourRefresh: Diff ops "<<std::endl;
|
||||
AUDIT();
|
||||
// Odd det factors
|
||||
Mpc.MpcDag(etaOdd,PhiOdd);
|
||||
AUDIT();
|
||||
std::cout << " TwoFlavourRefresh: MpcDag "<<std::endl;
|
||||
tmp=Zero();
|
||||
AUDIT();
|
||||
std::cout << " TwoFlavourRefresh: Zero() guess "<<std::endl;
|
||||
AUDIT();
|
||||
HeatbathSolver(Vpc,PhiOdd,tmp);
|
||||
AUDIT();
|
||||
std::cout << " TwoFlavourRefresh: Heatbath solver "<<std::endl;
|
||||
Vpc.Mpc(tmp,PhiOdd);
|
||||
std::cout << " TwoFlavourRefresh: Mpc "<<std::endl;
|
||||
@ -220,20 +207,27 @@ NAMESPACE_BEGIN(Grid);
|
||||
//X = (Mdag M)^-1 V^dag phi
|
||||
//Y = (Mdag)^-1 V^dag phi
|
||||
Vpc.MpcDag(PhiOdd,Y); // Y= Vdag phi
|
||||
std::cout << GridLogMessage <<" Y "<<norm2(Y)<<std::endl;
|
||||
X=Zero();
|
||||
DerivativeSolver(Mpc,Y,X); // X= (MdagM)^-1 Vdag phi
|
||||
std::cout << GridLogMessage <<" X "<<norm2(X)<<std::endl;
|
||||
Mpc.Mpc(X,Y); // Y= Mdag^-1 Vdag phi
|
||||
std::cout << GridLogMessage <<" Y "<<norm2(Y)<<std::endl;
|
||||
|
||||
// phi^dag V (Mdag M)^-1 dV^dag phi
|
||||
Vpc.MpcDagDeriv(force , X, PhiOdd ); dSdU = force;
|
||||
std::cout << GridLogMessage <<" deriv "<<norm2(force)<<std::endl;
|
||||
|
||||
// phi^dag dV (Mdag M)^-1 V^dag phi
|
||||
Vpc.MpcDeriv(force , PhiOdd, X ); dSdU = dSdU+force;
|
||||
std::cout << GridLogMessage <<" deriv "<<norm2(force)<<std::endl;
|
||||
|
||||
// - phi^dag V (Mdag M)^-1 Mdag dM (Mdag M)^-1 V^dag phi
|
||||
// - phi^dag V (Mdag M)^-1 dMdag M (Mdag M)^-1 V^dag phi
|
||||
Mpc.MpcDeriv(force,Y,X); dSdU = dSdU-force;
|
||||
std::cout << GridLogMessage <<" deriv "<<norm2(force)<<std::endl;
|
||||
Mpc.MpcDagDeriv(force,X,Y); dSdU = dSdU-force;
|
||||
std::cout << GridLogMessage <<" deriv "<<norm2(force)<<std::endl;
|
||||
|
||||
// FIXME No force contribution from EvenEven assumed here
|
||||
// Needs a fix for clover.
|
||||
|
@ -134,14 +134,12 @@ protected:
|
||||
double start_force = usecond();
|
||||
|
||||
std::cout << GridLogMessage << "AuditForce["<<level<<"]["<<a<<"] before"<<std::endl;
|
||||
AUDIT();
|
||||
|
||||
as[level].actions.at(a)->deriv_timer_start();
|
||||
as[level].actions.at(a)->deriv(Us, force); // deriv should NOT include Ta
|
||||
as[level].actions.at(a)->deriv_timer_stop();
|
||||
|
||||
std::cout << GridLogMessage << "AuditForce["<<level<<"]["<<a<<"] after"<<std::endl;
|
||||
AUDIT();
|
||||
|
||||
std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl;
|
||||
auto name = as[level].actions.at(a)->action_name();
|
||||
@ -284,6 +282,15 @@ public:
|
||||
<< as[level].actions.at(actionID)->deriv_us*1.0e-6<<" s"<< std::endl;
|
||||
}
|
||||
}
|
||||
std::cout << GridLogMessage << "--------------------------- "<<std::endl;
|
||||
std::cout << GridLogMessage << " Dslash counts "<<std::endl;
|
||||
std::cout << GridLogMessage << "------------------------- "<<std::endl;
|
||||
uint64_t full, partial, dirichlet;
|
||||
DslashGetCounts(dirichlet,partial,full);
|
||||
std::cout << GridLogMessage << " Full BCs : "<<full<<std::endl;
|
||||
std::cout << GridLogMessage << " Partial dirichlet BCs : "<<partial<<std::endl;
|
||||
std::cout << GridLogMessage << " Dirichlet BCs : "<<dirichlet<<std::endl;
|
||||
|
||||
std::cout << GridLogMessage << "--------------------------- "<<std::endl;
|
||||
std::cout << GridLogMessage << " Force average size "<<std::endl;
|
||||
std::cout << GridLogMessage << "------------------------- "<<std::endl;
|
||||
@ -373,12 +380,12 @@ public:
|
||||
Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
|
||||
|
||||
std::cout << GridLogMessage << "AuditRefresh["<<level<<"]["<<actionID<<"] before"<<std::endl;
|
||||
AUDIT();
|
||||
|
||||
as[level].actions.at(actionID)->refresh_timer_start();
|
||||
as[level].actions.at(actionID)->refresh(Us, sRNG, pRNG);
|
||||
as[level].actions.at(actionID)->refresh_timer_stop();
|
||||
std::cout << GridLogMessage << "AuditRefresh["<<level<<"]["<<actionID<<"] after"<<std::endl;
|
||||
AUDIT();
|
||||
|
||||
}
|
||||
|
||||
// Refresh the higher representation actions
|
||||
@ -415,7 +422,7 @@ public:
|
||||
// Actions
|
||||
for (int level = 0; level < as.size(); ++level) {
|
||||
for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
|
||||
AUDIT();
|
||||
|
||||
// get gauge field from the SmearingPolicy and
|
||||
// based on the boolean is_smeared in actionID
|
||||
Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
|
||||
@ -425,7 +432,7 @@ public:
|
||||
as[level].actions.at(actionID)->S_timer_stop();
|
||||
std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl;
|
||||
H += Hterm;
|
||||
AUDIT();
|
||||
|
||||
}
|
||||
as[level].apply(S_hireps, Representations, level, H);
|
||||
}
|
||||
@ -438,9 +445,9 @@ public:
|
||||
void operator()(std::vector<Action<FieldType>*> repr_set, Repr& Rep, int level, RealD& H) {
|
||||
|
||||
for (int a = 0; a < repr_set.size(); ++a) {
|
||||
AUDIT();
|
||||
|
||||
RealD Hterm = repr_set.at(a)->Sinitial(Rep.U);
|
||||
AUDIT();
|
||||
|
||||
std::cout << GridLogMessage << "Sinitial Level " << level << " term " << a << " H Hirep = " << Hterm << std::endl;
|
||||
H += Hterm;
|
||||
|
||||
@ -465,10 +472,10 @@ public:
|
||||
Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
|
||||
std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl;
|
||||
as[level].actions.at(actionID)->S_timer_start();
|
||||
AUDIT();
|
||||
|
||||
Hterm = as[level].actions.at(actionID)->Sinitial(Us);
|
||||
as[level].actions.at(actionID)->S_timer_stop();
|
||||
AUDIT();
|
||||
|
||||
std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl;
|
||||
H += Hterm;
|
||||
}
|
||||
@ -481,7 +488,6 @@ public:
|
||||
|
||||
void integrate(Field& U)
|
||||
{
|
||||
AUDIT();
|
||||
// reset the clocks
|
||||
t_U = 0;
|
||||
for (int level = 0; level < as.size(); ++level) {
|
||||
@ -499,10 +505,8 @@ public:
|
||||
assert(fabs(t_U - t_P[level]) < 1.0e-6); // must be the same
|
||||
std::cout << GridLogIntegrator << " times[" << level << "]= " << t_P[level] << " " << t_U << std::endl;
|
||||
}
|
||||
AUDIT();
|
||||
|
||||
FieldImplementation::Project(U);
|
||||
AUDIT();
|
||||
|
||||
// and that we indeed got to the end of the trajectory
|
||||
assert(fabs(t_U - Params.trajL) < 1.0e-6);
|
||||
|
@ -320,7 +320,7 @@ struct Conj{
|
||||
|
||||
struct TimesMinusI{
|
||||
//Complex single
|
||||
inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
|
||||
inline float32x4_t operator()(float32x4_t in){
|
||||
// ar ai br bi -> ai -ar ai -br
|
||||
float32x4_t r0, r1;
|
||||
r0 = vnegq_f32(in); // -ar -ai -br -bi
|
||||
@ -328,7 +328,7 @@ struct TimesMinusI{
|
||||
return vtrn1q_f32(r1, r0); // ar -ai br -bi
|
||||
}
|
||||
//Complex double
|
||||
inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
|
||||
inline float64x2_t operator()(float64x2_t in){
|
||||
// a ib -> b -ia
|
||||
float64x2_t tmp;
|
||||
tmp = vnegq_f64(in);
|
||||
@ -338,7 +338,7 @@ struct TimesMinusI{
|
||||
|
||||
struct TimesI{
|
||||
//Complex single
|
||||
inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
|
||||
inline float32x4_t operator()(float32x4_t in){
|
||||
// ar ai br bi -> -ai ar -bi br
|
||||
float32x4_t r0, r1;
|
||||
r0 = vnegq_f32(in); // -ar -ai -br -bi
|
||||
@ -346,7 +346,7 @@ struct TimesI{
|
||||
return vtrn1q_f32(r1, in); // -ai ar -bi br
|
||||
}
|
||||
//Complex double
|
||||
inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
|
||||
inline float64x2_t operator()(float64x2_t in){
|
||||
// a ib -> -b ia
|
||||
float64x2_t tmp;
|
||||
tmp = vnegq_f64(in);
|
||||
|
@ -36,7 +36,7 @@ public:
|
||||
}
|
||||
template<class vobj,class cobj,class compressor>
|
||||
static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
||||
Vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
||||
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
||||
compressor &compress,int type,int partial)
|
||||
{
|
||||
assert( (table.size()&0x1)==0);
|
||||
|
@ -29,6 +29,27 @@
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
uint64_t DslashFullCount;
|
||||
uint64_t DslashPartialCount;
|
||||
uint64_t DslashDirichletCount;
|
||||
|
||||
void DslashResetCounts(void)
|
||||
{
|
||||
DslashFullCount=0;
|
||||
DslashPartialCount=0;
|
||||
DslashDirichletCount=0;
|
||||
}
|
||||
void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full)
|
||||
{
|
||||
dirichlet = DslashDirichletCount;
|
||||
partial = DslashPartialCount;
|
||||
full = DslashFullCount;
|
||||
}
|
||||
void DslashLogFull(void) { DslashFullCount++;}
|
||||
void DslashLogPartial(void) { DslashPartialCount++;}
|
||||
void DslashLogDirichlet(void){ DslashDirichletCount++;}
|
||||
|
||||
|
||||
void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
|
||||
int off,std::vector<std::pair<int,int> > & table)
|
||||
{
|
||||
|
@ -91,11 +91,14 @@ void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lat
|
||||
///////////////////////////////////////////////////////////////////
|
||||
template<class cobj,class vobj,class compressor>
|
||||
void Gather_plane_exchange_table(const Lattice<vobj> &rhs,
|
||||
commVector<cobj *> pointers,int dimension,int plane,int cbmask,compressor &compress,int type) __attribute__((noinline));
|
||||
commVector<cobj *> pointers,
|
||||
int dimension,int plane,
|
||||
int cbmask,compressor &compress,int type) __attribute__((noinline));
|
||||
|
||||
template<class cobj,class vobj,class compressor>
|
||||
void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
||||
Vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
||||
void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,
|
||||
const Lattice<vobj> &rhs,
|
||||
std::vector<cobj *> &pointers,int dimension,int plane,int cbmask,
|
||||
compressor &compress,int type)
|
||||
{
|
||||
assert( (table.size()&0x1)==0);
|
||||
@ -103,19 +106,26 @@ void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,const La
|
||||
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
|
||||
|
||||
auto rhs_v = rhs.View(AcceleratorRead);
|
||||
auto rhs_p = &rhs_v[0];
|
||||
auto p0=&pointers[0][0];
|
||||
auto p1=&pointers[1][0];
|
||||
auto tp=&table[0];
|
||||
accelerator_forNB(j, num, vobj::Nsimd(), {
|
||||
compress.CompressExchange(p0,p1, &rhs_v[0], j,
|
||||
so+tp[2*j ].second,
|
||||
so+tp[2*j+1].second,
|
||||
type);
|
||||
compress.CompressExchange(p0,p1, rhs_p, j,
|
||||
so+tp[2*j ].second,
|
||||
so+tp[2*j+1].second,
|
||||
type);
|
||||
});
|
||||
rhs_v.ViewClose();
|
||||
}
|
||||
*/
|
||||
|
||||
void DslashResetCounts(void);
|
||||
void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full);
|
||||
void DslashLogFull(void);
|
||||
void DslashLogPartial(void);
|
||||
void DslashLogDirichlet(void);
|
||||
|
||||
struct StencilEntry {
|
||||
#ifdef GRID_CUDA
|
||||
uint64_t _byte_offset; // 8 bytes
|
||||
@ -257,8 +267,8 @@ public:
|
||||
struct Merge {
|
||||
static constexpr int Nsimd = vobj::Nsimd();
|
||||
cobj * mpointer;
|
||||
Vector<scalar_object *> rpointers;
|
||||
Vector<cobj *> vpointers;
|
||||
// std::vector<scalar_object *> rpointers;
|
||||
std::vector<cobj *> vpointers;
|
||||
Integer buffer_size;
|
||||
Integer type;
|
||||
Integer partial; // partial dirichlet BCs
|
||||
@ -308,6 +318,7 @@ public:
|
||||
|
||||
int face_table_computed;
|
||||
int partialDirichlet;
|
||||
int fullDirichlet;
|
||||
std::vector<commVector<std::pair<int,int> > > face_table ;
|
||||
Vector<int> surface_list;
|
||||
|
||||
@ -328,8 +339,8 @@ public:
|
||||
// Vectors that live on the symmetric heap in case of SHMEM
|
||||
// These are used; either SHM objects or refs to the above symmetric heap vectors
|
||||
// depending on comms target
|
||||
Vector<cobj *> u_simd_send_buf;
|
||||
Vector<cobj *> u_simd_recv_buf;
|
||||
std::vector<cobj *> u_simd_send_buf;
|
||||
std::vector<cobj *> u_simd_recv_buf;
|
||||
|
||||
int u_comm_offset;
|
||||
int _unified_buffer_size;
|
||||
@ -337,7 +348,7 @@ public:
|
||||
////////////////////////////////////////
|
||||
// Stencil query
|
||||
////////////////////////////////////////
|
||||
#ifdef SHM_FAST_PATH
|
||||
#if 1
|
||||
inline int SameNode(int point) {
|
||||
|
||||
int dimension = this->_directions[point];
|
||||
@ -423,7 +434,6 @@ public:
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||
{
|
||||
accelerator_barrier();
|
||||
for(int i=0;i<Packets.size();i++){
|
||||
_grid->StencilSendToRecvFromBegin(MpiReqs,
|
||||
Packets[i].send_buf,
|
||||
@ -437,6 +447,12 @@ public:
|
||||
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||
{
|
||||
_grid->StencilSendToRecvFromComplete(MpiReqs,0);
|
||||
if ( this->partialDirichlet ) DslashLogPartial();
|
||||
else if ( this->fullDirichlet ) DslashLogDirichlet();
|
||||
else DslashLogFull();
|
||||
acceleratorCopySynchronise();
|
||||
// Everyone agrees we are all done
|
||||
_grid->StencilBarrier();
|
||||
}
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Blocking send and receive. Either sequential or parallel.
|
||||
@ -514,7 +530,6 @@ public:
|
||||
{
|
||||
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
||||
|
||||
// conformable(source.Grid(),_grid);
|
||||
assert(source.Grid()==_grid);
|
||||
|
||||
u_comm_offset=0;
|
||||
@ -625,7 +640,7 @@ public:
|
||||
d.buffer_size = buffer_size;
|
||||
dv.push_back(d);
|
||||
}
|
||||
void AddMerge(cobj *merge_p,Vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
|
||||
void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
|
||||
Merge m;
|
||||
m.partial = this->partialDirichlet;
|
||||
m.dims = _grid->_fdimensions;
|
||||
@ -640,8 +655,8 @@ public:
|
||||
CommsMerge(decompress,Mergers,Decompressions);
|
||||
}
|
||||
template<class decompressor> void CommsMergeSHM(decompressor decompress) {
|
||||
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
||||
CommsMerge(decompress,MergersSHM,DecompressionsSHM);
|
||||
assert(MergersSHM.size()==0);
|
||||
assert(DecompressionsSHM.size()==0);
|
||||
}
|
||||
|
||||
template<class decompressor>
|
||||
@ -690,6 +705,7 @@ public:
|
||||
}
|
||||
}
|
||||
}
|
||||
std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
|
||||
}
|
||||
/// Introduce a block structure and switch off comms on boundaries
|
||||
void DirichletBlock(const Coordinate &dirichlet_block)
|
||||
@ -765,6 +781,10 @@ public:
|
||||
if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0);
|
||||
partialDirichlet = p.partialDirichlet;
|
||||
DirichletBlock(p.dirichlet); // comms send/recv set up
|
||||
fullDirichlet=0;
|
||||
for(int d=0;d<p.dirichlet.size();d++){
|
||||
if (p.dirichlet[d]) fullDirichlet=1;
|
||||
}
|
||||
|
||||
_unified_buffer_size=0;
|
||||
surface_list.resize(0);
|
||||
@ -1268,8 +1288,8 @@ public:
|
||||
|
||||
assert(bytes*simd_layout == reduced_buffer_size*datum_bytes);
|
||||
|
||||
Vector<cobj *> rpointers(maxl);
|
||||
Vector<cobj *> spointers(maxl);
|
||||
std::vector<cobj *> rpointers(maxl);
|
||||
std::vector<cobj *> spointers(maxl);
|
||||
|
||||
///////////////////////////////////////////
|
||||
// Work out what to send where
|
||||
@ -1347,10 +1367,11 @@ public:
|
||||
int recv_from_rank;
|
||||
int xmit_to_rank;
|
||||
int shm_send=0;
|
||||
int shm_recv=0;
|
||||
|
||||
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||
#ifdef SHM_FAST_PATH
|
||||
#warning STENCIL SHM FAST PATH SELECTED
|
||||
int shm_recv=0;
|
||||
// shm == receive pointer if offnode
|
||||
// shm == Translate[send pointer] if on node -- my view of his send pointer
|
||||
cobj *shm = (cobj *) _grid->ShmBufferTranslate(recv_from_rank,sp);
|
||||
@ -1383,7 +1404,6 @@ public:
|
||||
acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero
|
||||
}
|
||||
int do_send = (comms_send|comms_partial_send) && (!shm_send );
|
||||
int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
|
||||
AddPacket((void *)sp,(void *)rp,
|
||||
xmit_to_rank,do_send,
|
||||
recv_from_rank,do_send,
|
||||
|
@ -133,7 +133,6 @@ typename vobj::scalar_object extractLane(int lane, const vobj & __restrict__ vec
|
||||
typedef scalar_type * pointer;
|
||||
|
||||
constexpr int words=sizeof(vobj)/sizeof(vector_type);
|
||||
constexpr int Nsimd=vector_type::Nsimd();
|
||||
|
||||
scalar_object extracted;
|
||||
pointer __restrict__ sp = (pointer)&extracted; // Type pun
|
||||
@ -153,7 +152,6 @@ void insertLane(int lane, vobj & __restrict__ vec,const typename vobj::scalar_ob
|
||||
typedef scalar_type * pointer;
|
||||
|
||||
constexpr int words=sizeof(vobj)/sizeof(vector_type);
|
||||
constexpr int Nsimd=vector_type::Nsimd();
|
||||
|
||||
pointer __restrict__ sp = (pointer)&extracted;
|
||||
vector_type *vp = (vector_type *)&vec;
|
||||
@ -178,8 +176,6 @@ void extract(const vobj &vec,const ExtractPointerArray<sobj> &extracted, int off
|
||||
const int s = Nsimd/Nextr;
|
||||
|
||||
vector_type * vp = (vector_type *)&vec;
|
||||
scalar_type vtmp;
|
||||
sobj_scalar_type stmp;
|
||||
for(int w=0;w<words;w++){
|
||||
for(int i=0;i<Nextr;i++){
|
||||
sobj_scalar_type * pointer = (sobj_scalar_type *)& extracted[i][offset];
|
||||
@ -205,7 +201,6 @@ void merge(vobj &vec,const ExtractPointerArray<sobj> &extracted, int offset)
|
||||
|
||||
vector_type * vp = (vector_type *)&vec;
|
||||
scalar_type vtmp;
|
||||
sobj_scalar_type stmp;
|
||||
for(int w=0;w<words;w++){
|
||||
for(int i=0;i<Nextr;i++){
|
||||
sobj_scalar_type * pointer = (sobj_scalar_type *)& extracted[i][offset];
|
||||
@ -226,7 +221,7 @@ template<class vobjOut, class vobjIn>
|
||||
accelerator_inline
|
||||
void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __restrict__ vecIn, int lane_in)
|
||||
{
|
||||
static_assert( std::is_same<typename vobjOut::DoublePrecision, typename vobjIn::DoublePrecision>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
|
||||
static_assert( std::is_same<typename vobjOut::scalar_typeD, typename vobjIn::scalar_typeD>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
|
||||
|
||||
typedef typename vobjOut::vector_type ovector_type;
|
||||
typedef typename vobjIn::vector_type ivector_type;
|
||||
@ -242,18 +237,15 @@ void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __rest
|
||||
typedef oextract_type * opointer;
|
||||
typedef iextract_type * ipointer;
|
||||
|
||||
constexpr int oNsimd=ovector_type::Nsimd();
|
||||
constexpr int iNsimd=ivector_type::Nsimd();
|
||||
|
||||
iscalar_type itmp;
|
||||
oscalar_type otmp;
|
||||
|
||||
ovector_type * __restrict__ op = (ovector_type *)&vecOut;
|
||||
ivector_type * __restrict__ ip = (ivector_type *)&vecIn;
|
||||
for(int w=0;w<owords;w++){
|
||||
itmp = ip[iNsimd*w].getlane(lane_in);
|
||||
itmp = ip[w].getlane(lane_in);
|
||||
otmp = itmp; //potential precision change
|
||||
op[oNsimd*w].putlane(otmp,lane_out);
|
||||
op[w].putlane(otmp,lane_out);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -458,6 +458,7 @@ inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream);
|
||||
// Common on all GPU targets
|
||||
//////////////////////////////////////////////
|
||||
#if defined(GRID_SYCL) || defined(GRID_CUDA) || defined(GRID_HIP)
|
||||
// FIXME -- the non-blocking nature got broken March 30 2023 by PAB
|
||||
#define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );
|
||||
|
||||
#define accelerator_for( iter, num, nsimd, ... ) \
|
||||
@ -525,7 +526,7 @@ inline void acceleratorFreeCpu (void *ptr){free(ptr);};
|
||||
//////////////////////////////////////////////
|
||||
|
||||
#ifdef GRID_SYCL
|
||||
inline void acceleratorFenceComputeStream(void){ accelerator_barrier();};
|
||||
inline void acceleratorFenceComputeStream(void){ theGridAccelerator->ext_oneapi_submit_barrier(); };
|
||||
#else
|
||||
// Ordering within a stream guaranteed on Nvidia & AMD
|
||||
inline void acceleratorFenceComputeStream(void){ };
|
||||
|
@ -167,14 +167,13 @@ void GridCmdOptionInt(std::string &str,int & val)
|
||||
return;
|
||||
}
|
||||
|
||||
void GridCmdOptionFloat(std::string &str,float & val)
|
||||
void GridCmdOptionFloat(std::string &str,double & val)
|
||||
{
|
||||
std::stringstream ss(str);
|
||||
ss>>val;
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
void GridParseLayout(char **argv,int argc,
|
||||
Coordinate &latt_c,
|
||||
Coordinate &mpi_c)
|
||||
|
@ -57,7 +57,7 @@ void GridCmdOptionCSL(std::string str,std::vector<std::string> & vec);
|
||||
template<class VectorInt>
|
||||
void GridCmdOptionIntVector(const std::string &str,VectorInt & vec);
|
||||
void GridCmdOptionInt(std::string &str,int & val);
|
||||
void GridCmdOptionFloat(std::string &str,float & val);
|
||||
void GridCmdOptionFloat(std::string &str,double & val);
|
||||
|
||||
|
||||
void GridParseLayout(char **argv,int argc,
|
||||
|
@ -164,11 +164,6 @@ int main(int argc, char **argv) {
|
||||
typedef MobiusEOFAFermionF FermionEOFAActionF;
|
||||
typedef typename FermionActionF::FermionField FermionFieldF;
|
||||
|
||||
typedef WilsonImplD2 FermionImplPolicyD2;
|
||||
typedef MobiusFermionD2 FermionActionD2;
|
||||
typedef MobiusEOFAFermionD2 FermionEOFAActionD2;
|
||||
typedef typename FermionActionD2::FermionField FermionFieldD2;
|
||||
|
||||
typedef Grid::XmlReader Serialiser;
|
||||
|
||||
//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
|
||||
@ -232,31 +227,34 @@ int main(int argc, char **argv) {
|
||||
// std::vector<Real> hasenbusch({ light_mass, 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated
|
||||
// std::vector<Real> hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass });
|
||||
|
||||
OneFlavourRationalParams OFRp; // Up/down
|
||||
OFRp.lo = 4.0e-5;
|
||||
int SP_iters=9000;
|
||||
|
||||
RationalActionParams OFRp; // Up/down
|
||||
OFRp.lo = 6.0e-5;
|
||||
OFRp.hi = 90.0;
|
||||
OFRp.MaxIter = 60000;
|
||||
OFRp.tolerance= 1.0e-5;
|
||||
OFRp.mdtolerance= 1.0e-3;
|
||||
OFRp.inv_pow = 2;
|
||||
OFRp.MaxIter = SP_iters; // get most shifts by 2000, stop sharing space
|
||||
OFRp.action_tolerance= 1.0e-8;
|
||||
OFRp.action_degree = 18;
|
||||
OFRp.md_tolerance= 1.0e-5;
|
||||
OFRp.md_degree = 14;
|
||||
// OFRp.degree = 20; converges
|
||||
// OFRp.degree = 16;
|
||||
OFRp.degree = 18;
|
||||
OFRp.precision= 80;
|
||||
OFRp.BoundsCheckFreq=0;
|
||||
std::vector<RealD> ActionTolByPole({
|
||||
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
|
||||
1.0e-7,1.0e-8,1.0e-8,1.0e-8,
|
||||
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
|
||||
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
|
||||
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
|
||||
1.0e-8,1.0e-8
|
||||
});
|
||||
std::vector<RealD> MDTolByPole({
|
||||
1.0e-5,5.0e-6,1.0e-6,1.0e-7, // soften convergence more more
|
||||
1.6e-5,5.0e-6,1.0e-6,3.0e-7, // soften convergence more more
|
||||
// 1.0e-6,3.0e-7,1.0e-7,1.0e-7,
|
||||
// 3.0e-6,1.0e-6,1.0e-7,1.0e-7, // soften convergence
|
||||
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
|
||||
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
|
||||
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
|
||||
1.0e-8,1.0e-8
|
||||
});
|
||||
|
||||
@ -265,10 +263,8 @@ int main(int argc, char **argv) {
|
||||
|
||||
typedef SchurDiagMooeeOperator<FermionActionF,FermionFieldF> LinearOperatorF;
|
||||
typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
|
||||
typedef SchurDiagMooeeOperator<FermionActionD2,FermionFieldD2 > LinearOperatorD2;
|
||||
typedef SchurDiagMooeeOperator<FermionEOFAActionF,FermionFieldF> LinearOperatorEOFAF;
|
||||
typedef SchurDiagMooeeOperator<FermionEOFAAction ,FermionField > LinearOperatorEOFAD;
|
||||
typedef SchurDiagMooeeOperator<FermionEOFAActionD2,FermionFieldD2 > LinearOperatorEOFAD2;
|
||||
typedef MixedPrecisionConjugateGradientOperatorFunction<MobiusFermionD,MobiusFermionF,LinearOperatorD,LinearOperatorF> MxPCG;
|
||||
typedef MixedPrecisionConjugateGradientOperatorFunction<MobiusEOFAFermionD,MobiusEOFAFermionF,LinearOperatorEOFAD,LinearOperatorEOFAF> MxPCG_EOFA;
|
||||
|
||||
@ -321,7 +317,6 @@ int main(int argc, char **argv) {
|
||||
// temporarily need a gauge field
|
||||
LatticeGaugeFieldD U(GridPtr); U=Zero();
|
||||
LatticeGaugeFieldF UF(GridPtrF); UF=Zero();
|
||||
LatticeGaugeFieldD2 UD2(GridPtrF); UD2=Zero();
|
||||
|
||||
std::cout << GridLogMessage << " Running the HMC "<< std::endl;
|
||||
TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file
|
||||
@ -340,6 +335,7 @@ int main(int argc, char **argv) {
|
||||
ParamsDirF.dirichlet=Dirichlet;
|
||||
ParamsDir.partialDirichlet=1;
|
||||
ParamsDirF.partialDirichlet=1;
|
||||
std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
|
||||
|
||||
// double StoppingCondition = 1e-14;
|
||||
// double MDStoppingCondition = 1e-9;
|
||||
@ -366,12 +362,12 @@ int main(int argc, char **argv) {
|
||||
|
||||
// Probably dominates the force - back to EOFA.
|
||||
OneFlavourRationalParams SFRp;
|
||||
SFRp.lo = 0.25;
|
||||
SFRp.lo = 0.1;
|
||||
SFRp.hi = 25.0;
|
||||
SFRp.MaxIter = 10000;
|
||||
SFRp.tolerance= 1.0e-5;
|
||||
SFRp.tolerance= 1.0e-8;
|
||||
SFRp.mdtolerance= 2.0e-4;
|
||||
SFRp.degree = 8;
|
||||
SFRp.degree = 12;
|
||||
SFRp.precision= 50;
|
||||
|
||||
MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
|
||||
@ -424,7 +420,7 @@ int main(int argc, char **argv) {
|
||||
ActionCGL, ActionCGR,
|
||||
DerivativeCGL, DerivativeCGR,
|
||||
SFRp, true);
|
||||
// Level2.push_back(&EOFA);
|
||||
Level2.push_back(&EOFA);
|
||||
|
||||
////////////////////////////////////
|
||||
// up down action
|
||||
@ -449,17 +445,15 @@ int main(int argc, char **argv) {
|
||||
std::vector<FermionAction *> Denominators;
|
||||
std::vector<FermionActionF *> NumeratorsF;
|
||||
std::vector<FermionActionF *> DenominatorsF;
|
||||
std::vector<FermionActionD2 *> NumeratorsD2;
|
||||
std::vector<FermionActionD2 *> DenominatorsD2;
|
||||
std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
|
||||
std::vector<MxPCG *> ActionMPCG;
|
||||
std::vector<MxPCG *> MPCG;
|
||||
|
||||
#define MIXED_PRECISION
|
||||
#ifdef MIXED_PRECISION
|
||||
std::vector<OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicyD2> *> Bdys;
|
||||
std::vector<GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF> *> Bdys;
|
||||
#else
|
||||
std::vector<OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys;
|
||||
std::vector<GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys;
|
||||
#endif
|
||||
|
||||
typedef SchurDiagMooeeOperator<FermionActionF,FermionFieldF> LinearOperatorF;
|
||||
@ -532,31 +526,17 @@ int main(int argc, char **argv) {
|
||||
Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],*MPCG[h],*ActionMPCG[h],CG));
|
||||
} else {
|
||||
#ifdef MIXED_PRECISION
|
||||
// Use the D2 data types and make them use same grid as single
|
||||
FermionActionD2::ImplParams ParamsDenD2(boundary);
|
||||
FermionActionD2::ImplParams ParamsNumD2(boundary);
|
||||
|
||||
ParamsDenD2.dirichlet = ParamsDen.dirichlet;
|
||||
ParamsDenD2.partialDirichlet = ParamsDen.partialDirichlet;
|
||||
DenominatorsD2.push_back(new FermionActionD2(UD2,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsDenD2));
|
||||
|
||||
ParamsNumD2.dirichlet = ParamsNum.dirichlet;
|
||||
ParamsNumD2.partialDirichlet = ParamsNum.partialDirichlet;
|
||||
NumeratorsD2.push_back (new FermionActionD2(UD2,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[h],M5,b,c, ParamsNumD2));
|
||||
|
||||
Bdys.push_back( new OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicyD2>(
|
||||
Bdys.push_back( new GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF>(
|
||||
*Numerators[h],*Denominators[h],
|
||||
*NumeratorsF[h],*DenominatorsF[h],
|
||||
*NumeratorsD2[h],*DenominatorsD2[h],
|
||||
OFRp, 400) );
|
||||
Bdys.push_back( new OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicyD2>(
|
||||
OFRp, SP_iters) );
|
||||
Bdys.push_back( new GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF>(
|
||||
*Numerators[h],*Denominators[h],
|
||||
*NumeratorsF[h],*DenominatorsF[h],
|
||||
*NumeratorsD2[h],*DenominatorsD2[h],
|
||||
OFRp, 400) );
|
||||
OFRp, SP_iters) );
|
||||
#else
|
||||
Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
|
||||
Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
|
||||
Bdys.push_back( new GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
|
||||
Bdys.push_back( new GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
@ -329,7 +329,6 @@ int main(int argc, char **argv) {
|
||||
|
||||
|
||||
auto grid4= GridPtr;
|
||||
auto rbgrid4= GridRBPtr;
|
||||
auto rbgrid = StrangeOp.FermionRedBlackGrid();
|
||||
auto grid = StrangeOp.FermionGrid();
|
||||
if(1){
|
||||
|
@ -164,11 +164,6 @@ int main(int argc, char **argv) {
|
||||
typedef MobiusEOFAFermionF FermionEOFAActionF;
|
||||
typedef typename FermionActionF::FermionField FermionFieldF;
|
||||
|
||||
typedef WilsonImplD2 FermionImplPolicyD2;
|
||||
typedef MobiusFermionD2 FermionActionD2;
|
||||
typedef MobiusEOFAFermionD2 FermionEOFAActionD2;
|
||||
typedef typename FermionActionD2::FermionField FermionFieldD2;
|
||||
|
||||
typedef Grid::XmlReader Serialiser;
|
||||
|
||||
//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
|
||||
@ -183,7 +178,7 @@ int main(int argc, char **argv) {
|
||||
// 4/2 => 0.6 dH
|
||||
// 3/3 => 0.8 dH .. depth 3, slower
|
||||
//MD.MDsteps = 4;
|
||||
MD.MDsteps = 3;
|
||||
MD.MDsteps = 12;
|
||||
MD.trajL = 0.5;
|
||||
|
||||
HMCparameters HMCparams;
|
||||
@ -200,8 +195,8 @@ int main(int argc, char **argv) {
|
||||
TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
|
||||
|
||||
CheckpointerParameters CPparams;
|
||||
CPparams.config_prefix = "ckpoint_DDHMC_lat";
|
||||
CPparams.rng_prefix = "ckpoint_DDHMC_rng";
|
||||
CPparams.config_prefix = "ckpoint_HMC_lat";
|
||||
CPparams.rng_prefix = "ckpoint_HMC_rng";
|
||||
CPparams.saveInterval = 1;
|
||||
CPparams.format = "IEEE64BIG";
|
||||
TheHMC.Resources.LoadNerscCheckpointer(CPparams);
|
||||
@ -228,7 +223,7 @@ int main(int argc, char **argv) {
|
||||
Real pv_mass = 1.0;
|
||||
// std::vector<Real> hasenbusch({ 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
|
||||
// std::vector<Real> hasenbusch({ light_mass, 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
|
||||
std::vector<Real> hasenbusch({ light_mass, 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated
|
||||
std::vector<Real> hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated
|
||||
// std::vector<Real> hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass });
|
||||
|
||||
auto GridPtr = TheHMC.Resources.GetCartesian();
|
||||
@ -250,11 +245,6 @@ int main(int argc, char **argv) {
|
||||
|
||||
GlobalSharedMemory::GetShmDims(mpi,shm);
|
||||
|
||||
Coordinate CommDim(Nd);
|
||||
for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
|
||||
|
||||
Coordinate NonDirichlet(Nd+1,0);
|
||||
|
||||
//////////////////////////
|
||||
// Fermion Grids
|
||||
//////////////////////////
|
||||
@ -272,7 +262,6 @@ int main(int argc, char **argv) {
|
||||
// temporarily need a gauge field
|
||||
LatticeGaugeFieldD U(GridPtr); U=Zero();
|
||||
LatticeGaugeFieldF UF(GridPtrF); UF=Zero();
|
||||
LatticeGaugeFieldD2 UD2(GridPtrF); UD2=Zero();
|
||||
|
||||
std::cout << GridLogMessage << " Running the HMC "<< std::endl;
|
||||
TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file
|
||||
@ -283,8 +272,6 @@ int main(int argc, char **argv) {
|
||||
std::vector<Complex> boundary = {1,1,1,-1};
|
||||
FermionAction::ImplParams Params(boundary);
|
||||
FermionActionF::ImplParams ParamsF(boundary);
|
||||
Params.dirichlet=NonDirichlet;
|
||||
ParamsF.dirichlet=NonDirichlet;
|
||||
|
||||
// double StoppingCondition = 1e-14;
|
||||
// double MDStoppingCondition = 1e-9;
|
||||
@ -299,8 +286,8 @@ int main(int argc, char **argv) {
|
||||
////////////////////////////////////
|
||||
// Collect actions
|
||||
////////////////////////////////////
|
||||
ActionLevel<HMCWrapper::Field> Level1(1);
|
||||
ActionLevel<HMCWrapper::Field> Level2(3);
|
||||
// ActionLevel<HMCWrapper::Field> Level1(1);
|
||||
ActionLevel<HMCWrapper::Field> Level2(1);
|
||||
ActionLevel<HMCWrapper::Field> Level3(15);
|
||||
|
||||
////////////////////////////////////
|
||||
@ -311,12 +298,12 @@ int main(int argc, char **argv) {
|
||||
|
||||
// Probably dominates the force - back to EOFA.
|
||||
OneFlavourRationalParams SFRp;
|
||||
SFRp.lo = 0.25;
|
||||
SFRp.hi = 25.0;
|
||||
SFRp.lo = 0.1;
|
||||
SFRp.hi = 30.0;
|
||||
SFRp.MaxIter = 10000;
|
||||
SFRp.tolerance= 1.0e-5;
|
||||
SFRp.mdtolerance= 2.0e-4;
|
||||
SFRp.degree = 8;
|
||||
SFRp.tolerance= 1.0e-8;
|
||||
SFRp.mdtolerance= 2.0e-6;
|
||||
SFRp.degree = 10;
|
||||
SFRp.precision= 50;
|
||||
|
||||
MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
|
||||
@ -369,40 +356,36 @@ int main(int argc, char **argv) {
|
||||
ActionCGL, ActionCGR,
|
||||
DerivativeCGL, DerivativeCGR,
|
||||
SFRp, true);
|
||||
// Level2.push_back(&EOFA);
|
||||
Level2.push_back(&EOFA);
|
||||
|
||||
////////////////////////////////////
|
||||
// up down action
|
||||
////////////////////////////////////
|
||||
std::vector<Real> light_den;
|
||||
std::vector<Real> light_num;
|
||||
std::vector<int> dirichlet_den;
|
||||
std::vector<int> dirichlet_num;
|
||||
|
||||
int n_hasenbusch = hasenbusch.size();
|
||||
light_den.push_back(light_mass); dirichlet_den.push_back(0);
|
||||
light_den.push_back(light_mass);
|
||||
for(int h=0;h<n_hasenbusch;h++){
|
||||
light_den.push_back(hasenbusch[h]); dirichlet_den.push_back(0);
|
||||
light_den.push_back(hasenbusch[h]);
|
||||
}
|
||||
|
||||
for(int h=0;h<n_hasenbusch;h++){
|
||||
light_num.push_back(hasenbusch[h]); dirichlet_num.push_back(0);
|
||||
light_num.push_back(hasenbusch[h]);
|
||||
}
|
||||
light_num.push_back(pv_mass); dirichlet_num.push_back(0);
|
||||
light_num.push_back(pv_mass);
|
||||
|
||||
std::vector<FermionAction *> Numerators;
|
||||
std::vector<FermionAction *> Denominators;
|
||||
std::vector<FermionActionF *> NumeratorsF;
|
||||
std::vector<FermionActionF *> DenominatorsF;
|
||||
std::vector<FermionActionD2 *> NumeratorsD2;
|
||||
std::vector<FermionActionD2 *> DenominatorsD2;
|
||||
std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
|
||||
std::vector<MxPCG *> ActionMPCG;
|
||||
std::vector<MxPCG *> MPCG;
|
||||
|
||||
#define MIXED_PRECISION
|
||||
#ifdef MIXED_PRECISION
|
||||
std::vector<OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicyD2> *> Bdys;
|
||||
std::vector<OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF> *> Bdys;
|
||||
#else
|
||||
std::vector<OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys;
|
||||
#endif
|
||||
@ -416,9 +399,7 @@ int main(int argc, char **argv) {
|
||||
std::cout << GridLogMessage
|
||||
<< " 2f quotient Action ";
|
||||
std::cout << "det D("<<light_den[h]<<")";
|
||||
if ( dirichlet_den[h] ) std::cout << "^dirichlet ";
|
||||
std::cout << "/ det D("<<light_num[h]<<")";
|
||||
if ( dirichlet_num[h] ) std::cout << "^dirichlet ";
|
||||
std::cout << std::endl;
|
||||
|
||||
FermionAction::ImplParams ParamsNum(boundary);
|
||||
@ -426,21 +407,11 @@ int main(int argc, char **argv) {
|
||||
FermionActionF::ImplParams ParamsDenF(boundary);
|
||||
FermionActionF::ImplParams ParamsNumF(boundary);
|
||||
|
||||
ParamsNum.dirichlet = NonDirichlet;
|
||||
ParamsDen.dirichlet = NonDirichlet;
|
||||
|
||||
ParamsNum.partialDirichlet = 0;
|
||||
ParamsDen.partialDirichlet = 0;
|
||||
|
||||
Numerators.push_back (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
|
||||
Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
|
||||
|
||||
ParamsDenF.dirichlet = ParamsDen.dirichlet;
|
||||
ParamsDenF.partialDirichlet = ParamsDen.partialDirichlet;
|
||||
DenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsDenF));
|
||||
|
||||
ParamsNumF.dirichlet = ParamsNum.dirichlet;
|
||||
ParamsNumF.partialDirichlet = ParamsNum.partialDirichlet;
|
||||
NumeratorsF.push_back (new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[h],M5,b,c, ParamsNumF));
|
||||
|
||||
LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
|
||||
@ -477,7 +448,6 @@ int main(int argc, char **argv) {
|
||||
// Gauge action
|
||||
/////////////////////////////////////////////////////////////
|
||||
Level3.push_back(&GaugeAction);
|
||||
TheHMC.TheAction.push_back(Level1);
|
||||
TheHMC.TheAction.push_back(Level2);
|
||||
TheHMC.TheAction.push_back(Level3);
|
||||
std::cout << GridLogMessage << " Action complete "<< std::endl;
|
||||
|
@ -1,7 +1,8 @@
|
||||
# Grid [),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=GridBasedSoftware_Grid&tab=projectOverview)
|
||||
|
||||
# Grid
|
||||
**Data parallel C++ mathematical object library.**
|
||||
|
||||
[),branch:default:true)/statusIcon.svg)](https://ci.dev.dirac.ed.ac.uk/project/GridBasedSoftware_Grid?mode=builds)
|
||||
|
||||
License: GPL v2.
|
||||
|
||||
Last update June 2017.
|
||||
|
10
TODO
10
TODO
@ -1,3 +1,12 @@
|
||||
- - Slice sum optimisation & A2A - atomic addition
|
||||
- - Also faster non-atomic reduction
|
||||
- - Remaining PRs
|
||||
- - DDHMC
|
||||
- - MixedPrec is the action eval, high precision
|
||||
- - MixedPrecCleanup is the force eval, low precision
|
||||
|
||||
=================
|
||||
=================
|
||||
Lattice_basis.h -- > HIP and SYCL GPU code
|
||||
|
||||
|
||||
@ -8,6 +17,7 @@ DDHMC
|
||||
-- Multishift Mixed Precision - DONE
|
||||
-- Pole dependent residual - DONE
|
||||
|
||||
|
||||
=======
|
||||
-- comms threads issue??
|
||||
-- Part done: Staggered kernel performance on GPU
|
||||
|
@ -425,7 +425,7 @@ void Benchmark(int Ls, Coordinate Dirichlet)
|
||||
|
||||
err = r_eo-result;
|
||||
n2e= norm2(err);
|
||||
std::cout<<GridLogMessage << "norm diff "<< n2e<< " Line "<<__LINE__ <<std::endl;
|
||||
std::cout<<GridLogMessage << "norm diff "<< n2e<<std::endl;
|
||||
assert(n2e<1.0e-4);
|
||||
|
||||
pickCheckerboard(Even,src_e,err);
|
||||
|
387
benchmarks/Benchmark_dwf_fp32_paranoid.cc
Normal file
387
benchmarks/Benchmark_dwf_fp32_paranoid.cc
Normal file
@ -0,0 +1,387 @@
|
||||
/*************************************************************************************
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
Source file: ./benchmarks/Benchmark_dwf.cc
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/Grid.h>
|
||||
#ifdef GRID_CUDA
|
||||
#define CUDA_PROFILE
|
||||
#endif
|
||||
|
||||
#ifdef CUDA_PROFILE
|
||||
#include <cuda_profiler_api.h>
|
||||
#endif
|
||||
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
|
||||
template<class d>
|
||||
struct scal {
|
||||
d internal;
|
||||
};
|
||||
|
||||
Gamma::Algebra Gmu [] = {
|
||||
Gamma::Algebra::GammaX,
|
||||
Gamma::Algebra::GammaY,
|
||||
Gamma::Algebra::GammaZ,
|
||||
Gamma::Algebra::GammaT
|
||||
};
|
||||
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
|
||||
int threads = GridThread::GetThreads();
|
||||
|
||||
Coordinate latt4 = GridDefaultLatt();
|
||||
int Ls=16;
|
||||
for(int i=0;i<argc;i++)
|
||||
if(std::string(argv[i]) == "-Ls"){
|
||||
std::stringstream ss(argv[i+1]); ss >> Ls;
|
||||
}
|
||||
|
||||
GridLogLayout();
|
||||
|
||||
long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
|
||||
|
||||
|
||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
|
||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||
|
||||
std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
|
||||
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
|
||||
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
|
||||
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
|
||||
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
|
||||
|
||||
std::vector<int> seeds4({1,2,3,4});
|
||||
std::vector<int> seeds5({5,6,7,8});
|
||||
|
||||
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
|
||||
GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG"));
|
||||
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
|
||||
GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG"));
|
||||
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
|
||||
|
||||
LatticeFermionF src (FGrid); random(RNG5,src);
|
||||
LatticeFermionF src1 (FGrid); random(RNG5,src1);
|
||||
#if 0
|
||||
src = Zero();
|
||||
{
|
||||
Coordinate origin({0,0,0,latt4[2]-1,0});
|
||||
SpinColourVectorF tmp;
|
||||
tmp=Zero();
|
||||
tmp()(0)(0)=Complex(-2.0,0.0);
|
||||
std::cout << " source site 0 " << tmp<<std::endl;
|
||||
pokeSite(tmp,src,origin);
|
||||
}
|
||||
#else
|
||||
RealD N2 = 1.0/::sqrt(norm2(src));
|
||||
src = src*N2;
|
||||
#endif
|
||||
|
||||
|
||||
LatticeFermionF result(FGrid); result=Zero();
|
||||
LatticeFermionF ref(FGrid); ref=Zero();
|
||||
LatticeFermionF tmp(FGrid);
|
||||
LatticeFermionF err(FGrid);
|
||||
|
||||
std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
|
||||
LatticeGaugeFieldF Umu(UGrid);
|
||||
SU<Nc>::HotConfiguration(RNG4,Umu);
|
||||
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
|
||||
#if 0
|
||||
Umu=1.0;
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
LatticeColourMatrixF ttmp(UGrid);
|
||||
ttmp = PeekIndex<LorentzIndex>(Umu,mu);
|
||||
// if (mu !=2 ) ttmp = 0;
|
||||
// ttmp = ttmp* pow(10.0,mu);
|
||||
PokeIndex<LorentzIndex>(Umu,ttmp,mu);
|
||||
}
|
||||
std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
|
||||
#endif
|
||||
|
||||
////////////////////////////////////
|
||||
// Naive wilson implementation
|
||||
////////////////////////////////////
|
||||
// replicate across fifth dimension
|
||||
// LatticeGaugeFieldF Umu5d(FGrid);
|
||||
std::vector<LatticeColourMatrixF> U(4,UGrid);
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
|
||||
}
|
||||
std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
|
||||
|
||||
if (1)
|
||||
{
|
||||
ref = Zero();
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
|
||||
tmp = Cshift(src,mu+1,1);
|
||||
{
|
||||
autoView( tmp_v , tmp , CpuWrite);
|
||||
autoView( U_v , U[mu] , CpuRead);
|
||||
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
||||
for(int s=0;s<Ls;s++){
|
||||
tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
|
||||
}
|
||||
}
|
||||
}
|
||||
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
|
||||
|
||||
{
|
||||
autoView( tmp_v , tmp , CpuWrite);
|
||||
autoView( U_v , U[mu] , CpuRead);
|
||||
autoView( src_v, src , CpuRead);
|
||||
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
||||
for(int s=0;s<Ls;s++){
|
||||
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
|
||||
}
|
||||
}
|
||||
}
|
||||
tmp =Cshift(tmp,mu+1,-1);
|
||||
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
|
||||
}
|
||||
ref = -0.5*ref;
|
||||
}
|
||||
|
||||
RealD mass=0.1;
|
||||
RealD M5 =1.8;
|
||||
|
||||
RealD NP = UGrid->_Nprocessors;
|
||||
RealD NN = UGrid->NodeCount();
|
||||
|
||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
|
||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
|
||||
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
|
||||
std::cout << GridLogMessage<< "* VComplexF size is "<<sizeof(vComplexF)<< " B"<<std::endl;
|
||||
if ( sizeof(RealF)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
|
||||
if ( sizeof(RealF)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
|
||||
#ifdef GRID_OMP
|
||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
|
||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
|
||||
#endif
|
||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
|
||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||
|
||||
DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
||||
int ncall =100;
|
||||
|
||||
if (1) {
|
||||
FGrid->Barrier();
|
||||
Dw.Dhop(src,result,0);
|
||||
std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
|
||||
double t0=usecond();
|
||||
for(int i=0;i<ncall;i++){
|
||||
Dw.Dhop(src1,result,0);
|
||||
Dw.Dhop(src,result,0);
|
||||
err = ref-result;
|
||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
||||
assert (norm2(err)< 1.0e-4 );
|
||||
}
|
||||
double t1=usecond();
|
||||
FGrid->Barrier();
|
||||
|
||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||
double flops=single_site_flops*volume*ncall;
|
||||
|
||||
auto nsimd = vComplex::Nsimd();
|
||||
auto simdwidth = sizeof(vComplex);
|
||||
|
||||
// RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
|
||||
double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
|
||||
|
||||
// mem: Nd Wilson * Ls, Nd gauge, Nc colors
|
||||
double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
|
||||
|
||||
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
|
||||
// std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
||||
// std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
|
||||
std::cout<<GridLogMessage << "RF GiB/s (base 2) = "<< 1000000. * data_rf/((t1-t0))<<std::endl;
|
||||
std::cout<<GridLogMessage << "mem GiB/s (base 2) = "<< 1000000. * data_mem/((t1-t0))<<std::endl;
|
||||
err = ref-result;
|
||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
||||
//exit(0);
|
||||
|
||||
if(( norm2(err)>1.0e-4) ) {
|
||||
|
||||
/*
|
||||
std::cout << "RESULT\n " << result<<std::endl;
|
||||
std::cout << "REF \n " << ref <<std::endl;
|
||||
std::cout << "ERR \n " << err <<std::endl;
|
||||
*/
|
||||
std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
|
||||
FGrid->Barrier();
|
||||
exit(-1);
|
||||
}
|
||||
assert (norm2(err)< 1.0e-4 );
|
||||
}
|
||||
|
||||
if (1)
|
||||
{ // Naive wilson dag implementation
|
||||
ref = Zero();
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
|
||||
// ref = src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
|
||||
tmp = Cshift(src,mu+1,1);
|
||||
{
|
||||
autoView( ref_v, ref, CpuWrite);
|
||||
autoView( tmp_v, tmp, CpuRead);
|
||||
autoView( U_v , U[mu] , CpuRead);
|
||||
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
||||
for(int s=0;s<Ls;s++){
|
||||
int i=s+Ls*ss;
|
||||
ref_v[i]+= U_v[ss]*(tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]); ;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
autoView( tmp_v , tmp , CpuWrite);
|
||||
autoView( U_v , U[mu] , CpuRead);
|
||||
autoView( src_v, src , CpuRead);
|
||||
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
||||
for(int s=0;s<Ls;s++){
|
||||
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
|
||||
}
|
||||
}
|
||||
}
|
||||
// tmp =adj(U[mu])*src;
|
||||
tmp =Cshift(tmp,mu+1,-1);
|
||||
{
|
||||
autoView( ref_v, ref, CpuWrite);
|
||||
autoView( tmp_v, tmp, CpuRead);
|
||||
for(int i=0;i<ref_v.size();i++){
|
||||
ref_v[i]+= tmp_v[i] - Gamma(Gmu[mu])*tmp_v[i]; ;
|
||||
}
|
||||
}
|
||||
}
|
||||
ref = -0.5*ref;
|
||||
}
|
||||
// dump=1;
|
||||
Dw.Dhop(src,result,1);
|
||||
std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
|
||||
std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm dag ref "<< norm2(ref)<<std::endl;
|
||||
err = ref-result;
|
||||
std::cout<<GridLogMessage << "norm dag diff "<< norm2(err)<<std::endl;
|
||||
if((norm2(err)>1.0e-4)){
|
||||
/*
|
||||
std::cout<< "DAG RESULT\n " <<ref << std::endl;
|
||||
std::cout<< "DAG sRESULT\n " <<result << std::endl;
|
||||
std::cout<< "DAG ERR \n " << err <<std::endl;
|
||||
*/
|
||||
}
|
||||
LatticeFermionF src_e (FrbGrid);
|
||||
LatticeFermionF src_o (FrbGrid);
|
||||
LatticeFermionF r_e (FrbGrid);
|
||||
LatticeFermionF r_o (FrbGrid);
|
||||
LatticeFermionF r_eo (FGrid);
|
||||
|
||||
std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
|
||||
pickCheckerboard(Even,src_e,src);
|
||||
pickCheckerboard(Odd,src_o,src);
|
||||
|
||||
std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
|
||||
std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
|
||||
|
||||
|
||||
// S-direction is INNERMOST and takes no part in the parity.
|
||||
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionF::DhopEO "<<std::endl;
|
||||
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
|
||||
if ( sizeof(RealF)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
|
||||
if ( sizeof(RealF)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
|
||||
#ifdef GRID_OMP
|
||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
|
||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
|
||||
#endif
|
||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
|
||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
||||
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||
{
|
||||
FGrid->Barrier();
|
||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||
double t0=usecond();
|
||||
for(int i=0;i<ncall;i++){
|
||||
#ifdef CUDA_PROFILE
|
||||
if(i==10) cudaProfilerStart();
|
||||
#endif
|
||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||
#ifdef CUDA_PROFILE
|
||||
if(i==20) cudaProfilerStop();
|
||||
#endif
|
||||
}
|
||||
double t1=usecond();
|
||||
FGrid->Barrier();
|
||||
|
||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||
double flops=(single_site_flops*volume*ncall)/2.0;
|
||||
|
||||
std::cout<<GridLogMessage << "Deo mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
std::cout<<GridLogMessage << "Deo mflop/s per rank "<< flops/(t1-t0)/NP<<std::endl;
|
||||
std::cout<<GridLogMessage << "Deo mflop/s per node "<< flops/(t1-t0)/NN<<std::endl;
|
||||
}
|
||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||
Dw.DhopOE(src_e,r_o,DaggerNo);
|
||||
Dw.Dhop (src ,result,DaggerNo);
|
||||
|
||||
std::cout<<GridLogMessage << "r_e"<<norm2(r_e)<<std::endl;
|
||||
std::cout<<GridLogMessage << "r_o"<<norm2(r_o)<<std::endl;
|
||||
std::cout<<GridLogMessage << "res"<<norm2(result)<<std::endl;
|
||||
|
||||
setCheckerboard(r_eo,r_o);
|
||||
setCheckerboard(r_eo,r_e);
|
||||
|
||||
err = r_eo-result;
|
||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
||||
if((norm2(err)>1.0e-4)){
|
||||
/*
|
||||
std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
|
||||
std::cout<< "Deo REF\n " <<result << std::endl;
|
||||
std::cout<< "Deo ERR \n " << err <<std::endl;
|
||||
*/
|
||||
}
|
||||
|
||||
pickCheckerboard(Even,src_e,err);
|
||||
pickCheckerboard(Odd,src_o,err);
|
||||
std::cout<<GridLogMessage << "norm diff even "<< norm2(src_e)<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm diff odd "<< norm2(src_o)<<std::endl;
|
||||
|
||||
assert(norm2(src_e)<1.0e-4);
|
||||
assert(norm2(src_o)<1.0e-4);
|
||||
Grid_finalize();
|
||||
exit(0);
|
||||
}
|
131
benchmarks/Benchmark_halo.cc
Normal file
131
benchmarks/Benchmark_halo.cc
Normal file
@ -0,0 +1,131 @@
|
||||
/*************************************************************************************
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
Source file: ./benchmarks/Benchmark_dwf.cc
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/Grid.h>
|
||||
#ifdef GRID_CUDA
|
||||
#define CUDA_PROFILE
|
||||
#endif
|
||||
|
||||
#ifdef CUDA_PROFILE
|
||||
#include <cuda_profiler_api.h>
|
||||
#endif
|
||||
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
|
||||
template<class d>
|
||||
struct scal {
|
||||
d internal;
|
||||
};
|
||||
|
||||
Gamma::Algebra Gmu [] = {
|
||||
Gamma::Algebra::GammaX,
|
||||
Gamma::Algebra::GammaY,
|
||||
Gamma::Algebra::GammaZ,
|
||||
Gamma::Algebra::GammaT
|
||||
};
|
||||
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
Coordinate latt4= GridDefaultLatt();
|
||||
Coordinate mpi = GridDefaultMpi();
|
||||
Coordinate simd = GridDefaultSimd(Nd,vComplexF::Nsimd());
|
||||
|
||||
GridLogLayout();
|
||||
|
||||
int Ls=16;
|
||||
for(int i=0;i<argc;i++)
|
||||
if(std::string(argv[i]) == "-Ls"){
|
||||
std::stringstream ss(argv[i+1]); ss >> Ls;
|
||||
}
|
||||
|
||||
|
||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4,simd ,mpi);
|
||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||
|
||||
std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
|
||||
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
|
||||
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
|
||||
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
|
||||
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
|
||||
|
||||
std::vector<int> seeds4({1,2,3,4});
|
||||
std::vector<int> seeds5({5,6,7,8});
|
||||
|
||||
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
|
||||
GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG"));
|
||||
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
|
||||
GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG"));
|
||||
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
|
||||
|
||||
LatticeFermionF src (FGrid); random(RNG5,src);
|
||||
RealD N2 = 1.0/::sqrt(norm2(src));
|
||||
src = src*N2;
|
||||
|
||||
std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
|
||||
LatticeGaugeFieldF Umu(UGrid);
|
||||
SU<Nc>::HotConfiguration(RNG4,Umu);
|
||||
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
|
||||
|
||||
RealD mass=0.1;
|
||||
RealD M5 =1.8;
|
||||
|
||||
RealD NP = UGrid->_Nprocessors;
|
||||
RealD NN = UGrid->NodeCount();
|
||||
|
||||
DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
||||
|
||||
const int ncall = 500;
|
||||
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionF::HaloGatherOpt "<<std::endl;
|
||||
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||
{
|
||||
typename DomainWallFermionF::Compressor compressor(0);
|
||||
FGrid->Barrier();
|
||||
Dw.Stencil.HaloExchangeOptGather(src,compressor);
|
||||
double t0=usecond();
|
||||
for(int i=0;i<ncall;i++){
|
||||
Dw.Stencil.HaloExchangeOptGather(src,compressor);
|
||||
}
|
||||
double t1=usecond();
|
||||
FGrid->Barrier();
|
||||
|
||||
double bytes=0.0;
|
||||
if(mpi[0]) bytes+=latt4[1]*latt4[2]*latt4[3];
|
||||
if(mpi[1]) bytes+=latt4[0]*latt4[2]*latt4[3];
|
||||
if(mpi[2]) bytes+=latt4[0]*latt4[1]*latt4[3];
|
||||
if(mpi[3]) bytes+=latt4[0]*latt4[1]*latt4[2];
|
||||
bytes = bytes * Ls * 8.* (24.+12.)* 2.0;
|
||||
|
||||
std::cout<<GridLogMessage << "Gather us /call = "<< (t1-t0)/ncall<<std::endl;
|
||||
std::cout<<GridLogMessage << "Gather MBs /call = "<< bytes*ncall/(t1-t0)<<std::endl;
|
||||
|
||||
}
|
||||
|
||||
Grid_finalize();
|
||||
exit(0);
|
||||
}
|
189
benchmarks/Benchmark_prec_change.cc
Normal file
189
benchmarks/Benchmark_prec_change.cc
Normal file
@ -0,0 +1,189 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./benchmarks/Benchmark_prec_change.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Christopher Kelly <ckelly@bnl.gov>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/Grid.h>
|
||||
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
int Ls = 12;
|
||||
Coordinate latt4 = GridDefaultLatt();
|
||||
|
||||
GridCartesian * UGridD = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
|
||||
GridRedBlackCartesian * UrbGridD = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridD);
|
||||
GridCartesian * FGridD = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridD);
|
||||
GridRedBlackCartesian * FrbGridD = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridD);
|
||||
|
||||
GridCartesian * UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
|
||||
GridRedBlackCartesian * UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
|
||||
GridCartesian * FGridF = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
|
||||
GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
|
||||
|
||||
|
||||
std::vector<int> seeds4({1,2,3,4});
|
||||
std::vector<int> seeds5({5,6,7,8});
|
||||
|
||||
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
|
||||
GridParallelRNG RNG4(UGridD); RNG4.SeedFixedIntegers(seeds4);
|
||||
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
|
||||
GridParallelRNG RNG5(FGridD); RNG5.SeedFixedIntegers(seeds5);
|
||||
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
|
||||
|
||||
LatticeFermionD field_d(FGridD), tmp_d(FGridD);
|
||||
random(RNG5,field_d); tmp_d = field_d;
|
||||
|
||||
LatticeFermionD2 field_d2(FGridF), tmp_d2(FGridF);
|
||||
precisionChange(field_d2, field_d); tmp_d2 = field_d2;
|
||||
|
||||
LatticeFermionF field_f(FGridF), tmp_f(FGridF);
|
||||
precisionChange(field_f, field_d); tmp_f = field_f;
|
||||
|
||||
int N = 500;
|
||||
|
||||
double time_ds = 0, time_sd = 0;
|
||||
|
||||
std::cout<<GridLogMessage << "Benchmarking single<->double original implementation (fields initially device-resident)" << std::endl;
|
||||
for(int i=0;i<N;i++){
|
||||
//We want to benchmark the typical scenario of both fields being device resident
|
||||
//To do this, invoke an operation that will open a device view and touch all sites
|
||||
//with a write operation that invalidates the CPU copy
|
||||
field_d = tmp_d;
|
||||
field_f = tmp_f;
|
||||
|
||||
double start=usecond();
|
||||
precisionChangeOrig(field_d,field_f);
|
||||
double stop=usecond();
|
||||
time_sd += stop - start;
|
||||
|
||||
field_d = tmp_d;
|
||||
field_f = tmp_f;
|
||||
|
||||
start=usecond();
|
||||
precisionChangeOrig(field_f,field_d);
|
||||
stop=usecond();
|
||||
time_ds += stop - start;
|
||||
}
|
||||
std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;
|
||||
|
||||
|
||||
precisionChangeWorkspace wk_sp_to_dp(field_d.Grid(),field_f.Grid());
|
||||
precisionChangeWorkspace wk_dp_to_sp(field_f.Grid(),field_d.Grid());
|
||||
|
||||
std::cout<<GridLogMessage << "Benchmarking single<->double with pregenerated workspace(fields initially device-resident)" << std::endl;
|
||||
time_sd = time_ds = 0;
|
||||
for(int i=0;i<N;i++){
|
||||
field_d = tmp_d;
|
||||
field_f = tmp_f;
|
||||
|
||||
double start=usecond();
|
||||
precisionChange(field_d,field_f, wk_sp_to_dp);
|
||||
double stop=usecond();
|
||||
time_sd += stop - start;
|
||||
|
||||
field_d = tmp_d;
|
||||
field_f = tmp_f;
|
||||
|
||||
start=usecond();
|
||||
precisionChange(field_f,field_d, wk_dp_to_sp);
|
||||
stop=usecond();
|
||||
time_ds += stop - start;
|
||||
}
|
||||
std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;
|
||||
|
||||
std::cout<<GridLogMessage << "Benchmarking single<->double with workspace generated on-the-fly (fields initially device-resident)" << std::endl;
|
||||
time_sd = time_ds = 0;
|
||||
for(int i=0;i<N;i++){
|
||||
field_d = tmp_d;
|
||||
field_f = tmp_f;
|
||||
|
||||
double start=usecond();
|
||||
precisionChange(field_d,field_f);
|
||||
double stop=usecond();
|
||||
time_sd += stop - start;
|
||||
|
||||
field_d = tmp_d;
|
||||
field_f = tmp_f;
|
||||
|
||||
start=usecond();
|
||||
precisionChange(field_f,field_d);
|
||||
stop=usecond();
|
||||
time_ds += stop - start;
|
||||
|
||||
}
|
||||
std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;
|
||||
|
||||
|
||||
std::cout<<GridLogMessage << "Benchmarking single<->double2 (fields initially device-resident)" << std::endl;
|
||||
time_sd = time_ds = 0;
|
||||
for(int i=0;i<N;i++){
|
||||
field_d2 = tmp_d2;
|
||||
field_f = tmp_f;
|
||||
|
||||
double start=usecond();
|
||||
precisionChangeFast(field_d2,field_f);
|
||||
double stop=usecond();
|
||||
time_sd += stop - start;
|
||||
|
||||
field_d2 = tmp_d2;
|
||||
field_f = tmp_f;
|
||||
|
||||
start=usecond();
|
||||
precisionChangeFast(field_f,field_d2);
|
||||
stop=usecond();
|
||||
time_ds += stop - start;
|
||||
}
|
||||
std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;
|
||||
|
||||
|
||||
std::cout<<GridLogMessage << "Benchmarking single<->double2 through standard precisionChange call(fields initially device-resident) [NB: perf should be the same as the previous test!]" << std::endl;
|
||||
time_sd = time_ds = 0;
|
||||
for(int i=0;i<N;i++){
|
||||
field_d2 = tmp_d2;
|
||||
field_f = tmp_f;
|
||||
|
||||
double start=usecond();
|
||||
precisionChange(field_d2,field_f);
|
||||
double stop=usecond();
|
||||
time_sd += stop - start;
|
||||
|
||||
field_d2 = tmp_d2;
|
||||
field_f = tmp_f;
|
||||
|
||||
start=usecond();
|
||||
precisionChange(field_f,field_d2);
|
||||
stop=usecond();
|
||||
time_ds += stop - start;
|
||||
}
|
||||
std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;
|
||||
|
||||
Grid_finalize();
|
||||
}
|
133
examples/socket_grid.cc
Normal file
133
examples/socket_grid.cc
Normal file
@ -0,0 +1,133 @@
|
||||
#include <sys/socket.h>
|
||||
#include <sys/un.h>
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <err.h>
|
||||
#include <fcntl.h>
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
static int sock;
|
||||
static const char *sock_path_fmt = "/tmp/GridUnixSocket.%d";
|
||||
static char sock_path[256];
|
||||
|
||||
class UnixSockets {
|
||||
public:
|
||||
static void Open(int rank)
|
||||
{
|
||||
int errnum;
|
||||
|
||||
sock = socket(AF_UNIX, SOCK_DGRAM, 0); assert(sock>0);
|
||||
printf("allocated socket %d\n",sock);
|
||||
|
||||
struct sockaddr_un sa_un = { 0 };
|
||||
sa_un.sun_family = AF_UNIX;
|
||||
snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,rank);
|
||||
unlink(sa_un.sun_path);
|
||||
if (bind(sock, (struct sockaddr *)&sa_un, sizeof(sa_un))) {
|
||||
perror("bind failure");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
printf("bound socket %d to %s\n",sock,sa_un.sun_path);
|
||||
}
|
||||
|
||||
static int RecvFileDescriptor(void)
|
||||
{
|
||||
int n;
|
||||
int fd;
|
||||
char buf[1];
|
||||
struct iovec iov;
|
||||
struct msghdr msg;
|
||||
struct cmsghdr *cmsg;
|
||||
char cms[CMSG_SPACE(sizeof(int))];
|
||||
|
||||
iov.iov_base = buf;
|
||||
iov.iov_len = 1;
|
||||
|
||||
memset(&msg, 0, sizeof msg);
|
||||
msg.msg_name = 0;
|
||||
msg.msg_namelen = 0;
|
||||
msg.msg_iov = &iov;
|
||||
msg.msg_iovlen = 1;
|
||||
|
||||
msg.msg_control = (caddr_t)cms;
|
||||
msg.msg_controllen = sizeof cms;
|
||||
|
||||
if((n=recvmsg(sock, &msg, 0)) < 0) {
|
||||
perror("recvmsg failed");
|
||||
return -1;
|
||||
}
|
||||
if(n == 0){
|
||||
perror("recvmsg returned 0");
|
||||
return -1;
|
||||
}
|
||||
cmsg = CMSG_FIRSTHDR(&msg);
|
||||
memmove(&fd, CMSG_DATA(cmsg), sizeof(int));
|
||||
printf("received fd %d from socket %d\n",fd,sock);
|
||||
return fd;
|
||||
}
|
||||
|
||||
static void SendFileDescriptor(int fildes,int xmit_to_rank)
|
||||
{
|
||||
struct msghdr msg;
|
||||
struct iovec iov;
|
||||
struct cmsghdr *cmsg = NULL;
|
||||
char ctrl[CMSG_SPACE(sizeof(int))];
|
||||
char data = ' ';
|
||||
|
||||
memset(&msg, 0, sizeof(struct msghdr));
|
||||
memset(ctrl, 0, CMSG_SPACE(sizeof(int)));
|
||||
iov.iov_base = &data;
|
||||
iov.iov_len = sizeof(data);
|
||||
|
||||
sprintf(sock_path,sock_path_fmt,xmit_to_rank);
|
||||
printf("sending FD %d over socket %d to rank %d AF_UNIX path %s\n",fildes,sock,xmit_to_rank,sock_path);fflush(stdout);
|
||||
|
||||
struct sockaddr_un sa_un = { 0 };
|
||||
sa_un.sun_family = AF_UNIX;
|
||||
snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,xmit_to_rank);
|
||||
|
||||
msg.msg_name = (void *)&sa_un;
|
||||
msg.msg_namelen = sizeof(sa_un);
|
||||
msg.msg_iov = &iov;
|
||||
msg.msg_iovlen = 1;
|
||||
msg.msg_controllen = CMSG_SPACE(sizeof(int));
|
||||
msg.msg_control = ctrl;
|
||||
|
||||
cmsg = CMSG_FIRSTHDR(&msg);
|
||||
cmsg->cmsg_level = SOL_SOCKET;
|
||||
cmsg->cmsg_type = SCM_RIGHTS;
|
||||
cmsg->cmsg_len = CMSG_LEN(sizeof(int));
|
||||
|
||||
*((int *) CMSG_DATA(cmsg)) = fildes;
|
||||
|
||||
if ( sendmsg(sock, &msg, 0) == -1 ) perror("sendmsg failed");
|
||||
};
|
||||
};
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
int me = fork()?0:1;
|
||||
|
||||
UnixSockets::Open(me);
|
||||
|
||||
// need MPI barrier
|
||||
sleep(10);
|
||||
const char * message = "Hello, World\n";
|
||||
if( me ) {
|
||||
int fd = open("foo",O_RDWR|O_CREAT,0666);
|
||||
if ( fd < 0 ) {
|
||||
perror("failed to open file");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
// rank 1 sends ot rank 0
|
||||
UnixSockets::SendFileDescriptor(fd,0);
|
||||
close(fd);
|
||||
} else {
|
||||
// rank 0 sends receives frmo rank 1
|
||||
int fd = UnixSockets::RecvFileDescriptor();
|
||||
write(fd,(const void *)message,strlen(message));
|
||||
close(fd);
|
||||
}
|
||||
}
|
@ -7,6 +7,7 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
|
||||
--enable-accelerator=hip \
|
||||
--enable-gen-simd-width=64 \
|
||||
--enable-simd=GPU \
|
||||
--disable-accelerator-cshift \
|
||||
--with-gmp=$OLCF_GMP_ROOT \
|
||||
--with-fftw=$FFTW_DIR/.. \
|
||||
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
|
||||
|
@ -5,8 +5,8 @@ module load emacs
|
||||
#module load gperftools
|
||||
module load PrgEnv-gnu
|
||||
module load rocm/5.3.0
|
||||
module load cray-mpich/8.1.16
|
||||
#module load cray-mpich/8.1.17
|
||||
#module load cray-mpich/8.1.16
|
||||
module load cray-mpich/8.1.17
|
||||
module load gmp
|
||||
module load cray-fftw
|
||||
module load craype-accel-amd-gfx90a
|
||||
|
6
systems/PVC/benchmarks/run-1tile.sh
Normal file → Executable file
6
systems/PVC/benchmarks/run-1tile.sh
Normal file → Executable file
@ -4,9 +4,9 @@
|
||||
#SBATCH -p QZ1J-ICX-PVC
|
||||
##SBATCH -p QZ1J-SPR-PVC-2C
|
||||
|
||||
source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
|
||||
#source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
|
||||
|
||||
export NT=16
|
||||
export NT=8
|
||||
|
||||
export I_MPI_OFFLOAD=1
|
||||
export I_MPI_OFFLOAD_TOPOLIB=level_zero
|
||||
@ -21,7 +21,7 @@ export I_MPI_OFFLOAD_CELL=tile
|
||||
export EnableImplicitScaling=0
|
||||
export EnableWalkerPartition=0
|
||||
export ZE_AFFINITY_MASK=0.0
|
||||
mpiexec -launcher ssh -n 1 -host localhost ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0
|
||||
mpiexec -launcher ssh -n 1 -host localhost ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 --device-mem 32768
|
||||
|
||||
export ZE_AFFINITY_MASK=0
|
||||
export I_MPI_OFFLOAD_CELL=device
|
||||
|
@ -1,13 +1,13 @@
|
||||
#!/bin/bash
|
||||
##SBATCH -p PVC-SPR-QZEH
|
||||
##SBATCH -p PVC-ICX-QZNW
|
||||
|
||||
#SBATCH -p QZ1J-ICX-PVC
|
||||
|
||||
source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
|
||||
#source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
|
||||
|
||||
export NT=16
|
||||
|
||||
|
||||
# export IGC_EnableLSCFenceUGMBeforeEOT=0
|
||||
# export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file=False"
|
||||
#export IGC_ShaderDumpEnable=1
|
||||
@ -19,8 +19,15 @@ export SYCL_DEVICE_FILTER=gpu,level_zero
|
||||
export I_MPI_OFFLOAD_CELL=tile
|
||||
export EnableImplicitScaling=0
|
||||
export EnableWalkerPartition=0
|
||||
#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1
|
||||
#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
||||
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0
|
||||
|
||||
#mpiexec -launcher ssh -n 1 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0 > 1tile.log
|
||||
for i in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
|
||||
do
|
||||
mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 0 --device-mem 32768 > 1.1.1.2.log$i
|
||||
mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 0 --device-mem 32768 > 2.1.1.1.log$i
|
||||
done
|
||||
|
||||
mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0
|
||||
|
||||
|
@ -5,10 +5,5 @@ export ZE_AFFINITY_MASK=0.$MPI_LOCALRANKID
|
||||
echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK
|
||||
|
||||
|
||||
#if [ $MPI_LOCALRANKID = "0" ]
|
||||
#then
|
||||
# ~psteinbr/build_pti/ze_tracer -c $@
|
||||
# onetrace --chrome-kernel-timeline $@
|
||||
#else
|
||||
$@
|
||||
#fi
|
||||
|
||||
|
@ -2,14 +2,15 @@ INSTALL=/nfs/site/home/paboylx/prereqs/
|
||||
../../configure \
|
||||
--enable-simd=GPU \
|
||||
--enable-gen-simd-width=64 \
|
||||
--enable-comms=mpi \
|
||||
--enable-comms=mpi-auto \
|
||||
--disable-accelerator-cshift \
|
||||
--disable-gparity \
|
||||
--disable-fermion-reps \
|
||||
--enable-shm=nvlink \
|
||||
--enable-accelerator=sycl \
|
||||
--enable-unified=no \
|
||||
CXX=mpicxx \
|
||||
MPICXX=mpicxx \
|
||||
CXX=dpcpp \
|
||||
LDFLAGS="-fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$INSTALL/lib" \
|
||||
CXXFLAGS="-cxx=icpx -fsycl-unnamed-lambda -fsycl -Wno-tautological-constant-compare -I$INSTALL/include"
|
||||
CXXFLAGS="-fsycl-unnamed-lambda -fsycl -no-fma -I$INSTALL/include -Wno-tautological-compare"
|
||||
|
||||
|
@ -3,8 +3,14 @@ export https_proxy=http://proxy-chain.intel.com:911
|
||||
export LD_LIBRARY_PATH=$HOME/prereqs/lib/:$LD_LIBRARY_PATH
|
||||
|
||||
module load intel-release
|
||||
source /opt/intel/oneapi/PVC_setup.sh
|
||||
module load intel-comp-rt/embargo-ci-neo
|
||||
|
||||
#source /opt/intel/oneapi/PVC_setup.sh
|
||||
#source /opt/intel/oneapi/ATS_setup.sh
|
||||
#module load intel-nightly/20230331
|
||||
#module load intel-comp-rt/ci-neo-master/026093
|
||||
|
||||
#module load intel/mpich
|
||||
module load intel/mpich/pvc45.3
|
||||
export PATH=~/ATS/pti-gpu/tools/onetrace/:$PATH
|
||||
|
||||
|
@ -1,3 +1,2 @@
|
||||
CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GEN --enable-debug --enable-comms=mpi
|
||||
#CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GPU-RRII --enable-comms=mpi
|
||||
#CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GPU --enable-debug --enable-comms=mpi
|
||||
CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GEN --enable-debug --enable-comms=mpi --enable-unified=yes
|
||||
|
||||
|
@ -73,12 +73,12 @@ int main (int argc, char ** argv)
|
||||
RealD M5 =1.8;
|
||||
|
||||
std::cout<<GridLogMessage<<"**************************************************************"<<std::endl;
|
||||
std::cout<<GridLogMessage <<"DomainWallFermion vectorised test"<<std::endl;
|
||||
std::cout<<GridLogMessage <<"DomainWallFermion test"<<std::endl;
|
||||
std::cout<<GridLogMessage<<"**************************************************************"<<std::endl;
|
||||
std::vector<Complex> boundary = {1,1,1,-1};
|
||||
DomainWallFermionD::ImplParams Params(boundary);
|
||||
Coordinate Dirichlet({0,8,8,16,32});
|
||||
Params.dirichlet=Dirichlet;
|
||||
// Coordinate Dirichlet({0,8,8,16,32});
|
||||
// Params.dirichlet=Dirichlet;
|
||||
|
||||
DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,Params);
|
||||
TestWhat<DomainWallFermionD>(Ddwf,FGrid,FrbGrid,UGrid,mass,M5,&RNG4,&RNG5);
|
||||
|
@ -101,7 +101,7 @@ int main (int argc, char ** argv)
|
||||
std:: cout << " MdagM site flops = "<< 4*MdagMsiteflops<<std::endl;
|
||||
std:: cout << " CG site flops = "<< CGsiteflops <<std::endl;
|
||||
int iters;
|
||||
for(int i=0;i<200;i++){
|
||||
for(int i=0;i<10;i++){
|
||||
result_o = Zero();
|
||||
t1=usecond();
|
||||
mCG(src_o,result_o);
|
||||
|
@ -53,7 +53,7 @@ static int readInt(int* argc, char*** argv, std::string&& option, int defaultVal
|
||||
|
||||
static float readFloat(int* argc, char*** argv, std::string&& option, float defaultValue) {
|
||||
std::string arg;
|
||||
float ret = defaultValue;
|
||||
double ret = defaultValue;
|
||||
if(checkPresent(argc, argv, option)) {
|
||||
arg = getContent(argc, argv, option);
|
||||
GridCmdOptionFloat(arg, ret);
|
||||
|
@ -1,160 +0,0 @@
|
||||
/*************************************************************************************
|
||||
|
||||
grid` physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./tests/Test_cshift.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/Grid.h>
|
||||
|
||||
using namespace Grid;
|
||||
;
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
int threads = GridThread::GetThreads();
|
||||
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
||||
|
||||
Coordinate latt_size = GridDefaultLatt();
|
||||
Coordinate simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
|
||||
Coordinate mpi_layout = GridDefaultMpi();
|
||||
|
||||
int vol = 1;
|
||||
for(int d=0;d<latt_size.size();d++){
|
||||
vol = vol * latt_size[d];
|
||||
}
|
||||
GridCartesian GRID(latt_size,simd_layout,mpi_layout);
|
||||
GridRedBlackCartesian RBGRID(&GRID);
|
||||
|
||||
LatticeComplexD one(&GRID);
|
||||
LatticeComplexD zz(&GRID);
|
||||
LatticeComplexD C(&GRID);
|
||||
LatticeComplexD Ctilde(&GRID);
|
||||
LatticeComplexD Cref (&GRID);
|
||||
LatticeComplexD Csav (&GRID);
|
||||
LatticeComplexD coor(&GRID);
|
||||
|
||||
LatticeSpinMatrixD S(&GRID);
|
||||
LatticeSpinMatrixD Stilde(&GRID);
|
||||
|
||||
Coordinate p({1,3,2,3});
|
||||
|
||||
one = ComplexD(1.0,0.0);
|
||||
zz = ComplexD(0.0,0.0);
|
||||
|
||||
ComplexD ci(0.0,1.0);
|
||||
|
||||
std::vector<int> seeds({1,2,3,4});
|
||||
GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds); // naughty seeding
|
||||
GridParallelRNG pRNG(&GRID);
|
||||
pRNG.SeedFixedIntegers(seeds);
|
||||
|
||||
LatticeGaugeFieldD Umu(&GRID);
|
||||
|
||||
SU<Nc>::ColdConfiguration(pRNG,Umu); // Unit gauge
|
||||
|
||||
////////////////////////////////////////////////////
|
||||
// Wilson test
|
||||
////////////////////////////////////////////////////
|
||||
{
|
||||
LatticeFermionD src(&GRID); gaussian(pRNG,src);
|
||||
LatticeFermionD tmp(&GRID);
|
||||
LatticeFermionD ref(&GRID);
|
||||
|
||||
RealD mass=0.01;
|
||||
WilsonFermionD Dw(Umu,GRID,RBGRID,mass);
|
||||
|
||||
Dw.M(src,tmp);
|
||||
|
||||
std::cout << "Dw src = " <<norm2(src)<<std::endl;
|
||||
std::cout << "Dw tmp = " <<norm2(tmp)<<std::endl;
|
||||
|
||||
Dw.FreePropagator(tmp,ref,mass);
|
||||
|
||||
std::cout << "Dw ref = " <<norm2(ref)<<std::endl;
|
||||
|
||||
ref = ref - src;
|
||||
|
||||
std::cout << "Dw ref-src = " <<norm2(ref)<<std::endl;
|
||||
}
|
||||
|
||||
|
||||
////////////////////////////////////////////////////
|
||||
// Wilson prop
|
||||
////////////////////////////////////////////////////
|
||||
{
|
||||
std::cout<<"****************************************"<<std::endl;
|
||||
std::cout << "Wilson Mom space 4d propagator \n";
|
||||
std::cout<<"****************************************"<<std::endl;
|
||||
|
||||
LatticeFermionD src(&GRID); gaussian(pRNG,src);
|
||||
LatticeFermionD tmp(&GRID);
|
||||
LatticeFermionD ref(&GRID);
|
||||
LatticeFermionD diff(&GRID);
|
||||
|
||||
src=Zero();
|
||||
Coordinate point(4,0); // 0,0,0,0
|
||||
SpinColourVectorD ferm;
|
||||
ferm=Zero();
|
||||
ferm()(0)(0) = ComplexD(1.0);
|
||||
pokeSite(ferm,src,point);
|
||||
|
||||
RealD mass=0.01;
|
||||
WilsonFermionD Dw(Umu,GRID,RBGRID,mass);
|
||||
|
||||
// Momentum space prop
|
||||
std::cout << " Solving by FFT and Feynman rules" <<std::endl;
|
||||
Dw.FreePropagator(src,ref,mass) ;
|
||||
|
||||
Gamma G5(Gamma::Algebra::Gamma5);
|
||||
|
||||
LatticeFermionD result(&GRID);
|
||||
const int sdir=0;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Conjugate gradient on normal equations system
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
std::cout << " Solving by Conjugate Gradient (CGNE)" <<std::endl;
|
||||
Dw.Mdag(src,tmp);
|
||||
src=tmp;
|
||||
MdagMLinearOperator<WilsonFermionD,LatticeFermionD> HermOp(Dw);
|
||||
ConjugateGradient<LatticeFermionD> CG(1.0e-10,10000);
|
||||
CG(HermOp,src,result);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
std::cout << " Taking difference" <<std::endl;
|
||||
std::cout << "Dw result "<<norm2(result)<<std::endl;
|
||||
std::cout << "Dw ref "<<norm2(ref)<<std::endl;
|
||||
|
||||
diff = ref - result;
|
||||
std::cout << "result - ref "<<norm2(diff)<<std::endl;
|
||||
|
||||
DumpSliceNorm("Slice Norm Solution ",result,Nd-1);
|
||||
}
|
||||
|
||||
|
||||
Grid_finalize();
|
||||
}
|
110
tests/core/Test_memory_manager.cc
Normal file
110
tests/core/Test_memory_manager.cc
Normal file
@ -0,0 +1,110 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./tests/Test_memory_manager.cc
|
||||
|
||||
Copyright (C) 2022
|
||||
|
||||
Author: Peter Boyle <pboyle@bnl.gov>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/Grid.h>
|
||||
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
|
||||
void MemoryTest(GridCartesian * FGrid,int N);
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
||||
|
||||
int N=100;
|
||||
for(int i=0;i<N;i++){
|
||||
std::cout << "============================"<<std::endl;
|
||||
std::cout << "Epoch "<<i<<"/"<<N<<std::endl;
|
||||
std::cout << "============================"<<std::endl;
|
||||
MemoryTest(UGrid,256);
|
||||
MemoryManager::Print();
|
||||
AUDIT();
|
||||
}
|
||||
Grid_finalize();
|
||||
}
|
||||
|
||||
void MemoryTest(GridCartesian * FGrid, int N)
|
||||
{
|
||||
LatticeComplexD zero(FGrid); zero=Zero();
|
||||
std::vector<LatticeComplexD> A(N,zero);//FGrid);
|
||||
|
||||
std::vector<ComplexD> B(N,ComplexD(0.0)); // Update sequentially on host
|
||||
|
||||
for(int v=0;v<N;v++) A[v] = Zero();
|
||||
|
||||
uint64_t counter = 0;
|
||||
for(int epoch = 0;epoch<10000;epoch++){
|
||||
|
||||
int v = random() %N; // Which vec
|
||||
int w = random() %2; // Write or read
|
||||
int e = random() %3; // expression or for loop
|
||||
int dev= random() %2; // On device?
|
||||
// int e=1;
|
||||
ComplexD zc = counter++;
|
||||
|
||||
if ( w ) {
|
||||
B[v] = B[v] + zc;
|
||||
if ( e == 0 ) {
|
||||
A[v] = A[v] + zc - A[v] + A[v];
|
||||
} else {
|
||||
if ( dev ) {
|
||||
autoView(A_v,A[v],AcceleratorWrite);
|
||||
accelerator_for(ss,FGrid->oSites(),1,{
|
||||
A_v[ss] = A_v[ss] + zc;
|
||||
});
|
||||
} else {
|
||||
autoView(A_v,A[v],CpuWrite);
|
||||
thread_for(ss,FGrid->oSites(),{
|
||||
A_v[ss] = A_v[ss] + zc;
|
||||
});
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if ( e == 0 ) {
|
||||
A[v] = A[v] + A[v] - A[v];
|
||||
} else {
|
||||
if ( dev ) {
|
||||
autoView(A_v,A[v],AcceleratorRead);
|
||||
accelerator_for(ss,FGrid->oSites(),1,{
|
||||
assert(B[v]==A_v[ss]()()().getlane(0));
|
||||
});
|
||||
// std::cout << "["<<v<<"] checked on GPU"<<B[v]<<std::endl;
|
||||
} else {
|
||||
autoView(A_v,A[v],CpuRead);
|
||||
thread_for(ss,FGrid->oSites(),{
|
||||
assert(B[v]==A_v[ss]()()().getlane(0));
|
||||
});
|
||||
// std::cout << "["<<v<<"] checked on CPU"<<B[v]<<std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
124
tests/core/Test_prec_change.cc
Normal file
124
tests/core/Test_prec_change.cc
Normal file
@ -0,0 +1,124 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./tests/core/Test_prec_change.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Christopher Kelly <ckelly@bnl.gov>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/Grid.h>
|
||||
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
int Ls = 12;
|
||||
Coordinate latt4 = GridDefaultLatt();
|
||||
|
||||
GridCartesian * UGridD = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
|
||||
GridRedBlackCartesian * UrbGridD = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridD);
|
||||
GridCartesian * FGridD = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridD);
|
||||
GridRedBlackCartesian * FrbGridD = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridD);
|
||||
|
||||
GridCartesian * UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
|
||||
GridRedBlackCartesian * UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
|
||||
GridCartesian * FGridF = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
|
||||
GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
|
||||
|
||||
|
||||
std::vector<int> seeds4({1,2,3,4});
|
||||
std::vector<int> seeds5({5,6,7,8});
|
||||
|
||||
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
|
||||
GridParallelRNG RNG5(FGridD); RNG5.SeedFixedIntegers(seeds5);
|
||||
GridParallelRNG RNG5F(FGridF); RNG5F.SeedFixedIntegers(seeds5);
|
||||
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
|
||||
|
||||
LatticeFermionD field_d(FGridD), tmp_d(FGridD);
|
||||
random(RNG5,field_d);
|
||||
RealD norm2_field_d = norm2(field_d);
|
||||
|
||||
LatticeFermionD2 field_d2(FGridF), tmp_d2(FGridF);
|
||||
random(RNG5F,field_d2);
|
||||
RealD norm2_field_d2 = norm2(field_d2);
|
||||
|
||||
LatticeFermionF field_f(FGridF);
|
||||
|
||||
//Test original implementation
|
||||
{
|
||||
std::cout << GridLogMessage << "Testing original implementation" << std::endl;
|
||||
field_f = Zero();
|
||||
precisionChangeOrig(field_f,field_d);
|
||||
RealD Ndiff = (norm2_field_d - norm2(field_f))/norm2_field_d;
|
||||
std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of single and double prec fields differs by " << Ndiff << std::endl;
|
||||
tmp_d = Zero();
|
||||
precisionChangeOrig(tmp_d, field_f);
|
||||
Ndiff = norm2( LatticeFermionD(tmp_d-field_d) ) / norm2_field_d;
|
||||
std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of back-converted and original double prec fields differs by " << Ndiff << std::endl;
|
||||
}
|
||||
//Test new implementation with pregenerated workspace
|
||||
{
|
||||
std::cout << GridLogMessage << "Testing new implementation with pregenerated workspace" << std::endl;
|
||||
precisionChangeWorkspace wk_sp_to_dp(field_d.Grid(),field_f.Grid());
|
||||
precisionChangeWorkspace wk_dp_to_sp(field_f.Grid(),field_d.Grid());
|
||||
|
||||
field_f = Zero();
|
||||
precisionChange(field_f,field_d,wk_dp_to_sp);
|
||||
RealD Ndiff = (norm2_field_d - norm2(field_f))/norm2_field_d;
|
||||
std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of single and double prec fields differs by " << Ndiff << std::endl;
|
||||
tmp_d = Zero();
|
||||
precisionChange(tmp_d, field_f,wk_sp_to_dp);
|
||||
Ndiff = norm2( LatticeFermionD(tmp_d-field_d) ) / norm2_field_d;
|
||||
std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of back-converted and original double prec fields differs by " << Ndiff << std::endl;
|
||||
}
|
||||
//Test new implementation without pregenerated workspace
|
||||
{
|
||||
std::cout << GridLogMessage << "Testing new implementation without pregenerated workspace" << std::endl;
|
||||
field_f = Zero();
|
||||
precisionChange(field_f,field_d);
|
||||
RealD Ndiff = (norm2_field_d - norm2(field_f))/norm2_field_d;
|
||||
std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of single and double prec fields differs by " << Ndiff << std::endl;
|
||||
tmp_d = Zero();
|
||||
precisionChange(tmp_d, field_f);
|
||||
Ndiff = norm2( LatticeFermionD(tmp_d-field_d) ) / norm2_field_d;
|
||||
std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of back-converted and original double prec fields differs by " << Ndiff << std::endl;
|
||||
}
|
||||
//Test fast implementation
|
||||
{
|
||||
std::cout << GridLogMessage << "Testing fast (double2) implementation" << std::endl;
|
||||
field_f = Zero();
|
||||
precisionChangeFast(field_f,field_d2);
|
||||
RealD Ndiff = (norm2_field_d2 - norm2(field_f))/norm2_field_d2;
|
||||
std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of single and double prec fields differs by " << Ndiff << std::endl;
|
||||
tmp_d2 = Zero();
|
||||
precisionChangeFast(tmp_d2, field_f);
|
||||
Ndiff = norm2( LatticeFermionD2(tmp_d2-field_d2) ) / norm2_field_d2;
|
||||
std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of back-converted and original double prec fields differs by " << Ndiff << std::endl;
|
||||
}
|
||||
std::cout << "Done" << std::endl;
|
||||
|
||||
Grid_finalize();
|
||||
}
|
305
tests/forces/Test_bdy.cc
Normal file
305
tests/forces/Test_bdy.cc
Normal file
@ -0,0 +1,305 @@
|
||||
/*
|
||||
|
||||
2f Full det MdagM 10^6 force ~ 1.3e7
|
||||
rid : Message : 1767.283471 s : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
Grid : Message : 1767.283476 s : S1 : 1.52885e+09
|
||||
Grid : Message : 1767.283480 s : S2 : 1.52886e+09
|
||||
Grid : Message : 1767.283482 s : dS : 8877.34
|
||||
Grid : Message : 1767.283483 s : dSpred : 8877.7
|
||||
Grid : Message : 1767.283484 s : diff : -0.360484
|
||||
Grid : Message : 1767.283485 s : *********************************************************
|
||||
|
||||
2f Full det MpcdagMpc 10^6 force ~ 1.8e6
|
||||
Grid : Message : 2399.576962 s : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
Grid : Message : 2399.576968 s : S1 : 1.52885e+09
|
||||
Grid : Message : 2399.576972 s : S2 : 1.52886e+09
|
||||
Grid : Message : 2399.576974 s : dS : 9728.49
|
||||
Grid : Message : 2399.576975 s : dSpred : 9726.58
|
||||
Grid : Message : 2399.576976 s : diff : 1.90683
|
||||
Grid : Message : 2399.576977 s : *********************************************************
|
||||
|
||||
2f bdy MdagM 1500 force Force ~ 2800
|
||||
Grid : Message : 4622.385061 s : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
Grid : Message : 4622.385067 s : S1 : 1.52885e+09
|
||||
Grid : Message : 4622.385071 s : S2 : 1.52885e+09
|
||||
Grid : Message : 4622.385072 s : dS : 25.4944
|
||||
Grid : Message : 4622.385073 s : dSpred : 25.4672
|
||||
Grid : Message : 4622.385074 s : diff : 0.0271414
|
||||
Grid : Message : 4622.385075 s : *********************************************************
|
||||
|
||||
2f bdy MpcdagMpc 10^6 force Force ~ 2200
|
||||
Grid : Message : 4622.385061 s : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
Grid : Message : 4622.385067 s : S1 : 1.52885e+09
|
||||
Grid : Message : 4622.385071 s : S2 : 1.52885e+09
|
||||
Grid : Message : 4622.385072 s : dS : 25.4944
|
||||
Grid : Message : 4622.385073 s : dSpred : 25.4672
|
||||
Grid : Message : 4622.385074 s : diff : 0.0271414
|
||||
Grid : Message : 4622.385075 s : *********************************************************
|
||||
|
||||
1f Bdy Det
|
||||
Optimisation log: looser rational AND MD tolerances sloppy
|
||||
MobiusForce.221179 -- same as HMC. dS is mispredicted Forece ~2.8
|
||||
Grid : Message : 6582.258991 s : dS : 0.024478
|
||||
Grid : Message : 6582.258992 s : dSpred : 0.00791876
|
||||
Grid : Message : 6582.258994 s : diff : 0.0165592
|
||||
|
||||
MobiusForce.221193 -- tight rational AND MD tolerances to 1e-8 ~ 2.8 same
|
||||
Grid : Message : 1964.939209 s : S1 : 7.64404e+08
|
||||
Grid : Message : 1964.939213 s : S2 : 7.64404e+08
|
||||
Grid : Message : 1964.939215 s : dS : -0.00775838 <--- too loose even on action
|
||||
Grid : Message : 1964.939216 s : dSpred : -0.00416793
|
||||
Grid : Message : 1964.939217 s : diff : -0.00359045
|
||||
|
||||
MobiusForce.221394 -- looser rational, MD tol 1e-8 ~ 2.8 same
|
||||
Grid : Message : 1198.346720 s : S1 : 764404649.48886
|
||||
Grid : Message : 1198.346760 s : S2 : 764404649.5133
|
||||
Grid : Message : 1198.346780 s : dS : 0.024440884590149
|
||||
Grid : Message : 1198.346800 s : dSpred : 0.0079145154465184
|
||||
Grid : Message : 1198.346810 s : diff : 0.016526369143631
|
||||
|
||||
MobiusForce.221394 -- tight rational, MD tol sloppy Force ~ 2.8
|
||||
Grid : Message : 2376.921950 s : S1 : 764404436.44069
|
||||
Grid : Message : 2376.921954 s : S2 : 764404436.43299
|
||||
Grid : Message : 2376.921956 s : dS : -0.0076971054077148
|
||||
Grid : Message : 2376.921958 s : dSpred : -0.0041610472282526
|
||||
Grid : Message : 2376.921959 s : diff : -0.0035360581794623
|
||||
|
||||
*/
|
||||
|
||||
//
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./tests/Test_double_ratio.cc
|
||||
|
||||
Copyright (C) 2022
|
||||
|
||||
Author: Peter Boyle <pboyle@bnl.gov>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/Grid.h>
|
||||
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
|
||||
typedef MobiusFermionD FermionAction;
|
||||
typedef WilsonImplD FimplD;
|
||||
typedef WilsonImplD FermionImplPolicy;
|
||||
|
||||
template<class Gimpl>
|
||||
void ForceTest(Action<LatticeGaugeField> &action,LatticeGaugeField & U,MomentumFilterBase<LatticeGaugeField> &Filter)
|
||||
{
|
||||
GridBase *UGrid = U.Grid();
|
||||
|
||||
std::vector<int> seeds({1,2,3,5});
|
||||
GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds);
|
||||
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds);
|
||||
|
||||
LatticeColourMatrix Pmu(UGrid);
|
||||
LatticeGaugeField P(UGrid);
|
||||
LatticeGaugeField UdSdU(UGrid);
|
||||
|
||||
std::cout << GridLogMessage << "*********************************************************"<<std::endl;
|
||||
std::cout << GridLogMessage << " Force test for "<<action.action_name()<<std::endl;
|
||||
std::cout << GridLogMessage << "*********************************************************"<<std::endl;
|
||||
|
||||
RealD eps=0.005;
|
||||
|
||||
std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
|
||||
std::cout << GridLogMessage << " Refresh "<<action.action_name()<<std::endl;
|
||||
std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
|
||||
|
||||
Gimpl::generate_momenta(P,sRNG,RNG4);
|
||||
Filter.applyFilter(P);
|
||||
|
||||
#if 0
|
||||
FieldMetaData header;
|
||||
std::string file("./ckpoint_lat.2000");
|
||||
NerscIO::readConfiguration(U,header,file);
|
||||
#else
|
||||
U = 1.0;
|
||||
#endif
|
||||
action.refresh(U,sRNG,RNG4);
|
||||
|
||||
std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
|
||||
std::cout << GridLogMessage << " Action "<<action.action_name()<<std::endl;
|
||||
std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
|
||||
|
||||
RealD S1 = action.S(U);
|
||||
|
||||
Gimpl::update_field(P,U,eps);
|
||||
|
||||
std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
|
||||
std::cout << GridLogMessage << " Derivative "<<action.action_name()<<std::endl;
|
||||
std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
|
||||
action.deriv(U,UdSdU);
|
||||
UdSdU = Ta(UdSdU);
|
||||
Filter.applyFilter(UdSdU);
|
||||
|
||||
DumpSliceNorm("Force",UdSdU,Nd-1);
|
||||
|
||||
Gimpl::update_field(P,U,eps);
|
||||
std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
|
||||
std::cout << GridLogMessage << " Action "<<action.action_name()<<std::endl;
|
||||
std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
|
||||
|
||||
RealD S2 = action.S(U);
|
||||
|
||||
// Use the derivative
|
||||
LatticeComplex dS(UGrid); dS = Zero();
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
auto UdSdUmu = PeekIndex<LorentzIndex>(UdSdU,mu);
|
||||
Pmu= PeekIndex<LorentzIndex>(P,mu);
|
||||
dS = dS - trace(Pmu*UdSdUmu)*eps*2.0*2.0;
|
||||
}
|
||||
ComplexD dSpred = sum(dS);
|
||||
RealD diff = S2-S1-dSpred.real();
|
||||
|
||||
std::cout<< GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
|
||||
std::cout<< GridLogMessage << "S1 : "<< S1 <<std::endl;
|
||||
std::cout<< GridLogMessage << "S2 : "<< S2 <<std::endl;
|
||||
std::cout<< GridLogMessage << "dS : "<< S2-S1 <<std::endl;
|
||||
std::cout<< GridLogMessage << "dSpred : "<< dSpred.real() <<std::endl;
|
||||
std::cout<< GridLogMessage << "diff : "<< diff<<std::endl;
|
||||
std::cout<< GridLogMessage << "*********************************************************"<<std::endl;
|
||||
// assert(diff<1.0);
|
||||
std::cout<< GridLogMessage << "Done" <<std::endl;
|
||||
std::cout << GridLogMessage << "*********************************************************"<<std::endl;
|
||||
}
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
std::cout << std::setprecision(14);
|
||||
Coordinate latt_size = GridDefaultLatt();
|
||||
Coordinate mpi_layout = GridDefaultMpi();
|
||||
Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
|
||||
Coordinate shm;
|
||||
GlobalSharedMemory::GetShmDims(mpi_layout,shm);
|
||||
|
||||
const int Ls=12;
|
||||
const int Nt = latt_size[3];
|
||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// Domain decomposed operator
|
||||
////////////////////////////////////////////////////////////////
|
||||
Coordinate CommDim(Nd);
|
||||
for(int d=0;d<Nd;d++) CommDim[d]= (mpi_layout[d]/shm[d])>1 ? 1 : 0;
|
||||
|
||||
Coordinate NonDirichlet(Nd+1,0);
|
||||
Coordinate Dirichlet(Nd+1,0);
|
||||
Dirichlet[1] = CommDim[0]*latt_size[0]/mpi_layout[0] * shm[0];
|
||||
Dirichlet[2] = CommDim[1]*latt_size[1]/mpi_layout[1] * shm[1];
|
||||
Dirichlet[3] = CommDim[2]*latt_size[2]/mpi_layout[2] * shm[2];
|
||||
Dirichlet[4] = CommDim[3]*latt_size[3]/mpi_layout[3] * shm[3];
|
||||
|
||||
Coordinate Block4(Nd);
|
||||
Block4[0] = Dirichlet[1];
|
||||
Block4[1] = Dirichlet[2];
|
||||
Block4[2] = Dirichlet[3];
|
||||
Block4[3] = Dirichlet[4];
|
||||
|
||||
std::vector<Complex> boundary = {1,1,1,-1};
|
||||
FermionAction::ImplParams Params(boundary);
|
||||
FermionAction::ImplParams ParamsDir(boundary);
|
||||
Params.dirichlet=NonDirichlet;
|
||||
ParamsDir.dirichlet=Dirichlet;
|
||||
ParamsDir.partialDirichlet=1;
|
||||
|
||||
///////////////////// Gauge Field and Gauge Forces ////////////////////////////
|
||||
LatticeGaugeField U(UGrid);
|
||||
|
||||
RealD beta=6.0;
|
||||
WilsonGaugeActionR PlaqAction(beta);
|
||||
IwasakiGaugeActionR RectAction(beta);
|
||||
|
||||
MomentumFilterNone<LatticeGaugeField> FilterNone;
|
||||
ForceTest<GimplTypesR>(PlaqAction,U,FilterNone);
|
||||
ForceTest<GimplTypesR>(RectAction,U,FilterNone);
|
||||
|
||||
////////////////////////////////////
|
||||
// Action
|
||||
////////////////////////////////////
|
||||
RealD mass=0.00078;
|
||||
RealD pvmass=1.0;
|
||||
RealD M5=1.8;
|
||||
RealD b=1.5;
|
||||
RealD c=0.5;
|
||||
|
||||
// Double versions
|
||||
FermionAction DdwfPeriodic(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c,Params);
|
||||
FermionAction PVPeriodic (U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,pvmass,M5,b,c,Params);
|
||||
FermionAction DdwfDirichlet(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c,ParamsDir);
|
||||
|
||||
double StoppingCondition = 1.0e-8;
|
||||
double MaxCGIterations = 50000;
|
||||
ConjugateGradient<LatticeFermion> CG(StoppingCondition,MaxCGIterations);
|
||||
|
||||
//////////////////// Two Flavour Determinant Ratio ///////////////////////////////
|
||||
TwoFlavourRatioPseudoFermionAction<FimplD> Nf2(PVPeriodic, DdwfPeriodic,CG,CG);
|
||||
// ForceTest<GimplTypesR>(Nf2,U,FilterNone);
|
||||
|
||||
//////////////////// Two Flavour Determinant force test Even Odd ///////////////////////////////
|
||||
TwoFlavourEvenOddRatioPseudoFermionAction<FimplD> Nf2eo(PVPeriodic, DdwfPeriodic,CG,CG);
|
||||
// ForceTest<GimplTypesR>(Nf2eo,U,FilterNone);
|
||||
|
||||
//////////////////// Domain forces ////////////////////
|
||||
int Width=4;
|
||||
DDHMCFilter<WilsonImplD::Field> DDHMCFilter(Block4,Width);
|
||||
|
||||
//////////////////// Two flavour boundary det ////////////////////
|
||||
TwoFlavourRatioPseudoFermionAction<FimplD> BdyNf2(DdwfDirichlet, DdwfPeriodic,CG,CG);
|
||||
// ForceTest<GimplTypesR>(BdyNf2,U,DDHMCFilter);
|
||||
|
||||
//////////////////// Two flavour eo boundary det ////////////////////
|
||||
TwoFlavourEvenOddRatioPseudoFermionAction<FimplD> BdyNf2eo(DdwfDirichlet, DdwfPeriodic,CG,CG);
|
||||
// ForceTest<GimplTypesR>(BdyNf2eo,U,DDHMCFilter);
|
||||
|
||||
//////////////////// One flavour boundary det ////////////////////
|
||||
OneFlavourRationalParams OFRp; // Up/down
|
||||
OFRp.lo = 4.0e-5;
|
||||
OFRp.hi = 90.0;
|
||||
OFRp.MaxIter = 60000;
|
||||
OFRp.tolerance= 1.0e-8;
|
||||
OFRp.mdtolerance= 1.0e-6;
|
||||
OFRp.degree = 18;
|
||||
OFRp.precision= 80;
|
||||
OFRp.BoundsCheckFreq=0;
|
||||
std::vector<RealD> ActionTolByPole({
|
||||
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
|
||||
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
|
||||
1.0e-8,1.0e-8,1.0e-8,1.0e-8
|
||||
});
|
||||
std::vector<RealD> MDTolByPole({
|
||||
1.0e-6,3.0e-7,1.0e-7,1.0e-7, // Orig sloppy
|
||||
// 1.0e-8,1.0e-8,1.0e-8,1.0e-8,
|
||||
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
|
||||
1.0e-8,1.0e-8,1.0e-8,1.0e-8
|
||||
});
|
||||
OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> BdySqrt(DdwfDirichlet,DdwfPeriodic,OFRp);
|
||||
ForceTest<GimplTypesR>(BdySqrt,U,DDHMCFilter);
|
||||
|
||||
Grid_finalize();
|
||||
}
|
@ -476,6 +476,22 @@ int main (int argc, char ** argv)
|
||||
// ForceTest<GimplTypesR>(BdyNf2eo,U,DDHMCFilter);
|
||||
|
||||
//////////////////// One flavour boundary det ////////////////////
|
||||
/*
|
||||
RationalActionParams OFRp; // Up/down
|
||||
int SP_iters = 3000;
|
||||
OFRp.lo = 6.0e-5;
|
||||
OFRp.hi = 90.0;
|
||||
OFRp.inv_pow = 2;
|
||||
OFRp.MaxIter = SP_iters; // get most shifts by 2000, stop sharing space
|
||||
OFRp.action_tolerance= 1.0e-8;
|
||||
OFRp.action_degree = 18;
|
||||
OFRp.md_tolerance= 1.0e-5;
|
||||
OFRp.md_degree = 14;
|
||||
// OFRp.degree = 20; converges
|
||||
// OFRp.degree = 16;
|
||||
OFRp.precision= 80;
|
||||
OFRp.BoundsCheckFreq=0;
|
||||
*/
|
||||
OneFlavourRationalParams OFRp; // Up/down
|
||||
OFRp.lo = 4.0e-5;
|
||||
OFRp.hi = 90.0;
|
||||
@ -485,6 +501,22 @@ int main (int argc, char ** argv)
|
||||
OFRp.degree = 18;
|
||||
OFRp.precision= 80;
|
||||
OFRp.BoundsCheckFreq=0;
|
||||
std::vector<RealD> ActionTolByPole({
|
||||
1.0e-7,1.0e-8,1.0e-8,1.0e-8,
|
||||
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
|
||||
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
|
||||
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
|
||||
1.0e-8,1.0e-8
|
||||
});
|
||||
std::vector<RealD> MDTolByPole({
|
||||
1.6e-5,5.0e-6,1.0e-6,3.0e-7, // soften convergence more more
|
||||
// 1.0e-6,3.0e-7,1.0e-7,1.0e-7,
|
||||
// 3.0e-6,1.0e-6,1.0e-7,1.0e-7, // soften convergence
|
||||
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
|
||||
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
|
||||
1.0e-8,1.0e-8
|
||||
});
|
||||
/*
|
||||
std::vector<RealD> ActionTolByPole({
|
||||
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
|
||||
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
|
||||
@ -499,9 +531,9 @@ int main (int argc, char ** argv)
|
||||
// 1.0e-8,1.0e-8,1.0e-8,1.0e-8,
|
||||
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
|
||||
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
|
||||
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
|
||||
1.0e-8,1.0e-8
|
||||
});
|
||||
*/
|
||||
OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> BdySqrt(DdwfDirichlet,DdwfPeriodic,OFRp);
|
||||
BdySqrt.SetTolerances(ActionTolByPole,MDTolByPole);
|
||||
ForceTest<GimplTypesR>(BdySqrt,U,DDHMCFilter);
|
||||
|
@ -85,7 +85,7 @@ int main(int argc, char **argv) {
|
||||
TheHMC.Resources.AddObservable<PlaqObs>();
|
||||
//////////////////////////////////////////////
|
||||
|
||||
const int Ls = 4;
|
||||
const int Ls = 8;
|
||||
Real beta = 2.13;
|
||||
Real light_mass = 0.01;
|
||||
Real strange_mass = 0.04;
|
||||
|
73
tests/lanczos/Test_dwf_block_lanczos.README
Normal file
73
tests/lanczos/Test_dwf_block_lanczos.README
Normal file
@ -0,0 +1,73 @@
|
||||
#Example script
|
||||
DIR=/gpfs/alpine/phy157/proj-shared/phy157dwf/chulwoo/Grid/BL/build/tests/lanczos
|
||||
BIN=${DIR}/Test_dwf_block_lanczos
|
||||
|
||||
VOL='--grid 16.16.16.32 '
|
||||
GRID='--mpi 1.1.1.4 '
|
||||
CONF='--gconf ckpoint_lat.IEEE64BIG.2000 '
|
||||
OPT='--mass 0.01 --M5 1.8 --phase in.params --omega in.params --shm 4096'
|
||||
#BL='--rbl 16.1024.128.1000.10 --split 1.1.4.4 --check_int 100 --resid 1.0e-5 --cheby_l 0.007 --cheby_u 7 --cheby_n 51'
|
||||
BL='--rbl 4.128.16.100.10 --split 1.1.1.4 --check_int 25 --resid 1.0e-5 --cheby_l 0.007 --cheby_u 7 --cheby_n 51'
|
||||
|
||||
ARGS=${CONF}" "${OPT}" "${BL}" "${VOL}" "${GRID}
|
||||
export APP="${BIN} ${ARGS}"
|
||||
echo APP=${APP}
|
||||
#export JS="jsrun --nrs 32 -a4 -g4 -c42 -dpacked -b packed:7 --smpiargs="-gpu" "
|
||||
export JS="jsrun --nrs 1 -a4 -g4 -c42 -dpacked -b packed:10 --smpiargs="-gpu" "
|
||||
$JS $APP
|
||||
|
||||
#sample in.param
|
||||
|
||||
boundary_phase 0 1 0
|
||||
boundary_phase 1 1 0
|
||||
boundary_phase 2 1 0
|
||||
boundary_phase 3 -1 0
|
||||
|
||||
omega 0 0.5 0
|
||||
omega 1 0.5 0
|
||||
omega 2 0.5 0
|
||||
omega 3 0.5 0
|
||||
omega 4 0.5 0
|
||||
omega 5 0.5 0
|
||||
omega 6 0.5 0
|
||||
omega 7 0.5 0
|
||||
omega 8 0.5 0
|
||||
omega 9 0.5 0
|
||||
omega 10 0.5 0
|
||||
omega 11 0.5 0
|
||||
|
||||
|
||||
#output
|
||||
|
||||
Grid : Message : 1.717474 s : Gauge Configuration ckpoint_lat.IEEE64BIG.2000
|
||||
Grid : Message : 1.717478 s : boundary_phase[0] = (1,0)
|
||||
Grid : Message : 1.717497 s : boundary_phase[1] = (1,0)
|
||||
Grid : Message : 1.717500 s : boundary_phase[2] = (1,0)
|
||||
Grid : Message : 1.717503 s : boundary_phase[3] = (-1,0)
|
||||
Grid : Message : 1.717506 s : Ls 12
|
||||
Grid : Message : 1.717507 s : mass 0.01
|
||||
Grid : Message : 1.717510 s : M5 1.8
|
||||
Grid : Message : 1.717512 s : mob_b 1.5
|
||||
Grid : Message : 1.717514 s : omega[0] = (0.5,0)
|
||||
Grid : Message : 1.717517 s : omega[1] = (0.5,0)
|
||||
Grid : Message : 1.717520 s : omega[2] = (0.5,0)
|
||||
Grid : Message : 1.717523 s : omega[3] = (0.5,0)
|
||||
Grid : Message : 1.717526 s : omega[4] = (0.5,0)
|
||||
Grid : Message : 1.717529 s : omega[5] = (0.5,0)
|
||||
Grid : Message : 1.717532 s : omega[6] = (0.5,0)
|
||||
Grid : Message : 1.717535 s : omega[7] = (0.5,0)
|
||||
Grid : Message : 1.717538 s : omega[8] = (0.5,0)
|
||||
Grid : Message : 1.717541 s : omega[9] = (0.5,0)
|
||||
Grid : Message : 1.717544 s : omega[10] = (0.5,0)
|
||||
Grid : Message : 1.717547 s : omega[11] = (0.5,0)
|
||||
Grid : Message : 1.717550 s : Nu 4
|
||||
Grid : Message : 1.717551 s : Nk 128
|
||||
Grid : Message : 1.717552 s : Np 16
|
||||
Grid : Message : 1.717553 s : Nm 288
|
||||
Grid : Message : 1.717554 s : Nstop 100
|
||||
Grid : Message : 1.717555 s : Ntest 25
|
||||
Grid : Message : 1.717557 s : MaxIter 10
|
||||
Grid : Message : 1.717558 s : resid 1e-05
|
||||
Grid : Message : 1.717560 s : Cheby Poly 0.007,7,51
|
||||
|
||||
|
410
tests/lanczos/Test_dwf_block_lanczos.cc
Normal file
410
tests/lanczos/Test_dwf_block_lanczos.cc
Normal file
@ -0,0 +1,410 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./tests/Test_dwf_block_lanczos.cc
|
||||
|
||||
Copyright (C) 2022
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Yong-Chull Jang <ypj@quark.phy.bnl.gov>
|
||||
Author: Chulwoo Jung <chulwoo@bnl.gov>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/Grid.h>
|
||||
#include <Grid/util/Init.h>
|
||||
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
|
||||
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
//using namespace Grid::QCD;
|
||||
|
||||
//typedef typename GparityDomainWallFermionR::FermionField FermionField;
|
||||
typedef typename ZMobiusFermionF::FermionField FermionField;
|
||||
|
||||
RealD AllZero(RealD x){ return 0.;}
|
||||
|
||||
class CmdJobParams
|
||||
{
|
||||
public:
|
||||
std::string gaugefile;
|
||||
|
||||
int Ls;
|
||||
double mass;
|
||||
double M5;
|
||||
double mob_b;
|
||||
std::vector<ComplexD> omega;
|
||||
std::vector<Complex> boundary_phase;
|
||||
std::vector<int> mpi_split;
|
||||
|
||||
LanczosType Impl;
|
||||
int Nu;
|
||||
int Nk;
|
||||
int Np;
|
||||
int Nm;
|
||||
int Nstop;
|
||||
int Ntest;
|
||||
int MaxIter;
|
||||
double resid;
|
||||
|
||||
double low;
|
||||
double high;
|
||||
int order;
|
||||
|
||||
CmdJobParams()
|
||||
: gaugefile("Hot"),
|
||||
Ls(8), mass(0.01), M5(1.8), mob_b(1.5),
|
||||
Impl(LanczosType::irbl),mpi_split(4,1),
|
||||
Nu(4), Nk(200), Np(200), Nstop(100), Ntest(1), MaxIter(10), resid(1.0e-8),
|
||||
low(0.2), high(5.5), order(11)
|
||||
{Nm=Nk+Np;};
|
||||
|
||||
void Parse(char **argv, int argc);
|
||||
};
|
||||
|
||||
|
||||
void CmdJobParams::Parse(char **argv,int argc)
|
||||
{
|
||||
std::string arg;
|
||||
std::vector<int> vi;
|
||||
double re,im;
|
||||
int expect, idx;
|
||||
std::string vstr;
|
||||
std::ifstream pfile;
|
||||
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--gconf") ){
|
||||
gaugefile = GridCmdOptionPayload(argv,argv+argc,"--gconf");
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--phase") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--phase");
|
||||
pfile.open(arg);
|
||||
assert(pfile);
|
||||
expect = 0;
|
||||
while( pfile >> vstr ) {
|
||||
if ( vstr.compare("boundary_phase") == 0 ) {
|
||||
pfile >> vstr;
|
||||
GridCmdOptionInt(vstr,idx);
|
||||
assert(expect==idx);
|
||||
pfile >> vstr;
|
||||
GridCmdOptionFloat(vstr,re);
|
||||
pfile >> vstr;
|
||||
GridCmdOptionFloat(vstr,im);
|
||||
boundary_phase.push_back({re,im});
|
||||
expect++;
|
||||
}
|
||||
}
|
||||
pfile.close();
|
||||
} else {
|
||||
for (int i=0; i<4; ++i) boundary_phase.push_back({1.,0.});
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--omega") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--omega");
|
||||
pfile.open(arg);
|
||||
assert(pfile);
|
||||
Ls = 0;
|
||||
while( pfile >> vstr ) {
|
||||
if ( vstr.compare("omega") == 0 ) {
|
||||
pfile >> vstr;
|
||||
GridCmdOptionInt(vstr,idx);
|
||||
assert(Ls==idx);
|
||||
pfile >> vstr;
|
||||
GridCmdOptionFloat(vstr,re);
|
||||
pfile >> vstr;
|
||||
GridCmdOptionFloat(vstr,im);
|
||||
omega.push_back({re,im});
|
||||
Ls++;
|
||||
}
|
||||
}
|
||||
pfile.close();
|
||||
} else {
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--Ls") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--Ls");
|
||||
GridCmdOptionInt(arg,Ls);
|
||||
}
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--mass") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--mass");
|
||||
GridCmdOptionFloat(arg,mass);
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--M5") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--M5");
|
||||
GridCmdOptionFloat(arg,M5);
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--mob_b") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--mob_b");
|
||||
GridCmdOptionFloat(arg,mob_b);
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--irbl") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--irbl");
|
||||
GridCmdOptionIntVector(arg,vi);
|
||||
Nu = vi[0];
|
||||
Nk = vi[1];
|
||||
Np = vi[2];
|
||||
Nstop = vi[3];
|
||||
MaxIter = vi[4];
|
||||
// ypj[fixme] mode overriding message is needed.
|
||||
Impl = LanczosType::irbl;
|
||||
Nm = Nk+Np;
|
||||
}
|
||||
|
||||
// block Lanczos with explicit extension of its dimensions
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--rbl") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--rbl");
|
||||
GridCmdOptionIntVector(arg,vi);
|
||||
Nu = vi[0];
|
||||
Nk = vi[1];
|
||||
Np = vi[2]; // vector space is enlarged by adding Np vectors
|
||||
Nstop = vi[3];
|
||||
MaxIter = vi[4];
|
||||
// ypj[fixme] mode overriding message is needed.
|
||||
Impl = LanczosType::rbl;
|
||||
Nm = Nk+Np*MaxIter;
|
||||
}
|
||||
|
||||
#if 1
|
||||
// block Lanczos with explicit extension of its dimensions
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--split") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--split");
|
||||
GridCmdOptionIntVector(arg,vi);
|
||||
for(int i=0;i<mpi_split.size();i++)
|
||||
mpi_split[i] = vi[i];
|
||||
}
|
||||
#endif
|
||||
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--check_int") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--check_int");
|
||||
GridCmdOptionInt(arg,Ntest);
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--resid") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--resid");
|
||||
GridCmdOptionFloat(arg,resid);
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--cheby_l") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--cheby_l");
|
||||
GridCmdOptionFloat(arg,low);
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--cheby_u") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--cheby_u");
|
||||
GridCmdOptionFloat(arg,high);
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--cheby_n") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--cheby_n");
|
||||
GridCmdOptionInt(arg,order);
|
||||
}
|
||||
|
||||
if ( CartesianCommunicator::RankWorld() == 0 ) {
|
||||
std::streamsize ss = std::cout.precision();
|
||||
std::cout << GridLogMessage <<" Gauge Configuration "<< gaugefile << '\n';
|
||||
std::cout.precision(15);
|
||||
for ( int i=0; i<4; ++i ) std::cout << GridLogMessage <<" boundary_phase["<< i << "] = " << boundary_phase[i] << '\n';
|
||||
std::cout.precision(ss);
|
||||
std::cout << GridLogMessage <<" Ls "<< Ls << '\n';
|
||||
std::cout << GridLogMessage <<" mass "<< mass << '\n';
|
||||
std::cout << GridLogMessage <<" M5 "<< M5 << '\n';
|
||||
std::cout << GridLogMessage <<" mob_b "<< mob_b << '\n';
|
||||
std::cout.precision(15);
|
||||
for ( int i=0; i<Ls; ++i ) std::cout << GridLogMessage <<" omega["<< i << "] = " << omega[i] << '\n';
|
||||
std::cout.precision(ss);
|
||||
std::cout << GridLogMessage <<" Nu "<< Nu << '\n';
|
||||
std::cout << GridLogMessage <<" Nk "<< Nk << '\n';
|
||||
std::cout << GridLogMessage <<" Np "<< Np << '\n';
|
||||
std::cout << GridLogMessage <<" Nm "<< Nm << '\n';
|
||||
std::cout << GridLogMessage <<" Nstop "<< Nstop << '\n';
|
||||
std::cout << GridLogMessage <<" Ntest "<< Ntest << '\n';
|
||||
std::cout << GridLogMessage <<" MaxIter "<< MaxIter << '\n';
|
||||
std::cout << GridLogMessage <<" resid "<< resid << '\n';
|
||||
std::cout << GridLogMessage <<" Cheby Poly "<< low << "," << high << "," << order << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
CmdJobParams JP;
|
||||
JP.Parse(argv,argc);
|
||||
|
||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(JP.Ls,UGrid);
|
||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(JP.Ls,UGrid);
|
||||
// printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n",UGrid,UrbGrid,FGrid,FrbGrid);
|
||||
GridCartesian * UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
|
||||
GridRedBlackCartesian * UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
|
||||
GridCartesian * FGridF = SpaceTimeGrid::makeFiveDimGrid(JP.Ls,UGridF);
|
||||
GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(JP.Ls,UGridF);
|
||||
|
||||
std::vector<int> seeds4({1,2,3,4});
|
||||
std::vector<int> seeds5({5,6,7,8});
|
||||
GridParallelRNG RNG5(FGridF); RNG5.SeedFixedIntegers(seeds5);
|
||||
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
||||
// ypj [note] why seed RNG5 again? bug? In this case, run with a default seed().
|
||||
GridParallelRNG RNG5rb(FrbGridF); RNG5rb.SeedFixedIntegers(seeds5);
|
||||
|
||||
LatticeGaugeField Umu(UGrid);
|
||||
std::vector<LatticeColourMatrix> U(4,UGrid);
|
||||
LatticeGaugeFieldF UmuF(UGridF);
|
||||
std::vector<LatticeColourMatrix> UF(4,UGridF);
|
||||
|
||||
if ( JP.gaugefile.compare("Hot") == 0 ) {
|
||||
SU3::HotConfiguration(RNG4, Umu);
|
||||
} else {
|
||||
FieldMetaData header;
|
||||
NerscIO::readConfiguration(Umu,header,JP.gaugefile);
|
||||
// ypj [fixme] additional checks for the loaded configuration?
|
||||
}
|
||||
precisionChange (UmuF,Umu);
|
||||
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
|
||||
}
|
||||
|
||||
RealD mass = JP.mass;
|
||||
RealD M5 = JP.M5;
|
||||
|
||||
// ypj [fixme] flexible support for a various Fermions
|
||||
// RealD mob_b = JP.mob_b; // Gparity
|
||||
// std::vector<ComplexD> omega; // ZMobius
|
||||
|
||||
// GparityMobiusFermionD ::ImplParams params;
|
||||
// std::vector<int> twists({1,1,1,0});
|
||||
// params.twists = twists;
|
||||
// GparityMobiusFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.,params);
|
||||
// SchurDiagTwoOperator<GparityMobiusFermionR,FermionField> HermOp(Ddwf);
|
||||
|
||||
|
||||
// int mrhs = JP.Nu;
|
||||
int Ndir=4;
|
||||
auto mpi_layout = GridDefaultMpi();
|
||||
std::vector<int> mpi_split (Ndir,1);
|
||||
#if 0
|
||||
int tmp=mrhs, dir=0;
|
||||
std::cout << GridLogMessage << "dir= "<<dir <<"tmp= "<<tmp<<"mpi_split= "<<mpi_split[dir]<<"mpi_layout= "<<mpi_split[dir]<<std::endl;
|
||||
while ( tmp> 1) {
|
||||
if ((mpi_split[dir]*2) <= mpi_layout[dir]){
|
||||
mpi_split[dir] *=2;
|
||||
tmp = tmp/2;
|
||||
}
|
||||
std::cout << GridLogMessage << "dir= "<<dir <<"tmp= "<<tmp<<"mpi_split= "<<mpi_split[dir]<<"mpi_layout= "<<mpi_layout[dir]<<std::endl;
|
||||
dir = (dir+1)%Ndir;
|
||||
}
|
||||
#endif
|
||||
int mrhs=1;
|
||||
for(int i =0;i<Ndir;i++){
|
||||
mpi_split[i] = mpi_layout[i] / JP.mpi_split[i] ;
|
||||
mrhs *= JP.mpi_split[i];
|
||||
}
|
||||
std::cout << GridLogMessage << "mpi_layout= " << mpi_layout << std::endl;
|
||||
std::cout << GridLogMessage << "mpi_split= " << mpi_split << std::endl;
|
||||
std::cout << GridLogMessage << "mrhs= " << mrhs << std::endl;
|
||||
// assert(JP.Nu==tmp);
|
||||
|
||||
/////////////////////////////////////////////
|
||||
// Split into 1^4 mpi communicators, keeping it explicitly single
|
||||
/////////////////////////////////////////////
|
||||
GridCartesian * SGrid = new GridCartesian(GridDefaultLatt(),
|
||||
GridDefaultSimd(Nd,vComplexF::Nsimd()),
|
||||
mpi_split,
|
||||
*UGrid);
|
||||
|
||||
GridCartesian * SFGrid = SpaceTimeGrid::makeFiveDimGrid(JP.Ls,SGrid);
|
||||
GridRedBlackCartesian * SrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(SGrid);
|
||||
GridRedBlackCartesian * SFrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(JP.Ls,SGrid);
|
||||
|
||||
LatticeGaugeFieldF s_Umu(SGrid);
|
||||
Grid_split (UmuF,s_Umu);
|
||||
|
||||
//WilsonFermionR::ImplParams params;
|
||||
ZMobiusFermionF::ImplParams params;
|
||||
params.overlapCommsCompute = true;
|
||||
params.boundary_phases = JP.boundary_phase;
|
||||
ZMobiusFermionF DdwfF(UmuF,*FGridF,*FrbGridF,*UGridF,*UrbGridF,mass,M5,JP.omega,1.,0.,params);
|
||||
// SchurDiagTwoOperator<ZMobiusFermionF,FermionField> HermOp(Ddwf);
|
||||
SchurDiagOneOperator<ZMobiusFermionF,FermionField> HermOp(DdwfF);
|
||||
ZMobiusFermionF Dsplit(s_Umu,*SFGrid,*SFrbGrid,*SGrid,*SrbGrid,mass,M5,JP.omega,1.,0.,params);
|
||||
// SchurDiagTwoOperator<ZMobiusFermionF,FermionField> SHermOp(Dsplit);
|
||||
SchurDiagOneOperator<ZMobiusFermionF,FermionField> SHermOp(Dsplit);
|
||||
|
||||
//std::vector<double> Coeffs { 0.,-1.};
|
||||
// ypj [note] this may not be supported by some compilers
|
||||
std::vector<double> Coeffs({ 0.,-1.});
|
||||
Polynomial<FermionField> PolyX(Coeffs);
|
||||
//Chebyshev<FermionField> Cheb(0.2,5.5,11);
|
||||
Chebyshev<FermionField> Cheb(JP.low,JP.high,JP.order);
|
||||
// Cheb.csv(std::cout);
|
||||
ImplicitlyRestartedBlockLanczos<FermionField> IRBL(HermOp, SHermOp,
|
||||
FrbGridF,SFrbGrid,mrhs,
|
||||
Cheb,
|
||||
JP.Nstop, JP.Ntest,
|
||||
JP.Nu, JP.Nk, JP.Nm,
|
||||
JP.resid,
|
||||
JP.MaxIter,
|
||||
IRBLdiagonaliseWithEigen);
|
||||
// IRBLdiagonaliseWithLAPACK);
|
||||
IRBL.split_test=1;
|
||||
|
||||
std::vector<RealD> eval(JP.Nm);
|
||||
|
||||
std::vector<FermionField> src(JP.Nu,FrbGridF);
|
||||
if (0)
|
||||
{
|
||||
// in case RNG is too slow
|
||||
std::cout << GridLogMessage << "Using RNG5"<<std::endl;
|
||||
FermionField src_tmp(FGrid);
|
||||
for ( int i=0; i<JP.Nu; ++i ){
|
||||
// gaussian(RNG5,src_tmp);
|
||||
ComplexD rnd;
|
||||
RealD re;
|
||||
fillScalar(re,RNG5._gaussian[0],RNG5._generators[0]);
|
||||
std::cout << i <<" / "<< JP.Nm <<" re "<< re << std::endl;
|
||||
// printf("%d / %d re %e\n",i,FGrid->_processor,re);
|
||||
src_tmp=re;
|
||||
pickCheckerboard(Odd,src[i],src_tmp);
|
||||
}
|
||||
RNG5.Report();
|
||||
} else {
|
||||
std::cout << GridLogMessage << "Using RNG5rb"<<std::endl;
|
||||
for ( int i=0; i<JP.Nu; ++i )
|
||||
gaussian(RNG5rb,src[i]);
|
||||
RNG5rb.Report();
|
||||
|
||||
}
|
||||
|
||||
std::vector<FermionField> evec(JP.Nm,FrbGridF);
|
||||
for(int i=0;i<1;++i){
|
||||
std::cout << GridLogMessage << i <<" / "<< JP.Nm <<" grid pointer "<< evec[i].Grid() << std::endl;
|
||||
};
|
||||
|
||||
int Nconv;
|
||||
IRBL.calc(eval,evec,src,Nconv,JP.Impl);
|
||||
|
||||
|
||||
Grid_finalize();
|
||||
}
|
401
tests/lanczos/Test_dwf_block_lanczos.cc.double
Normal file
401
tests/lanczos/Test_dwf_block_lanczos.cc.double
Normal file
@ -0,0 +1,401 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./tests/Test_dwf_block_lanczos.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/Grid.h>
|
||||
#include <Grid/util/Init.h>
|
||||
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
|
||||
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
//using namespace Grid::QCD;
|
||||
|
||||
//typedef typename GparityDomainWallFermionR::FermionField FermionField;
|
||||
typedef typename ZMobiusFermionR::FermionField FermionField;
|
||||
|
||||
RealD AllZero(RealD x){ return 0.;}
|
||||
|
||||
class CmdJobParams
|
||||
{
|
||||
public:
|
||||
std::string gaugefile;
|
||||
|
||||
int Ls;
|
||||
double mass;
|
||||
double M5;
|
||||
double mob_b;
|
||||
std::vector<ComplexD> omega;
|
||||
std::vector<Complex> boundary_phase;
|
||||
std::vector<int> mpi_split;
|
||||
|
||||
LanczosType Impl;
|
||||
int Nu;
|
||||
int Nk;
|
||||
int Np;
|
||||
int Nm;
|
||||
int Nstop;
|
||||
int Ntest;
|
||||
int MaxIter;
|
||||
double resid;
|
||||
|
||||
double low;
|
||||
double high;
|
||||
int order;
|
||||
|
||||
CmdJobParams()
|
||||
: gaugefile("Hot"),
|
||||
Ls(8), mass(0.01), M5(1.8), mob_b(1.5),
|
||||
Impl(LanczosType::irbl),mpi_split(4,1),
|
||||
Nu(4), Nk(200), Np(200), Nstop(100), Ntest(1), MaxIter(10), resid(1.0e-8),
|
||||
low(0.2), high(5.5), order(11)
|
||||
{Nm=Nk+Np;};
|
||||
|
||||
void Parse(char **argv, int argc);
|
||||
};
|
||||
|
||||
|
||||
void CmdJobParams::Parse(char **argv,int argc)
|
||||
{
|
||||
std::string arg;
|
||||
std::vector<int> vi;
|
||||
double re,im;
|
||||
int expect, idx;
|
||||
std::string vstr;
|
||||
std::ifstream pfile;
|
||||
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--gconf") ){
|
||||
gaugefile = GridCmdOptionPayload(argv,argv+argc,"--gconf");
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--phase") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--phase");
|
||||
pfile.open(arg);
|
||||
assert(pfile);
|
||||
expect = 0;
|
||||
while( pfile >> vstr ) {
|
||||
if ( vstr.compare("boundary_phase") == 0 ) {
|
||||
pfile >> vstr;
|
||||
GridCmdOptionInt(vstr,idx);
|
||||
assert(expect==idx);
|
||||
pfile >> vstr;
|
||||
GridCmdOptionFloat(vstr,re);
|
||||
pfile >> vstr;
|
||||
GridCmdOptionFloat(vstr,im);
|
||||
boundary_phase.push_back({re,im});
|
||||
expect++;
|
||||
}
|
||||
}
|
||||
pfile.close();
|
||||
} else {
|
||||
for (int i=0; i<4; ++i) boundary_phase.push_back({1.,0.});
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--omega") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--omega");
|
||||
pfile.open(arg);
|
||||
assert(pfile);
|
||||
Ls = 0;
|
||||
while( pfile >> vstr ) {
|
||||
if ( vstr.compare("omega") == 0 ) {
|
||||
pfile >> vstr;
|
||||
GridCmdOptionInt(vstr,idx);
|
||||
assert(Ls==idx);
|
||||
pfile >> vstr;
|
||||
GridCmdOptionFloat(vstr,re);
|
||||
pfile >> vstr;
|
||||
GridCmdOptionFloat(vstr,im);
|
||||
omega.push_back({re,im});
|
||||
Ls++;
|
||||
}
|
||||
}
|
||||
pfile.close();
|
||||
} else {
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--Ls") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--Ls");
|
||||
GridCmdOptionInt(arg,Ls);
|
||||
}
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--mass") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--mass");
|
||||
GridCmdOptionFloat(arg,mass);
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--M5") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--M5");
|
||||
GridCmdOptionFloat(arg,M5);
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--mob_b") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--mob_b");
|
||||
GridCmdOptionFloat(arg,mob_b);
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--irbl") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--irbl");
|
||||
GridCmdOptionIntVector(arg,vi);
|
||||
Nu = vi[0];
|
||||
Nk = vi[1];
|
||||
Np = vi[2];
|
||||
Nstop = vi[3];
|
||||
MaxIter = vi[4];
|
||||
// ypj[fixme] mode overriding message is needed.
|
||||
Impl = LanczosType::irbl;
|
||||
Nm = Nk+Np;
|
||||
}
|
||||
|
||||
// block Lanczos with explicit extension of its dimensions
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--rbl") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--rbl");
|
||||
GridCmdOptionIntVector(arg,vi);
|
||||
Nu = vi[0];
|
||||
Nk = vi[1];
|
||||
Np = vi[2]; // vector space is enlarged by adding Np vectors
|
||||
Nstop = vi[3];
|
||||
MaxIter = vi[4];
|
||||
// ypj[fixme] mode overriding message is needed.
|
||||
Impl = LanczosType::rbl;
|
||||
Nm = Nk+Np*MaxIter;
|
||||
}
|
||||
|
||||
#if 1
|
||||
// block Lanczos with explicit extension of its dimensions
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--split") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--split");
|
||||
GridCmdOptionIntVector(arg,vi);
|
||||
for(int i=0;i<mpi_split.size();i++)
|
||||
mpi_split[i] = vi[i];
|
||||
}
|
||||
#endif
|
||||
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--check_int") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--check_int");
|
||||
GridCmdOptionInt(arg,Ntest);
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--resid") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--resid");
|
||||
GridCmdOptionFloat(arg,resid);
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--cheby_l") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--cheby_l");
|
||||
GridCmdOptionFloat(arg,low);
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--cheby_u") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--cheby_u");
|
||||
GridCmdOptionFloat(arg,high);
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--cheby_n") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--cheby_n");
|
||||
GridCmdOptionInt(arg,order);
|
||||
}
|
||||
|
||||
if ( CartesianCommunicator::RankWorld() == 0 ) {
|
||||
std::streamsize ss = std::cout.precision();
|
||||
std::cout << GridLogMessage <<" Gauge Configuration "<< gaugefile << '\n';
|
||||
std::cout.precision(15);
|
||||
for ( int i=0; i<4; ++i ) std::cout << GridLogMessage <<" boundary_phase["<< i << "] = " << boundary_phase[i] << '\n';
|
||||
std::cout.precision(ss);
|
||||
std::cout << GridLogMessage <<" Ls "<< Ls << '\n';
|
||||
std::cout << GridLogMessage <<" mass "<< mass << '\n';
|
||||
std::cout << GridLogMessage <<" M5 "<< M5 << '\n';
|
||||
std::cout << GridLogMessage <<" mob_b "<< mob_b << '\n';
|
||||
std::cout.precision(15);
|
||||
for ( int i=0; i<Ls; ++i ) std::cout << GridLogMessage <<" omega["<< i << "] = " << omega[i] << '\n';
|
||||
std::cout.precision(ss);
|
||||
std::cout << GridLogMessage <<" Nu "<< Nu << '\n';
|
||||
std::cout << GridLogMessage <<" Nk "<< Nk << '\n';
|
||||
std::cout << GridLogMessage <<" Np "<< Np << '\n';
|
||||
std::cout << GridLogMessage <<" Nm "<< Nm << '\n';
|
||||
std::cout << GridLogMessage <<" Nstop "<< Nstop << '\n';
|
||||
std::cout << GridLogMessage <<" Ntest "<< Ntest << '\n';
|
||||
std::cout << GridLogMessage <<" MaxIter "<< MaxIter << '\n';
|
||||
std::cout << GridLogMessage <<" resid "<< resid << '\n';
|
||||
std::cout << GridLogMessage <<" Cheby Poly "<< low << "," << high << "," << order << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
CmdJobParams JP;
|
||||
JP.Parse(argv,argc);
|
||||
|
||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(JP.Ls,UGrid);
|
||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(JP.Ls,UGrid);
|
||||
// printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n",UGrid,UrbGrid,FGrid,FrbGrid);
|
||||
|
||||
std::vector<int> seeds4({1,2,3,4});
|
||||
std::vector<int> seeds5({5,6,7,8});
|
||||
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
|
||||
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
||||
// ypj [note] why seed RNG5 again? bug? In this case, run with a default seed().
|
||||
GridParallelRNG RNG5rb(FrbGrid); RNG5rb.SeedFixedIntegers(seeds5);
|
||||
|
||||
LatticeGaugeField Umu(UGrid);
|
||||
std::vector<LatticeColourMatrix> U(4,UGrid);
|
||||
|
||||
if ( JP.gaugefile.compare("Hot") == 0 ) {
|
||||
SU3::HotConfiguration(RNG4, Umu);
|
||||
} else {
|
||||
FieldMetaData header;
|
||||
NerscIO::readConfiguration(Umu,header,JP.gaugefile);
|
||||
// ypj [fixme] additional checks for the loaded configuration?
|
||||
}
|
||||
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
|
||||
}
|
||||
|
||||
RealD mass = JP.mass;
|
||||
RealD M5 = JP.M5;
|
||||
|
||||
// ypj [fixme] flexible support for a various Fermions
|
||||
// RealD mob_b = JP.mob_b; // Gparity
|
||||
// std::vector<ComplexD> omega; // ZMobius
|
||||
|
||||
// GparityMobiusFermionD ::ImplParams params;
|
||||
// std::vector<int> twists({1,1,1,0});
|
||||
// params.twists = twists;
|
||||
// GparityMobiusFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.,params);
|
||||
// SchurDiagTwoOperator<GparityMobiusFermionR,FermionField> HermOp(Ddwf);
|
||||
|
||||
|
||||
// int mrhs = JP.Nu;
|
||||
int Ndir=4;
|
||||
auto mpi_layout = GridDefaultMpi();
|
||||
std::vector<int> mpi_split (Ndir,1);
|
||||
#if 0
|
||||
int tmp=mrhs, dir=0;
|
||||
std::cout << GridLogMessage << "dir= "<<dir <<"tmp= "<<tmp<<"mpi_split= "<<mpi_split[dir]<<"mpi_layout= "<<mpi_split[dir]<<std::endl;
|
||||
while ( tmp> 1) {
|
||||
if ((mpi_split[dir]*2) <= mpi_layout[dir]){
|
||||
mpi_split[dir] *=2;
|
||||
tmp = tmp/2;
|
||||
}
|
||||
std::cout << GridLogMessage << "dir= "<<dir <<"tmp= "<<tmp<<"mpi_split= "<<mpi_split[dir]<<"mpi_layout= "<<mpi_layout[dir]<<std::endl;
|
||||
dir = (dir+1)%Ndir;
|
||||
}
|
||||
#endif
|
||||
int mrhs=1;
|
||||
for(int i =0;i<Ndir;i++){
|
||||
mpi_split[i] = mpi_layout[i] / JP.mpi_split[i] ;
|
||||
mrhs *= JP.mpi_split[i];
|
||||
}
|
||||
std::cout << GridLogMessage << "mpi_layout= " << mpi_layout << std::endl;
|
||||
std::cout << GridLogMessage << "mpi_split= " << mpi_split << std::endl;
|
||||
std::cout << GridLogMessage << "mrhs= " << mrhs << std::endl;
|
||||
// assert(JP.Nu==tmp);
|
||||
|
||||
/////////////////////////////////////////////
|
||||
// Split into 1^4 mpi communicators
|
||||
/////////////////////////////////////////////
|
||||
GridCartesian * SGrid = new GridCartesian(GridDefaultLatt(),
|
||||
GridDefaultSimd(Nd,vComplex::Nsimd()),
|
||||
mpi_split,
|
||||
*UGrid);
|
||||
|
||||
GridCartesian * SFGrid = SpaceTimeGrid::makeFiveDimGrid(JP.Ls,SGrid);
|
||||
GridRedBlackCartesian * SrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(SGrid);
|
||||
GridRedBlackCartesian * SFrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(JP.Ls,SGrid);
|
||||
|
||||
LatticeGaugeField s_Umu(SGrid);
|
||||
Grid_split (Umu,s_Umu);
|
||||
|
||||
//WilsonFermionR::ImplParams params;
|
||||
ZMobiusFermionR::ImplParams params;
|
||||
params.overlapCommsCompute = true;
|
||||
params.boundary_phases = JP.boundary_phase;
|
||||
ZMobiusFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,JP.omega,1.,0.,params);
|
||||
// SchurDiagTwoOperator<ZMobiusFermionR,FermionField> HermOp(Ddwf);
|
||||
SchurDiagOneOperator<ZMobiusFermionR,FermionField> HermOp(Ddwf);
|
||||
ZMobiusFermionR Dsplit(s_Umu,*SFGrid,*SFrbGrid,*SGrid,*SrbGrid,mass,M5,JP.omega,1.,0.,params);
|
||||
// SchurDiagTwoOperator<ZMobiusFermionR,FermionField> SHermOp(Dsplit);
|
||||
SchurDiagOneOperator<ZMobiusFermionR,FermionField> SHermOp(Dsplit);
|
||||
|
||||
//std::vector<double> Coeffs { 0.,-1.};
|
||||
// ypj [note] this may not be supported by some compilers
|
||||
std::vector<double> Coeffs({ 0.,-1.});
|
||||
Polynomial<FermionField> PolyX(Coeffs);
|
||||
//Chebyshev<FermionField> Cheb(0.2,5.5,11);
|
||||
Chebyshev<FermionField> Cheb(JP.low,JP.high,JP.order);
|
||||
// Cheb.csv(std::cout);
|
||||
ImplicitlyRestartedBlockLanczos<FermionField> IRBL(HermOp, SHermOp,
|
||||
FrbGrid,SFrbGrid,mrhs,
|
||||
Cheb,
|
||||
JP.Nstop, JP.Ntest,
|
||||
JP.Nu, JP.Nk, JP.Nm,
|
||||
JP.resid,
|
||||
JP.MaxIter,
|
||||
IRBLdiagonaliseWithEigen);
|
||||
// IRBLdiagonaliseWithLAPACK);
|
||||
IRBL.split_test=0;
|
||||
|
||||
std::vector<RealD> eval(JP.Nm);
|
||||
|
||||
std::vector<FermionField> src(JP.Nu,FrbGrid);
|
||||
if (0)
|
||||
{
|
||||
// in case RNG is too slow
|
||||
std::cout << GridLogMessage << "Using RNG5"<<std::endl;
|
||||
FermionField src_tmp(FGrid);
|
||||
for ( int i=0; i<JP.Nu; ++i ){
|
||||
// gaussian(RNG5,src_tmp);
|
||||
ComplexD rnd;
|
||||
RealD re;
|
||||
fillScalar(re,RNG5._gaussian[0],RNG5._generators[0]);
|
||||
std::cout << i <<" / "<< JP.Nm <<" re "<< re << std::endl;
|
||||
// printf("%d / %d re %e\n",i,FGrid->_processor,re);
|
||||
src_tmp=re;
|
||||
pickCheckerboard(Odd,src[i],src_tmp);
|
||||
}
|
||||
RNG5.Report();
|
||||
} else {
|
||||
std::cout << GridLogMessage << "Using RNG5rb"<<std::endl;
|
||||
for ( int i=0; i<JP.Nu; ++i )
|
||||
gaussian(RNG5rb,src[i]);
|
||||
RNG5rb.Report();
|
||||
|
||||
}
|
||||
|
||||
std::vector<FermionField> evec(JP.Nm,FrbGrid);
|
||||
for(int i=0;i<1;++i){
|
||||
std::cout << GridLogMessage << i <<" / "<< JP.Nm <<" grid pointer "<< evec[i].Grid() << std::endl;
|
||||
};
|
||||
|
||||
int Nconv;
|
||||
IRBL.calc(eval,evec,src,Nconv,JP.Impl);
|
||||
|
||||
|
||||
Grid_finalize();
|
||||
}
|
408
tests/lanczos/Test_dwf_block_lanczos.cc.single
Normal file
408
tests/lanczos/Test_dwf_block_lanczos.cc.single
Normal file
@ -0,0 +1,408 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./tests/Test_dwf_block_lanczos.cc
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/Grid.h>
|
||||
#include <Grid/util/Init.h>
|
||||
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
|
||||
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
//using namespace Grid::QCD;
|
||||
|
||||
//typedef typename GparityDomainWallFermionR::FermionField FermionField;
|
||||
typedef typename ZMobiusFermionF::FermionField FermionField;
|
||||
|
||||
RealD AllZero(RealD x){ return 0.;}
|
||||
|
||||
class CmdJobParams
|
||||
{
|
||||
public:
|
||||
std::string gaugefile;
|
||||
|
||||
int Ls;
|
||||
double mass;
|
||||
double M5;
|
||||
double mob_b;
|
||||
std::vector<ComplexD> omega;
|
||||
std::vector<Complex> boundary_phase;
|
||||
std::vector<int> mpi_split;
|
||||
|
||||
LanczosType Impl;
|
||||
int Nu;
|
||||
int Nk;
|
||||
int Np;
|
||||
int Nm;
|
||||
int Nstop;
|
||||
int Ntest;
|
||||
int MaxIter;
|
||||
double resid;
|
||||
|
||||
double low;
|
||||
double high;
|
||||
int order;
|
||||
|
||||
CmdJobParams()
|
||||
: gaugefile("Hot"),
|
||||
Ls(8), mass(0.01), M5(1.8), mob_b(1.5),
|
||||
Impl(LanczosType::irbl),mpi_split(4,1),
|
||||
Nu(4), Nk(200), Np(200), Nstop(100), Ntest(1), MaxIter(10), resid(1.0e-8),
|
||||
low(0.2), high(5.5), order(11)
|
||||
{Nm=Nk+Np;};
|
||||
|
||||
void Parse(char **argv, int argc);
|
||||
};
|
||||
|
||||
|
||||
void CmdJobParams::Parse(char **argv,int argc)
|
||||
{
|
||||
std::string arg;
|
||||
std::vector<int> vi;
|
||||
double re,im;
|
||||
int expect, idx;
|
||||
std::string vstr;
|
||||
std::ifstream pfile;
|
||||
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--gconf") ){
|
||||
gaugefile = GridCmdOptionPayload(argv,argv+argc,"--gconf");
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--phase") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--phase");
|
||||
pfile.open(arg);
|
||||
assert(pfile);
|
||||
expect = 0;
|
||||
while( pfile >> vstr ) {
|
||||
if ( vstr.compare("boundary_phase") == 0 ) {
|
||||
pfile >> vstr;
|
||||
GridCmdOptionInt(vstr,idx);
|
||||
assert(expect==idx);
|
||||
pfile >> vstr;
|
||||
GridCmdOptionFloat(vstr,re);
|
||||
pfile >> vstr;
|
||||
GridCmdOptionFloat(vstr,im);
|
||||
boundary_phase.push_back({re,im});
|
||||
expect++;
|
||||
}
|
||||
}
|
||||
pfile.close();
|
||||
} else {
|
||||
for (int i=0; i<4; ++i) boundary_phase.push_back({1.,0.});
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--omega") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--omega");
|
||||
pfile.open(arg);
|
||||
assert(pfile);
|
||||
Ls = 0;
|
||||
while( pfile >> vstr ) {
|
||||
if ( vstr.compare("omega") == 0 ) {
|
||||
pfile >> vstr;
|
||||
GridCmdOptionInt(vstr,idx);
|
||||
assert(Ls==idx);
|
||||
pfile >> vstr;
|
||||
GridCmdOptionFloat(vstr,re);
|
||||
pfile >> vstr;
|
||||
GridCmdOptionFloat(vstr,im);
|
||||
omega.push_back({re,im});
|
||||
Ls++;
|
||||
}
|
||||
}
|
||||
pfile.close();
|
||||
} else {
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--Ls") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--Ls");
|
||||
GridCmdOptionInt(arg,Ls);
|
||||
}
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--mass") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--mass");
|
||||
GridCmdOptionFloat(arg,mass);
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--M5") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--M5");
|
||||
GridCmdOptionFloat(arg,M5);
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--mob_b") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--mob_b");
|
||||
GridCmdOptionFloat(arg,mob_b);
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--irbl") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--irbl");
|
||||
GridCmdOptionIntVector(arg,vi);
|
||||
Nu = vi[0];
|
||||
Nk = vi[1];
|
||||
Np = vi[2];
|
||||
Nstop = vi[3];
|
||||
MaxIter = vi[4];
|
||||
// ypj[fixme] mode overriding message is needed.
|
||||
Impl = LanczosType::irbl;
|
||||
Nm = Nk+Np;
|
||||
}
|
||||
|
||||
// block Lanczos with explicit extension of its dimensions
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--rbl") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--rbl");
|
||||
GridCmdOptionIntVector(arg,vi);
|
||||
Nu = vi[0];
|
||||
Nk = vi[1];
|
||||
Np = vi[2]; // vector space is enlarged by adding Np vectors
|
||||
Nstop = vi[3];
|
||||
MaxIter = vi[4];
|
||||
// ypj[fixme] mode overriding message is needed.
|
||||
Impl = LanczosType::rbl;
|
||||
Nm = Nk+Np*MaxIter;
|
||||
}
|
||||
|
||||
#if 1
|
||||
// block Lanczos with explicit extension of its dimensions
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--split") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--split");
|
||||
GridCmdOptionIntVector(arg,vi);
|
||||
for(int i=0;i<mpi_split.size();i++)
|
||||
mpi_split[i] = vi[i];
|
||||
}
|
||||
#endif
|
||||
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--check_int") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--check_int");
|
||||
GridCmdOptionInt(arg,Ntest);
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--resid") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--resid");
|
||||
GridCmdOptionFloat(arg,resid);
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--cheby_l") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--cheby_l");
|
||||
GridCmdOptionFloat(arg,low);
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--cheby_u") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--cheby_u");
|
||||
GridCmdOptionFloat(arg,high);
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(argv,argv+argc,"--cheby_n") ){
|
||||
arg = GridCmdOptionPayload(argv,argv+argc,"--cheby_n");
|
||||
GridCmdOptionInt(arg,order);
|
||||
}
|
||||
|
||||
if ( CartesianCommunicator::RankWorld() == 0 ) {
|
||||
std::streamsize ss = std::cout.precision();
|
||||
std::cout << GridLogMessage <<" Gauge Configuration "<< gaugefile << '\n';
|
||||
std::cout.precision(15);
|
||||
for ( int i=0; i<4; ++i ) std::cout << GridLogMessage <<" boundary_phase["<< i << "] = " << boundary_phase[i] << '\n';
|
||||
std::cout.precision(ss);
|
||||
std::cout << GridLogMessage <<" Ls "<< Ls << '\n';
|
||||
std::cout << GridLogMessage <<" mass "<< mass << '\n';
|
||||
std::cout << GridLogMessage <<" M5 "<< M5 << '\n';
|
||||
std::cout << GridLogMessage <<" mob_b "<< mob_b << '\n';
|
||||
std::cout.precision(15);
|
||||
for ( int i=0; i<Ls; ++i ) std::cout << GridLogMessage <<" omega["<< i << "] = " << omega[i] << '\n';
|
||||
std::cout.precision(ss);
|
||||
std::cout << GridLogMessage <<" Nu "<< Nu << '\n';
|
||||
std::cout << GridLogMessage <<" Nk "<< Nk << '\n';
|
||||
std::cout << GridLogMessage <<" Np "<< Np << '\n';
|
||||
std::cout << GridLogMessage <<" Nm "<< Nm << '\n';
|
||||
std::cout << GridLogMessage <<" Nstop "<< Nstop << '\n';
|
||||
std::cout << GridLogMessage <<" Ntest "<< Ntest << '\n';
|
||||
std::cout << GridLogMessage <<" MaxIter "<< MaxIter << '\n';
|
||||
std::cout << GridLogMessage <<" resid "<< resid << '\n';
|
||||
std::cout << GridLogMessage <<" Cheby Poly "<< low << "," << high << "," << order << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
CmdJobParams JP;
|
||||
JP.Parse(argv,argc);
|
||||
|
||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(JP.Ls,UGrid);
|
||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(JP.Ls,UGrid);
|
||||
// printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n",UGrid,UrbGrid,FGrid,FrbGrid);
|
||||
GridCartesian * UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
|
||||
GridRedBlackCartesian * UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
|
||||
GridCartesian * FGridF = SpaceTimeGrid::makeFiveDimGrid(JP.Ls,UGridF);
|
||||
GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(JP.Ls,UGridF);
|
||||
|
||||
std::vector<int> seeds4({1,2,3,4});
|
||||
std::vector<int> seeds5({5,6,7,8});
|
||||
GridParallelRNG RNG5(FGridF); RNG5.SeedFixedIntegers(seeds5);
|
||||
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
||||
// ypj [note] why seed RNG5 again? bug? In this case, run with a default seed().
|
||||
GridParallelRNG RNG5rb(FrbGridF); RNG5rb.SeedFixedIntegers(seeds5);
|
||||
|
||||
LatticeGaugeField Umu(UGrid);
|
||||
std::vector<LatticeColourMatrix> U(4,UGrid);
|
||||
LatticeGaugeFieldF UmuF(UGridF);
|
||||
std::vector<LatticeColourMatrix> UF(4,UGridF);
|
||||
|
||||
if ( JP.gaugefile.compare("Hot") == 0 ) {
|
||||
SU3::HotConfiguration(RNG4, Umu);
|
||||
} else {
|
||||
FieldMetaData header;
|
||||
NerscIO::readConfiguration(Umu,header,JP.gaugefile);
|
||||
// ypj [fixme] additional checks for the loaded configuration?
|
||||
}
|
||||
precisionChange (UmuF,Umu);
|
||||
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
|
||||
}
|
||||
|
||||
RealD mass = JP.mass;
|
||||
RealD M5 = JP.M5;
|
||||
|
||||
// ypj [fixme] flexible support for a various Fermions
|
||||
// RealD mob_b = JP.mob_b; // Gparity
|
||||
// std::vector<ComplexD> omega; // ZMobius
|
||||
|
||||
// GparityMobiusFermionD ::ImplParams params;
|
||||
// std::vector<int> twists({1,1,1,0});
|
||||
// params.twists = twists;
|
||||
// GparityMobiusFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.,params);
|
||||
// SchurDiagTwoOperator<GparityMobiusFermionR,FermionField> HermOp(Ddwf);
|
||||
|
||||
|
||||
// int mrhs = JP.Nu;
|
||||
int Ndir=4;
|
||||
auto mpi_layout = GridDefaultMpi();
|
||||
std::vector<int> mpi_split (Ndir,1);
|
||||
#if 0
|
||||
int tmp=mrhs, dir=0;
|
||||
std::cout << GridLogMessage << "dir= "<<dir <<"tmp= "<<tmp<<"mpi_split= "<<mpi_split[dir]<<"mpi_layout= "<<mpi_split[dir]<<std::endl;
|
||||
while ( tmp> 1) {
|
||||
if ((mpi_split[dir]*2) <= mpi_layout[dir]){
|
||||
mpi_split[dir] *=2;
|
||||
tmp = tmp/2;
|
||||
}
|
||||
std::cout << GridLogMessage << "dir= "<<dir <<"tmp= "<<tmp<<"mpi_split= "<<mpi_split[dir]<<"mpi_layout= "<<mpi_layout[dir]<<std::endl;
|
||||
dir = (dir+1)%Ndir;
|
||||
}
|
||||
#endif
|
||||
int mrhs=1;
|
||||
for(int i =0;i<Ndir;i++){
|
||||
mpi_split[i] = mpi_layout[i] / JP.mpi_split[i] ;
|
||||
mrhs *= JP.mpi_split[i];
|
||||
}
|
||||
std::cout << GridLogMessage << "mpi_layout= " << mpi_layout << std::endl;
|
||||
std::cout << GridLogMessage << "mpi_split= " << mpi_split << std::endl;
|
||||
std::cout << GridLogMessage << "mrhs= " << mrhs << std::endl;
|
||||
// assert(JP.Nu==tmp);
|
||||
|
||||
/////////////////////////////////////////////
|
||||
// Split into 1^4 mpi communicators, keeping it explicitly single
|
||||
/////////////////////////////////////////////
|
||||
GridCartesian * SGrid = new GridCartesian(GridDefaultLatt(),
|
||||
GridDefaultSimd(Nd,vComplexF::Nsimd()),
|
||||
mpi_split,
|
||||
*UGrid);
|
||||
|
||||
GridCartesian * SFGrid = SpaceTimeGrid::makeFiveDimGrid(JP.Ls,SGrid);
|
||||
GridRedBlackCartesian * SrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(SGrid);
|
||||
GridRedBlackCartesian * SFrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(JP.Ls,SGrid);
|
||||
|
||||
LatticeGaugeFieldF s_Umu(SGrid);
|
||||
Grid_split (UmuF,s_Umu);
|
||||
|
||||
//WilsonFermionR::ImplParams params;
|
||||
ZMobiusFermionF::ImplParams params;
|
||||
params.overlapCommsCompute = true;
|
||||
params.boundary_phases = JP.boundary_phase;
|
||||
ZMobiusFermionF DdwfF(UmuF,*FGridF,*FrbGridF,*UGridF,*UrbGridF,mass,M5,JP.omega,1.,0.,params);
|
||||
// SchurDiagTwoOperator<ZMobiusFermionF,FermionField> HermOp(Ddwf);
|
||||
SchurDiagOneOperator<ZMobiusFermionF,FermionField> HermOp(DdwfF);
|
||||
ZMobiusFermionF Dsplit(s_Umu,*SFGrid,*SFrbGrid,*SGrid,*SrbGrid,mass,M5,JP.omega,1.,0.,params);
|
||||
// SchurDiagTwoOperator<ZMobiusFermionF,FermionField> SHermOp(Dsplit);
|
||||
SchurDiagOneOperator<ZMobiusFermionF,FermionField> SHermOp(Dsplit);
|
||||
|
||||
//std::vector<double> Coeffs { 0.,-1.};
|
||||
// ypj [note] this may not be supported by some compilers
|
||||
std::vector<double> Coeffs({ 0.,-1.});
|
||||
Polynomial<FermionField> PolyX(Coeffs);
|
||||
//Chebyshev<FermionField> Cheb(0.2,5.5,11);
|
||||
Chebyshev<FermionField> Cheb(JP.low,JP.high,JP.order);
|
||||
// Cheb.csv(std::cout);
|
||||
ImplicitlyRestartedBlockLanczos<FermionField> IRBL(HermOp, SHermOp,
|
||||
FrbGridF,SFrbGrid,mrhs,
|
||||
Cheb,
|
||||
JP.Nstop, JP.Ntest,
|
||||
JP.Nu, JP.Nk, JP.Nm,
|
||||
JP.resid,
|
||||
JP.MaxIter,
|
||||
IRBLdiagonaliseWithEigen);
|
||||
// IRBLdiagonaliseWithLAPACK);
|
||||
IRBL.split_test=1;
|
||||
|
||||
std::vector<RealD> eval(JP.Nm);
|
||||
|
||||
std::vector<FermionField> src(JP.Nu,FrbGridF);
|
||||
if (0)
|
||||
{
|
||||
// in case RNG is too slow
|
||||
std::cout << GridLogMessage << "Using RNG5"<<std::endl;
|
||||
FermionField src_tmp(FGrid);
|
||||
for ( int i=0; i<JP.Nu; ++i ){
|
||||
// gaussian(RNG5,src_tmp);
|
||||
ComplexD rnd;
|
||||
RealD re;
|
||||
fillScalar(re,RNG5._gaussian[0],RNG5._generators[0]);
|
||||
std::cout << i <<" / "<< JP.Nm <<" re "<< re << std::endl;
|
||||
// printf("%d / %d re %e\n",i,FGrid->_processor,re);
|
||||
src_tmp=re;
|
||||
pickCheckerboard(Odd,src[i],src_tmp);
|
||||
}
|
||||
RNG5.Report();
|
||||
} else {
|
||||
std::cout << GridLogMessage << "Using RNG5rb"<<std::endl;
|
||||
for ( int i=0; i<JP.Nu; ++i )
|
||||
gaussian(RNG5rb,src[i]);
|
||||
RNG5rb.Report();
|
||||
|
||||
}
|
||||
|
||||
std::vector<FermionField> evec(JP.Nm,FrbGridF);
|
||||
for(int i=0;i<1;++i){
|
||||
std::cout << GridLogMessage << i <<" / "<< JP.Nm <<" grid pointer "<< evec[i].Grid() << std::endl;
|
||||
};
|
||||
|
||||
int Nconv;
|
||||
IRBL.calc(eval,evec,src,Nconv,JP.Impl);
|
||||
|
||||
|
||||
Grid_finalize();
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user