mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-21 17:22:03 +01:00
Compare commits
142 Commits
feature/bl
...
018e6da872
Author | SHA1 | Date | |
---|---|---|---|
018e6da872 | |||
b77bccfac2 | |||
80359e0d49 | |||
3d437c5cc4 | |||
b8a7004365 | |||
bd56c95a6f | |||
994512048e | |||
1dfaa08afb | |||
f44dce390f | |||
bb71e9a96a | |||
78bae9417c | |||
dd170ead01 | |||
014704856f | |||
6f6844ccf1 | |||
4c6613d72c | |||
ee92e08edb | |||
c1dcee9328 | |||
6b150961fe | |||
36cc9c524f | |||
5bafcaedfa | |||
bfeceae708 | |||
eacb66591f | |||
fadaa85626 | |||
02a5b0d786 | |||
0e2141442a | |||
769eb0eecb | |||
4241c7d4a3 | |||
7b11075102 | |||
abc658dca5 | |||
85e35c4da1 | |||
d72e914cf0 | |||
3b5254e2d5 | |||
f1c358b596 | |||
c0ef210265 | |||
e3e1cc1962 | |||
723eadbb5c | |||
e24637ec1e | |||
8b01ff4ce7 | |||
588197c487 | |||
1352bad2e4 | |||
ffd7301649 | |||
d2a8494044 | |||
0982e0d19b | |||
3badbfc3c1 | |||
5465961e30 | |||
477b794bc5 | |||
4835fd1a87 | |||
6533c25814 | |||
1b2914ec09 | |||
519f795066 | |||
4240ad5ca8 | |||
d418347d86 | |||
29a4bfe5e5 | |||
9955bf9daf | |||
876c8f4478 | |||
9c8750f261 | |||
91efd08179 | |||
9953511b65 | |||
025fa9991a | |||
e8c60c355b | |||
6c9c7f9d85 | |||
f534523ede | |||
1b8a834beb | |||
3aa43e6065 | |||
78ac4044ff | |||
119c3db47f | |||
21bbdb8fc2 | |||
739bd7572c | |||
074627a5bd | |||
6a23b2c599 | |||
bd891fb3f5 | |||
3984265851 | |||
45361d188f | |||
80c9d77e02 | |||
3aff64dddb | |||
b4f2ca81ff | |||
d1dea5f840 | |||
54f8b84d16 | |||
da503fef0e | |||
4a6802098a | |||
f9b41a84d2 | |||
5d7e0d18b9 | |||
9e64387933 | |||
983b681d46 | |||
4072408b6f | |||
bd76b47fbf | |||
18ce23aa75 | |||
ffa7fe0cc2 | |||
6b979f0a69 | |||
86dac5ff4f | |||
4a382fad3f | |||
cc753670d9 | |||
cc9d88ea1c | |||
b281b0166e | |||
6a21f694ff | |||
fc4db5e963 | |||
6252ffaf76 | |||
af64c1c6b6 | |||
866f48391a | |||
a4df527d74 | |||
5764d21161 | |||
496d04cd85 | |||
10e6d7c6ce | |||
c42e25e5b8 | |||
a00ae981e0 | |||
58e020b62a | |||
a7e1aceeca | |||
7212432f43 | |||
4a261fab30 | |||
6af97069b9 | |||
5068413cdb | |||
71c6960eea | |||
ddf6d5c9e3 | |||
39214702f6 | |||
3e4614c63a | |||
900e01f49b | |||
2376156fbc | |||
3f2fd49db4 | |||
0efa107cb6 | |||
8feedb4f6f | |||
05e562e3d7 | |||
dd3bbb8fa2 | |||
2fbcf13c46 | |||
4ea48ef0c4 | |||
5c85774ee3 | |||
d8a9a745d8 | |||
dcf172da3b | |||
546be724e7 | |||
481bbaf1fc | |||
281488611a | |||
bae0f8ea99 | |||
bbbcd36ae5 | |||
a3e935c902 | |||
7731c7db8e | |||
ff97340324 | |||
920a51438d | |||
be528b6d27 | |||
ccd21f96ff | |||
4b90cb8888 | |||
7d62f1d6d2 | |||
458c943987 | |||
88015b0858 |
54
.github/ISSUE_TEMPLATE/bug-report.yml
vendored
Normal file
54
.github/ISSUE_TEMPLATE/bug-report.yml
vendored
Normal file
@ -0,0 +1,54 @@
|
||||
name: Bug report
|
||||
description: Report a bug.
|
||||
title: "<insert title>"
|
||||
labels: [bug]
|
||||
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: >
|
||||
Thank you for taking the time to file a bug report.
|
||||
Please check that the code is pointing to the HEAD of develop
|
||||
or any commit in master which is tagged with a version number.
|
||||
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: "Describe the issue:"
|
||||
description: >
|
||||
Describe the issue and any previous attempt to solve it.
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: "Code example:"
|
||||
description: >
|
||||
If relevant, show how to reproduce the issue using a minimal working
|
||||
example.
|
||||
placeholder: |
|
||||
<< your code here >>
|
||||
render: shell
|
||||
validations:
|
||||
required: false
|
||||
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: "Target platform:"
|
||||
description: >
|
||||
Give a description of the target platform (CPU, network, compiler).
|
||||
Please give the full CPU part description, using for example
|
||||
`cat /proc/cpuinfo | grep 'model name' | uniq` (Linux)
|
||||
or `sysctl machdep.cpu.brand_string` (macOS) and the full output
|
||||
the `--version` option of your compiler.
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: "Configure options:"
|
||||
description: >
|
||||
Please give the exact configure command used and attach
|
||||
`config.log`, `grid.config.summary` and the output of `make V=1`.
|
||||
render: shell
|
||||
validations:
|
||||
required: true
|
@ -55,6 +55,7 @@ NAMESPACE_CHECK(BiCGSTAB);
|
||||
#include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
|
||||
#include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
|
||||
#include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h>
|
||||
#include <Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h>
|
||||
#include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
|
||||
#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
|
||||
#include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
|
||||
|
@ -542,6 +542,7 @@ public:
|
||||
(*this)(in[i], out[i]);
|
||||
}
|
||||
}
|
||||
virtual ~LinearFunction(){};
|
||||
};
|
||||
|
||||
template<class Field> class IdentityLinearFunction : public LinearFunction<Field> {
|
||||
|
@ -191,7 +191,7 @@ public:
|
||||
std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
|
||||
std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
|
||||
|
||||
std::cout << GridLogMessage << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl;
|
||||
std::cout << GridLogDebug << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl;
|
||||
|
||||
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
|
||||
|
||||
|
213
Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
Normal file
213
Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
Normal file
@ -0,0 +1,213 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Raoul Hodgson <raoul.hodgson@ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
|
||||
#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
//Mixed precision restarted defect correction CG
|
||||
template<class FieldD,class FieldF,
|
||||
typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
|
||||
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
|
||||
class MixedPrecisionConjugateGradientBatched : public LinearFunction<FieldD> {
|
||||
public:
|
||||
using LinearFunction<FieldD>::operator();
|
||||
RealD Tolerance;
|
||||
RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
|
||||
Integer MaxInnerIterations;
|
||||
Integer MaxOuterIterations;
|
||||
Integer MaxPatchupIterations;
|
||||
GridBase* SinglePrecGrid; //Grid for single-precision fields
|
||||
RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
|
||||
LinearOperatorBase<FieldF> &Linop_f;
|
||||
LinearOperatorBase<FieldD> &Linop_d;
|
||||
|
||||
//Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
|
||||
LinearFunction<FieldF> *guesser;
|
||||
bool updateResidual;
|
||||
|
||||
MixedPrecisionConjugateGradientBatched(RealD tol,
|
||||
Integer maxinnerit,
|
||||
Integer maxouterit,
|
||||
Integer maxpatchit,
|
||||
GridBase* _sp_grid,
|
||||
LinearOperatorBase<FieldF> &_Linop_f,
|
||||
LinearOperatorBase<FieldD> &_Linop_d,
|
||||
bool _updateResidual=true) :
|
||||
Linop_f(_Linop_f), Linop_d(_Linop_d),
|
||||
Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), MaxPatchupIterations(maxpatchit), SinglePrecGrid(_sp_grid),
|
||||
OuterLoopNormMult(100.), guesser(NULL), updateResidual(_updateResidual) { };
|
||||
|
||||
void useGuesser(LinearFunction<FieldF> &g){
|
||||
guesser = &g;
|
||||
}
|
||||
|
||||
void operator() (const FieldD &src_d_in, FieldD &sol_d){
|
||||
std::vector<FieldD> srcs_d_in{src_d_in};
|
||||
std::vector<FieldD> sols_d{sol_d};
|
||||
|
||||
(*this)(srcs_d_in,sols_d);
|
||||
|
||||
sol_d = sols_d[0];
|
||||
}
|
||||
|
||||
void operator() (const std::vector<FieldD> &src_d_in, std::vector<FieldD> &sol_d){
|
||||
assert(src_d_in.size() == sol_d.size());
|
||||
int NBatch = src_d_in.size();
|
||||
|
||||
std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl;
|
||||
|
||||
Integer TotalOuterIterations = 0; //Number of restarts
|
||||
std::vector<Integer> TotalInnerIterations(NBatch,0); //Number of inner CG iterations
|
||||
std::vector<Integer> TotalFinalStepIterations(NBatch,0); //Number of CG iterations in final patch-up step
|
||||
|
||||
GridStopWatch TotalTimer;
|
||||
TotalTimer.Start();
|
||||
|
||||
GridStopWatch InnerCGtimer;
|
||||
GridStopWatch PrecChangeTimer;
|
||||
|
||||
int cb = src_d_in[0].Checkerboard();
|
||||
|
||||
std::vector<RealD> src_norm;
|
||||
std::vector<RealD> norm;
|
||||
std::vector<RealD> stop;
|
||||
|
||||
GridBase* DoublePrecGrid = src_d_in[0].Grid();
|
||||
FieldD tmp_d(DoublePrecGrid);
|
||||
tmp_d.Checkerboard() = cb;
|
||||
|
||||
FieldD tmp2_d(DoublePrecGrid);
|
||||
tmp2_d.Checkerboard() = cb;
|
||||
|
||||
std::vector<FieldD> src_d;
|
||||
std::vector<FieldF> src_f;
|
||||
std::vector<FieldF> sol_f;
|
||||
|
||||
for (int i=0; i<NBatch; i++) {
|
||||
sol_d[i].Checkerboard() = cb;
|
||||
|
||||
src_norm.push_back(norm2(src_d_in[i]));
|
||||
norm.push_back(0.);
|
||||
stop.push_back(src_norm[i] * Tolerance*Tolerance);
|
||||
|
||||
src_d.push_back(src_d_in[i]); //source for next inner iteration, computed from residual during operation
|
||||
|
||||
src_f.push_back(SinglePrecGrid);
|
||||
src_f[i].Checkerboard() = cb;
|
||||
|
||||
sol_f.push_back(SinglePrecGrid);
|
||||
sol_f[i].Checkerboard() = cb;
|
||||
}
|
||||
|
||||
RealD inner_tol = InnerTolerance;
|
||||
|
||||
ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
|
||||
CG_f.ErrorOnNoConverge = false;
|
||||
|
||||
Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
|
||||
|
||||
for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
|
||||
std::cout << GridLogMessage << std::endl;
|
||||
std::cout << GridLogMessage << "Outer iteration " << outer_iter << std::endl;
|
||||
|
||||
bool allConverged = true;
|
||||
|
||||
for (int i=0; i<NBatch; i++) {
|
||||
//Compute double precision rsd and also new RHS vector.
|
||||
Linop_d.HermOp(sol_d[i], tmp_d);
|
||||
norm[i] = axpy_norm(src_d[i], -1., tmp_d, src_d_in[i]); //src_d is residual vector
|
||||
|
||||
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Outer iteration " << outer_iter <<" solve " << i << " residual "<< norm[i] << " target "<< stop[i] <<std::endl;
|
||||
|
||||
PrecChangeTimer.Start();
|
||||
precisionChange(src_f[i], src_d[i]);
|
||||
PrecChangeTimer.Stop();
|
||||
|
||||
sol_f[i] = Zero();
|
||||
|
||||
if(norm[i] > OuterLoopNormMult * stop[i]) {
|
||||
allConverged = false;
|
||||
}
|
||||
}
|
||||
if (allConverged) break;
|
||||
|
||||
if (updateResidual) {
|
||||
RealD normMax = *std::max_element(std::begin(norm), std::end(norm));
|
||||
RealD stopMax = *std::max_element(std::begin(stop), std::end(stop));
|
||||
while( normMax * inner_tol * inner_tol < stopMax) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
|
||||
CG_f.Tolerance = inner_tol;
|
||||
}
|
||||
|
||||
//Optionally improve inner solver guess (eg using known eigenvectors)
|
||||
if(guesser != NULL) {
|
||||
(*guesser)(src_f, sol_f);
|
||||
}
|
||||
|
||||
for (int i=0; i<NBatch; i++) {
|
||||
//Inner CG
|
||||
InnerCGtimer.Start();
|
||||
CG_f(Linop_f, src_f[i], sol_f[i]);
|
||||
InnerCGtimer.Stop();
|
||||
TotalInnerIterations[i] += CG_f.IterationsToComplete;
|
||||
|
||||
//Convert sol back to double and add to double prec solution
|
||||
PrecChangeTimer.Start();
|
||||
precisionChange(tmp_d, sol_f[i]);
|
||||
PrecChangeTimer.Stop();
|
||||
|
||||
axpy(sol_d[i], 1.0, tmp_d, sol_d[i]);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//Final trial CG
|
||||
std::cout << GridLogMessage << std::endl;
|
||||
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Starting final patch-up double-precision solve"<<std::endl;
|
||||
|
||||
for (int i=0; i<NBatch; i++) {
|
||||
ConjugateGradient<FieldD> CG_d(Tolerance, MaxPatchupIterations);
|
||||
CG_d(Linop_d, src_d_in[i], sol_d[i]);
|
||||
TotalFinalStepIterations[i] += CG_d.IterationsToComplete;
|
||||
}
|
||||
|
||||
TotalTimer.Stop();
|
||||
|
||||
std::cout << GridLogMessage << std::endl;
|
||||
for (int i=0; i<NBatch; i++) {
|
||||
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: solve " << i << " Inner CG iterations " << TotalInnerIterations[i] << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations[i] << std::endl;
|
||||
}
|
||||
std::cout << GridLogMessage << std::endl;
|
||||
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
#endif
|
@ -166,16 +166,16 @@ public:
|
||||
rsqf[s] =rsq[s];
|
||||
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
|
||||
// ps_d[s] = src_d;
|
||||
precisionChangeFast(ps_f[s],src_d);
|
||||
precisionChange(ps_f[s],src_d);
|
||||
}
|
||||
// r and p for primary
|
||||
p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
|
||||
r_d = p_d;
|
||||
|
||||
//MdagM+m[0]
|
||||
precisionChangeFast(p_f,p_d);
|
||||
precisionChange(p_f,p_d);
|
||||
Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
|
||||
precisionChangeFast(tmp_d,mmp_f);
|
||||
precisionChange(tmp_d,mmp_f);
|
||||
Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
|
||||
tmp_d = tmp_d - mmp_d;
|
||||
std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
|
||||
@ -204,7 +204,7 @@ public:
|
||||
|
||||
for(int s=0;s<nshift;s++) {
|
||||
axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
|
||||
precisionChangeFast(psi_f[s],psi_d[s]);
|
||||
precisionChange(psi_f[s],psi_d[s]);
|
||||
}
|
||||
|
||||
///////////////////////////////////////
|
||||
@ -225,7 +225,7 @@ public:
|
||||
AXPYTimer.Stop();
|
||||
|
||||
PrecChangeTimer.Start();
|
||||
precisionChangeFast(r_f, r_d);
|
||||
precisionChange(r_f, r_d);
|
||||
PrecChangeTimer.Stop();
|
||||
|
||||
AXPYTimer.Start();
|
||||
@ -243,13 +243,13 @@ public:
|
||||
|
||||
cp=c;
|
||||
PrecChangeTimer.Start();
|
||||
precisionChangeFast(p_f, p_d); //get back single prec search direction for linop
|
||||
precisionChange(p_f, p_d); //get back single prec search direction for linop
|
||||
PrecChangeTimer.Stop();
|
||||
MatrixTimer.Start();
|
||||
Linop_f.HermOp(p_f,mmp_f);
|
||||
MatrixTimer.Stop();
|
||||
PrecChangeTimer.Start();
|
||||
precisionChangeFast(mmp_d, mmp_f); // From Float to Double
|
||||
precisionChange(mmp_d, mmp_f); // From Float to Double
|
||||
PrecChangeTimer.Stop();
|
||||
|
||||
d=real(innerProduct(p_d,mmp_d));
|
||||
@ -311,7 +311,7 @@ public:
|
||||
SolverTimer.Stop();
|
||||
|
||||
for(int s=0;s<nshift;s++){
|
||||
precisionChangeFast(psi_d[s],psi_f[s]);
|
||||
precisionChange(psi_d[s],psi_f[s]);
|
||||
}
|
||||
|
||||
|
||||
|
@ -211,7 +211,7 @@ public:
|
||||
Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
|
||||
tmp_d = tmp_d - mmp_d;
|
||||
std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
|
||||
// assert(norm2(tmp_d)< 1.0e-4);
|
||||
assert(norm2(tmp_d)< 1.0);
|
||||
|
||||
axpy(mmp_d,mass[0],p_d,mmp_d);
|
||||
RealD rn = norm2(p_d);
|
||||
|
@ -419,14 +419,15 @@ until convergence
|
||||
}
|
||||
}
|
||||
|
||||
if ( Nconv < Nstop )
|
||||
if ( Nconv < Nstop ) {
|
||||
std::cout << GridLogIRL << "Nconv ("<<Nconv<<") < Nstop ("<<Nstop<<")"<<std::endl;
|
||||
|
||||
std::cout << GridLogIRL << "returning Nstop vectors, the last "<< Nstop-Nconv << "of which might meet convergence criterion only approximately" <<std::endl;
|
||||
}
|
||||
eval=eval2;
|
||||
|
||||
//Keep only converged
|
||||
eval.resize(Nconv);// Nstop?
|
||||
evec.resize(Nconv,grid);// Nstop?
|
||||
eval.resize(Nstop);// was Nconv
|
||||
evec.resize(Nstop,grid);// was Nconv
|
||||
basisSortInPlace(evec,eval,reverse);
|
||||
|
||||
}
|
||||
|
@ -4,11 +4,14 @@ NAMESPACE_BEGIN(Grid);
|
||||
|
||||
/*Allocation types, saying which pointer cache should be used*/
|
||||
#define Cpu (0)
|
||||
#define CpuSmall (1)
|
||||
#define Acc (2)
|
||||
#define AccSmall (3)
|
||||
#define Shared (4)
|
||||
#define SharedSmall (5)
|
||||
#define CpuHuge (1)
|
||||
#define CpuSmall (2)
|
||||
#define Acc (3)
|
||||
#define AccHuge (4)
|
||||
#define AccSmall (5)
|
||||
#define Shared (6)
|
||||
#define SharedHuge (7)
|
||||
#define SharedSmall (8)
|
||||
#undef GRID_MM_VERBOSE
|
||||
uint64_t total_shared;
|
||||
uint64_t total_device;
|
||||
@ -35,12 +38,15 @@ void MemoryManager::PrintBytes(void)
|
||||
|
||||
}
|
||||
|
||||
uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; }
|
||||
uint64_t MemoryManager::HostCacheBytes() { return CacheBytes[Cpu] + CacheBytes[CpuHuge] + CacheBytes[CpuSmall]; }
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Data tables for recently freed pooiniter caches
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
|
||||
int MemoryManager::Victim[MemoryManager::NallocType];
|
||||
int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 8, 16, 8, 16 };
|
||||
int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 0, 8, 8, 0, 16, 8, 0, 16 };
|
||||
uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Actual allocation and deallocation utils
|
||||
@ -170,6 +176,16 @@ void MemoryManager::Init(void)
|
||||
}
|
||||
}
|
||||
|
||||
str= getenv("GRID_ALLOC_NCACHE_HUGE");
|
||||
if ( str ) {
|
||||
Nc = atoi(str);
|
||||
if ( (Nc>=0) && (Nc < NallocCacheMax)) {
|
||||
Ncache[CpuHuge]=Nc;
|
||||
Ncache[AccHuge]=Nc;
|
||||
Ncache[SharedHuge]=Nc;
|
||||
}
|
||||
}
|
||||
|
||||
str= getenv("GRID_ALLOC_NCACHE_SMALL");
|
||||
if ( str ) {
|
||||
Nc = atoi(str);
|
||||
@ -190,7 +206,9 @@ void MemoryManager::InitMessage(void) {
|
||||
|
||||
std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
|
||||
#ifdef ALLOCATION_CACHE
|
||||
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
|
||||
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent host allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<" HUGE "<<Ncache[CpuHuge]<<std::endl;
|
||||
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent device allocations: SMALL "<<Ncache[AccSmall]<<" LARGE "<<Ncache[Acc]<<" Huge "<<Ncache[AccHuge]<<std::endl;
|
||||
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent shared allocations: SMALL "<<Ncache[SharedSmall]<<" LARGE "<<Ncache[Shared]<<" Huge "<<Ncache[SharedHuge]<<std::endl;
|
||||
#endif
|
||||
|
||||
#ifdef GRID_UVM
|
||||
@ -222,8 +240,11 @@ void MemoryManager::InitMessage(void) {
|
||||
void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
|
||||
{
|
||||
#ifdef ALLOCATION_CACHE
|
||||
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
|
||||
int cache = type + small;
|
||||
int cache;
|
||||
if (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2;
|
||||
else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
|
||||
else cache = type;
|
||||
|
||||
return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);
|
||||
#else
|
||||
return ptr;
|
||||
@ -232,11 +253,12 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
|
||||
|
||||
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes)
|
||||
{
|
||||
assert(ncache>0);
|
||||
#ifdef GRID_OMP
|
||||
assert(omp_in_parallel()==0);
|
||||
#endif
|
||||
|
||||
if (ncache == 0) return ptr;
|
||||
|
||||
void * ret = NULL;
|
||||
int v = -1;
|
||||
|
||||
@ -271,8 +293,11 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
|
||||
void *MemoryManager::Lookup(size_t bytes,int type)
|
||||
{
|
||||
#ifdef ALLOCATION_CACHE
|
||||
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
|
||||
int cache = type+small;
|
||||
int cache;
|
||||
if (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2;
|
||||
else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
|
||||
else cache = type;
|
||||
|
||||
return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]);
|
||||
#else
|
||||
return NULL;
|
||||
@ -281,7 +306,6 @@ void *MemoryManager::Lookup(size_t bytes,int type)
|
||||
|
||||
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes)
|
||||
{
|
||||
assert(ncache>0);
|
||||
#ifdef GRID_OMP
|
||||
assert(omp_in_parallel()==0);
|
||||
#endif
|
||||
|
@ -35,6 +35,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
// Move control to configure.ac and Config.h?
|
||||
|
||||
#define GRID_ALLOC_SMALL_LIMIT (4096)
|
||||
#define GRID_ALLOC_HUGE_LIMIT (2147483648)
|
||||
|
||||
#define STRINGIFY(x) #x
|
||||
#define TOSTRING(x) STRINGIFY(x)
|
||||
@ -70,6 +71,21 @@ enum ViewMode {
|
||||
CpuWriteDiscard = 0x10 // same for now
|
||||
};
|
||||
|
||||
struct MemoryStatus {
|
||||
uint64_t DeviceBytes;
|
||||
uint64_t DeviceLRUBytes;
|
||||
uint64_t DeviceMaxBytes;
|
||||
uint64_t HostToDeviceBytes;
|
||||
uint64_t DeviceToHostBytes;
|
||||
uint64_t HostToDeviceXfer;
|
||||
uint64_t DeviceToHostXfer;
|
||||
uint64_t DeviceEvictions;
|
||||
uint64_t DeviceDestroy;
|
||||
uint64_t DeviceAllocCacheBytes;
|
||||
uint64_t HostAllocCacheBytes;
|
||||
};
|
||||
|
||||
|
||||
class MemoryManager {
|
||||
private:
|
||||
|
||||
@ -83,7 +99,7 @@ private:
|
||||
} AllocationCacheEntry;
|
||||
|
||||
static const int NallocCacheMax=128;
|
||||
static const int NallocType=6;
|
||||
static const int NallocType=9;
|
||||
static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
|
||||
static int Victim[NallocType];
|
||||
static int Ncache[NallocType];
|
||||
@ -121,7 +137,26 @@ private:
|
||||
static uint64_t DeviceToHostXfer;
|
||||
static uint64_t DeviceEvictions;
|
||||
static uint64_t DeviceDestroy;
|
||||
|
||||
|
||||
static uint64_t DeviceCacheBytes();
|
||||
static uint64_t HostCacheBytes();
|
||||
|
||||
static MemoryStatus GetFootprint(void) {
|
||||
MemoryStatus stat;
|
||||
stat.DeviceBytes = DeviceBytes;
|
||||
stat.DeviceLRUBytes = DeviceLRUBytes;
|
||||
stat.DeviceMaxBytes = DeviceMaxBytes;
|
||||
stat.HostToDeviceBytes = HostToDeviceBytes;
|
||||
stat.DeviceToHostBytes = DeviceToHostBytes;
|
||||
stat.HostToDeviceXfer = HostToDeviceXfer;
|
||||
stat.DeviceToHostXfer = DeviceToHostXfer;
|
||||
stat.DeviceEvictions = DeviceEvictions;
|
||||
stat.DeviceDestroy = DeviceDestroy;
|
||||
stat.DeviceAllocCacheBytes = DeviceCacheBytes();
|
||||
stat.HostAllocCacheBytes = HostCacheBytes();
|
||||
return stat;
|
||||
};
|
||||
|
||||
private:
|
||||
#ifndef GRID_UVM
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
|
@ -519,7 +519,6 @@ void MemoryManager::Audit(std::string s)
|
||||
uint64_t LruBytes1=0;
|
||||
uint64_t LruBytes2=0;
|
||||
uint64_t LruCnt=0;
|
||||
uint64_t LockedBytes=0;
|
||||
|
||||
std::cout << " Memory Manager::Audit() from "<<s<<std::endl;
|
||||
for(auto it=LRU.begin();it!=LRU.end();it++){
|
||||
|
@ -400,9 +400,6 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
||||
}
|
||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
|
||||
{
|
||||
acceleratorCopySynchronise();
|
||||
StencilBarrier();// Synch shared memory on a single nodes
|
||||
|
||||
int nreq=list.size();
|
||||
|
||||
if (nreq==0) return;
|
||||
|
@ -128,7 +128,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
||||
int recv_from_rank,int dor,
|
||||
int xbytes,int rbytes, int dir)
|
||||
{
|
||||
return 2.0*bytes;
|
||||
return xbytes+rbytes;
|
||||
}
|
||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
|
||||
{
|
||||
|
@ -91,6 +91,59 @@ void *SharedMemory::ShmBufferSelf(void)
|
||||
//std::cerr << "ShmBufferSelf "<<ShmRank<<" "<<std::hex<< ShmCommBufs[ShmRank] <<std::dec<<std::endl;
|
||||
return ShmCommBufs[ShmRank];
|
||||
}
|
||||
static inline int divides(int a,int b)
|
||||
{
|
||||
return ( b == ( (b/a)*a ) );
|
||||
}
|
||||
void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
|
||||
{
|
||||
////////////////////////////////////////////////////////////////
|
||||
// Allow user to configure through environment variable
|
||||
////////////////////////////////////////////////////////////////
|
||||
char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str());
|
||||
if ( str ) {
|
||||
std::vector<int> IntShmDims;
|
||||
GridCmdOptionIntVector(std::string(str),IntShmDims);
|
||||
assert(IntShmDims.size() == WorldDims.size());
|
||||
long ShmSize = 1;
|
||||
for (int dim=0;dim<WorldDims.size();dim++) {
|
||||
ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
|
||||
assert(divides(ShmDims[dim],WorldDims[dim]));
|
||||
}
|
||||
assert(ShmSize == WorldShmSize);
|
||||
return;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// Powers of 2,3,5 only in prime decomposition for now
|
||||
////////////////////////////////////////////////////////////////
|
||||
int ndimension = WorldDims.size();
|
||||
ShmDims=Coordinate(ndimension,1);
|
||||
|
||||
std::vector<int> primes({2,3,5});
|
||||
|
||||
int dim = 0;
|
||||
int last_dim = ndimension - 1;
|
||||
int AutoShmSize = 1;
|
||||
while(AutoShmSize != WorldShmSize) {
|
||||
int p;
|
||||
for(p=0;p<primes.size();p++) {
|
||||
int prime=primes[p];
|
||||
if ( divides(prime,WorldDims[dim]/ShmDims[dim])
|
||||
&& divides(prime,WorldShmSize/AutoShmSize) ) {
|
||||
AutoShmSize*=prime;
|
||||
ShmDims[dim]*=prime;
|
||||
last_dim = dim;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (p == primes.size() && last_dim == dim) {
|
||||
std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
dim=(dim+1) %ndimension;
|
||||
}
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
|
@ -27,9 +27,10 @@ Author: Christoph Lehner <christoph@lhnr.de>
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#define Mheader "SharedMemoryMpi: "
|
||||
|
||||
#include <Grid/GridCore.h>
|
||||
#include <pwd.h>
|
||||
#include <syscall.h>
|
||||
|
||||
#ifdef GRID_CUDA
|
||||
#include <cuda_runtime_api.h>
|
||||
@ -37,12 +38,120 @@ Author: Christoph Lehner <christoph@lhnr.de>
|
||||
#ifdef GRID_HIP
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#endif
|
||||
#ifdef GRID_SYCl
|
||||
|
||||
#ifdef GRID_SYCL
|
||||
#define GRID_SYCL_LEVEL_ZERO_IPC
|
||||
#include <syscall.h>
|
||||
#define SHM_SOCKETS
|
||||
#endif
|
||||
|
||||
#include <sys/socket.h>
|
||||
#include <sys/un.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
#define header "SharedMemoryMpi: "
|
||||
|
||||
#ifdef SHM_SOCKETS
|
||||
|
||||
/*
|
||||
* Barbaric extra intranode communication route in case we need sockets to pass FDs
|
||||
* Forced by level_zero not being nicely designed
|
||||
*/
|
||||
static int sock;
|
||||
static const char *sock_path_fmt = "/tmp/GridUnixSocket.%d";
|
||||
static char sock_path[256];
|
||||
class UnixSockets {
|
||||
public:
|
||||
static void Open(int rank)
|
||||
{
|
||||
int errnum;
|
||||
|
||||
sock = socket(AF_UNIX, SOCK_DGRAM, 0); assert(sock>0);
|
||||
|
||||
struct sockaddr_un sa_un = { 0 };
|
||||
sa_un.sun_family = AF_UNIX;
|
||||
snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,rank);
|
||||
unlink(sa_un.sun_path);
|
||||
if (bind(sock, (struct sockaddr *)&sa_un, sizeof(sa_un))) {
|
||||
perror("bind failure");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
static int RecvFileDescriptor(void)
|
||||
{
|
||||
int n;
|
||||
int fd;
|
||||
char buf[1];
|
||||
struct iovec iov;
|
||||
struct msghdr msg;
|
||||
struct cmsghdr *cmsg;
|
||||
char cms[CMSG_SPACE(sizeof(int))];
|
||||
|
||||
iov.iov_base = buf;
|
||||
iov.iov_len = 1;
|
||||
|
||||
memset(&msg, 0, sizeof msg);
|
||||
msg.msg_name = 0;
|
||||
msg.msg_namelen = 0;
|
||||
msg.msg_iov = &iov;
|
||||
msg.msg_iovlen = 1;
|
||||
|
||||
msg.msg_control = (caddr_t)cms;
|
||||
msg.msg_controllen = sizeof cms;
|
||||
|
||||
if((n=recvmsg(sock, &msg, 0)) < 0) {
|
||||
perror("recvmsg failed");
|
||||
return -1;
|
||||
}
|
||||
if(n == 0){
|
||||
perror("recvmsg returned 0");
|
||||
return -1;
|
||||
}
|
||||
cmsg = CMSG_FIRSTHDR(&msg);
|
||||
|
||||
memmove(&fd, CMSG_DATA(cmsg), sizeof(int));
|
||||
|
||||
return fd;
|
||||
}
|
||||
|
||||
static void SendFileDescriptor(int fildes,int xmit_to_rank)
|
||||
{
|
||||
struct msghdr msg;
|
||||
struct iovec iov;
|
||||
struct cmsghdr *cmsg = NULL;
|
||||
char ctrl[CMSG_SPACE(sizeof(int))];
|
||||
char data = ' ';
|
||||
|
||||
memset(&msg, 0, sizeof(struct msghdr));
|
||||
memset(ctrl, 0, CMSG_SPACE(sizeof(int)));
|
||||
iov.iov_base = &data;
|
||||
iov.iov_len = sizeof(data);
|
||||
|
||||
sprintf(sock_path,sock_path_fmt,xmit_to_rank);
|
||||
|
||||
struct sockaddr_un sa_un = { 0 };
|
||||
sa_un.sun_family = AF_UNIX;
|
||||
snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,xmit_to_rank);
|
||||
|
||||
msg.msg_name = (void *)&sa_un;
|
||||
msg.msg_namelen = sizeof(sa_un);
|
||||
msg.msg_iov = &iov;
|
||||
msg.msg_iovlen = 1;
|
||||
msg.msg_controllen = CMSG_SPACE(sizeof(int));
|
||||
msg.msg_control = ctrl;
|
||||
|
||||
cmsg = CMSG_FIRSTHDR(&msg);
|
||||
cmsg->cmsg_level = SOL_SOCKET;
|
||||
cmsg->cmsg_type = SCM_RIGHTS;
|
||||
cmsg->cmsg_len = CMSG_LEN(sizeof(int));
|
||||
|
||||
*((int *) CMSG_DATA(cmsg)) = fildes;
|
||||
|
||||
sendmsg(sock, &msg, 0);
|
||||
};
|
||||
};
|
||||
#endif
|
||||
|
||||
|
||||
/*Construct from an MPI communicator*/
|
||||
void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
|
||||
{
|
||||
@ -65,8 +174,8 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
|
||||
MPI_Comm_size(WorldShmComm ,&WorldShmSize);
|
||||
|
||||
if ( WorldRank == 0) {
|
||||
std::cout << header " World communicator of size " <<WorldSize << std::endl;
|
||||
std::cout << header " Node communicator of size " <<WorldShmSize << std::endl;
|
||||
std::cout << Mheader " World communicator of size " <<WorldSize << std::endl;
|
||||
std::cout << Mheader " Node communicator of size " <<WorldShmSize << std::endl;
|
||||
}
|
||||
// WorldShmComm, WorldShmSize, WorldShmRank
|
||||
|
||||
@ -169,59 +278,7 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M
|
||||
if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm,SHM);
|
||||
else OptimalCommunicatorSharedMemory(processors,optimal_comm,SHM);
|
||||
}
|
||||
static inline int divides(int a,int b)
|
||||
{
|
||||
return ( b == ( (b/a)*a ) );
|
||||
}
|
||||
void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
|
||||
{
|
||||
////////////////////////////////////////////////////////////////
|
||||
// Allow user to configure through environment variable
|
||||
////////////////////////////////////////////////////////////////
|
||||
char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str());
|
||||
if ( str ) {
|
||||
std::vector<int> IntShmDims;
|
||||
GridCmdOptionIntVector(std::string(str),IntShmDims);
|
||||
assert(IntShmDims.size() == WorldDims.size());
|
||||
long ShmSize = 1;
|
||||
for (int dim=0;dim<WorldDims.size();dim++) {
|
||||
ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
|
||||
assert(divides(ShmDims[dim],WorldDims[dim]));
|
||||
}
|
||||
assert(ShmSize == WorldShmSize);
|
||||
return;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// Powers of 2,3,5 only in prime decomposition for now
|
||||
////////////////////////////////////////////////////////////////
|
||||
int ndimension = WorldDims.size();
|
||||
ShmDims=Coordinate(ndimension,1);
|
||||
|
||||
std::vector<int> primes({2,3,5});
|
||||
|
||||
int dim = 0;
|
||||
int last_dim = ndimension - 1;
|
||||
int AutoShmSize = 1;
|
||||
while(AutoShmSize != WorldShmSize) {
|
||||
int p;
|
||||
for(p=0;p<primes.size();p++) {
|
||||
int prime=primes[p];
|
||||
if ( divides(prime,WorldDims[dim]/ShmDims[dim])
|
||||
&& divides(prime,WorldShmSize/AutoShmSize) ) {
|
||||
AutoShmSize*=prime;
|
||||
ShmDims[dim]*=prime;
|
||||
last_dim = dim;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (p == primes.size() && last_dim == dim) {
|
||||
std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
dim=(dim+1) %ndimension;
|
||||
}
|
||||
}
|
||||
void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
|
||||
{
|
||||
////////////////////////////////////////////////////////////////
|
||||
@ -395,7 +452,7 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
|
||||
#ifdef GRID_MPI3_SHMGET
|
||||
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
{
|
||||
std::cout << header "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
|
||||
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
|
||||
assert(_ShmSetup==1);
|
||||
assert(_ShmAlloc==0);
|
||||
|
||||
@ -480,7 +537,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
|
||||
std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
|
||||
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
|
||||
|
||||
SharedMemoryZero(ShmCommBuf,bytes);
|
||||
@ -523,7 +580,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
if ( WorldRank == 0 ){
|
||||
std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
|
||||
std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
|
||||
<< "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl;
|
||||
}
|
||||
SharedMemoryZero(ShmCommBuf,bytes);
|
||||
@ -531,8 +588,13 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Loop over ranks/gpu's on our node
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef SHM_SOCKETS
|
||||
UnixSockets::Open(WorldShmRank);
|
||||
#endif
|
||||
for(int r=0;r<WorldShmSize;r++){
|
||||
|
||||
MPI_Barrier(WorldShmComm);
|
||||
|
||||
#ifndef GRID_MPI3_SHM_NONE
|
||||
//////////////////////////////////////////////////
|
||||
// If it is me, pass around the IPC access key
|
||||
@ -540,24 +602,32 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
void * thisBuf = ShmCommBuf;
|
||||
if(!Stencil_force_mpi) {
|
||||
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
|
||||
typedef struct { int fd; pid_t pid ; } clone_mem_t;
|
||||
typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;
|
||||
|
||||
auto zeDevice = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device());
|
||||
auto zeContext = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context());
|
||||
auto zeDevice = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
|
||||
auto zeContext = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
|
||||
|
||||
ze_ipc_mem_handle_t ihandle;
|
||||
clone_mem_t handle;
|
||||
|
||||
|
||||
if ( r==WorldShmRank ) {
|
||||
auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&ihandle);
|
||||
if ( err != ZE_RESULT_SUCCESS ) {
|
||||
std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
|
||||
std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
} else {
|
||||
std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
|
||||
}
|
||||
memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
|
||||
handle.pid = getpid();
|
||||
memcpy((void *)&handle.ze,(void *)&ihandle,sizeof(ihandle));
|
||||
#ifdef SHM_SOCKETS
|
||||
for(int rr=0;rr<WorldShmSize;rr++){
|
||||
if(rr!=r){
|
||||
UnixSockets::SendFileDescriptor(handle.fd,rr);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
#ifdef GRID_CUDA
|
||||
@ -585,6 +655,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
// Share this IPC handle across the Shm Comm
|
||||
//////////////////////////////////////////////////
|
||||
{
|
||||
MPI_Barrier(WorldShmComm);
|
||||
int ierr=MPI_Bcast(&handle,
|
||||
sizeof(handle),
|
||||
MPI_BYTE,
|
||||
@ -600,6 +671,10 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
|
||||
if ( r!=WorldShmRank ) {
|
||||
thisBuf = nullptr;
|
||||
int myfd;
|
||||
#ifdef SHM_SOCKETS
|
||||
myfd=UnixSockets::RecvFileDescriptor();
|
||||
#else
|
||||
std::cout<<"mapping seeking remote pid/fd "
|
||||
<<handle.pid<<"/"
|
||||
<<handle.fd<<std::endl;
|
||||
@ -607,16 +682,22 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
|
||||
std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
|
||||
// int myfd = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
|
||||
int myfd = syscall(438,pidfd,handle.fd,0);
|
||||
|
||||
std::cout<<"Using IpcHandle myfd "<<myfd<<"\n";
|
||||
|
||||
myfd = syscall(438,pidfd,handle.fd,0);
|
||||
int err_t = errno;
|
||||
if (myfd < 0) {
|
||||
fprintf(stderr,"pidfd_getfd returned %d errno was %d\n", myfd,err_t); fflush(stderr);
|
||||
perror("pidfd_getfd failed ");
|
||||
assert(0);
|
||||
}
|
||||
#endif
|
||||
std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
|
||||
memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle));
|
||||
memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));
|
||||
|
||||
auto err = zeMemOpenIpcHandle(zeContext,zeDevice,ihandle,0,&thisBuf);
|
||||
if ( err != ZE_RESULT_SUCCESS ) {
|
||||
std::cout << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
|
||||
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
|
||||
std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
|
||||
std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
} else {
|
||||
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
|
||||
@ -651,6 +732,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
#else
|
||||
WorldShmCommBufs[r] = ShmCommBuf;
|
||||
#endif
|
||||
MPI_Barrier(WorldShmComm);
|
||||
}
|
||||
|
||||
_ShmAllocBytes=bytes;
|
||||
@ -662,7 +744,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
#ifdef GRID_MPI3_SHMMMAP
|
||||
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
{
|
||||
std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
|
||||
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
|
||||
assert(_ShmSetup==1);
|
||||
assert(_ShmAlloc==0);
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -699,7 +781,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
assert(((uint64_t)ptr&0x3F)==0);
|
||||
close(fd);
|
||||
WorldShmCommBufs[r] =ptr;
|
||||
// std::cout << header "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
|
||||
// std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
|
||||
}
|
||||
_ShmAlloc=1;
|
||||
_ShmAllocBytes = bytes;
|
||||
@ -709,7 +791,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
#ifdef GRID_MPI3_SHM_NONE
|
||||
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
{
|
||||
std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
|
||||
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
|
||||
assert(_ShmSetup==1);
|
||||
assert(_ShmAlloc==0);
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -756,7 +838,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
{
|
||||
std::cout << header "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
|
||||
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
|
||||
assert(_ShmSetup==1);
|
||||
assert(_ShmAlloc==0);
|
||||
MPI_Barrier(WorldShmComm);
|
||||
|
@ -297,6 +297,30 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
|
||||
}
|
||||
}
|
||||
|
||||
#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
|
||||
|
||||
template <typename T>
|
||||
T iDivUp(T a, T b) // Round a / b to nearest higher integer value
|
||||
{ return (a % b != 0) ? (a / b + 1) : (a / b); }
|
||||
|
||||
template <typename T>
|
||||
__global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride)
|
||||
{
|
||||
int idx = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
if (idx >= e1*e2) return;
|
||||
|
||||
int n, b, o;
|
||||
|
||||
n = idx / e2;
|
||||
b = idx % e2;
|
||||
o = n*stride + b;
|
||||
|
||||
vector[2*idx + 0] = lo + o;
|
||||
vector[2*idx + 1] = ro + o;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// local to node block strided copies
|
||||
//////////////////////////////////////////////////////
|
||||
@ -321,12 +345,20 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
|
||||
int ent=0;
|
||||
|
||||
if(cbmask == 0x3 ){
|
||||
#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
|
||||
ent = e1*e2;
|
||||
dim3 blockSize(acceleratorThreads());
|
||||
dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
|
||||
populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
|
||||
accelerator_barrier();
|
||||
#else
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
int o =n*stride+b;
|
||||
Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
@ -377,11 +409,19 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
|
||||
int ent=0;
|
||||
|
||||
if ( cbmask == 0x3 ) {
|
||||
#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
|
||||
ent = e1*e2;
|
||||
dim3 blockSize(acceleratorThreads());
|
||||
dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
|
||||
populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
|
||||
accelerator_barrier();
|
||||
#else
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
int o =n*stride;
|
||||
Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
|
||||
}}
|
||||
#endif
|
||||
} else {
|
||||
for(int n=0;n<e1;n++){
|
||||
for(int b=0;b<e2;b++){
|
||||
|
@ -47,3 +47,4 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
#include <Grid/lattice/Lattice_transfer.h>
|
||||
#include <Grid/lattice/Lattice_basis.h>
|
||||
#include <Grid/lattice/Lattice_crc.h>
|
||||
#include <Grid/lattice/PaddedCell.h>
|
||||
|
@ -153,33 +153,44 @@ inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
|
||||
}
|
||||
|
||||
template<class vobj>
|
||||
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
|
||||
inline typename vobj::scalar_object rankSum(const Lattice<vobj> &arg)
|
||||
{
|
||||
Integer osites = arg.Grid()->oSites();
|
||||
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
|
||||
typename vobj::scalar_object ssum;
|
||||
autoView( arg_v, arg, AcceleratorRead);
|
||||
ssum= sum_gpu(&arg_v[0],osites);
|
||||
return sum_gpu(&arg_v[0],osites);
|
||||
#else
|
||||
autoView(arg_v, arg, CpuRead);
|
||||
auto ssum= sum_cpu(&arg_v[0],osites);
|
||||
return sum_cpu(&arg_v[0],osites);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class vobj>
|
||||
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
|
||||
{
|
||||
auto ssum = rankSum(arg);
|
||||
arg.Grid()->GlobalSum(ssum);
|
||||
return ssum;
|
||||
}
|
||||
|
||||
template<class vobj>
|
||||
inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
|
||||
inline typename vobj::scalar_object rankSumLarge(const Lattice<vobj> &arg)
|
||||
{
|
||||
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
|
||||
autoView( arg_v, arg, AcceleratorRead);
|
||||
Integer osites = arg.Grid()->oSites();
|
||||
auto ssum= sum_gpu_large(&arg_v[0],osites);
|
||||
return sum_gpu_large(&arg_v[0],osites);
|
||||
#else
|
||||
autoView(arg_v, arg, CpuRead);
|
||||
Integer osites = arg.Grid()->oSites();
|
||||
auto ssum= sum_cpu(&arg_v[0],osites);
|
||||
return sum_cpu(&arg_v[0],osites);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class vobj>
|
||||
inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
|
||||
{
|
||||
auto ssum = rankSumLarge(arg);
|
||||
arg.Grid()->GlobalSum(ssum);
|
||||
return ssum;
|
||||
}
|
||||
|
@ -211,25 +211,22 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
|
||||
assert(ok);
|
||||
|
||||
Integer smemSize = numThreads * sizeof(sobj);
|
||||
// UVM seems to be buggy under later CUDA drivers
|
||||
// This fails on A100 and driver 5.30.02 / CUDA 12.1
|
||||
// Fails with multiple NVCC versions back to 11.4,
|
||||
// which worked with earlier drivers.
|
||||
// Not sure which driver had first fail and this bears checking
|
||||
// Is awkward as must install multiple driver versions
|
||||
// Move out of UVM
|
||||
// Turns out I had messed up the synchronise after move to compute stream
|
||||
// as running this on the default stream fools the synchronise
|
||||
#undef UVM_BLOCK_BUFFER
|
||||
#ifndef UVM_BLOCK_BUFFER
|
||||
commVector<sobj> buffer(numBlocks);
|
||||
sobj *buffer_v = &buffer[0];
|
||||
sobj result;
|
||||
reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
|
||||
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
|
||||
accelerator_barrier();
|
||||
acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
|
||||
#else
|
||||
Vector<sobj> buffer(numBlocks);
|
||||
sobj *buffer_v = &buffer[0];
|
||||
sobj result;
|
||||
reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
|
||||
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
|
||||
accelerator_barrier();
|
||||
result = *buffer_v;
|
||||
#endif
|
||||
|
@ -440,17 +440,8 @@ public:
|
||||
_grid->GlobalCoorToGlobalIndex(gcoor,gidx);
|
||||
|
||||
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
|
||||
#if 1
|
||||
assert(rank == _grid->ThisRank() );
|
||||
#else
|
||||
//
|
||||
if (rank != _grid->ThisRank() ){
|
||||
std::cout <<"rank "<<rank<<" _grid->ThisRank() "<<_grid->ThisRank()<< std::endl;
|
||||
// exit(-42);
|
||||
// assert(0);
|
||||
}
|
||||
#endif
|
||||
|
||||
assert(rank == _grid->ThisRank() );
|
||||
|
||||
int l_idx=generator_idx(o_idx,i_idx);
|
||||
_generators[l_idx] = master_engine;
|
||||
|
@ -288,7 +288,36 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
|
||||
blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed);
|
||||
}
|
||||
}
|
||||
template<class vobj,class CComplex,int nbasis,class VLattice>
|
||||
inline void batchBlockProject(std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
|
||||
const std::vector<Lattice<vobj>> &fineData,
|
||||
const VLattice &Basis)
|
||||
{
|
||||
int NBatch = fineData.size();
|
||||
assert(coarseData.size() == NBatch);
|
||||
|
||||
GridBase * fine = fineData[0].Grid();
|
||||
GridBase * coarse= coarseData[0].Grid();
|
||||
|
||||
Lattice<iScalar<CComplex>> ip(coarse);
|
||||
std::vector<Lattice<vobj>> fineDataCopy = fineData;
|
||||
|
||||
autoView(ip_, ip, AcceleratorWrite);
|
||||
for(int v=0;v<nbasis;v++) {
|
||||
for (int k=0; k<NBatch; k++) {
|
||||
autoView( coarseData_ , coarseData[k], AcceleratorWrite);
|
||||
blockInnerProductD(ip,Basis[v],fineDataCopy[k]); // ip = <basis|fine>
|
||||
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
|
||||
convertType(coarseData_[sc](v),ip_[sc]);
|
||||
});
|
||||
|
||||
// improve numerical stability of projection
|
||||
// |fine> = |fine> - <basis|fine> |basis>
|
||||
ip=-ip;
|
||||
blockZAXPY(fineDataCopy[k],ip,Basis[v],fineDataCopy[k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<class vobj,class vobj2,class CComplex>
|
||||
inline void blockZAXPY(Lattice<vobj> &fineZ,
|
||||
@ -590,6 +619,26 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
|
||||
}
|
||||
#endif
|
||||
|
||||
template<class vobj,class CComplex,int nbasis,class VLattice>
|
||||
inline void batchBlockPromote(const std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
|
||||
std::vector<Lattice<vobj>> &fineData,
|
||||
const VLattice &Basis)
|
||||
{
|
||||
int NBatch = coarseData.size();
|
||||
assert(fineData.size() == NBatch);
|
||||
|
||||
GridBase * fine = fineData[0].Grid();
|
||||
GridBase * coarse = coarseData[0].Grid();
|
||||
for (int k=0; k<NBatch; k++)
|
||||
fineData[k]=Zero();
|
||||
for (int i=0;i<nbasis;i++) {
|
||||
for (int k=0; k<NBatch; k++) {
|
||||
Lattice<iScalar<CComplex>> ip = PeekIndex<0>(coarseData[k],i);
|
||||
blockZAXPY(fineData[k],ip,Basis[i],fineData[k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars.
|
||||
// Simd layouts need not match since we use peek/poke Local
|
||||
template<class vobj,class vvobj>
|
||||
@ -648,8 +697,68 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
|
||||
for(int d=0;d<nd;d++){
|
||||
assert(Fg->_processors[d] == Tg->_processors[d]);
|
||||
}
|
||||
|
||||
// the above should guarantee that the operations are local
|
||||
|
||||
#if 1
|
||||
|
||||
size_t nsite = 1;
|
||||
for(int i=0;i<nd;i++) nsite *= RegionSize[i];
|
||||
|
||||
size_t tbytes = 4*nsite*sizeof(int);
|
||||
int *table = (int*)malloc(tbytes);
|
||||
|
||||
thread_for(idx, nsite, {
|
||||
Coordinate from_coor, to_coor;
|
||||
size_t rem = idx;
|
||||
for(int i=0;i<nd;i++){
|
||||
size_t base_i = rem % RegionSize[i]; rem /= RegionSize[i];
|
||||
from_coor[i] = base_i + FromLowerLeft[i];
|
||||
to_coor[i] = base_i + ToLowerLeft[i];
|
||||
}
|
||||
|
||||
int foidx = Fg->oIndex(from_coor);
|
||||
int fiidx = Fg->iIndex(from_coor);
|
||||
int toidx = Tg->oIndex(to_coor);
|
||||
int tiidx = Tg->iIndex(to_coor);
|
||||
int* tt = table + 4*idx;
|
||||
tt[0] = foidx;
|
||||
tt[1] = fiidx;
|
||||
tt[2] = toidx;
|
||||
tt[3] = tiidx;
|
||||
});
|
||||
|
||||
int* table_d = (int*)acceleratorAllocDevice(tbytes);
|
||||
acceleratorCopyToDevice(table,table_d,tbytes);
|
||||
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
|
||||
autoView(from_v,From,AcceleratorRead);
|
||||
autoView(to_v,To,AcceleratorWrite);
|
||||
|
||||
accelerator_for(idx,nsite,1,{
|
||||
static const int words=sizeof(vobj)/sizeof(vector_type);
|
||||
int* tt = table_d + 4*idx;
|
||||
int from_oidx = *tt++;
|
||||
int from_lane = *tt++;
|
||||
int to_oidx = *tt++;
|
||||
int to_lane = *tt;
|
||||
|
||||
const vector_type* from = (const vector_type *)&from_v[from_oidx];
|
||||
vector_type* to = (vector_type *)&to_v[to_oidx];
|
||||
|
||||
scalar_type stmp;
|
||||
for(int w=0;w<words;w++){
|
||||
stmp = getlane(from[w], from_lane);
|
||||
putlane(to[w], stmp, to_lane);
|
||||
}
|
||||
});
|
||||
|
||||
acceleratorFreeDevice(table_d);
|
||||
free(table);
|
||||
|
||||
|
||||
#else
|
||||
Coordinate ldf = Fg->_ldimensions;
|
||||
Coordinate rdf = Fg->_rdimensions;
|
||||
Coordinate isf = Fg->_istride;
|
||||
@ -658,9 +767,9 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
|
||||
Coordinate ist = Tg->_istride;
|
||||
Coordinate ost = Tg->_ostride;
|
||||
|
||||
autoView( t_v , To, AcceleratorWrite);
|
||||
autoView( f_v , From, AcceleratorRead);
|
||||
accelerator_for(idx,Fg->lSites(),1,{
|
||||
autoView( t_v , To, CpuWrite);
|
||||
autoView( f_v , From, CpuRead);
|
||||
thread_for(idx,Fg->lSites(),{
|
||||
sobj s;
|
||||
Coordinate Fcoor(nd);
|
||||
Coordinate Tcoor(nd);
|
||||
@ -673,17 +782,24 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
|
||||
Tcoor[d] = ToLowerLeft[d]+ Fcoor[d]-FromLowerLeft[d];
|
||||
}
|
||||
if (in_region) {
|
||||
Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]);
|
||||
Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]);
|
||||
Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]);
|
||||
Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]);
|
||||
vector_type * fp = (vector_type *)&f_v[odx_f];
|
||||
vector_type * tp = (vector_type *)&t_v[odx_t];
|
||||
#if 0
|
||||
Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]); // inner index from
|
||||
Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]); // inner index to
|
||||
Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]); // outer index from
|
||||
Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]); // outer index to
|
||||
scalar_type * fp = (scalar_type *)&f_v[odx_f];
|
||||
scalar_type * tp = (scalar_type *)&t_v[odx_t];
|
||||
for(int w=0;w<words;w++){
|
||||
tp[w].putlane(fp[w].getlane(idx_f),idx_t);
|
||||
}
|
||||
#else
|
||||
peekLocalSite(s,f_v,Fcoor);
|
||||
pokeLocalSite(s,t_v,Tcoor);
|
||||
#endif
|
||||
}
|
||||
});
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@ -776,6 +892,8 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
|
||||
}
|
||||
|
||||
|
||||
//Insert subvolume orthogonal to direction 'orthog' with slice index 'slice_lo' from 'lowDim' onto slice index 'slice_hi' of higherDim
|
||||
//The local dimensions of both 'lowDim' and 'higherDim' orthogonal to 'orthog' should be the same
|
||||
template<class vobj>
|
||||
void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
|
||||
{
|
||||
@ -792,11 +910,70 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
|
||||
|
||||
for(int d=0;d<nh;d++){
|
||||
if ( d!=orthog ) {
|
||||
assert(lg->_processors[d] == hg->_processors[d]);
|
||||
assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
|
||||
}
|
||||
assert(lg->_processors[d] == hg->_processors[d]);
|
||||
assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
|
||||
}
|
||||
}
|
||||
|
||||
#if 1
|
||||
size_t nsite = lg->lSites()/lg->LocalDimensions()[orthog];
|
||||
size_t tbytes = 4*nsite*sizeof(int);
|
||||
int *table = (int*)malloc(tbytes);
|
||||
|
||||
thread_for(idx,nsite,{
|
||||
Coordinate lcoor(nl);
|
||||
Coordinate hcoor(nh);
|
||||
lcoor[orthog] = slice_lo;
|
||||
hcoor[orthog] = slice_hi;
|
||||
size_t rem = idx;
|
||||
for(int mu=0;mu<nl;mu++){
|
||||
if(mu != orthog){
|
||||
int xmu = rem % lg->LocalDimensions()[mu]; rem /= lg->LocalDimensions()[mu];
|
||||
lcoor[mu] = hcoor[mu] = xmu;
|
||||
}
|
||||
}
|
||||
int loidx = lg->oIndex(lcoor);
|
||||
int liidx = lg->iIndex(lcoor);
|
||||
int hoidx = hg->oIndex(hcoor);
|
||||
int hiidx = hg->iIndex(hcoor);
|
||||
int* tt = table + 4*idx;
|
||||
tt[0] = loidx;
|
||||
tt[1] = liidx;
|
||||
tt[2] = hoidx;
|
||||
tt[3] = hiidx;
|
||||
});
|
||||
|
||||
int* table_d = (int*)acceleratorAllocDevice(tbytes);
|
||||
acceleratorCopyToDevice(table,table_d,tbytes);
|
||||
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
|
||||
autoView(lowDim_v,lowDim,AcceleratorRead);
|
||||
autoView(higherDim_v,higherDim,AcceleratorWrite);
|
||||
|
||||
accelerator_for(idx,nsite,1,{
|
||||
static const int words=sizeof(vobj)/sizeof(vector_type);
|
||||
int* tt = table_d + 4*idx;
|
||||
int from_oidx = *tt++;
|
||||
int from_lane = *tt++;
|
||||
int to_oidx = *tt++;
|
||||
int to_lane = *tt;
|
||||
|
||||
const vector_type* from = (const vector_type *)&lowDim_v[from_oidx];
|
||||
vector_type* to = (vector_type *)&higherDim_v[to_oidx];
|
||||
|
||||
scalar_type stmp;
|
||||
for(int w=0;w<words;w++){
|
||||
stmp = getlane(from[w], from_lane);
|
||||
putlane(to[w], stmp, to_lane);
|
||||
}
|
||||
});
|
||||
|
||||
acceleratorFreeDevice(table_d);
|
||||
free(table);
|
||||
|
||||
#else
|
||||
// the above should guarantee that the operations are local
|
||||
autoView(lowDimv,lowDim,CpuRead);
|
||||
autoView(higherDimv,higherDim,CpuWrite);
|
||||
@ -812,6 +989,7 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
|
||||
pokeLocalSite(s,higherDimv,hcoor);
|
||||
}
|
||||
});
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
174
Grid/lattice/PaddedCell.h
Normal file
174
Grid/lattice/PaddedCell.h
Normal file
@ -0,0 +1,174 @@
|
||||
/*************************************************************************************
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/lattice/PaddedCell.h
|
||||
|
||||
Copyright (C) 2019
|
||||
|
||||
Author: Peter Boyle pboyle@bnl.gov
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#pragma once
|
||||
|
||||
#include<Grid/cshift/Cshift.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
//Allow the user to specify how the C-shift is performed, e.g. to respect the appropriate boundary conditions
|
||||
template<typename vobj>
|
||||
struct CshiftImplBase{
|
||||
virtual Lattice<vobj> Cshift(const Lattice<vobj> &in, int dir, int shift) const = 0;
|
||||
virtual ~CshiftImplBase(){}
|
||||
};
|
||||
template<typename vobj>
|
||||
struct CshiftImplDefault: public CshiftImplBase<vobj>{
|
||||
Lattice<vobj> Cshift(const Lattice<vobj> &in, int dir, int shift) const override{ return Grid::Cshift(in,dir,shift); }
|
||||
};
|
||||
template<typename Gimpl>
|
||||
struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::vector_object>{
|
||||
typename Gimpl::GaugeLinkField Cshift(const typename Gimpl::GaugeLinkField &in, int dir, int shift) const override{ return Gimpl::CshiftLink(in,dir,shift); }
|
||||
};
|
||||
|
||||
class PaddedCell {
|
||||
public:
|
||||
GridCartesian * unpadded_grid;
|
||||
int dims;
|
||||
int depth;
|
||||
std::vector<GridCartesian *> grids;
|
||||
|
||||
~PaddedCell()
|
||||
{
|
||||
DeleteGrids();
|
||||
}
|
||||
PaddedCell(int _depth,GridCartesian *_grid)
|
||||
{
|
||||
unpadded_grid = _grid;
|
||||
depth=_depth;
|
||||
dims=_grid->Nd();
|
||||
AllocateGrids();
|
||||
Coordinate local =unpadded_grid->LocalDimensions();
|
||||
for(int d=0;d<dims;d++){
|
||||
assert(local[d]>=depth);
|
||||
}
|
||||
}
|
||||
void DeleteGrids(void)
|
||||
{
|
||||
for(int d=0;d<grids.size();d++){
|
||||
delete grids[d];
|
||||
}
|
||||
grids.resize(0);
|
||||
};
|
||||
void AllocateGrids(void)
|
||||
{
|
||||
Coordinate local =unpadded_grid->LocalDimensions();
|
||||
Coordinate simd =unpadded_grid->_simd_layout;
|
||||
Coordinate processors=unpadded_grid->_processors;
|
||||
Coordinate plocal =unpadded_grid->LocalDimensions();
|
||||
Coordinate global(dims);
|
||||
|
||||
// expand up one dim at a time
|
||||
for(int d=0;d<dims;d++){
|
||||
|
||||
plocal[d] += 2*depth;
|
||||
|
||||
for(int d=0;d<dims;d++){
|
||||
global[d] = plocal[d]*processors[d];
|
||||
}
|
||||
|
||||
grids.push_back(new GridCartesian(global,simd,processors));
|
||||
}
|
||||
};
|
||||
template<class vobj>
|
||||
inline Lattice<vobj> Extract(const Lattice<vobj> &in) const
|
||||
{
|
||||
Lattice<vobj> out(unpadded_grid);
|
||||
|
||||
Coordinate local =unpadded_grid->LocalDimensions();
|
||||
Coordinate fll(dims,depth); // depends on the MPI spread
|
||||
Coordinate tll(dims,0); // depends on the MPI spread
|
||||
localCopyRegion(in,out,fll,tll,local);
|
||||
return out;
|
||||
}
|
||||
template<class vobj>
|
||||
inline Lattice<vobj> Exchange(const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
|
||||
{
|
||||
GridBase *old_grid = in.Grid();
|
||||
int dims = old_grid->Nd();
|
||||
Lattice<vobj> tmp = in;
|
||||
for(int d=0;d<dims;d++){
|
||||
tmp = Expand(d,tmp,cshift); // rvalue && assignment
|
||||
}
|
||||
return tmp;
|
||||
}
|
||||
// expand up one dim at a time
|
||||
template<class vobj>
|
||||
inline Lattice<vobj> Expand(int dim, const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
|
||||
{
|
||||
GridBase *old_grid = in.Grid();
|
||||
GridCartesian *new_grid = grids[dim];//These are new grids
|
||||
Lattice<vobj> padded(new_grid);
|
||||
Lattice<vobj> shifted(old_grid);
|
||||
Coordinate local =old_grid->LocalDimensions();
|
||||
Coordinate plocal =new_grid->LocalDimensions();
|
||||
if(dim==0) conformable(old_grid,unpadded_grid);
|
||||
else conformable(old_grid,grids[dim-1]);
|
||||
|
||||
std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl;
|
||||
|
||||
double tins=0, tshift=0;
|
||||
|
||||
// Middle bit
|
||||
double t = usecond();
|
||||
for(int x=0;x<local[dim];x++){
|
||||
InsertSliceLocal(in,padded,x,depth+x,dim);
|
||||
}
|
||||
tins += usecond() - t;
|
||||
|
||||
// High bit
|
||||
t = usecond();
|
||||
shifted = cshift.Cshift(in,dim,depth);
|
||||
tshift += usecond() - t;
|
||||
|
||||
t=usecond();
|
||||
for(int x=0;x<depth;x++){
|
||||
InsertSliceLocal(shifted,padded,local[dim]-depth+x,depth+local[dim]+x,dim);
|
||||
}
|
||||
tins += usecond() - t;
|
||||
|
||||
// Low bit
|
||||
t = usecond();
|
||||
shifted = cshift.Cshift(in,dim,-depth);
|
||||
tshift += usecond() - t;
|
||||
|
||||
t = usecond();
|
||||
for(int x=0;x<depth;x++){
|
||||
InsertSliceLocal(shifted,padded,x,x,dim);
|
||||
}
|
||||
tins += usecond() - t;
|
||||
|
||||
std::cout << GridLogPerformance << "PaddedCell::Expand timings: cshift:" << tshift/1000 << "ms, insert-slice:" << tins/1000 << "ms" << std::endl;
|
||||
|
||||
return padded;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
@ -104,6 +104,7 @@ template<typename vtype> using iSpinMatrix = iScalar<iMatrix<iSca
|
||||
template<typename vtype> using iColourMatrix = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
|
||||
template<typename vtype> using iSpinColourMatrix = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
|
||||
template<typename vtype> using iLorentzColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
|
||||
template<typename vtype> using iLorentzComplex = iVector<iScalar<iScalar<vtype> >, Nd > ;
|
||||
template<typename vtype> using iDoubleStoredColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ;
|
||||
template<typename vtype> using iSpinVector = iScalar<iVector<iScalar<vtype>, Ns> >;
|
||||
template<typename vtype> using iColourVector = iScalar<iScalar<iVector<vtype, Nc> > >;
|
||||
@ -178,6 +179,15 @@ typedef iLorentzColourMatrix<vComplexF> vLorentzColourMatrixF;
|
||||
typedef iLorentzColourMatrix<vComplexD> vLorentzColourMatrixD;
|
||||
typedef iLorentzColourMatrix<vComplexD2> vLorentzColourMatrixD2;
|
||||
|
||||
// LorentzComplex
|
||||
typedef iLorentzComplex<Complex > LorentzComplex;
|
||||
typedef iLorentzComplex<ComplexF > LorentzComplexF;
|
||||
typedef iLorentzComplex<ComplexD > LorentzComplexD;
|
||||
|
||||
typedef iLorentzComplex<vComplex > vLorentzComplex;
|
||||
typedef iLorentzComplex<vComplexF> vLorentzComplexF;
|
||||
typedef iLorentzComplex<vComplexD> vLorentzComplexD;
|
||||
|
||||
// DoubleStored gauge field
|
||||
typedef iDoubleStoredColourMatrix<Complex > DoubleStoredColourMatrix;
|
||||
typedef iDoubleStoredColourMatrix<ComplexF > DoubleStoredColourMatrixF;
|
||||
@ -307,6 +317,10 @@ typedef Lattice<vLorentzColourMatrixF> LatticeLorentzColourMatrixF;
|
||||
typedef Lattice<vLorentzColourMatrixD> LatticeLorentzColourMatrixD;
|
||||
typedef Lattice<vLorentzColourMatrixD2> LatticeLorentzColourMatrixD2;
|
||||
|
||||
typedef Lattice<vLorentzComplex> LatticeLorentzComplex;
|
||||
typedef Lattice<vLorentzComplexF> LatticeLorentzComplexF;
|
||||
typedef Lattice<vLorentzComplexD> LatticeLorentzComplexD;
|
||||
|
||||
// DoubleStored gauge field
|
||||
typedef Lattice<vDoubleStoredColourMatrix> LatticeDoubleStoredColourMatrix;
|
||||
typedef Lattice<vDoubleStoredColourMatrixF> LatticeDoubleStoredColourMatrixF;
|
||||
|
@ -34,10 +34,24 @@ directory
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
///////////////////////////////////
|
||||
// Smart configuration base class
|
||||
///////////////////////////////////
|
||||
template< class Field >
|
||||
class ConfigurationBase
|
||||
{
|
||||
public:
|
||||
ConfigurationBase() {}
|
||||
virtual ~ConfigurationBase() {}
|
||||
virtual void set_Field(Field& U) =0;
|
||||
virtual void smeared_force(Field&) = 0;
|
||||
virtual Field& get_SmearedU() =0;
|
||||
virtual Field &get_U(bool smeared = false) = 0;
|
||||
};
|
||||
|
||||
template <class GaugeField >
|
||||
class Action
|
||||
{
|
||||
|
||||
public:
|
||||
bool is_smeared = false;
|
||||
RealD deriv_norm_sum;
|
||||
@ -77,11 +91,39 @@ public:
|
||||
void refresh_timer_stop(void) { refresh_us+=usecond(); }
|
||||
void S_timer_start(void) { S_us-=usecond(); }
|
||||
void S_timer_stop(void) { S_us+=usecond(); }
|
||||
/////////////////////////////
|
||||
// Heatbath?
|
||||
/////////////////////////////
|
||||
virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
|
||||
virtual RealD S(const GaugeField& U) = 0; // evaluate the action
|
||||
virtual RealD Sinitial(const GaugeField& U) { return this->S(U); } ; // if the refresh computes the action, can cache it. Alternately refreshAndAction() ?
|
||||
virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0; // evaluate the action derivative
|
||||
|
||||
/////////////////////////////////////////////////////////////
|
||||
// virtual smeared interface through configuration container
|
||||
/////////////////////////////////////////////////////////////
|
||||
virtual void refresh(ConfigurationBase<GaugeField> & U, GridSerialRNG &sRNG, GridParallelRNG& pRNG)
|
||||
{
|
||||
refresh(U.get_U(is_smeared),sRNG,pRNG);
|
||||
}
|
||||
virtual RealD S(ConfigurationBase<GaugeField>& U)
|
||||
{
|
||||
return S(U.get_U(is_smeared));
|
||||
}
|
||||
virtual RealD Sinitial(ConfigurationBase<GaugeField>& U)
|
||||
{
|
||||
return Sinitial(U.get_U(is_smeared));
|
||||
}
|
||||
virtual void deriv(ConfigurationBase<GaugeField>& U, GaugeField& dSdU)
|
||||
{
|
||||
deriv(U.get_U(is_smeared),dSdU);
|
||||
if ( is_smeared ) {
|
||||
U.smeared_force(dSdU);
|
||||
}
|
||||
}
|
||||
///////////////////////////////
|
||||
// Logging
|
||||
///////////////////////////////
|
||||
virtual std::string action_name() = 0; // return the action name
|
||||
virtual std::string LogParameters() = 0; // prints action parameters
|
||||
virtual ~Action(){}
|
||||
|
@ -30,6 +30,8 @@ directory
|
||||
#ifndef QCD_ACTION_CORE
|
||||
#define QCD_ACTION_CORE
|
||||
|
||||
#include <Grid/qcd/action/gauge/GaugeImplementations.h>
|
||||
|
||||
#include <Grid/qcd/action/ActionBase.h>
|
||||
NAMESPACE_CHECK(ActionBase);
|
||||
#include <Grid/qcd/action/ActionSet.h>
|
||||
|
@ -507,6 +507,7 @@ public:
|
||||
}
|
||||
this->face_table_computed=1;
|
||||
assert(this->u_comm_offset==this->_unified_buffer_size);
|
||||
accelerator_barrier();
|
||||
}
|
||||
|
||||
};
|
||||
|
@ -332,8 +332,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
|
||||
/////////////////////////////
|
||||
{
|
||||
GRID_TRACE("Gather");
|
||||
st.HaloExchangeOptGather(in,compressor);
|
||||
accelerator_barrier();
|
||||
st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine
|
||||
}
|
||||
|
||||
std::vector<std::vector<CommsRequest_t> > requests;
|
||||
|
@ -423,14 +423,14 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
|
||||
#define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier();
|
||||
|
||||
#define KERNEL_CALL_EXT(A) \
|
||||
const uint64_t NN = Nsite*Ls; \
|
||||
const uint64_t sz = st.surface_list.size(); \
|
||||
auto ptr = &st.surface_list[0]; \
|
||||
accelerator_forNB( ss, sz, Simd::Nsimd(), { \
|
||||
int sF = ptr[ss]; \
|
||||
int sU = ss/Ls; \
|
||||
int sU = sF/Ls; \
|
||||
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v); \
|
||||
});
|
||||
}); \
|
||||
accelerator_barrier();
|
||||
|
||||
#define ASM_CALL(A) \
|
||||
thread_for( sss, Nsite, { \
|
||||
@ -463,11 +463,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
||||
|
||||
if( interior && exterior ) {
|
||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
|
||||
#ifdef SYCL_HACK
|
||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteSycl); return; }
|
||||
#else
|
||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
|
||||
#endif
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;}
|
||||
#endif
|
||||
@ -478,8 +474,10 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;}
|
||||
#endif
|
||||
} else if( exterior ) {
|
||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;}
|
||||
// dependent on result of merge
|
||||
acceleratorFenceComputeStream();
|
||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL_EXT(GenericDhopSiteExt); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteExt); return;}
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); return;}
|
||||
#endif
|
||||
@ -502,21 +500,20 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;}
|
||||
#endif
|
||||
acceleratorFenceComputeStream();
|
||||
} else if( interior ) {
|
||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteDagInt); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteDagInt); return;}
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;}
|
||||
#endif
|
||||
} else if( exterior ) {
|
||||
// Dependent on result of merge
|
||||
acceleratorFenceComputeStream();
|
||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL_EXT(GenericDhopSiteDagExt); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteDagExt); return;}
|
||||
#ifndef GRID_CUDA
|
||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;}
|
||||
#endif
|
||||
acceleratorFenceComputeStream();
|
||||
}
|
||||
assert(0 && " Kernel optimisation case not covered ");
|
||||
}
|
||||
|
@ -1 +0,0 @@
|
||||
../CayleyFermion5DInstantiation.cc.master
|
@ -1 +0,0 @@
|
||||
../ContinuedFractionFermion5DInstantiation.cc.master
|
@ -1 +0,0 @@
|
||||
../DomainWallEOFAFermionInstantiation.cc.master
|
@ -1 +0,0 @@
|
||||
../MobiusEOFAFermionInstantiation.cc.master
|
@ -1 +0,0 @@
|
||||
../PartialFractionFermion5DInstantiation.cc.master
|
@ -1 +0,0 @@
|
||||
../WilsonCloverFermionInstantiation.cc.master
|
@ -1 +0,0 @@
|
||||
../WilsonFermion5DInstantiation.cc.master
|
@ -1 +0,0 @@
|
||||
../WilsonFermionInstantiation.cc.master
|
@ -1 +0,0 @@
|
||||
../WilsonKernelsInstantiation.cc.master
|
@ -1 +0,0 @@
|
||||
../WilsonTMFermionInstantiation.cc.master
|
@ -1 +0,0 @@
|
||||
#define IMPLEMENTATION WilsonImplD2
|
@ -1 +0,0 @@
|
||||
../CayleyFermion5DInstantiation.cc.master
|
@ -1 +0,0 @@
|
||||
../ContinuedFractionFermion5DInstantiation.cc.master
|
@ -1 +0,0 @@
|
||||
../DomainWallEOFAFermionInstantiation.cc.master
|
@ -1 +0,0 @@
|
||||
../MobiusEOFAFermionInstantiation.cc.master
|
@ -1 +0,0 @@
|
||||
../PartialFractionFermion5DInstantiation.cc.master
|
@ -1 +0,0 @@
|
||||
../WilsonFermion5DInstantiation.cc.master
|
@ -1 +0,0 @@
|
||||
../WilsonKernelsInstantiation.cc.master
|
@ -1 +0,0 @@
|
||||
#define IMPLEMENTATION ZWilsonImplD2
|
@ -176,7 +176,7 @@ public:
|
||||
return PeriodicBC::CshiftLink(Link,mu,shift);
|
||||
}
|
||||
|
||||
static inline void setDirections(std::vector<int> &conjDirs) { _conjDirs=conjDirs; }
|
||||
static inline void setDirections(const std::vector<int> &conjDirs) { _conjDirs=conjDirs; }
|
||||
static inline std::vector<int> getDirections(void) { return _conjDirs; }
|
||||
static inline bool isPeriodicGaugeField(void) { return false; }
|
||||
};
|
||||
|
@ -43,7 +43,7 @@ public:
|
||||
private:
|
||||
RealD c_plaq;
|
||||
RealD c_rect;
|
||||
|
||||
typename WilsonLoops<Gimpl>::StapleAndRectStapleAllWorkspace workspace;
|
||||
public:
|
||||
PlaqPlusRectangleAction(RealD b,RealD c): c_plaq(b),c_rect(c){};
|
||||
|
||||
@ -79,27 +79,18 @@ public:
|
||||
GridBase *grid = Umu.Grid();
|
||||
|
||||
std::vector<GaugeLinkField> U (Nd,grid);
|
||||
std::vector<GaugeLinkField> U2(Nd,grid);
|
||||
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
|
||||
WilsonLoops<Gimpl>::RectStapleDouble(U2[mu],U[mu],mu);
|
||||
}
|
||||
std::vector<GaugeLinkField> RectStaple(Nd,grid), Staple(Nd,grid);
|
||||
WilsonLoops<Gimpl>::StapleAndRectStapleAll(Staple, RectStaple, U, workspace);
|
||||
|
||||
GaugeLinkField dSdU_mu(grid);
|
||||
GaugeLinkField staple(grid);
|
||||
|
||||
for (int mu=0; mu < Nd; mu++){
|
||||
|
||||
// Staple in direction mu
|
||||
|
||||
WilsonLoops<Gimpl>::Staple(staple,Umu,mu);
|
||||
|
||||
dSdU_mu = Ta(U[mu]*staple)*factor_p;
|
||||
|
||||
WilsonLoops<Gimpl>::RectStaple(Umu,staple,U2,U,mu);
|
||||
|
||||
dSdU_mu = dSdU_mu + Ta(U[mu]*staple)*factor_r;
|
||||
dSdU_mu = Ta(U[mu]*Staple[mu])*factor_p;
|
||||
dSdU_mu = dSdU_mu + Ta(U[mu]*RectStaple[mu])*factor_r;
|
||||
|
||||
PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
|
||||
}
|
||||
|
@ -38,91 +38,73 @@ NAMESPACE_BEGIN(Grid);
|
||||
// cf. GeneralEvenOddRational.h for details
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template<class ImplD, class ImplF, class ImplD2>
|
||||
template<class ImplD, class ImplF>
|
||||
class GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction : public GeneralEvenOddRatioRationalPseudoFermionAction<ImplD> {
|
||||
private:
|
||||
typedef typename ImplD2::FermionField FermionFieldD2;
|
||||
typedef typename ImplD::FermionField FermionFieldD;
|
||||
typedef typename ImplF::FermionField FermionFieldF;
|
||||
|
||||
FermionOperator<ImplD> & NumOpD;
|
||||
FermionOperator<ImplD> & DenOpD;
|
||||
|
||||
FermionOperator<ImplD2> & NumOpD2;
|
||||
FermionOperator<ImplD2> & DenOpD2;
|
||||
|
||||
FermionOperator<ImplF> & NumOpF;
|
||||
FermionOperator<ImplF> & DenOpF;
|
||||
|
||||
Integer ReliableUpdateFreq;
|
||||
protected:
|
||||
|
||||
//Action evaluation
|
||||
//Allow derived classes to override the multishift CG
|
||||
virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, FermionFieldD &out){
|
||||
#if 0
|
||||
#if 1
|
||||
SchurDifferentiableOperator<ImplD> schurOp(numerator ? NumOpD : DenOpD);
|
||||
ConjugateGradientMultiShift<FermionFieldD> msCG(MaxIter, approx);
|
||||
msCG(schurOp,in, out);
|
||||
#else
|
||||
SchurDifferentiableOperator<ImplD2> schurOpD2(numerator ? NumOpD2 : DenOpD2);
|
||||
SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD);
|
||||
SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF);
|
||||
FermionFieldD2 inD2(NumOpD2.FermionRedBlackGrid());
|
||||
FermionFieldD2 outD2(NumOpD2.FermionRedBlackGrid());
|
||||
FermionFieldD inD(NumOpD.FermionRedBlackGrid());
|
||||
FermionFieldD outD(NumOpD.FermionRedBlackGrid());
|
||||
|
||||
// Action better with higher precision?
|
||||
ConjugateGradientMultiShiftMixedPrec<FermionFieldD2, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
|
||||
precisionChange(inD2,in);
|
||||
std::cout << "msCG single solve "<<norm2(inD2)<<" " <<norm2(in)<<std::endl;
|
||||
msCG(schurOpD2, inD2, outD2);
|
||||
precisionChange(out,outD2);
|
||||
ConjugateGradientMultiShiftMixedPrec<FermionFieldD, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
|
||||
msCG(schurOpD, in, out);
|
||||
#endif
|
||||
}
|
||||
//Force evaluation
|
||||
virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, std::vector<FermionFieldD> &out_elems, FermionFieldD &out){
|
||||
SchurDifferentiableOperator<ImplD2> schurOpD2(numerator ? NumOpD2 : DenOpD2);
|
||||
SchurDifferentiableOperator<ImplF> schurOpF (numerator ? NumOpF : DenOpF);
|
||||
SchurDifferentiableOperator<ImplD> schurOpD(numerator ? NumOpD : DenOpD);
|
||||
SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF);
|
||||
|
||||
FermionFieldD2 inD2(NumOpD2.FermionRedBlackGrid());
|
||||
FermionFieldD2 outD2(NumOpD2.FermionRedBlackGrid());
|
||||
std::vector<FermionFieldD2> out_elemsD2(out_elems.size(),NumOpD2.FermionRedBlackGrid());
|
||||
ConjugateGradientMultiShiftMixedPrecCleanup<FermionFieldD2, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
|
||||
precisionChange(inD2,in);
|
||||
std::cout << "msCG in "<<norm2(inD2)<<" " <<norm2(in)<<std::endl;
|
||||
msCG(schurOpD2, inD2, out_elemsD2, outD2);
|
||||
precisionChange(out,outD2);
|
||||
for(int i=0;i<out_elems.size();i++){
|
||||
precisionChange(out_elems[i],out_elemsD2[i]);
|
||||
}
|
||||
FermionFieldD inD(NumOpD.FermionRedBlackGrid());
|
||||
FermionFieldD outD(NumOpD.FermionRedBlackGrid());
|
||||
std::vector<FermionFieldD> out_elemsD(out_elems.size(),NumOpD.FermionRedBlackGrid());
|
||||
ConjugateGradientMultiShiftMixedPrecCleanup<FermionFieldD, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
|
||||
msCG(schurOpD, in, out_elems, out);
|
||||
}
|
||||
//Allow derived classes to override the gauge import
|
||||
virtual void ImportGauge(const typename ImplD::GaugeField &Ud){
|
||||
|
||||
typename ImplF::GaugeField Uf(NumOpF.GaugeGrid());
|
||||
typename ImplD2::GaugeField Ud2(NumOpD2.GaugeGrid());
|
||||
precisionChange(Uf, Ud);
|
||||
precisionChange(Ud2, Ud);
|
||||
|
||||
std::cout << "Importing "<<norm2(Ud)<<" "<< norm2(Uf)<<" " << norm2(Ud2)<<std::endl;
|
||||
std::cout << "Importing "<<norm2(Ud)<<" "<< norm2(Uf)<<" " <<std::endl;
|
||||
|
||||
NumOpD.ImportGauge(Ud);
|
||||
DenOpD.ImportGauge(Ud);
|
||||
|
||||
NumOpF.ImportGauge(Uf);
|
||||
DenOpF.ImportGauge(Uf);
|
||||
|
||||
NumOpD2.ImportGauge(Ud2);
|
||||
DenOpD2.ImportGauge(Ud2);
|
||||
}
|
||||
|
||||
public:
|
||||
GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction(FermionOperator<ImplD> &_NumOpD, FermionOperator<ImplD> &_DenOpD,
|
||||
FermionOperator<ImplF> &_NumOpF, FermionOperator<ImplF> &_DenOpF,
|
||||
FermionOperator<ImplD2> &_NumOpD2, FermionOperator<ImplD2> &_DenOpD2,
|
||||
const RationalActionParams & p, Integer _ReliableUpdateFreq
|
||||
) : GeneralEvenOddRatioRationalPseudoFermionAction<ImplD>(_NumOpD, _DenOpD, p),
|
||||
ReliableUpdateFreq(_ReliableUpdateFreq),
|
||||
NumOpD(_NumOpD), DenOpD(_DenOpD),
|
||||
NumOpF(_NumOpF), DenOpF(_DenOpF),
|
||||
NumOpD2(_NumOpD2), DenOpD2(_DenOpD2)
|
||||
NumOpF(_NumOpF), DenOpF(_DenOpF)
|
||||
{}
|
||||
|
||||
virtual std::string action_name(){return "GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction";}
|
||||
|
@ -67,9 +67,9 @@ NAMESPACE_BEGIN(Grid);
|
||||
virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}
|
||||
};
|
||||
|
||||
template<class Impl,class ImplF,class ImplD2>
|
||||
template<class Impl,class ImplF>
|
||||
class OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction
|
||||
: public GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF,ImplD2> {
|
||||
: public GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF> {
|
||||
public:
|
||||
typedef OneFlavourRationalParams Params;
|
||||
private:
|
||||
@ -91,11 +91,9 @@ NAMESPACE_BEGIN(Grid);
|
||||
FermionOperator<Impl> &_DenOp,
|
||||
FermionOperator<ImplF> &_NumOpF,
|
||||
FermionOperator<ImplF> &_DenOpF,
|
||||
FermionOperator<ImplD2> &_NumOpD2,
|
||||
FermionOperator<ImplD2> &_DenOpD2,
|
||||
const Params & p, Integer ReliableUpdateFreq
|
||||
) :
|
||||
GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF,ImplD2>(_NumOp, _DenOp,_NumOpF, _DenOpF,_NumOpD2, _DenOpD2, transcribe(p),ReliableUpdateFreq){}
|
||||
GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<Impl,ImplF>(_NumOp, _DenOp,_NumOpF, _DenOpF, transcribe(p),ReliableUpdateFreq){}
|
||||
|
||||
virtual std::string action_name(){return "OneFlavourEvenOddRatioRationalPseudoFermionAction";}
|
||||
};
|
||||
|
@ -112,40 +112,27 @@ NAMESPACE_BEGIN(Grid);
|
||||
// NumOp == V
|
||||
// DenOp == M
|
||||
//
|
||||
AUDIT();
|
||||
FermionField etaOdd (NumOp.FermionRedBlackGrid());
|
||||
FermionField etaEven(NumOp.FermionRedBlackGrid());
|
||||
FermionField tmp (NumOp.FermionRedBlackGrid());
|
||||
|
||||
AUDIT();
|
||||
pickCheckerboard(Even,etaEven,eta);
|
||||
AUDIT();
|
||||
pickCheckerboard(Odd,etaOdd,eta);
|
||||
|
||||
AUDIT();
|
||||
NumOp.ImportGauge(U);
|
||||
AUDIT();
|
||||
DenOp.ImportGauge(U);
|
||||
std::cout << " TwoFlavourRefresh: Imported gauge "<<std::endl;
|
||||
AUDIT();
|
||||
|
||||
SchurDifferentiableOperator<Impl> Mpc(DenOp);
|
||||
AUDIT();
|
||||
SchurDifferentiableOperator<Impl> Vpc(NumOp);
|
||||
AUDIT();
|
||||
|
||||
std::cout << " TwoFlavourRefresh: Diff ops "<<std::endl;
|
||||
AUDIT();
|
||||
// Odd det factors
|
||||
Mpc.MpcDag(etaOdd,PhiOdd);
|
||||
AUDIT();
|
||||
std::cout << " TwoFlavourRefresh: MpcDag "<<std::endl;
|
||||
tmp=Zero();
|
||||
AUDIT();
|
||||
std::cout << " TwoFlavourRefresh: Zero() guess "<<std::endl;
|
||||
AUDIT();
|
||||
HeatbathSolver(Vpc,PhiOdd,tmp);
|
||||
AUDIT();
|
||||
std::cout << " TwoFlavourRefresh: Heatbath solver "<<std::endl;
|
||||
Vpc.Mpc(tmp,PhiOdd);
|
||||
std::cout << " TwoFlavourRefresh: Mpc "<<std::endl;
|
||||
@ -220,20 +207,27 @@ NAMESPACE_BEGIN(Grid);
|
||||
//X = (Mdag M)^-1 V^dag phi
|
||||
//Y = (Mdag)^-1 V^dag phi
|
||||
Vpc.MpcDag(PhiOdd,Y); // Y= Vdag phi
|
||||
std::cout << GridLogMessage <<" Y "<<norm2(Y)<<std::endl;
|
||||
X=Zero();
|
||||
DerivativeSolver(Mpc,Y,X); // X= (MdagM)^-1 Vdag phi
|
||||
std::cout << GridLogMessage <<" X "<<norm2(X)<<std::endl;
|
||||
Mpc.Mpc(X,Y); // Y= Mdag^-1 Vdag phi
|
||||
std::cout << GridLogMessage <<" Y "<<norm2(Y)<<std::endl;
|
||||
|
||||
// phi^dag V (Mdag M)^-1 dV^dag phi
|
||||
Vpc.MpcDagDeriv(force , X, PhiOdd ); dSdU = force;
|
||||
std::cout << GridLogMessage <<" deriv "<<norm2(force)<<std::endl;
|
||||
|
||||
// phi^dag dV (Mdag M)^-1 V^dag phi
|
||||
Vpc.MpcDeriv(force , PhiOdd, X ); dSdU = dSdU+force;
|
||||
std::cout << GridLogMessage <<" deriv "<<norm2(force)<<std::endl;
|
||||
|
||||
// - phi^dag V (Mdag M)^-1 Mdag dM (Mdag M)^-1 V^dag phi
|
||||
// - phi^dag V (Mdag M)^-1 dMdag M (Mdag M)^-1 V^dag phi
|
||||
Mpc.MpcDeriv(force,Y,X); dSdU = dSdU-force;
|
||||
std::cout << GridLogMessage <<" deriv "<<norm2(force)<<std::endl;
|
||||
Mpc.MpcDagDeriv(force,X,Y); dSdU = dSdU-force;
|
||||
std::cout << GridLogMessage <<" deriv "<<norm2(force)<<std::endl;
|
||||
|
||||
// FIXME No force contribution from EvenEven assumed here
|
||||
// Needs a fix for clover.
|
||||
|
@ -283,12 +283,13 @@ public:
|
||||
std::cout << GridLogHMC << "Total time for trajectory (s): " << (t1-t0)/1e6 << std::endl;
|
||||
|
||||
TheIntegrator.print_timer();
|
||||
|
||||
|
||||
TheIntegrator.Smearer.set_Field(Ucur);
|
||||
for (int obs = 0; obs < Observables.size(); obs++) {
|
||||
std::cout << GridLogDebug << "Observables # " << obs << std::endl;
|
||||
std::cout << GridLogDebug << "Observables total " << Observables.size() << std::endl;
|
||||
std::cout << GridLogDebug << "Observables pointer " << Observables[obs] << std::endl;
|
||||
Observables[obs]->TrajectoryComplete(traj + 1, Ucur, sRNG, pRNG);
|
||||
Observables[obs]->TrajectoryComplete(traj + 1, TheIntegrator.Smearer, sRNG, pRNG);
|
||||
}
|
||||
std::cout << GridLogHMC << ":::::::::::::::::::::::::::::::::::::::::::" << std::endl;
|
||||
}
|
||||
|
@ -35,13 +35,16 @@ class CheckpointerParameters : Serializable {
|
||||
public:
|
||||
GRID_SERIALIZABLE_CLASS_MEMBERS(CheckpointerParameters,
|
||||
std::string, config_prefix,
|
||||
std::string, smeared_prefix,
|
||||
std::string, rng_prefix,
|
||||
int, saveInterval,
|
||||
bool, saveSmeared,
|
||||
std::string, format, );
|
||||
|
||||
CheckpointerParameters(std::string cf = "cfg", std::string rn = "rng",
|
||||
CheckpointerParameters(std::string cf = "cfg", std::string sf="cfg_smr" , std::string rn = "rng",
|
||||
int savemodulo = 1, const std::string &f = "IEEE64BIG")
|
||||
: config_prefix(cf),
|
||||
smeared_prefix(sf),
|
||||
rng_prefix(rn),
|
||||
saveInterval(savemodulo),
|
||||
format(f){};
|
||||
@ -61,13 +64,21 @@ template <class Impl>
|
||||
class BaseHmcCheckpointer : public HmcObservable<typename Impl::Field> {
|
||||
public:
|
||||
void build_filenames(int traj, CheckpointerParameters &Params,
|
||||
std::string &conf_file, std::string &rng_file) {
|
||||
std::string &conf_file,
|
||||
std::string &smear_file,
|
||||
std::string &rng_file) {
|
||||
{
|
||||
std::ostringstream os;
|
||||
os << Params.rng_prefix << "." << traj;
|
||||
rng_file = os.str();
|
||||
}
|
||||
|
||||
{
|
||||
std::ostringstream os;
|
||||
os << Params.smeared_prefix << "." << traj;
|
||||
smear_file = os.str();
|
||||
}
|
||||
|
||||
{
|
||||
std::ostringstream os;
|
||||
os << Params.config_prefix << "." << traj;
|
||||
@ -84,6 +95,11 @@ public:
|
||||
}
|
||||
virtual void initialize(const CheckpointerParameters &Params) = 0;
|
||||
|
||||
virtual void TrajectoryComplete(int traj,
|
||||
typename Impl::Field &U,
|
||||
GridSerialRNG &sRNG,
|
||||
GridParallelRNG &pRNG) { assert(0); } ; // HMC should pass the smart config with smeared and unsmeared
|
||||
|
||||
virtual void CheckpointRestore(int traj, typename Impl::Field &U,
|
||||
GridSerialRNG &sRNG,
|
||||
GridParallelRNG &pRNG) = 0;
|
||||
|
@ -61,11 +61,14 @@ public:
|
||||
fout.close();
|
||||
}
|
||||
|
||||
void TrajectoryComplete(int traj, Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {
|
||||
void TrajectoryComplete(int traj,
|
||||
ConfigurationBase<Field> &SmartConfig,
|
||||
GridSerialRNG &sRNG, GridParallelRNG &pRNG)
|
||||
{
|
||||
|
||||
if ((traj % Params.saveInterval) == 0) {
|
||||
std::string config, rng;
|
||||
this->build_filenames(traj, Params, config, rng);
|
||||
std::string config, rng, smr;
|
||||
this->build_filenames(traj, Params, config, smr, rng);
|
||||
|
||||
uint32_t nersc_csum;
|
||||
uint32_t scidac_csuma;
|
||||
@ -74,9 +77,15 @@ public:
|
||||
BinarySimpleUnmunger<sobj_double, sobj> munge;
|
||||
truncate(rng);
|
||||
BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
|
||||
truncate(config);
|
||||
std::cout << GridLogMessage << "Written Binary RNG " << rng
|
||||
<< " checksum " << std::hex
|
||||
<< nersc_csum <<"/"
|
||||
<< scidac_csuma <<"/"
|
||||
<< scidac_csumb
|
||||
<< std::dec << std::endl;
|
||||
|
||||
BinaryIO::writeLatticeObject<vobj, sobj_double>(U, config, munge, 0, Params.format,
|
||||
truncate(config);
|
||||
BinaryIO::writeLatticeObject<vobj, sobj_double>(SmartConfig.get_U(false), config, munge, 0, Params.format,
|
||||
nersc_csum,scidac_csuma,scidac_csumb);
|
||||
|
||||
std::cout << GridLogMessage << "Written Binary Configuration " << config
|
||||
@ -85,6 +94,18 @@ public:
|
||||
<< scidac_csuma <<"/"
|
||||
<< scidac_csumb
|
||||
<< std::dec << std::endl;
|
||||
|
||||
if ( Params.saveSmeared ) {
|
||||
truncate(smr);
|
||||
BinaryIO::writeLatticeObject<vobj, sobj_double>(SmartConfig.get_U(true), smr, munge, 0, Params.format,
|
||||
nersc_csum,scidac_csuma,scidac_csumb);
|
||||
std::cout << GridLogMessage << "Written Binary Smeared Configuration " << smr
|
||||
<< " checksum " << std::hex
|
||||
<< nersc_csum <<"/"
|
||||
<< scidac_csuma <<"/"
|
||||
<< scidac_csumb
|
||||
<< std::dec << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
|
@ -69,17 +69,27 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
void TrajectoryComplete(int traj, GaugeField &U, GridSerialRNG &sRNG,
|
||||
void TrajectoryComplete(int traj,
|
||||
ConfigurationBase<GaugeField> &SmartConfig,
|
||||
GridSerialRNG &sRNG,
|
||||
GridParallelRNG &pRNG) {
|
||||
if ((traj % Params.saveInterval) == 0) {
|
||||
std::string config, rng;
|
||||
std::string config, rng, smr;
|
||||
this->build_filenames(traj, Params, config, rng);
|
||||
GridBase *grid = U.Grid();
|
||||
GridBase *grid = SmartConfig.get_U(false).Grid();
|
||||
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
|
||||
BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
|
||||
std::cout << GridLogMessage << "Written BINARY RNG " << rng
|
||||
<< " checksum " << std::hex
|
||||
<< nersc_csum<<"/"
|
||||
<< scidac_csuma<<"/"
|
||||
<< scidac_csumb
|
||||
<< std::dec << std::endl;
|
||||
|
||||
|
||||
IldgWriter _IldgWriter(grid->IsBoss());
|
||||
_IldgWriter.open(config);
|
||||
_IldgWriter.writeConfiguration<GaugeStats>(U, traj, config, config);
|
||||
_IldgWriter.writeConfiguration<GaugeStats>(SmartConfig.get_U(false), traj, config, config);
|
||||
_IldgWriter.close();
|
||||
|
||||
std::cout << GridLogMessage << "Written ILDG Configuration on " << config
|
||||
@ -88,6 +98,21 @@ public:
|
||||
<< scidac_csuma<<"/"
|
||||
<< scidac_csumb
|
||||
<< std::dec << std::endl;
|
||||
|
||||
if ( Params.saveSmeared ) {
|
||||
IldgWriter _IldgWriter(grid->IsBoss());
|
||||
_IldgWriter.open(smr);
|
||||
_IldgWriter.writeConfiguration<GaugeStats>(SmartConfig.get_U(true), traj, config, config);
|
||||
_IldgWriter.close();
|
||||
|
||||
std::cout << GridLogMessage << "Written ILDG Configuration on " << smr
|
||||
<< " checksum " << std::hex
|
||||
<< nersc_csum<<"/"
|
||||
<< scidac_csuma<<"/"
|
||||
<< scidac_csumb
|
||||
<< std::dec << std::endl;
|
||||
}
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -52,23 +52,29 @@ public:
|
||||
Params.format = "IEEE64BIG"; // fixed, overwrite any other choice
|
||||
}
|
||||
|
||||
void TrajectoryComplete(int traj, GaugeField &U, GridSerialRNG &sRNG,
|
||||
GridParallelRNG &pRNG) {
|
||||
virtual void TrajectoryComplete(int traj,
|
||||
ConfigurationBase<GaugeField> &SmartConfig,
|
||||
GridSerialRNG &sRNG,
|
||||
GridParallelRNG &pRNG)
|
||||
{
|
||||
if ((traj % Params.saveInterval) == 0) {
|
||||
std::string config, rng;
|
||||
this->build_filenames(traj, Params, config, rng);
|
||||
|
||||
std::string config, rng, smr;
|
||||
this->build_filenames(traj, Params, config, smr, rng);
|
||||
|
||||
int precision32 = 1;
|
||||
int tworow = 0;
|
||||
NerscIO::writeRNGState(sRNG, pRNG, rng);
|
||||
NerscIO::writeConfiguration<GaugeStats>(U, config, tworow, precision32);
|
||||
NerscIO::writeConfiguration<GaugeStats>(SmartConfig.get_U(false), config, tworow, precision32);
|
||||
if ( Params.saveSmeared ) {
|
||||
NerscIO::writeConfiguration<GaugeStats>(SmartConfig.get_U(true), smr, tworow, precision32);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
void CheckpointRestore(int traj, GaugeField &U, GridSerialRNG &sRNG,
|
||||
GridParallelRNG &pRNG) {
|
||||
std::string config, rng;
|
||||
this->build_filenames(traj, Params, config, rng);
|
||||
std::string config, rng, smr;
|
||||
this->build_filenames(traj, Params, config, smr, rng );
|
||||
this->check_filename(rng);
|
||||
this->check_filename(config);
|
||||
|
||||
|
@ -70,19 +70,37 @@ class ScidacHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
|
||||
}
|
||||
}
|
||||
|
||||
void TrajectoryComplete(int traj, Field &U, GridSerialRNG &sRNG,
|
||||
void TrajectoryComplete(int traj,
|
||||
ConfigurationBase<Field> &SmartConfig,
|
||||
GridSerialRNG &sRNG,
|
||||
GridParallelRNG &pRNG) {
|
||||
if ((traj % Params.saveInterval) == 0) {
|
||||
std::string config, rng;
|
||||
this->build_filenames(traj, Params, config, rng);
|
||||
GridBase *grid = U.Grid();
|
||||
std::string config, rng,smr;
|
||||
this->build_filenames(traj, Params, config, smr, rng);
|
||||
GridBase *grid = SmartConfig.get_U(false).Grid();
|
||||
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
|
||||
BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
|
||||
ScidacWriter _ScidacWriter(grid->IsBoss());
|
||||
_ScidacWriter.open(config);
|
||||
_ScidacWriter.writeScidacFieldRecord(U, MData);
|
||||
_ScidacWriter.close();
|
||||
std::cout << GridLogMessage << "Written Binary RNG " << rng
|
||||
<< " checksum " << std::hex
|
||||
<< nersc_csum <<"/"
|
||||
<< scidac_csuma <<"/"
|
||||
<< scidac_csumb
|
||||
<< std::dec << std::endl;
|
||||
|
||||
|
||||
{
|
||||
ScidacWriter _ScidacWriter(grid->IsBoss());
|
||||
_ScidacWriter.open(config);
|
||||
_ScidacWriter.writeScidacFieldRecord(SmartConfig.get_U(false), MData);
|
||||
_ScidacWriter.close();
|
||||
}
|
||||
|
||||
if ( Params.saveSmeared ) {
|
||||
ScidacWriter _ScidacWriter(grid->IsBoss());
|
||||
_ScidacWriter.open(smr);
|
||||
_ScidacWriter.writeScidacFieldRecord(SmartConfig.get_U(true), MData);
|
||||
_ScidacWriter.close();
|
||||
}
|
||||
std::cout << GridLogMessage << "Written Scidac Configuration on " << config << std::endl;
|
||||
}
|
||||
};
|
||||
|
@ -66,6 +66,7 @@ public:
|
||||
template <class FieldImplementation_, class SmearingPolicy, class RepresentationPolicy>
|
||||
class Integrator {
|
||||
protected:
|
||||
public:
|
||||
typedef FieldImplementation_ FieldImplementation;
|
||||
typedef typename FieldImplementation::Field MomentaField; //for readability
|
||||
typedef typename FieldImplementation::Field Field;
|
||||
@ -96,7 +97,6 @@ protected:
|
||||
{
|
||||
t_P[level] += ep;
|
||||
update_P(P, U, level, ep);
|
||||
|
||||
std::cout << GridLogIntegrator << "[" << level << "] P " << " dt " << ep << " : t_P " << t_P[level] << std::endl;
|
||||
}
|
||||
|
||||
@ -130,30 +130,20 @@ protected:
|
||||
Field force(U.Grid());
|
||||
conformable(U.Grid(), Mom.Grid());
|
||||
|
||||
Field& Us = Smearer.get_U(as[level].actions.at(a)->is_smeared);
|
||||
double start_force = usecond();
|
||||
|
||||
std::cout << GridLogMessage << "AuditForce["<<level<<"]["<<a<<"] before"<<std::endl;
|
||||
AUDIT();
|
||||
|
||||
as[level].actions.at(a)->deriv_timer_start();
|
||||
as[level].actions.at(a)->deriv(Us, force); // deriv should NOT include Ta
|
||||
as[level].actions.at(a)->deriv(Smearer, force); // deriv should NOT include Ta
|
||||
as[level].actions.at(a)->deriv_timer_stop();
|
||||
|
||||
std::cout << GridLogMessage << "AuditForce["<<level<<"]["<<a<<"] after"<<std::endl;
|
||||
AUDIT();
|
||||
|
||||
std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl;
|
||||
auto name = as[level].actions.at(a)->action_name();
|
||||
if (as[level].actions.at(a)->is_smeared) Smearer.smeared_force(force);
|
||||
|
||||
force = FieldImplementation::projectForce(force); // Ta for gauge fields
|
||||
double end_force = usecond();
|
||||
|
||||
// DumpSliceNorm("force ",force,Nd-1);
|
||||
|
||||
MomFilter->applyFilter(force);
|
||||
|
||||
std::cout << GridLogIntegrator << " update_P : Level [" << level <<"]["<<a <<"] "<<name<<" dt "<<ep<< std::endl;
|
||||
DumpSliceNorm("force filtered ",force,Nd-1);
|
||||
|
||||
Real force_abs = std::sqrt(norm2(force)/U.Grid()->gSites()); //average per-site norm. nb. norm2(latt) = \sum_x norm2(latt[x])
|
||||
Real impulse_abs = force_abs * ep * HMC_MOMENTUM_DENOMINATOR;
|
||||
@ -379,15 +369,10 @@ public:
|
||||
auto name = as[level].actions.at(actionID)->action_name();
|
||||
std::cout << GridLogMessage << "refresh [" << level << "][" << actionID << "] "<<name << std::endl;
|
||||
|
||||
Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
|
||||
|
||||
std::cout << GridLogMessage << "AuditRefresh["<<level<<"]["<<actionID<<"] before"<<std::endl;
|
||||
AUDIT();
|
||||
as[level].actions.at(actionID)->refresh_timer_start();
|
||||
as[level].actions.at(actionID)->refresh(Us, sRNG, pRNG);
|
||||
as[level].actions.at(actionID)->refresh(Smearer, sRNG, pRNG);
|
||||
as[level].actions.at(actionID)->refresh_timer_stop();
|
||||
std::cout << GridLogMessage << "AuditRefresh["<<level<<"]["<<actionID<<"] after"<<std::endl;
|
||||
AUDIT();
|
||||
|
||||
}
|
||||
|
||||
// Refresh the higher representation actions
|
||||
@ -424,17 +409,16 @@ public:
|
||||
// Actions
|
||||
for (int level = 0; level < as.size(); ++level) {
|
||||
for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
|
||||
AUDIT();
|
||||
|
||||
// get gauge field from the SmearingPolicy and
|
||||
// based on the boolean is_smeared in actionID
|
||||
Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
|
||||
std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl;
|
||||
as[level].actions.at(actionID)->S_timer_start();
|
||||
Hterm = as[level].actions.at(actionID)->S(Us);
|
||||
Hterm = as[level].actions.at(actionID)->S(Smearer);
|
||||
as[level].actions.at(actionID)->S_timer_stop();
|
||||
std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl;
|
||||
H += Hterm;
|
||||
AUDIT();
|
||||
|
||||
}
|
||||
as[level].apply(S_hireps, Representations, level, H);
|
||||
}
|
||||
@ -447,9 +431,9 @@ public:
|
||||
void operator()(std::vector<Action<FieldType>*> repr_set, Repr& Rep, int level, RealD& H) {
|
||||
|
||||
for (int a = 0; a < repr_set.size(); ++a) {
|
||||
AUDIT();
|
||||
|
||||
RealD Hterm = repr_set.at(a)->Sinitial(Rep.U);
|
||||
AUDIT();
|
||||
|
||||
std::cout << GridLogMessage << "Sinitial Level " << level << " term " << a << " H Hirep = " << Hterm << std::endl;
|
||||
H += Hterm;
|
||||
|
||||
@ -471,13 +455,12 @@ public:
|
||||
for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
|
||||
// get gauge field from the SmearingPolicy and
|
||||
// based on the boolean is_smeared in actionID
|
||||
Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
|
||||
std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl;
|
||||
as[level].actions.at(actionID)->S_timer_start();
|
||||
AUDIT();
|
||||
Hterm = as[level].actions.at(actionID)->Sinitial(Us);
|
||||
as[level].actions.at(actionID)->S_timer_stop();
|
||||
AUDIT();
|
||||
|
||||
as[level].actions.at(actionID)->S_timer_start();
|
||||
Hterm = as[level].actions.at(actionID)->S(Smearer);
|
||||
as[level].actions.at(actionID)->S_timer_stop();
|
||||
|
||||
std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl;
|
||||
H += Hterm;
|
||||
}
|
||||
@ -490,7 +473,6 @@ public:
|
||||
|
||||
void integrate(Field& U)
|
||||
{
|
||||
AUDIT();
|
||||
// reset the clocks
|
||||
t_U = 0;
|
||||
for (int level = 0; level < as.size(); ++level) {
|
||||
@ -508,10 +490,8 @@ public:
|
||||
assert(fabs(t_U - t_P[level]) < 1.0e-6); // must be the same
|
||||
std::cout << GridLogIntegrator << " times[" << level << "]= " << t_P[level] << " " << t_U << std::endl;
|
||||
}
|
||||
AUDIT();
|
||||
|
||||
FieldImplementation::Project(U);
|
||||
AUDIT();
|
||||
|
||||
// and that we indeed got to the end of the trajectory
|
||||
assert(fabs(t_U - Params.trajL) < 1.0e-6);
|
||||
|
@ -34,6 +34,13 @@ NAMESPACE_BEGIN(Grid);
|
||||
template <class Field>
|
||||
class HmcObservable {
|
||||
public:
|
||||
virtual void TrajectoryComplete(int traj,
|
||||
ConfigurationBase<Field> &SmartConfig,
|
||||
GridSerialRNG &sRNG,
|
||||
GridParallelRNG &pRNG)
|
||||
{
|
||||
TrajectoryComplete(traj,SmartConfig.get_U(false),sRNG,pRNG); // Unsmeared observable
|
||||
};
|
||||
virtual void TrajectoryComplete(int traj,
|
||||
Field &U,
|
||||
GridSerialRNG &sRNG,
|
||||
|
@ -42,6 +42,18 @@ public:
|
||||
// necessary for HmcObservable compatibility
|
||||
typedef typename Impl::Field Field;
|
||||
|
||||
virtual void TrajectoryComplete(int traj,
|
||||
ConfigurationBase<Field> &SmartConfig,
|
||||
GridSerialRNG &sRNG,
|
||||
GridParallelRNG &pRNG)
|
||||
{
|
||||
std::cout << GridLogMessage << "+++++++++++++++++++"<<std::endl;
|
||||
std::cout << GridLogMessage << "Unsmeared plaquette"<<std::endl;
|
||||
TrajectoryComplete(traj,SmartConfig.get_U(false),sRNG,pRNG); // Unsmeared observable
|
||||
std::cout << GridLogMessage << "Smeared plaquette"<<std::endl;
|
||||
TrajectoryComplete(traj,SmartConfig.get_U(true),sRNG,pRNG); // Unsmeared observable
|
||||
std::cout << GridLogMessage << "+++++++++++++++++++"<<std::endl;
|
||||
};
|
||||
void TrajectoryComplete(int traj,
|
||||
Field &U,
|
||||
GridSerialRNG &sRNG,
|
||||
|
@ -7,26 +7,27 @@
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
|
||||
//trivial class for no smearing
|
||||
template< class Impl >
|
||||
class NoSmearing
|
||||
class NoSmearing : public ConfigurationBase<typename Impl::Field>
|
||||
{
|
||||
public:
|
||||
INHERIT_FIELD_TYPES(Impl);
|
||||
|
||||
Field* ThinField;
|
||||
Field* ThinLinks;
|
||||
|
||||
NoSmearing(): ThinField(NULL) {}
|
||||
NoSmearing(): ThinLinks(NULL) {}
|
||||
|
||||
void set_Field(Field& U) { ThinField = &U; }
|
||||
virtual void set_Field(Field& U) { ThinLinks = &U; }
|
||||
|
||||
void smeared_force(Field&) const {}
|
||||
virtual void smeared_force(Field&) {}
|
||||
|
||||
Field& get_SmearedU() { return *ThinField; }
|
||||
virtual Field& get_SmearedU() { return *ThinLinks; }
|
||||
|
||||
Field &get_U(bool smeared = false)
|
||||
virtual Field &get_U(bool smeared = false)
|
||||
{
|
||||
return *ThinField;
|
||||
return *ThinLinks;
|
||||
}
|
||||
};
|
||||
|
||||
@ -42,19 +43,24 @@ public:
|
||||
It stores a list of smeared configurations.
|
||||
*/
|
||||
template <class Gimpl>
|
||||
class SmearedConfiguration
|
||||
class SmearedConfiguration : public ConfigurationBase<typename Gimpl::Field>
|
||||
{
|
||||
public:
|
||||
INHERIT_GIMPL_TYPES(Gimpl);
|
||||
|
||||
private:
|
||||
protected:
|
||||
const unsigned int smearingLevels;
|
||||
Smear_Stout<Gimpl> *StoutSmearing;
|
||||
std::vector<GaugeField> SmearedSet;
|
||||
|
||||
public:
|
||||
GaugeField* ThinLinks; /* Pointer to the thin links configuration */ // move to base???
|
||||
protected:
|
||||
|
||||
// Member functions
|
||||
//====================================================================
|
||||
void fill_smearedSet(GaugeField &U)
|
||||
|
||||
// Overridden in masked version
|
||||
virtual void fill_smearedSet(GaugeField &U)
|
||||
{
|
||||
ThinLinks = &U; // attach the smearing routine to the field U
|
||||
|
||||
@ -82,9 +88,10 @@ private:
|
||||
}
|
||||
}
|
||||
}
|
||||
//====================================================================
|
||||
GaugeField AnalyticSmearedForce(const GaugeField& SigmaKPrime,
|
||||
const GaugeField& GaugeK) const
|
||||
|
||||
//overridden in masked verson
|
||||
virtual GaugeField AnalyticSmearedForce(const GaugeField& SigmaKPrime,
|
||||
const GaugeField& GaugeK) const
|
||||
{
|
||||
GridBase* grid = GaugeK.Grid();
|
||||
GaugeField C(grid), SigmaK(grid), iLambda(grid);
|
||||
@ -213,8 +220,6 @@ private:
|
||||
|
||||
//====================================================================
|
||||
public:
|
||||
GaugeField*
|
||||
ThinLinks; /* Pointer to the thin links configuration */
|
||||
|
||||
/* Standard constructor */
|
||||
SmearedConfiguration(GridCartesian* UGrid, unsigned int Nsmear,
|
||||
@ -230,7 +235,7 @@ public:
|
||||
: smearingLevels(0), StoutSmearing(nullptr), SmearedSet(), ThinLinks(NULL) {}
|
||||
|
||||
// attach the smeared routines to the thin links U and fill the smeared set
|
||||
void set_Field(GaugeField &U)
|
||||
virtual void set_Field(GaugeField &U)
|
||||
{
|
||||
double start = usecond();
|
||||
fill_smearedSet(U);
|
||||
@ -240,7 +245,7 @@ public:
|
||||
}
|
||||
|
||||
//====================================================================
|
||||
void smeared_force(GaugeField &SigmaTilde) const
|
||||
virtual void smeared_force(GaugeField &SigmaTilde)
|
||||
{
|
||||
if (smearingLevels > 0)
|
||||
{
|
||||
@ -267,14 +272,16 @@ public:
|
||||
}
|
||||
double end = usecond();
|
||||
double time = (end - start)/ 1e3;
|
||||
std::cout << GridLogMessage << "Smearing force in " << time << " ms" << std::endl;
|
||||
std::cout << GridLogMessage << " GaugeConfiguration: Smeared Force chain rule took " << time << " ms" << std::endl;
|
||||
} // if smearingLevels = 0 do nothing
|
||||
SigmaTilde=Gimpl::projectForce(SigmaTilde); // Ta
|
||||
|
||||
}
|
||||
//====================================================================
|
||||
|
||||
GaugeField& get_SmearedU() { return SmearedSet[smearingLevels - 1]; }
|
||||
virtual GaugeField& get_SmearedU() { return SmearedSet[smearingLevels - 1]; }
|
||||
|
||||
GaugeField &get_U(bool smeared = false)
|
||||
virtual GaugeField &get_U(bool smeared = false)
|
||||
{
|
||||
// get the config, thin links by default
|
||||
if (smeared)
|
||||
|
813
Grid/qcd/smearing/GaugeConfigurationMasked.h
Normal file
813
Grid/qcd/smearing/GaugeConfigurationMasked.h
Normal file
@ -0,0 +1,813 @@
|
||||
/*!
|
||||
@file GaugeConfiguration.h
|
||||
@brief Declares the GaugeConfiguration class
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
/*!
|
||||
@brief Smeared configuration masked container
|
||||
Modified for a multi-subset smearing (aka Luscher Flowed HMC)
|
||||
*/
|
||||
template <class Gimpl>
|
||||
class SmearedConfigurationMasked : public SmearedConfiguration<Gimpl>
|
||||
{
|
||||
public:
|
||||
INHERIT_GIMPL_TYPES(Gimpl);
|
||||
|
||||
private:
|
||||
// These live in base class
|
||||
// const unsigned int smearingLevels;
|
||||
// Smear_Stout<Gimpl> *StoutSmearing;
|
||||
// std::vector<GaugeField> SmearedSet;
|
||||
|
||||
std::vector<LatticeLorentzComplex> masks;
|
||||
|
||||
typedef typename SU3Adjoint::AMatrix AdjMatrix;
|
||||
typedef typename SU3Adjoint::LatticeAdjMatrix AdjMatrixField;
|
||||
typedef typename SU3Adjoint::LatticeAdjVector AdjVectorField;
|
||||
|
||||
// Adjoint vector to GaugeField force
|
||||
void InsertForce(GaugeField &Fdet,AdjVectorField &Fdet_nu,int nu)
|
||||
{
|
||||
Complex ci(0,1);
|
||||
GaugeLinkField Fdet_pol(Fdet.Grid());
|
||||
Fdet_pol=Zero();
|
||||
for(int e=0;e<8;e++){
|
||||
ColourMatrix te;
|
||||
SU3::generator(e, te);
|
||||
auto tmp=peekColour(Fdet_nu,e);
|
||||
Fdet_pol=Fdet_pol + ci*tmp*te; // but norm of te is different.. why?
|
||||
}
|
||||
pokeLorentz(Fdet, Fdet_pol, nu);
|
||||
}
|
||||
void Compute_MpInvJx_dNxxdSy(const GaugeLinkField &PlaqL,const GaugeLinkField &PlaqR, AdjMatrixField MpInvJx,AdjVectorField &Fdet2 )
|
||||
{
|
||||
GaugeLinkField UtaU(PlaqL.Grid());
|
||||
GaugeLinkField D(PlaqL.Grid());
|
||||
AdjMatrixField Dbc(PlaqL.Grid());
|
||||
LatticeComplex tmp(PlaqL.Grid());
|
||||
const int Ngen = SU3Adjoint::Dimension;
|
||||
Complex ci(0,1);
|
||||
ColourMatrix ta,tb,tc;
|
||||
|
||||
for(int a=0;a<Ngen;a++) {
|
||||
SU3::generator(a, ta);
|
||||
// Qlat Tb = 2i Tb^Grid
|
||||
UtaU= 2.0*ci*adj(PlaqL)*ta*PlaqR;
|
||||
for(int c=0;c<Ngen;c++) {
|
||||
SU3::generator(c, tc);
|
||||
D = Ta( (2.0)*ci*tc *UtaU);
|
||||
for(int b=0;b<Ngen;b++){
|
||||
SU3::generator(b, tb);
|
||||
tmp =-trace(ci*tb*D);
|
||||
PokeIndex<ColourIndex>(Dbc,tmp,b,c); // Adjoint rep
|
||||
}
|
||||
}
|
||||
tmp = trace(MpInvJx * Dbc);
|
||||
PokeIndex<ColourIndex>(Fdet2,tmp,a);
|
||||
}
|
||||
}
|
||||
|
||||
void ComputeNxy(const GaugeLinkField &PlaqL,const GaugeLinkField &PlaqR,AdjMatrixField &NxAd)
|
||||
{
|
||||
GaugeLinkField Nx(PlaqL.Grid());
|
||||
const int Ngen = SU3Adjoint::Dimension;
|
||||
Complex ci(0,1);
|
||||
ColourMatrix tb;
|
||||
ColourMatrix tc;
|
||||
for(int b=0;b<Ngen;b++) {
|
||||
SU3::generator(b, tb);
|
||||
Nx = (2.0)*Ta( adj(PlaqL)*ci*tb * PlaqR );
|
||||
for(int c=0;c<Ngen;c++) {
|
||||
SU3::generator(c, tc);
|
||||
auto tmp =closure( -trace(ci*tc*Nx));
|
||||
PokeIndex<ColourIndex>(NxAd,tmp,c,b);
|
||||
}
|
||||
}
|
||||
}
|
||||
void ApplyMask(GaugeField &U,int smr)
|
||||
{
|
||||
LatticeComplex tmp(U.Grid());
|
||||
GaugeLinkField Umu(U.Grid());
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
Umu=PeekIndex<LorentzIndex>(U,mu);
|
||||
tmp=PeekIndex<LorentzIndex>(masks[smr],mu);
|
||||
Umu=Umu*tmp;
|
||||
PokeIndex<LorentzIndex>(U, Umu, mu);
|
||||
}
|
||||
}
|
||||
public:
|
||||
|
||||
void logDetJacobianForceLevel(const GaugeField &U, GaugeField &force ,int smr)
|
||||
{
|
||||
GridBase* grid = U.Grid();
|
||||
ColourMatrix tb;
|
||||
ColourMatrix tc;
|
||||
ColourMatrix ta;
|
||||
GaugeField C(grid);
|
||||
GaugeField Umsk(grid);
|
||||
std::vector<GaugeLinkField> Umu(Nd,grid);
|
||||
GaugeLinkField Cmu(grid); // U and staple; C contains factor of epsilon
|
||||
GaugeLinkField Zx(grid); // U times Staple, contains factor of epsilon
|
||||
GaugeLinkField Nxx(grid); // Nxx fundamental space
|
||||
GaugeLinkField Utmp(grid);
|
||||
GaugeLinkField PlaqL(grid);
|
||||
GaugeLinkField PlaqR(grid);
|
||||
const int Ngen = SU3Adjoint::Dimension;
|
||||
AdjMatrix TRb;
|
||||
ColourMatrix Ident;
|
||||
LatticeComplex cplx(grid);
|
||||
|
||||
AdjVectorField dJdXe_nMpInv(grid);
|
||||
AdjVectorField dJdXe_nMpInv_y(grid);
|
||||
AdjMatrixField MpAd(grid); // Mprime luchang's notes
|
||||
AdjMatrixField MpAdInv(grid); // Mprime inverse
|
||||
AdjMatrixField NxxAd(grid); // Nxx in adjoint space
|
||||
AdjMatrixField JxAd(grid);
|
||||
AdjMatrixField ZxAd(grid);
|
||||
AdjMatrixField mZxAd(grid);
|
||||
AdjMatrixField X(grid);
|
||||
Complex ci(0,1);
|
||||
|
||||
RealD t0 = usecond();
|
||||
Ident = ComplexD(1.0);
|
||||
for(int d=0;d<Nd;d++){
|
||||
Umu[d] = peekLorentz(U, d);
|
||||
}
|
||||
int mu= (smr/2) %Nd;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Mask the gauge field
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
auto mask=PeekIndex<LorentzIndex>(masks[smr],mu); // the cb mask
|
||||
|
||||
Umsk = U;
|
||||
ApplyMask(Umsk,smr);
|
||||
Utmp = peekLorentz(Umsk,mu);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Retrieve the eps/rho parameter(s) -- could allow all different but not so far
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
double rho=this->StoutSmearing->SmearRho[1];
|
||||
int idx=0;
|
||||
for(int mu=0;mu<4;mu++){
|
||||
for(int nu=0;nu<4;nu++){
|
||||
if ( mu!=nu) assert(this->StoutSmearing->SmearRho[idx]==rho);
|
||||
else assert(this->StoutSmearing->SmearRho[idx]==0.0);
|
||||
idx++;
|
||||
}}
|
||||
//////////////////////////////////////////////////////////////////
|
||||
// Assemble the N matrix
|
||||
//////////////////////////////////////////////////////////////////
|
||||
// Computes ALL the staples -- could compute one only and do it here
|
||||
RealD time;
|
||||
time=-usecond();
|
||||
this->StoutSmearing->BaseSmear(C, U);
|
||||
Cmu = peekLorentz(C, mu);
|
||||
|
||||
//////////////////////////////////////////////////////////////////
|
||||
// Assemble Luscher exp diff map J matrix
|
||||
//////////////////////////////////////////////////////////////////
|
||||
// Ta so Z lives in Lie algabra
|
||||
Zx = Ta(Cmu * adj(Umu[mu]));
|
||||
time+=usecond();
|
||||
std::cout << GridLogMessage << "Z took "<<time<< " us"<<std::endl;
|
||||
|
||||
time=-usecond();
|
||||
// Move Z to the Adjoint Rep == make_adjoint_representation
|
||||
ZxAd = Zero();
|
||||
for(int b=0;b<8;b++) {
|
||||
// Adj group sets traceless antihermitian T's -- Guido, really????
|
||||
SU3::generator(b, tb); // Fund group sets traceless hermitian T's
|
||||
SU3Adjoint::generator(b,TRb);
|
||||
TRb=-TRb;
|
||||
cplx = 2.0*trace(ci*tb*Zx); // my convention 1/2 delta ba
|
||||
ZxAd = ZxAd + cplx * TRb; // is this right? YES - Guido used Anti herm Ta's and with bloody wrong sign.
|
||||
}
|
||||
time+=usecond();
|
||||
std::cout << GridLogMessage << "ZxAd took "<<time<< " us"<<std::endl;
|
||||
|
||||
//////////////////////////////////////
|
||||
// J(x) = 1 + Sum_k=1..N (-Zac)^k/(k+1)!
|
||||
//////////////////////////////////////
|
||||
time=-usecond();
|
||||
X=1.0;
|
||||
JxAd = X;
|
||||
mZxAd = (-1.0)*ZxAd;
|
||||
RealD kpfac = 1;
|
||||
for(int k=1;k<12;k++){
|
||||
X=X*mZxAd;
|
||||
kpfac = kpfac /(k+1);
|
||||
JxAd = JxAd + X * kpfac;
|
||||
}
|
||||
time+=usecond();
|
||||
std::cout << GridLogMessage << "Jx took "<<time<< " us"<<std::endl;
|
||||
|
||||
//////////////////////////////////////
|
||||
// dJ(x)/dxe
|
||||
//////////////////////////////////////
|
||||
time=-usecond();
|
||||
std::vector<AdjMatrixField> dJdX; dJdX.resize(8,grid);
|
||||
AdjMatrixField tbXn(grid);
|
||||
AdjMatrixField sumXtbX(grid);
|
||||
AdjMatrixField t2(grid);
|
||||
AdjMatrixField dt2(grid);
|
||||
AdjMatrixField t3(grid);
|
||||
AdjMatrixField dt3(grid);
|
||||
AdjMatrixField aunit(grid);
|
||||
for(int b=0;b<8;b++){
|
||||
aunit = ComplexD(1.0);
|
||||
SU3Adjoint::generator(b, TRb); //dt2
|
||||
|
||||
X = (-1.0)*ZxAd;
|
||||
t2 = X;
|
||||
dt2 = TRb;
|
||||
for (int j = 20; j > 1; --j) {
|
||||
t3 = t2*(1.0 / (j + 1)) + aunit;
|
||||
dt3 = dt2*(1.0 / (j + 1));
|
||||
t2 = X * t3;
|
||||
dt2 = TRb * t3 + X * dt3;
|
||||
}
|
||||
dJdX[b] = -dt2;
|
||||
}
|
||||
time+=usecond();
|
||||
std::cout << GridLogMessage << "dJx took "<<time<< " us"<<std::endl;
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Mask Umu for this link
|
||||
/////////////////////////////////////////////////////////////////
|
||||
time=-usecond();
|
||||
PlaqL = Ident;
|
||||
PlaqR = Utmp*adj(Cmu);
|
||||
ComputeNxy(PlaqL,PlaqR,NxxAd);
|
||||
time+=usecond();
|
||||
std::cout << GridLogMessage << "ComputeNxy took "<<time<< " us"<<std::endl;
|
||||
|
||||
////////////////////////////
|
||||
// Mab
|
||||
////////////////////////////
|
||||
MpAd = Complex(1.0,0.0);
|
||||
MpAd = MpAd - JxAd * NxxAd;
|
||||
|
||||
/////////////////////////
|
||||
// invert the 8x8
|
||||
/////////////////////////
|
||||
time=-usecond();
|
||||
MpAdInv = Inverse(MpAd);
|
||||
time+=usecond();
|
||||
std::cout << GridLogMessage << "MpAdInv took "<<time<< " us"<<std::endl;
|
||||
|
||||
RealD t3a = usecond();
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// Nxx Mp^-1
|
||||
/////////////////////////////////////////////////////////////////
|
||||
AdjVectorField FdetV(grid);
|
||||
AdjVectorField Fdet1_nu(grid);
|
||||
AdjVectorField Fdet2_nu(grid);
|
||||
AdjVectorField Fdet2_mu(grid);
|
||||
AdjVectorField Fdet1_mu(grid);
|
||||
|
||||
AdjMatrixField nMpInv(grid);
|
||||
nMpInv= NxxAd *MpAdInv;
|
||||
|
||||
AdjMatrixField MpInvJx(grid);
|
||||
AdjMatrixField MpInvJx_nu(grid);
|
||||
MpInvJx = (-1.0)*MpAdInv * JxAd;// rho is on the plaq factor
|
||||
|
||||
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx,FdetV);
|
||||
Fdet2_mu=FdetV;
|
||||
Fdet1_mu=Zero();
|
||||
|
||||
for(int e =0 ; e<8 ; e++){
|
||||
LatticeComplexD tr(grid);
|
||||
ColourMatrix te;
|
||||
SU3::generator(e, te);
|
||||
tr = trace(dJdX[e] * nMpInv);
|
||||
pokeColour(dJdXe_nMpInv,tr,e);
|
||||
}
|
||||
///////////////////////////////
|
||||
// Mask it off
|
||||
///////////////////////////////
|
||||
auto tmp=PeekIndex<LorentzIndex>(masks[smr],mu);
|
||||
dJdXe_nMpInv = dJdXe_nMpInv*tmp;
|
||||
|
||||
// dJdXe_nMpInv needs to multiply:
|
||||
// Nxx_mu (site local) (1)
|
||||
// Nxy_mu one site forward in each nu direction (3)
|
||||
// Nxy_mu one site backward in each nu direction (3)
|
||||
// Nxy_nu 0,0 ; +mu,0; 0,-nu; +mu-nu [ 3x4 = 12]
|
||||
// 19 terms.
|
||||
AdjMatrixField Nxy(grid);
|
||||
|
||||
GaugeField Fdet1(grid);
|
||||
GaugeField Fdet2(grid);
|
||||
GaugeLinkField Fdet_pol(grid); // one polarisation
|
||||
|
||||
RealD t4 = usecond();
|
||||
for(int nu=0;nu<Nd;nu++){
|
||||
|
||||
if (nu!=mu) {
|
||||
///////////////// +ve nu /////////////////
|
||||
// __
|
||||
// | |
|
||||
// x== // nu polarisation -- clockwise
|
||||
|
||||
time=-usecond();
|
||||
PlaqL=Ident;
|
||||
|
||||
PlaqR=(-rho)*Gimpl::CovShiftForward(Umu[nu], nu,
|
||||
Gimpl::CovShiftForward(Umu[mu], mu,
|
||||
Gimpl::CovShiftBackward(Umu[nu], nu,
|
||||
Gimpl::CovShiftIdentityBackward(Utmp, mu))));
|
||||
time+=usecond();
|
||||
std::cout << GridLogMessage << "PlaqLR took "<<time<< " us"<<std::endl;
|
||||
|
||||
time=-usecond();
|
||||
dJdXe_nMpInv_y = dJdXe_nMpInv;
|
||||
ComputeNxy(PlaqL,PlaqR,Nxy);
|
||||
Fdet1_nu = transpose(Nxy)*dJdXe_nMpInv_y;
|
||||
time+=usecond();
|
||||
std::cout << GridLogMessage << "ComputeNxy (occurs 6x) took "<<time<< " us"<<std::endl;
|
||||
|
||||
time=-usecond();
|
||||
PlaqR=(-1.0)*PlaqR;
|
||||
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx,FdetV);
|
||||
Fdet2_nu = FdetV;
|
||||
time+=usecond();
|
||||
std::cout << GridLogMessage << "Compute_MpInvJx_dNxxSy (occurs 6x) took "<<time<< " us"<<std::endl;
|
||||
|
||||
// x==
|
||||
// | |
|
||||
// .__| // nu polarisation -- anticlockwise
|
||||
|
||||
PlaqR=(rho)*Gimpl::CovShiftForward(Umu[nu], nu,
|
||||
Gimpl::CovShiftBackward(Umu[mu], mu,
|
||||
Gimpl::CovShiftIdentityBackward(Umu[nu], nu)));
|
||||
|
||||
PlaqL=Gimpl::CovShiftIdentityBackward(Utmp, mu);
|
||||
|
||||
dJdXe_nMpInv_y = Cshift(dJdXe_nMpInv,mu,-1);
|
||||
ComputeNxy(PlaqL, PlaqR,Nxy);
|
||||
Fdet1_nu = Fdet1_nu+transpose(Nxy)*dJdXe_nMpInv_y;
|
||||
|
||||
|
||||
MpInvJx_nu = Cshift(MpInvJx,mu,-1);
|
||||
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
||||
Fdet2_nu = Fdet2_nu+FdetV;
|
||||
|
||||
///////////////// -ve nu /////////////////
|
||||
// __
|
||||
// | |
|
||||
// x== // nu polarisation -- clockwise
|
||||
|
||||
PlaqL=(rho)* Gimpl::CovShiftForward(Umu[mu], mu,
|
||||
Gimpl::CovShiftForward(Umu[nu], nu,
|
||||
Gimpl::CovShiftIdentityBackward(Utmp, mu)));
|
||||
|
||||
PlaqR = Gimpl::CovShiftIdentityForward(Umu[nu], nu);
|
||||
|
||||
dJdXe_nMpInv_y = Cshift(dJdXe_nMpInv,nu,1);
|
||||
ComputeNxy(PlaqL,PlaqR,Nxy);
|
||||
Fdet1_nu = Fdet1_nu + transpose(Nxy)*dJdXe_nMpInv_y;
|
||||
|
||||
MpInvJx_nu = Cshift(MpInvJx,nu,1);
|
||||
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
||||
Fdet2_nu = Fdet2_nu+FdetV;
|
||||
|
||||
// x==
|
||||
// | |
|
||||
// |__| // nu polarisation
|
||||
|
||||
PlaqL=(-rho)*Gimpl::CovShiftForward(Umu[nu], nu,
|
||||
Gimpl::CovShiftIdentityBackward(Utmp, mu));
|
||||
|
||||
PlaqR=Gimpl::CovShiftBackward(Umu[mu], mu,
|
||||
Gimpl::CovShiftIdentityForward(Umu[nu], nu));
|
||||
|
||||
dJdXe_nMpInv_y = Cshift(dJdXe_nMpInv,mu,-1);
|
||||
dJdXe_nMpInv_y = Cshift(dJdXe_nMpInv_y,nu,1);
|
||||
|
||||
ComputeNxy(PlaqL,PlaqR,Nxy);
|
||||
Fdet1_nu = Fdet1_nu + transpose(Nxy)*dJdXe_nMpInv_y;
|
||||
|
||||
MpInvJx_nu = Cshift(MpInvJx,mu,-1);
|
||||
MpInvJx_nu = Cshift(MpInvJx_nu,nu,1);
|
||||
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
||||
Fdet2_nu = Fdet2_nu+FdetV;
|
||||
|
||||
/////////////////////////////////////////////////////////////////////
|
||||
// Set up the determinant force contribution in 3x3 algebra basis
|
||||
/////////////////////////////////////////////////////////////////////
|
||||
InsertForce(Fdet1,Fdet1_nu,nu);
|
||||
InsertForce(Fdet2,Fdet2_nu,nu);
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
// Parallel direction terms
|
||||
//////////////////////////////////////////////////
|
||||
|
||||
// __
|
||||
// | "
|
||||
// |__"x // mu polarisation
|
||||
PlaqL=(-rho)*Gimpl::CovShiftForward(Umu[mu], mu,
|
||||
Gimpl::CovShiftBackward(Umu[nu], nu,
|
||||
Gimpl::CovShiftIdentityBackward(Utmp, mu)));
|
||||
|
||||
PlaqR=Gimpl::CovShiftIdentityBackward(Umu[nu], nu);
|
||||
|
||||
dJdXe_nMpInv_y = Cshift(dJdXe_nMpInv,nu,-1);
|
||||
|
||||
ComputeNxy(PlaqL,PlaqR,Nxy);
|
||||
Fdet1_mu = Fdet1_mu + transpose(Nxy)*dJdXe_nMpInv_y;
|
||||
|
||||
MpInvJx_nu = Cshift(MpInvJx,nu,-1);
|
||||
|
||||
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
||||
Fdet2_mu = Fdet2_mu+FdetV;
|
||||
|
||||
// __
|
||||
// " |
|
||||
// x__| // mu polarisation
|
||||
|
||||
PlaqL=(-rho)*Gimpl::CovShiftForward(Umu[mu], mu,
|
||||
Gimpl::CovShiftForward(Umu[nu], nu,
|
||||
Gimpl::CovShiftIdentityBackward(Utmp, mu)));
|
||||
|
||||
PlaqR=Gimpl::CovShiftIdentityForward(Umu[nu], nu);
|
||||
|
||||
dJdXe_nMpInv_y = Cshift(dJdXe_nMpInv,nu,1);
|
||||
|
||||
ComputeNxy(PlaqL,PlaqR,Nxy);
|
||||
Fdet1_mu = Fdet1_mu + transpose(Nxy)*dJdXe_nMpInv_y;
|
||||
|
||||
MpInvJx_nu = Cshift(MpInvJx,nu,1);
|
||||
|
||||
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
||||
Fdet2_mu = Fdet2_mu+FdetV;
|
||||
|
||||
}
|
||||
}
|
||||
RealD t5 = usecond();
|
||||
|
||||
Fdet1_mu = Fdet1_mu + transpose(NxxAd)*dJdXe_nMpInv;
|
||||
|
||||
InsertForce(Fdet1,Fdet1_mu,mu);
|
||||
InsertForce(Fdet2,Fdet2_mu,mu);
|
||||
|
||||
force= (-0.5)*( Fdet1 + Fdet2);
|
||||
RealD t1 = usecond();
|
||||
std::cout << GridLogMessage << " logDetJacobianForce level took "<<t1-t0<<" us "<<std::endl;
|
||||
std::cout << GridLogMessage << " logDetJacobianForce t3-t0 "<<t3a-t0<<" us "<<std::endl;
|
||||
std::cout << GridLogMessage << " logDetJacobianForce t4-t3 dJdXe_nMpInv "<<t4-t3a<<" us "<<std::endl;
|
||||
std::cout << GridLogMessage << " logDetJacobianForce t5-t4 mu nu loop "<<t5-t4<<" us "<<std::endl;
|
||||
std::cout << GridLogMessage << " logDetJacobianForce t1-t5 "<<t1-t5<<" us "<<std::endl;
|
||||
std::cout << GridLogMessage << " logDetJacobianForce level took "<<t1-t0<<" us "<<std::endl;
|
||||
}
|
||||
RealD logDetJacobianLevel(const GaugeField &U,int smr)
|
||||
{
|
||||
GridBase* grid = U.Grid();
|
||||
GaugeField C(grid);
|
||||
GaugeLinkField Nb(grid);
|
||||
GaugeLinkField Z(grid);
|
||||
GaugeLinkField Umu(grid), Cmu(grid);
|
||||
ColourMatrix Tb;
|
||||
ColourMatrix Tc;
|
||||
typedef typename SU3Adjoint::AMatrix AdjMatrix;
|
||||
typedef typename SU3Adjoint::LatticeAdjMatrix AdjMatrixField;
|
||||
typedef typename SU3Adjoint::LatticeAdjVector AdjVectorField;
|
||||
const int Ngen = SU3Adjoint::Dimension;
|
||||
AdjMatrix TRb;
|
||||
LatticeComplex cplx(grid);
|
||||
AdjVectorField AlgV(grid);
|
||||
AdjMatrixField Mab(grid);
|
||||
AdjMatrixField Ncb(grid);
|
||||
AdjMatrixField Jac(grid);
|
||||
AdjMatrixField Zac(grid);
|
||||
AdjMatrixField mZac(grid);
|
||||
AdjMatrixField X(grid);
|
||||
|
||||
int mu= (smr/2) %Nd;
|
||||
|
||||
auto mask=PeekIndex<LorentzIndex>(masks[smr],mu); // the cb mask
|
||||
|
||||
//////////////////////////////////////////////////////////////////
|
||||
// Assemble the N matrix
|
||||
//////////////////////////////////////////////////////////////////
|
||||
// Computes ALL the staples -- could compute one only here
|
||||
this->StoutSmearing->BaseSmear(C, U);
|
||||
Cmu = peekLorentz(C, mu);
|
||||
Umu = peekLorentz(U, mu);
|
||||
Complex ci(0,1);
|
||||
for(int b=0;b<Ngen;b++) {
|
||||
SU3::generator(b, Tb);
|
||||
// Qlat Tb = 2i Tb^Grid
|
||||
Nb = (2.0)*Ta( ci*Tb * Umu * adj(Cmu));
|
||||
for(int c=0;c<Ngen;c++) {
|
||||
SU3::generator(c, Tc);
|
||||
auto tmp = -trace(ci*Tc*Nb); // Luchang's norm: (2Tc) (2Td) N^db = -2 delta cd N^db // - was important
|
||||
PokeIndex<ColourIndex>(Ncb,tmp,c,b);
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////
|
||||
// Assemble Luscher exp diff map J matrix
|
||||
//////////////////////////////////////////////////////////////////
|
||||
// Ta so Z lives in Lie algabra
|
||||
Z = Ta(Cmu * adj(Umu));
|
||||
|
||||
// Move Z to the Adjoint Rep == make_adjoint_representation
|
||||
Zac = Zero();
|
||||
for(int b=0;b<8;b++) {
|
||||
// Adj group sets traceless antihermitian T's -- Guido, really????
|
||||
// Is the mapping of these the same? Same structure constants
|
||||
// Might never have been checked.
|
||||
SU3::generator(b, Tb); // Fund group sets traceless hermitian T's
|
||||
SU3Adjoint::generator(b,TRb);
|
||||
TRb=-TRb;
|
||||
cplx = 2.0*trace(ci*Tb*Z); // my convention 1/2 delta ba
|
||||
Zac = Zac + cplx * TRb; // is this right? YES - Guido used Anti herm Ta's and with bloody wrong sign.
|
||||
}
|
||||
|
||||
//////////////////////////////////////
|
||||
// J(x) = 1 + Sum_k=1..N (-Zac)^k/(k+1)!
|
||||
//////////////////////////////////////
|
||||
X=1.0;
|
||||
Jac = X;
|
||||
mZac = (-1.0)*Zac;
|
||||
RealD kpfac = 1;
|
||||
for(int k=1;k<12;k++){
|
||||
X=X*mZac;
|
||||
kpfac = kpfac /(k+1);
|
||||
Jac = Jac + X * kpfac;
|
||||
}
|
||||
|
||||
////////////////////////////
|
||||
// Mab
|
||||
////////////////////////////
|
||||
Mab = Complex(1.0,0.0);
|
||||
Mab = Mab - Jac * Ncb;
|
||||
|
||||
////////////////////////////
|
||||
// det
|
||||
////////////////////////////
|
||||
LatticeComplex det(grid);
|
||||
det = Determinant(Mab);
|
||||
|
||||
////////////////////////////
|
||||
// ln det
|
||||
////////////////////////////
|
||||
LatticeComplex ln_det(grid);
|
||||
ln_det = log(det);
|
||||
|
||||
////////////////////////////
|
||||
// Masked sum
|
||||
////////////////////////////
|
||||
ln_det = ln_det * mask;
|
||||
Complex result = sum(ln_det);
|
||||
return result.real();
|
||||
}
|
||||
public:
|
||||
RealD logDetJacobian(void)
|
||||
{
|
||||
RealD ln_det = 0;
|
||||
if (this->smearingLevels > 0)
|
||||
{
|
||||
double start = usecond();
|
||||
for (int ismr = this->smearingLevels - 1; ismr > 0; --ismr) {
|
||||
ln_det+= logDetJacobianLevel(this->get_smeared_conf(ismr-1),ismr);
|
||||
}
|
||||
ln_det +=logDetJacobianLevel(*(this->ThinLinks),0);
|
||||
|
||||
double end = usecond();
|
||||
double time = (end - start)/ 1e3;
|
||||
std::cout << GridLogMessage << "GaugeConfigurationMasked: logDetJacobian took " << time << " ms" << std::endl;
|
||||
}
|
||||
return ln_det;
|
||||
}
|
||||
void logDetJacobianForce(GaugeField &force)
|
||||
{
|
||||
force =Zero();
|
||||
GaugeField force_det(force.Grid());
|
||||
|
||||
if (this->smearingLevels > 0)
|
||||
{
|
||||
double start = usecond();
|
||||
|
||||
GaugeLinkField tmp_mu(force.Grid());
|
||||
|
||||
for (int ismr = this->smearingLevels - 1; ismr > 0; --ismr) {
|
||||
|
||||
// remove U in UdSdU...
|
||||
for (int mu = 0; mu < Nd; mu++) {
|
||||
tmp_mu = adj(peekLorentz(this->get_smeared_conf(ismr), mu)) * peekLorentz(force, mu);
|
||||
pokeLorentz(force, tmp_mu, mu);
|
||||
}
|
||||
|
||||
// Propagate existing force
|
||||
force = this->AnalyticSmearedForce(force, this->get_smeared_conf(ismr - 1), ismr);
|
||||
|
||||
// Add back U in UdSdU...
|
||||
for (int mu = 0; mu < Nd; mu++) {
|
||||
tmp_mu = peekLorentz(this->get_smeared_conf(ismr - 1), mu) * peekLorentz(force, mu);
|
||||
pokeLorentz(force, tmp_mu, mu);
|
||||
}
|
||||
|
||||
// Get this levels determinant force
|
||||
force_det = Zero();
|
||||
logDetJacobianForceLevel(this->get_smeared_conf(ismr-1),force_det,ismr);
|
||||
|
||||
// Sum the contributions
|
||||
force = force + force_det;
|
||||
}
|
||||
|
||||
// remove U in UdSdU...
|
||||
for (int mu = 0; mu < Nd; mu++) {
|
||||
tmp_mu = adj(peekLorentz(this->get_smeared_conf(0), mu)) * peekLorentz(force, mu);
|
||||
pokeLorentz(force, tmp_mu, mu);
|
||||
}
|
||||
|
||||
force = this->AnalyticSmearedForce(force, *this->ThinLinks,0);
|
||||
|
||||
for (int mu = 0; mu < Nd; mu++) {
|
||||
tmp_mu = peekLorentz(*this->ThinLinks, mu) * peekLorentz(force, mu);
|
||||
pokeLorentz(force, tmp_mu, mu);
|
||||
}
|
||||
|
||||
force_det = Zero();
|
||||
|
||||
logDetJacobianForceLevel(*this->ThinLinks,force_det,0);
|
||||
|
||||
force = force + force_det;
|
||||
|
||||
force=Ta(force); // Ta
|
||||
|
||||
double end = usecond();
|
||||
double time = (end - start)/ 1e3;
|
||||
std::cout << GridLogMessage << "GaugeConfigurationMasked: lnDetJacobianForce took " << time << " ms" << std::endl;
|
||||
} // if smearingLevels = 0 do nothing
|
||||
}
|
||||
|
||||
private:
|
||||
//====================================================================
|
||||
// Override base clas here to mask it
|
||||
virtual void fill_smearedSet(GaugeField &U)
|
||||
{
|
||||
this->ThinLinks = &U; // attach the smearing routine to the field U
|
||||
|
||||
// check the pointer is not null
|
||||
if (this->ThinLinks == NULL)
|
||||
std::cout << GridLogError << "[SmearedConfigurationMasked] Error in ThinLinks pointer\n";
|
||||
|
||||
if (this->smearingLevels > 0)
|
||||
{
|
||||
std::cout << GridLogMessage << "[SmearedConfigurationMasked] Filling SmearedSet\n";
|
||||
GaugeField previous_u(this->ThinLinks->Grid());
|
||||
|
||||
GaugeField smeared_A(this->ThinLinks->Grid());
|
||||
GaugeField smeared_B(this->ThinLinks->Grid());
|
||||
|
||||
previous_u = *this->ThinLinks;
|
||||
double start = usecond();
|
||||
for (int smearLvl = 0; smearLvl < this->smearingLevels; ++smearLvl)
|
||||
{
|
||||
this->StoutSmearing->smear(smeared_A, previous_u);
|
||||
ApplyMask(smeared_A,smearLvl);
|
||||
smeared_B = previous_u;
|
||||
ApplyMask(smeared_B,smearLvl);
|
||||
// Replace only the masked portion
|
||||
this->SmearedSet[smearLvl] = previous_u-smeared_B + smeared_A;
|
||||
previous_u = this->SmearedSet[smearLvl];
|
||||
|
||||
// For debug purposes
|
||||
RealD impl_plaq = WilsonLoops<Gimpl>::avgPlaquette(previous_u);
|
||||
std::cout << GridLogMessage << "[SmearedConfigurationMasked] smeared Plaq: " << impl_plaq << std::endl;
|
||||
}
|
||||
double end = usecond();
|
||||
double time = (end - start)/ 1e3;
|
||||
std::cout << GridLogMessage << "GaugeConfigurationMasked: Link smearing took " << time << " ms" << std::endl;
|
||||
}
|
||||
}
|
||||
//====================================================================
|
||||
// Override base to add masking
|
||||
virtual GaugeField AnalyticSmearedForce(const GaugeField& SigmaKPrime,
|
||||
const GaugeField& GaugeK,int level)
|
||||
{
|
||||
GridBase* grid = GaugeK.Grid();
|
||||
GaugeField C(grid), SigmaK(grid), iLambda(grid);
|
||||
GaugeField SigmaKPrimeA(grid);
|
||||
GaugeField SigmaKPrimeB(grid);
|
||||
GaugeLinkField iLambda_mu(grid);
|
||||
GaugeLinkField iQ(grid), e_iQ(grid);
|
||||
GaugeLinkField SigmaKPrime_mu(grid);
|
||||
GaugeLinkField GaugeKmu(grid), Cmu(grid);
|
||||
|
||||
this->StoutSmearing->BaseSmear(C, GaugeK);
|
||||
SigmaK = Zero();
|
||||
iLambda = Zero();
|
||||
|
||||
SigmaKPrimeA = SigmaKPrime;
|
||||
ApplyMask(SigmaKPrimeA,level);
|
||||
SigmaKPrimeB = SigmaKPrime - SigmaKPrimeA;
|
||||
|
||||
// Could get away with computing only one polarisation here
|
||||
// int mu= (smr/2) %Nd;
|
||||
// SigmaKprime_A has only one component
|
||||
for (int mu = 0; mu < Nd; mu++)
|
||||
{
|
||||
Cmu = peekLorentz(C, mu);
|
||||
GaugeKmu = peekLorentz(GaugeK, mu);
|
||||
SigmaKPrime_mu = peekLorentz(SigmaKPrimeA, mu);
|
||||
iQ = Ta(Cmu * adj(GaugeKmu));
|
||||
this->set_iLambda(iLambda_mu, e_iQ, iQ, SigmaKPrime_mu, GaugeKmu);
|
||||
pokeLorentz(SigmaK, SigmaKPrime_mu * e_iQ + adj(Cmu) * iLambda_mu, mu);
|
||||
pokeLorentz(iLambda, iLambda_mu, mu);
|
||||
}
|
||||
this->StoutSmearing->derivative(SigmaK, iLambda,GaugeK); // derivative of SmearBase
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
// propagate the rest of the force as identity map, just add back
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
SigmaK = SigmaK+SigmaKPrimeB;
|
||||
|
||||
return SigmaK;
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
/* Standard constructor */
|
||||
SmearedConfigurationMasked(GridCartesian* _UGrid, unsigned int Nsmear, Smear_Stout<Gimpl>& Stout)
|
||||
: SmearedConfiguration<Gimpl>(_UGrid, Nsmear,Stout)
|
||||
{
|
||||
assert(Nsmear%(2*Nd)==0); // Or multiply by 8??
|
||||
|
||||
// was resized in base class
|
||||
assert(this->SmearedSet.size()==Nsmear);
|
||||
|
||||
GridRedBlackCartesian * UrbGrid;
|
||||
UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(_UGrid);
|
||||
LatticeComplex one(_UGrid); one = ComplexD(1.0,0.0);
|
||||
LatticeComplex tmp(_UGrid);
|
||||
|
||||
for (unsigned int i = 0; i < this->smearingLevels; ++i) {
|
||||
|
||||
masks.push_back(*(new LatticeLorentzComplex(_UGrid)));
|
||||
|
||||
int mu= (i/2) %Nd;
|
||||
int cb= (i%2);
|
||||
LatticeComplex tmpcb(UrbGrid);
|
||||
|
||||
masks[i]=Zero();
|
||||
////////////////////
|
||||
// Setup the mask
|
||||
////////////////////
|
||||
tmp = Zero();
|
||||
pickCheckerboard(cb,tmpcb,one);
|
||||
setCheckerboard(tmp,tmpcb);
|
||||
PokeIndex<LorentzIndex>(masks[i],tmp, mu);
|
||||
|
||||
}
|
||||
delete UrbGrid;
|
||||
}
|
||||
|
||||
virtual void smeared_force(GaugeField &SigmaTilde)
|
||||
{
|
||||
if (this->smearingLevels > 0)
|
||||
{
|
||||
double start = usecond();
|
||||
GaugeField force = SigmaTilde; // actually = U*SigmaTilde
|
||||
GaugeLinkField tmp_mu(SigmaTilde.Grid());
|
||||
|
||||
// Remove U from UdSdU
|
||||
for (int mu = 0; mu < Nd; mu++)
|
||||
{
|
||||
// to get just SigmaTilde
|
||||
tmp_mu = adj(peekLorentz(this->SmearedSet[this->smearingLevels - 1], mu)) * peekLorentz(force, mu);
|
||||
pokeLorentz(force, tmp_mu, mu);
|
||||
}
|
||||
|
||||
for (int ismr = this->smearingLevels - 1; ismr > 0; --ismr) {
|
||||
force = this->AnalyticSmearedForce(force, this->get_smeared_conf(ismr - 1),ismr);
|
||||
}
|
||||
|
||||
force = this->AnalyticSmearedForce(force, *this->ThinLinks,0);
|
||||
|
||||
// Add U to UdSdU
|
||||
for (int mu = 0; mu < Nd; mu++)
|
||||
{
|
||||
tmp_mu = peekLorentz(*this->ThinLinks, mu) * peekLorentz(force, mu);
|
||||
pokeLorentz(SigmaTilde, tmp_mu, mu);
|
||||
}
|
||||
|
||||
|
||||
double end = usecond();
|
||||
double time = (end - start)/ 1e3;
|
||||
std::cout << GridLogMessage << " GaugeConfigurationMasked: Smeared Force chain rule took " << time << " ms" << std::endl;
|
||||
|
||||
} // if smearingLevels = 0 do nothing
|
||||
SigmaTilde=Gimpl::projectForce(SigmaTilde); // Ta
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
87
Grid/qcd/smearing/JacobianAction.h
Normal file
87
Grid/qcd/smearing/JacobianAction.h
Normal file
@ -0,0 +1,87 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/gauge/JacobianAction.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution
|
||||
directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#pragma once
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Jacobian Action ..
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
template <class Gimpl>
|
||||
class JacobianAction : public Action<typename Gimpl::GaugeField> {
|
||||
public:
|
||||
INHERIT_GIMPL_TYPES(Gimpl);
|
||||
|
||||
SmearedConfigurationMasked<Gimpl> * smearer;
|
||||
/////////////////////////// constructors
|
||||
explicit JacobianAction(SmearedConfigurationMasked<Gimpl> * _smearer ) { smearer=_smearer;};
|
||||
|
||||
virtual std::string action_name() {return "JacobianAction";}
|
||||
|
||||
virtual std::string LogParameters(){
|
||||
std::stringstream sstream;
|
||||
sstream << GridLogMessage << "[JacobianAction] " << std::endl;
|
||||
return sstream.str();
|
||||
}
|
||||
|
||||
//////////////////////////////////
|
||||
// Usual cases are not used
|
||||
//////////////////////////////////
|
||||
virtual void refresh(const GaugeField &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG){ assert(0);};
|
||||
virtual RealD S(const GaugeField &U) { assert(0); }
|
||||
virtual void deriv(const GaugeField &U, GaugeField &dSdU) { assert(0); }
|
||||
|
||||
//////////////////////////////////
|
||||
// Functions of smart configs only
|
||||
//////////////////////////////////
|
||||
virtual void refresh(ConfigurationBase<GaugeField> & U, GridSerialRNG &sRNG, GridParallelRNG& pRNG)
|
||||
{
|
||||
return;
|
||||
}
|
||||
virtual RealD S(ConfigurationBase<GaugeField>& U)
|
||||
{
|
||||
// det M = e^{ - ( - logDetM) }
|
||||
assert( &U == smearer );
|
||||
return -smearer->logDetJacobian();
|
||||
}
|
||||
virtual RealD Sinitial(ConfigurationBase<GaugeField>& U)
|
||||
{
|
||||
return S(U);
|
||||
}
|
||||
virtual void deriv(ConfigurationBase<GaugeField>& U, GaugeField& dSdU)
|
||||
{
|
||||
assert( &U == smearer );
|
||||
smearer->logDetJacobianForce(dSdU);
|
||||
}
|
||||
|
||||
private:
|
||||
};
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
@ -40,7 +40,9 @@ template <class Gimpl>
|
||||
class Smear_Stout : public Smear<Gimpl> {
|
||||
private:
|
||||
int OrthogDim = -1;
|
||||
public:
|
||||
const std::vector<double> SmearRho;
|
||||
private:
|
||||
// Smear<Gimpl>* ownership semantics:
|
||||
// Smear<Gimpl>* passed in to constructor are owned by caller, so we don't delete them here
|
||||
// Smear<Gimpl>* created within constructor need to be deleted as part of the destructor
|
||||
|
@ -37,13 +37,14 @@ NAMESPACE_BEGIN(Grid);
|
||||
// Make these members of an Impl class for BC's.
|
||||
|
||||
namespace PeriodicBC {
|
||||
|
||||
//Out(x) = Link(x)*field(x+mu)
|
||||
template<class covariant,class gauge> Lattice<covariant> CovShiftForward(const Lattice<gauge> &Link,
|
||||
int mu,
|
||||
const Lattice<covariant> &field)
|
||||
{
|
||||
return Link*Cshift(field,mu,1);// moves towards negative mu
|
||||
}
|
||||
//Out(x) = Link^dag(x-mu)*field(x-mu)
|
||||
template<class covariant,class gauge> Lattice<covariant> CovShiftBackward(const Lattice<gauge> &Link,
|
||||
int mu,
|
||||
const Lattice<covariant> &field)
|
||||
@ -52,19 +53,19 @@ namespace PeriodicBC {
|
||||
tmp = adj(Link)*field;
|
||||
return Cshift(tmp,mu,-1);// moves towards positive mu
|
||||
}
|
||||
|
||||
//Out(x) = Link^dag(x-mu)
|
||||
template<class gauge> Lattice<gauge>
|
||||
CovShiftIdentityBackward(const Lattice<gauge> &Link, int mu)
|
||||
{
|
||||
return Cshift(adj(Link), mu, -1);
|
||||
}
|
||||
|
||||
//Out(x) = Link(x)
|
||||
template<class gauge> Lattice<gauge>
|
||||
CovShiftIdentityForward(const Lattice<gauge> &Link, int mu)
|
||||
{
|
||||
return Link;
|
||||
}
|
||||
|
||||
//Link(x) = Link(x+mu)
|
||||
template<class gauge> Lattice<gauge>
|
||||
ShiftStaple(const Lattice<gauge> &Link, int mu)
|
||||
{
|
||||
|
@ -34,6 +34,61 @@ directory
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
template<int N, class Vec>
|
||||
Lattice<iScalar<iScalar<iScalar<Vec> > > > Determinant(const Lattice<iScalar<iScalar<iMatrix<Vec, N> > > > &Umu)
|
||||
{
|
||||
GridBase *grid=Umu.Grid();
|
||||
auto lvol = grid->lSites();
|
||||
Lattice<iScalar<iScalar<iScalar<Vec> > > > ret(grid);
|
||||
typedef typename Vec::scalar_type scalar;
|
||||
autoView(Umu_v,Umu,CpuRead);
|
||||
autoView(ret_v,ret,CpuWrite);
|
||||
thread_for(site,lvol,{
|
||||
Eigen::MatrixXcd EigenU = Eigen::MatrixXcd::Zero(N,N);
|
||||
Coordinate lcoor;
|
||||
grid->LocalIndexToLocalCoor(site, lcoor);
|
||||
iScalar<iScalar<iMatrix<scalar, N> > > Us;
|
||||
peekLocalSite(Us, Umu_v, lcoor);
|
||||
for(int i=0;i<N;i++){
|
||||
for(int j=0;j<N;j++){
|
||||
scalar tmp= Us()()(i,j);
|
||||
ComplexD ztmp(real(tmp),imag(tmp));
|
||||
EigenU(i,j)=ztmp;
|
||||
}}
|
||||
ComplexD detD = EigenU.determinant();
|
||||
typename Vec::scalar_type det(detD.real(),detD.imag());
|
||||
pokeLocalSite(det,ret_v,lcoor);
|
||||
});
|
||||
return ret;
|
||||
}
|
||||
|
||||
template<int N, class Vec>
|
||||
static void ProjectSUn(Lattice<iScalar<iScalar<iMatrix<Vec, N> > > > &Umu)
|
||||
{
|
||||
Umu = ProjectOnGroup(Umu);
|
||||
auto det = Determinant(Umu);
|
||||
|
||||
det = conjugate(det);
|
||||
|
||||
for(int i=0;i<N;i++){
|
||||
auto element = PeekIndex<ColourIndex>(Umu,N-1,i);
|
||||
element = element * det;
|
||||
PokeIndex<ColourIndex>(Umu,element,Nc-1,i);
|
||||
}
|
||||
}
|
||||
template<int N,class Vec>
|
||||
static void ProjectSUn(Lattice<iVector<iScalar<iMatrix<Vec, N> >,Nd> > &U)
|
||||
{
|
||||
GridBase *grid=U.Grid();
|
||||
// Reunitarise
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
auto Umu = PeekIndex<LorentzIndex>(U,mu);
|
||||
Umu = ProjectOnGroup(Umu);
|
||||
ProjectSUn(Umu);
|
||||
PokeIndex<LorentzIndex>(U,Umu,mu);
|
||||
}
|
||||
}
|
||||
|
||||
template <int ncolour>
|
||||
class SU {
|
||||
public:
|
||||
@ -741,8 +796,14 @@ public:
|
||||
typedef Lattice<vMatrixType> LatticeMatrixType;
|
||||
|
||||
LatticeMatrixType Umu(out.Grid());
|
||||
LatticeMatrixType tmp(out.Grid());
|
||||
for (int mu = 0; mu < Nd; mu++) {
|
||||
LieRandomize(pRNG, Umu, 1.0);
|
||||
// LieRandomize(pRNG, Umu, 1.0);
|
||||
// PokeIndex<LorentzIndex>(out, Umu, mu);
|
||||
gaussian(pRNG,Umu);
|
||||
tmp = Ta(Umu);
|
||||
taExp(tmp,Umu);
|
||||
ProjectSUn(Umu);
|
||||
PokeIndex<LorentzIndex>(out, Umu, mu);
|
||||
}
|
||||
}
|
||||
@ -799,12 +860,12 @@ public:
|
||||
};
|
||||
|
||||
template<int N>
|
||||
LatticeComplexD Determinant(const Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > &Umu)
|
||||
Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > Inverse(const Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > &Umu)
|
||||
{
|
||||
GridBase *grid=Umu.Grid();
|
||||
auto lvol = grid->lSites();
|
||||
LatticeComplexD ret(grid);
|
||||
|
||||
Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > ret(grid);
|
||||
|
||||
autoView(Umu_v,Umu,CpuRead);
|
||||
autoView(ret_v,ret,CpuWrite);
|
||||
thread_for(site,lvol,{
|
||||
@ -812,42 +873,21 @@ LatticeComplexD Determinant(const Lattice<iScalar<iScalar<iMatrix<vComplexD, N>
|
||||
Coordinate lcoor;
|
||||
grid->LocalIndexToLocalCoor(site, lcoor);
|
||||
iScalar<iScalar<iMatrix<ComplexD, N> > > Us;
|
||||
iScalar<iScalar<iMatrix<ComplexD, N> > > Ui;
|
||||
peekLocalSite(Us, Umu_v, lcoor);
|
||||
for(int i=0;i<N;i++){
|
||||
for(int j=0;j<N;j++){
|
||||
EigenU(i,j) = Us()()(i,j);
|
||||
}}
|
||||
ComplexD det = EigenU.determinant();
|
||||
pokeLocalSite(det,ret_v,lcoor);
|
||||
Eigen::MatrixXcd EigenUinv = EigenU.inverse();
|
||||
for(int i=0;i<N;i++){
|
||||
for(int j=0;j<N;j++){
|
||||
Ui()()(i,j) = EigenUinv(i,j);
|
||||
}}
|
||||
pokeLocalSite(Ui,ret_v,lcoor);
|
||||
});
|
||||
return ret;
|
||||
}
|
||||
template<int N>
|
||||
static void ProjectSUn(Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > &Umu)
|
||||
{
|
||||
Umu = ProjectOnGroup(Umu);
|
||||
auto det = Determinant(Umu);
|
||||
|
||||
det = conjugate(det);
|
||||
|
||||
for(int i=0;i<N;i++){
|
||||
auto element = PeekIndex<ColourIndex>(Umu,N-1,i);
|
||||
element = element * det;
|
||||
PokeIndex<ColourIndex>(Umu,element,Nc-1,i);
|
||||
}
|
||||
}
|
||||
template<int N>
|
||||
static void ProjectSUn(Lattice<iVector<iScalar<iMatrix<vComplexD, N> >,Nd> > &U)
|
||||
{
|
||||
GridBase *grid=U.Grid();
|
||||
// Reunitarise
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
auto Umu = PeekIndex<LorentzIndex>(U,mu);
|
||||
Umu = ProjectOnGroup(Umu);
|
||||
ProjectSUn(Umu);
|
||||
PokeIndex<LorentzIndex>(U,Umu,mu);
|
||||
}
|
||||
}
|
||||
// Explicit specialisation for SU(3).
|
||||
// Explicit specialisation for SU(3).
|
||||
static void
|
||||
|
@ -51,6 +51,7 @@ public:
|
||||
typedef Lattice<iVector<iScalar<iMatrix<vComplexF, Dimension> >, Nd> > LatticeAdjFieldF;
|
||||
typedef Lattice<iVector<iScalar<iMatrix<vComplexD, Dimension> >, Nd> > LatticeAdjFieldD;
|
||||
|
||||
typedef Lattice<iScalar<iScalar<iVector<vComplex, Dimension> > > > LatticeAdjVector;
|
||||
|
||||
template <class cplx>
|
||||
static void generator(int Index, iSUnAdjointMatrix<cplx> &iAdjTa) {
|
||||
|
@ -290,7 +290,7 @@ public:
|
||||
}
|
||||
*/
|
||||
//////////////////////////////////////////////////
|
||||
// the sum over all staples on each site
|
||||
// the sum over all nu-oriented staples for nu != mu on each site
|
||||
//////////////////////////////////////////////////
|
||||
static void Staple(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
|
||||
|
||||
@ -300,6 +300,10 @@ public:
|
||||
for (int d = 0; d < Nd; d++) {
|
||||
U[d] = PeekIndex<LorentzIndex>(Umu, d);
|
||||
}
|
||||
Staple(staple, U, mu);
|
||||
}
|
||||
|
||||
static void Staple(GaugeMat &staple, const std::vector<GaugeMat> &U, int mu) {
|
||||
staple = Zero();
|
||||
|
||||
for (int nu = 0; nu < Nd; nu++) {
|
||||
@ -335,6 +339,202 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
/////////////
|
||||
//Staples for each direction mu, summed over nu != mu
|
||||
//staple: output staples for each mu (Nd)
|
||||
//U: link array (Nd)
|
||||
/////////////
|
||||
static void StapleAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U) {
|
||||
assert(staple.size() == Nd); assert(U.size() == Nd);
|
||||
for(int mu=0;mu<Nd;mu++) Staple(staple[mu], U, mu);
|
||||
}
|
||||
|
||||
|
||||
//A workspace class allowing reuse of the stencil
|
||||
class WilsonLoopPaddedStencilWorkspace{
|
||||
std::unique_ptr<GeneralLocalStencil> stencil;
|
||||
size_t nshift;
|
||||
|
||||
void generateStencil(GridBase* padded_grid){
|
||||
double t0 = usecond();
|
||||
|
||||
//Generate shift arrays
|
||||
std::vector<Coordinate> shifts = this->getShifts();
|
||||
nshift = shifts.size();
|
||||
|
||||
double t1 = usecond();
|
||||
//Generate local stencil
|
||||
stencil.reset(new GeneralLocalStencil(padded_grid,shifts));
|
||||
double t2 = usecond();
|
||||
std::cout << GridLogPerformance << " WilsonLoopPaddedWorkspace timings: coord:" << (t1-t0)/1000 << "ms, stencil:" << (t2-t1)/1000 << "ms" << std::endl;
|
||||
}
|
||||
public:
|
||||
//Get the stencil. If not already generated, or if generated using a different Grid than in PaddedCell, it will be created on-the-fly
|
||||
const GeneralLocalStencil & getStencil(const PaddedCell &pcell){
|
||||
assert(pcell.depth >= this->paddingDepth());
|
||||
if(!stencil || stencil->Grid() != (GridBase*)pcell.grids.back() ) generateStencil((GridBase*)pcell.grids.back());
|
||||
return *stencil;
|
||||
}
|
||||
size_t Nshift() const{ return nshift; }
|
||||
|
||||
virtual std::vector<Coordinate> getShifts() const = 0;
|
||||
virtual int paddingDepth() const = 0; //padding depth required
|
||||
|
||||
virtual ~WilsonLoopPaddedStencilWorkspace(){}
|
||||
};
|
||||
|
||||
//This workspace allows the sharing of a common PaddedCell object between multiple stencil workspaces
|
||||
class WilsonLoopPaddedWorkspace{
|
||||
std::vector<WilsonLoopPaddedStencilWorkspace*> stencil_wk;
|
||||
std::unique_ptr<PaddedCell> pcell;
|
||||
|
||||
void generatePcell(GridBase* unpadded_grid){
|
||||
assert(stencil_wk.size());
|
||||
int max_depth = 0;
|
||||
for(auto const &s : stencil_wk) max_depth=std::max(max_depth, s->paddingDepth());
|
||||
|
||||
pcell.reset(new PaddedCell(max_depth, dynamic_cast<GridCartesian*>(unpadded_grid)));
|
||||
}
|
||||
|
||||
public:
|
||||
//Add a stencil definition. This should be done before the first call to retrieve a stencil object.
|
||||
//Takes ownership of the pointer
|
||||
void addStencil(WilsonLoopPaddedStencilWorkspace *stencil){
|
||||
assert(!pcell);
|
||||
stencil_wk.push_back(stencil);
|
||||
}
|
||||
|
||||
const GeneralLocalStencil & getStencil(const size_t stencil_idx, GridBase* unpadded_grid){
|
||||
if(!pcell || pcell->unpadded_grid != unpadded_grid) generatePcell(unpadded_grid);
|
||||
return stencil_wk[stencil_idx]->getStencil(*pcell);
|
||||
}
|
||||
const PaddedCell & getPaddedCell(GridBase* unpadded_grid){
|
||||
if(!pcell || pcell->unpadded_grid != unpadded_grid) generatePcell(unpadded_grid);
|
||||
return *pcell;
|
||||
}
|
||||
|
||||
~WilsonLoopPaddedWorkspace(){
|
||||
for(auto &s : stencil_wk) delete s;
|
||||
}
|
||||
};
|
||||
|
||||
//A workspace class allowing reuse of the stencil
|
||||
class StaplePaddedAllWorkspace: public WilsonLoopPaddedStencilWorkspace{
|
||||
public:
|
||||
std::vector<Coordinate> getShifts() const override{
|
||||
std::vector<Coordinate> shifts;
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
for(int nu=0;nu<Nd;nu++){
|
||||
if(nu != mu){
|
||||
Coordinate shift_0(Nd,0);
|
||||
Coordinate shift_mu(Nd,0); shift_mu[mu]=1;
|
||||
Coordinate shift_nu(Nd,0); shift_nu[nu]=1;
|
||||
Coordinate shift_mnu(Nd,0); shift_mnu[nu]=-1;
|
||||
Coordinate shift_mnu_pmu(Nd,0); shift_mnu_pmu[nu]=-1; shift_mnu_pmu[mu]=1;
|
||||
|
||||
//U_nu(x+mu)U^dag_mu(x+nu) U^dag_nu(x)
|
||||
shifts.push_back(shift_0);
|
||||
shifts.push_back(shift_nu);
|
||||
shifts.push_back(shift_mu);
|
||||
|
||||
//U_nu^dag(x-nu+mu) U_mu^dag(x-nu) U_nu(x-nu)
|
||||
shifts.push_back(shift_mnu);
|
||||
shifts.push_back(shift_mnu);
|
||||
shifts.push_back(shift_mnu_pmu);
|
||||
}
|
||||
}
|
||||
}
|
||||
return shifts;
|
||||
}
|
||||
|
||||
int paddingDepth() const override{ return 1; }
|
||||
};
|
||||
|
||||
//Padded cell implementation of the staple method for all mu, summed over nu != mu
|
||||
//staple: output staple for each mu, summed over nu != mu (Nd)
|
||||
//U_padded: the gauge link fields padded out using the PaddedCell class
|
||||
//Cell: the padded cell class
|
||||
static void StaplePaddedAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U_padded, const PaddedCell &Cell) {
|
||||
StaplePaddedAllWorkspace wk;
|
||||
StaplePaddedAll(staple,U_padded,Cell,wk.getStencil(Cell));
|
||||
}
|
||||
|
||||
//Padded cell implementation of the staple method for all mu, summed over nu != mu
|
||||
//staple: output staple for each mu, summed over nu != mu (Nd)
|
||||
//U_padded: the gauge link fields padded out using the PaddedCell class
|
||||
//Cell: the padded cell class
|
||||
//gStencil: the precomputed generalized local stencil for the staple
|
||||
static void StaplePaddedAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U_padded, const PaddedCell &Cell, const GeneralLocalStencil &gStencil) {
|
||||
double t0 = usecond();
|
||||
assert(U_padded.size() == Nd); assert(staple.size() == Nd);
|
||||
assert(U_padded[0].Grid() == (GridBase*)Cell.grids.back());
|
||||
assert(Cell.depth >= 1);
|
||||
GridBase *ggrid = U_padded[0].Grid(); //padded cell grid
|
||||
|
||||
int shift_mu_off = gStencil._npoints/Nd;
|
||||
|
||||
//Open views to padded gauge links and keep open over mu loop
|
||||
typedef LatticeView<typename GaugeMat::vector_object> GaugeViewType;
|
||||
size_t vsize = Nd*sizeof(GaugeViewType);
|
||||
GaugeViewType* Ug_dirs_v_host = (GaugeViewType*)malloc(vsize);
|
||||
for(int i=0;i<Nd;i++) Ug_dirs_v_host[i] = U_padded[i].View(AcceleratorRead);
|
||||
GaugeViewType* Ug_dirs_v = (GaugeViewType*)acceleratorAllocDevice(vsize);
|
||||
acceleratorCopyToDevice(Ug_dirs_v_host,Ug_dirs_v,vsize);
|
||||
|
||||
GaugeMat gStaple(ggrid);
|
||||
|
||||
int outer_off = 0;
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
{ //view scope
|
||||
autoView( gStaple_v , gStaple, AcceleratorWrite);
|
||||
auto gStencil_v = gStencil.View();
|
||||
|
||||
accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), {
|
||||
decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss;
|
||||
stencil_ss = Zero();
|
||||
int off = outer_off;
|
||||
|
||||
for(int nu=0;nu<Nd;nu++){
|
||||
if(nu != mu){
|
||||
GeneralStencilEntry const* e = gStencil_v.GetEntry(off++,ss);
|
||||
auto U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
|
||||
e = gStencil_v.GetEntry(off++,ss);
|
||||
auto U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
|
||||
e = gStencil_v.GetEntry(off++,ss);
|
||||
auto U2 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
|
||||
|
||||
stencil_ss = stencil_ss + U2 * U1 * U0;
|
||||
|
||||
e = gStencil_v.GetEntry(off++,ss);
|
||||
U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
|
||||
e = gStencil_v.GetEntry(off++,ss);
|
||||
U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
|
||||
e = gStencil_v.GetEntry(off++,ss);
|
||||
U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
|
||||
|
||||
stencil_ss = stencil_ss + U2 * U1 * U0;
|
||||
}
|
||||
}
|
||||
|
||||
coalescedWrite(gStaple_v[ss],stencil_ss);
|
||||
}
|
||||
);
|
||||
} //ensure views are all closed!
|
||||
|
||||
staple[mu] = Cell.Extract(gStaple);
|
||||
outer_off += shift_mu_off;
|
||||
}//mu loop
|
||||
|
||||
for(int i=0;i<Nd;i++) Ug_dirs_v_host[i].ViewClose();
|
||||
free(Ug_dirs_v_host);
|
||||
acceleratorFreeDevice(Ug_dirs_v);
|
||||
|
||||
double t1=usecond();
|
||||
|
||||
std::cout << GridLogPerformance << "StaplePaddedAll timing:" << (t1-t0)/1000 << "ms" << std::endl;
|
||||
}
|
||||
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
// the sum over all staples on each site in direction mu,nu, upper part
|
||||
//////////////////////////////////////////////////
|
||||
@ -707,18 +907,14 @@ public:
|
||||
// the sum over all staples on each site
|
||||
//////////////////////////////////////////////////
|
||||
static void RectStapleDouble(GaugeMat &U2, const GaugeMat &U, int mu) {
|
||||
U2 = U * Cshift(U, mu, 1);
|
||||
U2 = U * Gimpl::CshiftLink(U, mu, 1);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Hop by two optimisation strategy does not work nicely with Gparity. (could
|
||||
// do,
|
||||
// but need to track two deep where cross boundary and apply a conjugation).
|
||||
// Must differentiate this in Gimpl, and use Gimpl::isPeriodicGaugeField to do
|
||||
// so .
|
||||
// Hop by two optimisation strategy. Use RectStapleDouble to obtain 'U2'
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
static void RectStapleOptimised(GaugeMat &Stap, std::vector<GaugeMat> &U2,
|
||||
std::vector<GaugeMat> &U, int mu) {
|
||||
static void RectStapleOptimised(GaugeMat &Stap, const std::vector<GaugeMat> &U2,
|
||||
const std::vector<GaugeMat> &U, int mu) {
|
||||
|
||||
Stap = Zero();
|
||||
|
||||
@ -732,9 +928,9 @@ public:
|
||||
|
||||
// Up staple ___ ___
|
||||
// | |
|
||||
tmp = Cshift(adj(U[nu]), nu, -1);
|
||||
tmp = Gimpl::CshiftLink(adj(U[nu]), nu, -1);
|
||||
tmp = adj(U2[mu]) * tmp;
|
||||
tmp = Cshift(tmp, mu, -2);
|
||||
tmp = Gimpl::CshiftLink(tmp, mu, -2);
|
||||
|
||||
Staple2x1 = Gimpl::CovShiftForward(U[nu], nu, tmp);
|
||||
|
||||
@ -742,14 +938,14 @@ public:
|
||||
// |___ ___|
|
||||
//
|
||||
tmp = adj(U2[mu]) * U[nu];
|
||||
Staple2x1 += Gimpl::CovShiftBackward(U[nu], nu, Cshift(tmp, mu, -2));
|
||||
Staple2x1 += Gimpl::CovShiftBackward(U[nu], nu, Gimpl::CshiftLink(tmp, mu, -2));
|
||||
|
||||
// ___ ___
|
||||
// | ___|
|
||||
// |___ ___|
|
||||
//
|
||||
|
||||
Stap += Cshift(Gimpl::CovShiftForward(U[mu], mu, Staple2x1), mu, 1);
|
||||
Stap += Gimpl::CshiftLink(Gimpl::CovShiftForward(U[mu], mu, Staple2x1), mu, 1);
|
||||
|
||||
// ___ ___
|
||||
// |___ |
|
||||
@ -758,7 +954,7 @@ public:
|
||||
|
||||
// tmp= Staple2x1* Cshift(U[mu],mu,-2);
|
||||
// Stap+= Cshift(tmp,mu,1) ;
|
||||
Stap += Cshift(Staple2x1, mu, 1) * Cshift(U[mu], mu, -1);
|
||||
Stap += Gimpl::CshiftLink(Staple2x1, mu, 1) * Gimpl::CshiftLink(U[mu], mu, -1);
|
||||
;
|
||||
|
||||
// --
|
||||
@ -766,10 +962,10 @@ public:
|
||||
//
|
||||
// | |
|
||||
|
||||
tmp = Cshift(adj(U2[nu]), nu, -2);
|
||||
tmp = Gimpl::CshiftLink(adj(U2[nu]), nu, -2);
|
||||
tmp = Gimpl::CovShiftBackward(U[mu], mu, tmp);
|
||||
tmp = U2[nu] * Cshift(tmp, nu, 2);
|
||||
Stap += Cshift(tmp, mu, 1);
|
||||
tmp = U2[nu] * Gimpl::CshiftLink(tmp, nu, 2);
|
||||
Stap += Gimpl::CshiftLink(tmp, mu, 1);
|
||||
|
||||
// | |
|
||||
//
|
||||
@ -778,25 +974,12 @@ public:
|
||||
|
||||
tmp = Gimpl::CovShiftBackward(U[mu], mu, U2[nu]);
|
||||
tmp = adj(U2[nu]) * tmp;
|
||||
tmp = Cshift(tmp, nu, -2);
|
||||
Stap += Cshift(tmp, mu, 1);
|
||||
tmp = Gimpl::CshiftLink(tmp, nu, -2);
|
||||
Stap += Gimpl::CshiftLink(tmp, mu, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void RectStaple(GaugeMat &Stap, const GaugeLorentz &Umu, int mu) {
|
||||
RectStapleUnoptimised(Stap, Umu, mu);
|
||||
}
|
||||
static void RectStaple(const GaugeLorentz &Umu, GaugeMat &Stap,
|
||||
std::vector<GaugeMat> &U2, std::vector<GaugeMat> &U,
|
||||
int mu) {
|
||||
if (Gimpl::isPeriodicGaugeField()) {
|
||||
RectStapleOptimised(Stap, U2, U, mu);
|
||||
} else {
|
||||
RectStapleUnoptimised(Stap, Umu, mu);
|
||||
}
|
||||
}
|
||||
|
||||
static void RectStapleUnoptimised(GaugeMat &Stap, const GaugeLorentz &Umu,
|
||||
int mu) {
|
||||
GridBase *grid = Umu.Grid();
|
||||
@ -895,6 +1078,288 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
static void RectStaple(GaugeMat &Stap, const GaugeLorentz &Umu, int mu) {
|
||||
RectStapleUnoptimised(Stap, Umu, mu);
|
||||
}
|
||||
static void RectStaple(const GaugeLorentz &Umu, GaugeMat &Stap,
|
||||
std::vector<GaugeMat> &U2, std::vector<GaugeMat> &U,
|
||||
int mu) {
|
||||
RectStapleOptimised(Stap, U2, U, mu);
|
||||
}
|
||||
//////////////////////////////////////////////////////
|
||||
//Compute the rectangular staples for all orientations
|
||||
//Stap : Array of staples (Nd)
|
||||
//U: Gauge links in each direction (Nd)
|
||||
/////////////////////////////////////////////////////
|
||||
static void RectStapleAll(std::vector<GaugeMat> &Stap, const std::vector<GaugeMat> &U){
|
||||
assert(Stap.size() == Nd); assert(U.size() == Nd);
|
||||
std::vector<GaugeMat> U2(Nd,U[0].Grid());
|
||||
for(int mu=0;mu<Nd;mu++) RectStapleDouble(U2[mu], U[mu], mu);
|
||||
for(int mu=0;mu<Nd;mu++) RectStapleOptimised(Stap[mu], U2, U, mu);
|
||||
}
|
||||
|
||||
//A workspace class allowing reuse of the stencil
|
||||
class RectStaplePaddedAllWorkspace: public WilsonLoopPaddedStencilWorkspace{
|
||||
public:
|
||||
std::vector<Coordinate> getShifts() const override{
|
||||
std::vector<Coordinate> shifts;
|
||||
for (int mu = 0; mu < Nd; mu++){
|
||||
for (int nu = 0; nu < Nd; nu++) {
|
||||
if (nu != mu) {
|
||||
auto genShift = [&](int mushift,int nushift){
|
||||
Coordinate out(Nd,0); out[mu]=mushift; out[nu]=nushift; return out;
|
||||
};
|
||||
|
||||
//tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x)
|
||||
shifts.push_back(genShift(0,0));
|
||||
shifts.push_back(genShift(0,+1));
|
||||
shifts.push_back(genShift(+1,+1));
|
||||
shifts.push_back(genShift(+2,0));
|
||||
shifts.push_back(genShift(+1,0));
|
||||
|
||||
//tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu)
|
||||
shifts.push_back(genShift(0,-1));
|
||||
shifts.push_back(genShift(0,-1));
|
||||
shifts.push_back(genShift(+1,-1));
|
||||
shifts.push_back(genShift(+2,-1));
|
||||
shifts.push_back(genShift(+1,0));
|
||||
|
||||
//tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu)
|
||||
shifts.push_back(genShift(-1,0));
|
||||
shifts.push_back(genShift(-1,-1));
|
||||
shifts.push_back(genShift(-1,-1));
|
||||
shifts.push_back(genShift(0,-1));
|
||||
shifts.push_back(genShift(+1,-1));
|
||||
|
||||
//tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu)
|
||||
shifts.push_back(genShift(-1,0));
|
||||
shifts.push_back(genShift(-1,0));
|
||||
shifts.push_back(genShift(-1,+1));
|
||||
shifts.push_back(genShift(0,+1));
|
||||
shifts.push_back(genShift(+1,0));
|
||||
|
||||
//tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x)
|
||||
shifts.push_back(genShift(0,0));
|
||||
shifts.push_back(genShift(0,+1));
|
||||
shifts.push_back(genShift(0,+2));
|
||||
shifts.push_back(genShift(+1,+1));
|
||||
shifts.push_back(genShift(+1,0));
|
||||
|
||||
//tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu)
|
||||
shifts.push_back(genShift(0,-1));
|
||||
shifts.push_back(genShift(0,-2));
|
||||
shifts.push_back(genShift(0,-2));
|
||||
shifts.push_back(genShift(+1,-2));
|
||||
shifts.push_back(genShift(+1,-1));
|
||||
}
|
||||
}
|
||||
}
|
||||
return shifts;
|
||||
}
|
||||
|
||||
int paddingDepth() const override{ return 2; }
|
||||
};
|
||||
|
||||
//Padded cell implementation of the rectangular staple method for all mu, summed over nu != mu
|
||||
//staple: output staple for each mu, summed over nu != mu (Nd)
|
||||
//U_padded: the gauge link fields padded out using the PaddedCell class
|
||||
//Cell: the padded cell class
|
||||
static void RectStaplePaddedAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U_padded, const PaddedCell &Cell) {
|
||||
RectStaplePaddedAllWorkspace wk;
|
||||
RectStaplePaddedAll(staple,U_padded,Cell,wk.getStencil(Cell));
|
||||
}
|
||||
|
||||
//Padded cell implementation of the rectangular staple method for all mu, summed over nu != mu
|
||||
//staple: output staple for each mu, summed over nu != mu (Nd)
|
||||
//U_padded: the gauge link fields padded out using the PaddedCell class
|
||||
//Cell: the padded cell class
|
||||
//gStencil: the stencil
|
||||
static void RectStaplePaddedAll(std::vector<GaugeMat> &staple, const std::vector<GaugeMat> &U_padded, const PaddedCell &Cell, const GeneralLocalStencil &gStencil) {
|
||||
double t0 = usecond();
|
||||
assert(U_padded.size() == Nd); assert(staple.size() == Nd);
|
||||
assert(U_padded[0].Grid() == (GridBase*)Cell.grids.back());
|
||||
assert(Cell.depth >= 2);
|
||||
GridBase *ggrid = U_padded[0].Grid(); //padded cell grid
|
||||
|
||||
size_t nshift = gStencil._npoints;
|
||||
int mu_off_delta = nshift / Nd;
|
||||
|
||||
//Open views to padded gauge links and keep open over mu loop
|
||||
typedef LatticeView<typename GaugeMat::vector_object> GaugeViewType;
|
||||
size_t vsize = Nd*sizeof(GaugeViewType);
|
||||
GaugeViewType* Ug_dirs_v_host = (GaugeViewType*)malloc(vsize);
|
||||
for(int i=0;i<Nd;i++) Ug_dirs_v_host[i] = U_padded[i].View(AcceleratorRead);
|
||||
GaugeViewType* Ug_dirs_v = (GaugeViewType*)acceleratorAllocDevice(vsize);
|
||||
acceleratorCopyToDevice(Ug_dirs_v_host,Ug_dirs_v,vsize);
|
||||
|
||||
GaugeMat gStaple(ggrid); //temp staple object on padded grid
|
||||
|
||||
int offset = 0;
|
||||
for(int mu=0; mu<Nd; mu++){
|
||||
|
||||
{ //view scope
|
||||
autoView( gStaple_v , gStaple, AcceleratorWrite);
|
||||
auto gStencil_v = gStencil.View();
|
||||
|
||||
accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), {
|
||||
decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss;
|
||||
stencil_ss = Zero();
|
||||
int s=offset;
|
||||
for(int nu=0;nu<Nd;nu++){
|
||||
if(nu != mu){
|
||||
//tmp6 = tmp5(x+mu) = U_mu(x+mu)U_nu(x+2mu)U_mu^dag(x+nu+mu) U_mu^dag(x+nu) U_nu^dag(x)
|
||||
GeneralStencilEntry const* e = gStencil_v.GetEntry(s++,ss);
|
||||
auto U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
|
||||
e = gStencil_v.GetEntry(s++,ss);
|
||||
auto U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
|
||||
e = gStencil_v.GetEntry(s++,ss);
|
||||
auto U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
|
||||
e = gStencil_v.GetEntry(s++,ss);
|
||||
auto U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
|
||||
e = gStencil_v.GetEntry(s++,ss);
|
||||
auto U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
|
||||
|
||||
stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
|
||||
|
||||
//tmp5 = tmp4(x+mu) = U_mu(x+mu)U^dag_nu(x-nu+2mu)U^dag_mu(x-nu+mu)U^dag_mu(x-nu)U_nu(x-nu)
|
||||
e = gStencil_v.GetEntry(s++,ss);
|
||||
U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
|
||||
e = gStencil_v.GetEntry(s++,ss);
|
||||
U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
|
||||
e = gStencil_v.GetEntry(s++,ss);
|
||||
U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
|
||||
e = gStencil_v.GetEntry(s++,ss);
|
||||
U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
|
||||
e = gStencil_v.GetEntry(s++,ss);
|
||||
U4 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
|
||||
|
||||
stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
|
||||
|
||||
//tmp5 = tmp4(x+mu) = U^dag_nu(x-nu+mu)U^dag_mu(x-nu)U^dag_mu(x-mu-nu)U_nu(x-mu-nu)U_mu(x-mu)
|
||||
e = gStencil_v.GetEntry(s++,ss);
|
||||
U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
|
||||
e = gStencil_v.GetEntry(s++,ss);
|
||||
U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
|
||||
e = gStencil_v.GetEntry(s++,ss);
|
||||
U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
|
||||
e = gStencil_v.GetEntry(s++,ss);
|
||||
U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
|
||||
e = gStencil_v.GetEntry(s++,ss);
|
||||
U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
|
||||
|
||||
stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
|
||||
|
||||
//tmp5 = tmp4(x+mu) = U_nu(x+mu)U_mu^dag(x+nu)U_mu^dag(x-mu+nu)U_nu^dag(x-mu)U_mu(x-mu)
|
||||
e = gStencil_v.GetEntry(s++,ss);
|
||||
U0 = coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd);
|
||||
e = gStencil_v.GetEntry(s++,ss);
|
||||
U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
|
||||
e = gStencil_v.GetEntry(s++,ss);
|
||||
U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
|
||||
e = gStencil_v.GetEntry(s++,ss);
|
||||
U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
|
||||
e = gStencil_v.GetEntry(s++,ss);
|
||||
U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
|
||||
|
||||
stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
|
||||
|
||||
//tmp6 = tmp5(x+mu) = U_nu(x+mu)U_nu(x+mu+nu)U_mu^dag(x+2nu)U_nu^dag(x+nu)U_nu^dag(x)
|
||||
e = gStencil_v.GetEntry(s++,ss);
|
||||
U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
|
||||
e = gStencil_v.GetEntry(s++,ss);
|
||||
U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
|
||||
e = gStencil_v.GetEntry(s++,ss);
|
||||
U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
|
||||
e = gStencil_v.GetEntry(s++,ss);
|
||||
U3 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
|
||||
e = gStencil_v.GetEntry(s++,ss);
|
||||
U4 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
|
||||
|
||||
stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
|
||||
|
||||
//tmp5 = tmp4(x+mu) = U_nu^dag(x+mu-nu)U_nu^dag(x+mu-2nu)U_mu^dag(x-2nu)U_nu(x-2nu)U_nu(x-nu)
|
||||
e = gStencil_v.GetEntry(s++,ss);
|
||||
U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
|
||||
e = gStencil_v.GetEntry(s++,ss);
|
||||
U1 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
|
||||
e = gStencil_v.GetEntry(s++,ss);
|
||||
U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
|
||||
e = gStencil_v.GetEntry(s++,ss);
|
||||
U3 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
|
||||
e = gStencil_v.GetEntry(s++,ss);
|
||||
U4 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
|
||||
|
||||
stencil_ss = stencil_ss + U4*U3*U2*U1*U0;
|
||||
|
||||
}
|
||||
}
|
||||
coalescedWrite(gStaple_v[ss],stencil_ss);
|
||||
}
|
||||
);
|
||||
offset += mu_off_delta;
|
||||
}//kernel/view scope
|
||||
|
||||
staple[mu] = Cell.Extract(gStaple);
|
||||
}//mu loop
|
||||
|
||||
for(int i=0;i<Nd;i++) Ug_dirs_v_host[i].ViewClose();
|
||||
free(Ug_dirs_v_host);
|
||||
acceleratorFreeDevice(Ug_dirs_v);
|
||||
|
||||
double t1 = usecond();
|
||||
|
||||
std::cout << GridLogPerformance << "RectStaplePaddedAll timings:" << (t1-t0)/1000 << "ms" << std::endl;
|
||||
}
|
||||
|
||||
//A workspace for reusing the PaddedCell and GeneralLocalStencil objects
|
||||
class StapleAndRectStapleAllWorkspace: public WilsonLoopPaddedWorkspace{
|
||||
public:
|
||||
StapleAndRectStapleAllWorkspace(){
|
||||
this->addStencil(new StaplePaddedAllWorkspace);
|
||||
this->addStencil(new RectStaplePaddedAllWorkspace);
|
||||
}
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
//Compute the 1x1 and 1x2 staples for all orientations
|
||||
//Stap : Array of staples (Nd)
|
||||
//RectStap: Array of rectangular staples (Nd)
|
||||
//U: Gauge links in each direction (Nd)
|
||||
/////////////////////////////////////////////////////
|
||||
static void StapleAndRectStapleAll(std::vector<GaugeMat> &Stap, std::vector<GaugeMat> &RectStap, const std::vector<GaugeMat> &U){
|
||||
StapleAndRectStapleAllWorkspace wk;
|
||||
StapleAndRectStapleAll(Stap,RectStap,U,wk);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
//Compute the 1x1 and 1x2 staples for all orientations
|
||||
//Stap : Array of staples (Nd)
|
||||
//RectStap: Array of rectangular staples (Nd)
|
||||
//U: Gauge links in each direction (Nd)
|
||||
//wk: a workspace containing stored PaddedCell and GeneralLocalStencil objects to maximize reuse
|
||||
/////////////////////////////////////////////////////
|
||||
static void StapleAndRectStapleAll(std::vector<GaugeMat> &Stap, std::vector<GaugeMat> &RectStap, const std::vector<GaugeMat> &U, StapleAndRectStapleAllWorkspace &wk){
|
||||
#if 0
|
||||
StapleAll(Stap, U);
|
||||
RectStapleAll(RectStap, U);
|
||||
#else
|
||||
double t0 = usecond();
|
||||
|
||||
GridCartesian* unpadded_grid = dynamic_cast<GridCartesian*>(U[0].Grid());
|
||||
const PaddedCell &Ghost = wk.getPaddedCell(unpadded_grid);
|
||||
|
||||
CshiftImplGauge<Gimpl> cshift_impl;
|
||||
std::vector<GaugeMat> U_pad(Nd, Ghost.grids.back());
|
||||
for(int mu=0;mu<Nd;mu++) U_pad[mu] = Ghost.Exchange(U[mu], cshift_impl);
|
||||
double t1 = usecond();
|
||||
StaplePaddedAll(Stap, U_pad, Ghost, wk.getStencil(0,unpadded_grid) );
|
||||
double t2 = usecond();
|
||||
RectStaplePaddedAll(RectStap, U_pad, Ghost, wk.getStencil(1,unpadded_grid));
|
||||
double t3 = usecond();
|
||||
std::cout << GridLogPerformance << "StapleAndRectStapleAll timings: pad:" << (t1-t0)/1000 << "ms, staple:" << (t2-t1)/1000 << "ms, rect-staple:" << (t3-t2)/1000 << "ms" << std::endl;
|
||||
#endif
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
// Wilson loop of size (R1, R2), oriented in mu,nu plane
|
||||
//////////////////////////////////////////////////
|
||||
|
@ -320,7 +320,7 @@ struct Conj{
|
||||
|
||||
struct TimesMinusI{
|
||||
//Complex single
|
||||
inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
|
||||
inline float32x4_t operator()(float32x4_t in){
|
||||
// ar ai br bi -> ai -ar ai -br
|
||||
float32x4_t r0, r1;
|
||||
r0 = vnegq_f32(in); // -ar -ai -br -bi
|
||||
@ -328,7 +328,7 @@ struct TimesMinusI{
|
||||
return vtrn1q_f32(r1, r0); // ar -ai br -bi
|
||||
}
|
||||
//Complex double
|
||||
inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
|
||||
inline float64x2_t operator()(float64x2_t in){
|
||||
// a ib -> b -ia
|
||||
float64x2_t tmp;
|
||||
tmp = vnegq_f64(in);
|
||||
@ -338,7 +338,7 @@ struct TimesMinusI{
|
||||
|
||||
struct TimesI{
|
||||
//Complex single
|
||||
inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
|
||||
inline float32x4_t operator()(float32x4_t in){
|
||||
// ar ai br bi -> -ai ar -bi br
|
||||
float32x4_t r0, r1;
|
||||
r0 = vnegq_f32(in); // -ar -ai -br -bi
|
||||
@ -346,7 +346,7 @@ struct TimesI{
|
||||
return vtrn1q_f32(r1, in); // -ai ar -bi br
|
||||
}
|
||||
//Complex double
|
||||
inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
|
||||
inline float64x2_t operator()(float64x2_t in){
|
||||
// a ib -> -b ia
|
||||
float64x2_t tmp;
|
||||
tmp = vnegq_f64(in);
|
||||
|
@ -79,60 +79,60 @@ public:
|
||||
this->_entries.resize(npoints* osites);
|
||||
this->_entries_p = &_entries[0];
|
||||
|
||||
thread_for(site, osites, {
|
||||
Coordinate Coor;
|
||||
Coordinate NbrCoor;
|
||||
|
||||
Coordinate Coor;
|
||||
Coordinate NbrCoor;
|
||||
for(Integer site=0;site<osites;site++){
|
||||
for(Integer ii=0;ii<npoints;ii++){
|
||||
Integer lex = site*npoints+ii;
|
||||
GeneralStencilEntry SE;
|
||||
////////////////////////////////////////////////
|
||||
// Outer index of neighbour Offset calculation
|
||||
////////////////////////////////////////////////
|
||||
grid->oCoorFromOindex(Coor,site);
|
||||
for(int d=0;d<Coor.size();d++){
|
||||
int rd = grid->_rdimensions[d];
|
||||
NbrCoor[d] = (Coor[d] + shifts[ii][d] + rd )%rd;
|
||||
for(Integer ii=0;ii<npoints;ii++){
|
||||
Integer lex = site*npoints+ii;
|
||||
GeneralStencilEntry SE;
|
||||
////////////////////////////////////////////////
|
||||
// Outer index of neighbour Offset calculation
|
||||
////////////////////////////////////////////////
|
||||
grid->oCoorFromOindex(Coor,site);
|
||||
for(int d=0;d<Coor.size();d++){
|
||||
int rd = grid->_rdimensions[d];
|
||||
NbrCoor[d] = (Coor[d] + shifts[ii][d] + rd )%rd;
|
||||
}
|
||||
SE._offset = grid->oIndexReduced(NbrCoor);
|
||||
|
||||
////////////////////////////////////////////////
|
||||
// Inner index permute calculation
|
||||
// Simpler version using icoor calculation
|
||||
////////////////////////////////////////////////
|
||||
SE._permute =0;
|
||||
for(int d=0;d<Coor.size();d++){
|
||||
|
||||
int fd = grid->_fdimensions[d];
|
||||
int rd = grid->_rdimensions[d];
|
||||
int ly = grid->_simd_layout[d];
|
||||
|
||||
assert((ly==1)||(ly==2));
|
||||
|
||||
int shift = (shifts[ii][d]+fd)%fd; // make it strictly positive 0.. L-1
|
||||
int x = Coor[d]; // x in [0... rd-1] as an oSite
|
||||
|
||||
int permute_dim = grid->PermuteDim(d);
|
||||
int permute_slice=0;
|
||||
if(permute_dim){
|
||||
int num = shift%rd; // Slice within dest osite cell of slice zero
|
||||
int wrap = shift/rd; // Number of osite local volume cells crossed through
|
||||
// x+num < rd dictates whether we are in same permute state as slice 0
|
||||
if ( x< rd-num ) permute_slice=wrap;
|
||||
else permute_slice=(wrap+1)%ly;
|
||||
}
|
||||
if ( permute_slice ) {
|
||||
int ptype =grid->PermuteType(d);
|
||||
uint8_t mask =0x1<<ptype;
|
||||
SE._permute |= mask;
|
||||
}
|
||||
}
|
||||
////////////////////////////////////////////////
|
||||
// Store in look up table
|
||||
////////////////////////////////////////////////
|
||||
this->_entries[lex] = SE;
|
||||
}
|
||||
SE._offset = grid->oIndexReduced(NbrCoor);
|
||||
|
||||
////////////////////////////////////////////////
|
||||
// Inner index permute calculation
|
||||
// Simpler version using icoor calculation
|
||||
////////////////////////////////////////////////
|
||||
SE._permute =0;
|
||||
for(int d=0;d<Coor.size();d++){
|
||||
|
||||
int fd = grid->_fdimensions[d];
|
||||
int rd = grid->_rdimensions[d];
|
||||
int ly = grid->_simd_layout[d];
|
||||
|
||||
assert((ly==1)||(ly==2));
|
||||
|
||||
int shift = (shifts[ii][d]+fd)%fd; // make it strictly positive 0.. L-1
|
||||
int x = Coor[d]; // x in [0... rd-1] as an oSite
|
||||
|
||||
int permute_dim = grid->PermuteDim(d);
|
||||
int permute_slice=0;
|
||||
if(permute_dim){
|
||||
int num = shift%rd; // Slice within dest osite cell of slice zero
|
||||
int wrap = shift/rd; // Number of osite local volume cells crossed through
|
||||
// x+num < rd dictates whether we are in same permute state as slice 0
|
||||
if ( x< rd-num ) permute_slice=wrap;
|
||||
else permute_slice=(wrap+1)%ly;
|
||||
}
|
||||
if ( permute_slice ) {
|
||||
int ptype =grid->PermuteType(d);
|
||||
uint8_t mask =grid->Nsimd() >> (ptype + 1);
|
||||
SE._permute |= mask;
|
||||
}
|
||||
}
|
||||
////////////////////////////////////////////////
|
||||
// Store in look up table
|
||||
////////////////////////////////////////////////
|
||||
this->_entries[lex] = SE;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
};
|
||||
|
@ -32,6 +32,7 @@
|
||||
|
||||
#include <Grid/stencil/SimpleCompressor.h> // subdir aggregate
|
||||
#include <Grid/stencil/Lebesgue.h> // subdir aggregate
|
||||
#include <Grid/stencil/GeneralLocalStencil.h>
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Must not lose sight that goal is to be able to construct really efficient
|
||||
@ -339,8 +340,8 @@ public:
|
||||
// Vectors that live on the symmetric heap in case of SHMEM
|
||||
// These are used; either SHM objects or refs to the above symmetric heap vectors
|
||||
// depending on comms target
|
||||
Vector<cobj *> u_simd_send_buf;
|
||||
Vector<cobj *> u_simd_recv_buf;
|
||||
std::vector<cobj *> u_simd_send_buf;
|
||||
std::vector<cobj *> u_simd_recv_buf;
|
||||
|
||||
int u_comm_offset;
|
||||
int _unified_buffer_size;
|
||||
@ -348,7 +349,7 @@ public:
|
||||
////////////////////////////////////////
|
||||
// Stencil query
|
||||
////////////////////////////////////////
|
||||
#ifdef SHM_FAST_PATH
|
||||
#if 1
|
||||
inline int SameNode(int point) {
|
||||
|
||||
int dimension = this->_directions[point];
|
||||
@ -434,7 +435,6 @@ public:
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||
{
|
||||
accelerator_barrier();
|
||||
for(int i=0;i<Packets.size();i++){
|
||||
_grid->StencilSendToRecvFromBegin(MpiReqs,
|
||||
Packets[i].send_buf,
|
||||
@ -443,7 +443,6 @@ public:
|
||||
Packets[i].from_rank,Packets[i].do_recv,
|
||||
Packets[i].xbytes,Packets[i].rbytes,i);
|
||||
}
|
||||
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
||||
}
|
||||
|
||||
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||
@ -452,6 +451,8 @@ public:
|
||||
if ( this->partialDirichlet ) DslashLogPartial();
|
||||
else if ( this->fullDirichlet ) DslashLogDirichlet();
|
||||
else DslashLogFull();
|
||||
acceleratorCopySynchronise();
|
||||
_grid->StencilBarrier();
|
||||
}
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Blocking send and receive. Either sequential or parallel.
|
||||
@ -529,7 +530,6 @@ public:
|
||||
{
|
||||
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
||||
|
||||
// conformable(source.Grid(),_grid);
|
||||
assert(source.Grid()==_grid);
|
||||
|
||||
u_comm_offset=0;
|
||||
@ -540,6 +540,7 @@ public:
|
||||
compress.Point(point);
|
||||
HaloGatherDir(source,compress,point,face_idx);
|
||||
}
|
||||
accelerator_barrier();
|
||||
face_table_computed=1;
|
||||
assert(u_comm_offset==_unified_buffer_size);
|
||||
|
||||
@ -655,8 +656,8 @@ public:
|
||||
CommsMerge(decompress,Mergers,Decompressions);
|
||||
}
|
||||
template<class decompressor> void CommsMergeSHM(decompressor decompress) {
|
||||
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
||||
CommsMerge(decompress,MergersSHM,DecompressionsSHM);
|
||||
assert(MergersSHM.size()==0);
|
||||
assert(DecompressionsSHM.size()==0);
|
||||
}
|
||||
|
||||
template<class decompressor>
|
||||
@ -705,6 +706,7 @@ public:
|
||||
}
|
||||
}
|
||||
}
|
||||
std::cout << GridLogDebug << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
|
||||
}
|
||||
/// Introduce a block structure and switch off comms on boundaries
|
||||
void DirichletBlock(const Coordinate &dirichlet_block)
|
||||
@ -1366,10 +1368,11 @@ public:
|
||||
int recv_from_rank;
|
||||
int xmit_to_rank;
|
||||
int shm_send=0;
|
||||
int shm_recv=0;
|
||||
|
||||
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||
#ifdef SHM_FAST_PATH
|
||||
#warning STENCIL SHM FAST PATH SELECTED
|
||||
int shm_recv=0;
|
||||
// shm == receive pointer if offnode
|
||||
// shm == Translate[send pointer] if on node -- my view of his send pointer
|
||||
cobj *shm = (cobj *) _grid->ShmBufferTranslate(recv_from_rank,sp);
|
||||
@ -1402,7 +1405,6 @@ public:
|
||||
acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero
|
||||
}
|
||||
int do_send = (comms_send|comms_partial_send) && (!shm_send );
|
||||
int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
|
||||
AddPacket((void *)sp,(void *)rp,
|
||||
xmit_to_rank,do_send,
|
||||
recv_from_rank,do_send,
|
||||
|
@ -73,6 +73,16 @@ vobj coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int
|
||||
return vec;
|
||||
}
|
||||
}
|
||||
//'perm_mask' acts as a bitmask
|
||||
template<class vobj> accelerator_inline
|
||||
vobj coalescedReadGeneralPermute(const vobj & __restrict__ vec,int perm_mask,int nd,int lane=0)
|
||||
{
|
||||
auto obj = vec, tmp = vec;
|
||||
for (int d=0;d<nd;d++)
|
||||
if (perm_mask & (0x1 << d)) { permute(obj,tmp,d); tmp=obj;}
|
||||
return obj;
|
||||
}
|
||||
|
||||
template<class vobj> accelerator_inline
|
||||
void coalescedWrite(vobj & __restrict__ vec,const vobj & __restrict__ extracted,int lane=0)
|
||||
{
|
||||
@ -83,7 +93,7 @@ void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__
|
||||
{
|
||||
vstream(vec, extracted);
|
||||
}
|
||||
#else
|
||||
#else //==GRID_SIMT
|
||||
|
||||
|
||||
//#ifndef GRID_SYCL
|
||||
@ -166,6 +176,14 @@ typename vobj::scalar_object coalescedReadPermute(const vobj & __restrict__ vec,
|
||||
return extractLane(plane,vec);
|
||||
}
|
||||
template<class vobj> accelerator_inline
|
||||
typename vobj::scalar_object coalescedReadGeneralPermute(const vobj & __restrict__ vec,int perm_mask,int nd,int lane=acceleratorSIMTlane(vobj::Nsimd()))
|
||||
{
|
||||
int plane = lane;
|
||||
for (int d=0;d<nd;d++)
|
||||
plane = (perm_mask & (0x1 << d)) ? plane ^ (vobj::Nsimd() >> (d + 1)) : plane;
|
||||
return extractLane(plane,vec);
|
||||
}
|
||||
template<class vobj> accelerator_inline
|
||||
void coalescedWrite(vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted,int lane=acceleratorSIMTlane(vobj::Nsimd()))
|
||||
{
|
||||
insertLane(lane,vec,extracted);
|
||||
|
@ -90,10 +90,12 @@ template<class vtype,int N> accelerator_inline iVector<vtype,N> ProjectOnGroup(c
|
||||
template<class vtype,int N, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0 >::type * =nullptr>
|
||||
accelerator_inline iMatrix<vtype,N> ProjectOnGroup(const iMatrix<vtype,N> &arg)
|
||||
{
|
||||
typedef typename iMatrix<vtype,N>::scalar_type scalar;
|
||||
// need a check for the group type?
|
||||
iMatrix<vtype,N> ret(arg);
|
||||
vtype nrm;
|
||||
vtype inner;
|
||||
scalar one(1.0);
|
||||
for(int c1=0;c1<N;c1++){
|
||||
|
||||
// Normalises row c1
|
||||
@ -102,7 +104,7 @@ accelerator_inline iMatrix<vtype,N> ProjectOnGroup(const iMatrix<vtype,N> &arg)
|
||||
inner += innerProduct(ret._internal[c1][c2],ret._internal[c1][c2]);
|
||||
|
||||
nrm = sqrt(inner);
|
||||
nrm = 1.0/nrm;
|
||||
nrm = one/nrm;
|
||||
for(int c2=0;c2<N;c2++)
|
||||
ret._internal[c1][c2]*= nrm;
|
||||
|
||||
@ -127,7 +129,7 @@ accelerator_inline iMatrix<vtype,N> ProjectOnGroup(const iMatrix<vtype,N> &arg)
|
||||
inner += innerProduct(ret._internal[c1][c2],ret._internal[c1][c2]);
|
||||
|
||||
nrm = sqrt(inner);
|
||||
nrm = 1.0/nrm;
|
||||
nrm = one/nrm;
|
||||
for(int c2=0;c2<N;c2++)
|
||||
ret._internal[c1][c2]*= nrm;
|
||||
}
|
||||
|
@ -55,7 +55,7 @@ template<class vtype, int N> accelerator_inline iVector<vtype, N> Exponentiate(c
|
||||
|
||||
|
||||
// Specialisation: Cayley-Hamilton exponential for SU(3)
|
||||
#ifndef GRID_ACCELERATED
|
||||
#if 0
|
||||
template<class vtype, typename std::enable_if< GridTypeMapper<vtype>::TensorLevel == 0>::type * =nullptr>
|
||||
accelerator_inline iMatrix<vtype,3> Exponentiate(const iMatrix<vtype,3> &arg, RealD alpha , Integer Nexp = DEFAULT_MAT_EXP )
|
||||
{
|
||||
|
@ -133,7 +133,6 @@ typename vobj::scalar_object extractLane(int lane, const vobj & __restrict__ vec
|
||||
typedef scalar_type * pointer;
|
||||
|
||||
constexpr int words=sizeof(vobj)/sizeof(vector_type);
|
||||
constexpr int Nsimd=vector_type::Nsimd();
|
||||
|
||||
scalar_object extracted;
|
||||
pointer __restrict__ sp = (pointer)&extracted; // Type pun
|
||||
@ -153,7 +152,6 @@ void insertLane(int lane, vobj & __restrict__ vec,const typename vobj::scalar_ob
|
||||
typedef scalar_type * pointer;
|
||||
|
||||
constexpr int words=sizeof(vobj)/sizeof(vector_type);
|
||||
constexpr int Nsimd=vector_type::Nsimd();
|
||||
|
||||
pointer __restrict__ sp = (pointer)&extracted;
|
||||
vector_type *vp = (vector_type *)&vec;
|
||||
@ -178,8 +176,6 @@ void extract(const vobj &vec,const ExtractPointerArray<sobj> &extracted, int off
|
||||
const int s = Nsimd/Nextr;
|
||||
|
||||
vector_type * vp = (vector_type *)&vec;
|
||||
scalar_type vtmp;
|
||||
sobj_scalar_type stmp;
|
||||
for(int w=0;w<words;w++){
|
||||
for(int i=0;i<Nextr;i++){
|
||||
sobj_scalar_type * pointer = (sobj_scalar_type *)& extracted[i][offset];
|
||||
@ -205,7 +201,6 @@ void merge(vobj &vec,const ExtractPointerArray<sobj> &extracted, int offset)
|
||||
|
||||
vector_type * vp = (vector_type *)&vec;
|
||||
scalar_type vtmp;
|
||||
sobj_scalar_type stmp;
|
||||
for(int w=0;w<words;w++){
|
||||
for(int i=0;i<Nextr;i++){
|
||||
sobj_scalar_type * pointer = (sobj_scalar_type *)& extracted[i][offset];
|
||||
@ -242,9 +237,6 @@ void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __rest
|
||||
typedef oextract_type * opointer;
|
||||
typedef iextract_type * ipointer;
|
||||
|
||||
constexpr int oNsimd=ovector_type::Nsimd();
|
||||
constexpr int iNsimd=ivector_type::Nsimd();
|
||||
|
||||
iscalar_type itmp;
|
||||
oscalar_type otmp;
|
||||
|
||||
|
@ -458,7 +458,8 @@ inline void acceleratorCopySynchronise(void) { hipStreamSynchronize(copyStream);
|
||||
// Common on all GPU targets
|
||||
//////////////////////////////////////////////
|
||||
#if defined(GRID_SYCL) || defined(GRID_CUDA) || defined(GRID_HIP)
|
||||
#define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );
|
||||
// FIXME -- the non-blocking nature got broken March 30 2023 by PAB
|
||||
#define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );
|
||||
|
||||
#define accelerator_for( iter, num, nsimd, ... ) \
|
||||
accelerator_forNB(iter, num, nsimd, { __VA_ARGS__ } ); \
|
||||
@ -525,7 +526,7 @@ inline void acceleratorFreeCpu (void *ptr){free(ptr);};
|
||||
//////////////////////////////////////////////
|
||||
|
||||
#ifdef GRID_SYCL
|
||||
inline void acceleratorFenceComputeStream(void){ accelerator_barrier();};
|
||||
inline void acceleratorFenceComputeStream(void){ theGridAccelerator->ext_oneapi_submit_barrier(); };
|
||||
#else
|
||||
// Ordering within a stream guaranteed on Nvidia & AMD
|
||||
inline void acceleratorFenceComputeStream(void){ };
|
||||
|
224
HMC/FTHMC2p1f.cc
Normal file
224
HMC/FTHMC2p1f.cc
Normal file
@ -0,0 +1,224 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Copyright (C) 2023
|
||||
|
||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution
|
||||
directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/Grid.h>
|
||||
#include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
|
||||
#include <Grid/qcd/smearing/JacobianAction.h>
|
||||
|
||||
using namespace Grid;
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
std::cout << std::setprecision(12);
|
||||
|
||||
Grid_init(&argc, &argv);
|
||||
int threads = GridThread::GetThreads();
|
||||
// here make a routine to print all the relevant information on the run
|
||||
std::cout << GridLogMessage << "Grid is setup to use " << threads << " threads" << std::endl;
|
||||
|
||||
// Typedefs to simplify notation
|
||||
typedef WilsonImplR FermionImplPolicy;
|
||||
typedef MobiusFermionD FermionAction;
|
||||
typedef typename FermionAction::FermionField FermionField;
|
||||
|
||||
typedef Grid::XmlReader Serialiser;
|
||||
|
||||
//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
|
||||
IntegratorParameters MD;
|
||||
// typedef GenericHMCRunner<LeapFrog> HMCWrapper;
|
||||
// MD.name = std::string("Leap Frog");
|
||||
// typedef GenericHMCRunner<ForceGradient> HMCWrapper;
|
||||
// MD.name = std::string("Force Gradient");
|
||||
typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
|
||||
MD.name = std::string("MinimumNorm2");
|
||||
MD.MDsteps = 12;
|
||||
MD.trajL = 1.0;
|
||||
|
||||
HMCparameters HMCparams;
|
||||
HMCparams.StartTrajectory = 0;
|
||||
HMCparams.Trajectories = 200;
|
||||
HMCparams.NoMetropolisUntil= 20;
|
||||
// "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
|
||||
HMCparams.StartingType =std::string("HotStart");
|
||||
HMCparams.MD = MD;
|
||||
HMCWrapper TheHMC(HMCparams);
|
||||
|
||||
// Grid from the command line arguments --grid and --mpi
|
||||
TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
|
||||
|
||||
CheckpointerParameters CPparams;
|
||||
CPparams.config_prefix = "ckpoint_EODWF_lat";
|
||||
CPparams.smeared_prefix = "ckpoint_EODWF_lat_smr";
|
||||
CPparams.rng_prefix = "ckpoint_EODWF_rng";
|
||||
CPparams.saveInterval = 1;
|
||||
CPparams.saveSmeared = true;
|
||||
CPparams.format = "IEEE64BIG";
|
||||
TheHMC.Resources.LoadNerscCheckpointer(CPparams);
|
||||
|
||||
RNGModuleParameters RNGpar;
|
||||
RNGpar.serial_seeds = "1 2 3 4 5";
|
||||
RNGpar.parallel_seeds = "6 7 8 9 10";
|
||||
TheHMC.Resources.SetRNGSeeds(RNGpar);
|
||||
|
||||
// Construct observables
|
||||
// here there is too much indirection
|
||||
typedef PlaquetteMod<HMCWrapper::ImplPolicy> PlaqObs;
|
||||
TheHMC.Resources.AddObservable<PlaqObs>();
|
||||
//////////////////////////////////////////////
|
||||
|
||||
const int Ls = 16;
|
||||
Real beta = 2.13;
|
||||
Real light_mass = 0.01;
|
||||
Real strange_mass = 0.04;
|
||||
Real pv_mass = 1.0;
|
||||
RealD M5 = 1.8;
|
||||
RealD b = 1.0; // Scale factor two
|
||||
RealD c = 0.0;
|
||||
|
||||
OneFlavourRationalParams OFRp;
|
||||
OFRp.lo = 1.0e-2;
|
||||
OFRp.hi = 64;
|
||||
OFRp.MaxIter = 10000;
|
||||
OFRp.tolerance= 1.0e-10;
|
||||
OFRp.degree = 14;
|
||||
OFRp.precision= 40;
|
||||
|
||||
std::vector<Real> hasenbusch({ 0.1 });
|
||||
|
||||
auto GridPtr = TheHMC.Resources.GetCartesian();
|
||||
auto GridRBPtr = TheHMC.Resources.GetRBCartesian();
|
||||
auto FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,GridPtr);
|
||||
auto FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,GridPtr);
|
||||
|
||||
IwasakiGaugeActionR GaugeAction(beta);
|
||||
|
||||
// temporarily need a gauge field
|
||||
LatticeGaugeField U(GridPtr);
|
||||
LatticeGaugeField Uhot(GridPtr);
|
||||
|
||||
// These lines are unecessary if BC are all periodic
|
||||
std::vector<Complex> boundary = {1,1,1,-1};
|
||||
FermionAction::ImplParams Params(boundary);
|
||||
|
||||
double StoppingCondition = 1e-10;
|
||||
double MaxCGIterations = 30000;
|
||||
ConjugateGradient<FermionField> CG(StoppingCondition,MaxCGIterations);
|
||||
|
||||
bool ApplySmearing = true;
|
||||
|
||||
////////////////////////////////////
|
||||
// Collect actions
|
||||
////////////////////////////////////
|
||||
ActionLevel<HMCWrapper::Field> Level1(1);
|
||||
ActionLevel<HMCWrapper::Field> Level2(2);
|
||||
ActionLevel<HMCWrapper::Field> Level3(4);
|
||||
|
||||
////////////////////////////////////
|
||||
// Strange action
|
||||
////////////////////////////////////
|
||||
|
||||
MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
|
||||
MobiusEOFAFermionD Strange_Op_R (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , pv_mass, strange_mass, pv_mass, -1.0, 1, M5, b, c);
|
||||
ExactOneFlavourRatioPseudoFermionAction<FermionImplPolicy>
|
||||
EOFA(Strange_Op_L, Strange_Op_R,
|
||||
CG,
|
||||
CG, CG,
|
||||
CG, CG,
|
||||
OFRp, false);
|
||||
|
||||
EOFA.is_smeared = ApplySmearing;
|
||||
Level1.push_back(&EOFA);
|
||||
|
||||
////////////////////////////////////
|
||||
// up down action
|
||||
////////////////////////////////////
|
||||
std::vector<Real> light_den;
|
||||
std::vector<Real> light_num;
|
||||
|
||||
int n_hasenbusch = hasenbusch.size();
|
||||
light_den.push_back(light_mass);
|
||||
for(int h=0;h<n_hasenbusch;h++){
|
||||
light_den.push_back(hasenbusch[h]);
|
||||
light_num.push_back(hasenbusch[h]);
|
||||
}
|
||||
light_num.push_back(pv_mass);
|
||||
|
||||
std::vector<FermionAction *> Numerators;
|
||||
std::vector<FermionAction *> Denominators;
|
||||
std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
|
||||
|
||||
for(int h=0;h<n_hasenbusch+1;h++){
|
||||
std::cout << GridLogMessage << " 2f quotient Action "<< light_num[h] << " / " << light_den[h]<< std::endl;
|
||||
Numerators.push_back (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, Params));
|
||||
Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, Params));
|
||||
Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],CG,CG));
|
||||
}
|
||||
|
||||
for(int h=0;h<n_hasenbusch+1;h++){
|
||||
Quotients[h]->is_smeared = ApplySmearing;
|
||||
Level1.push_back(Quotients[h]);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////
|
||||
// lnDetJacobianAction
|
||||
/////////////////////////////////////////////////////////////
|
||||
double rho = 0.1; // smearing parameter
|
||||
int Nsmear = 1; // number of smearing levels - must be multiple of 2Nd
|
||||
int Nstep = 8*Nsmear; // number of smearing levels - must be multiple of 2Nd
|
||||
Smear_Stout<HMCWrapper::ImplPolicy> Stout(rho);
|
||||
SmearedConfigurationMasked<HMCWrapper::ImplPolicy> SmearingPolicy(GridPtr, Nstep, Stout);
|
||||
JacobianAction<HMCWrapper::ImplPolicy> Jacobian(&SmearingPolicy);
|
||||
if( ApplySmearing ) Level2.push_back(&Jacobian);
|
||||
std::cout << GridLogMessage << " Built the Jacobian "<< std::endl;
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////
|
||||
// Gauge action
|
||||
/////////////////////////////////////////////////////////////
|
||||
// GaugeAction.is_smeared = ApplySmearing;
|
||||
GaugeAction.is_smeared = true;
|
||||
Level3.push_back(&GaugeAction);
|
||||
|
||||
std::cout << GridLogMessage << " ************************************************"<< std::endl;
|
||||
std::cout << GridLogMessage << " Action complete -- NO FERMIONS FOR NOW -- FIXME"<< std::endl;
|
||||
std::cout << GridLogMessage << " ************************************************"<< std::endl;
|
||||
std::cout << GridLogMessage << std::endl;
|
||||
std::cout << GridLogMessage << std::endl;
|
||||
|
||||
|
||||
std::cout << GridLogMessage << " Running the FT HMC "<< std::endl;
|
||||
|
||||
TheHMC.TheAction.push_back(Level1);
|
||||
TheHMC.TheAction.push_back(Level2);
|
||||
TheHMC.TheAction.push_back(Level3);
|
||||
|
||||
TheHMC.Run(SmearingPolicy); // for smearing
|
||||
|
||||
Grid_finalize();
|
||||
} // main
|
||||
|
||||
|
||||
|
@ -227,7 +227,7 @@ int main(int argc, char **argv) {
|
||||
// std::vector<Real> hasenbusch({ light_mass, 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated
|
||||
// std::vector<Real> hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass });
|
||||
|
||||
int SP_iters=10000;
|
||||
int SP_iters=9000;
|
||||
|
||||
RationalActionParams OFRp; // Up/down
|
||||
OFRp.lo = 6.0e-5;
|
||||
@ -362,12 +362,12 @@ int main(int argc, char **argv) {
|
||||
|
||||
// Probably dominates the force - back to EOFA.
|
||||
OneFlavourRationalParams SFRp;
|
||||
SFRp.lo = 0.25;
|
||||
SFRp.lo = 0.1;
|
||||
SFRp.hi = 25.0;
|
||||
SFRp.MaxIter = 10000;
|
||||
SFRp.tolerance= 1.0e-5;
|
||||
SFRp.tolerance= 1.0e-8;
|
||||
SFRp.mdtolerance= 2.0e-4;
|
||||
SFRp.degree = 8;
|
||||
SFRp.degree = 12;
|
||||
SFRp.precision= 50;
|
||||
|
||||
MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
|
||||
@ -451,7 +451,7 @@ int main(int argc, char **argv) {
|
||||
|
||||
#define MIXED_PRECISION
|
||||
#ifdef MIXED_PRECISION
|
||||
std::vector<GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicy> *> Bdys;
|
||||
std::vector<GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF> *> Bdys;
|
||||
#else
|
||||
std::vector<GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys;
|
||||
#endif
|
||||
@ -526,15 +526,13 @@ int main(int argc, char **argv) {
|
||||
Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],*MPCG[h],*ActionMPCG[h],CG));
|
||||
} else {
|
||||
#ifdef MIXED_PRECISION
|
||||
Bdys.push_back( new GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicy>(
|
||||
Bdys.push_back( new GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF>(
|
||||
*Numerators[h],*Denominators[h],
|
||||
*NumeratorsF[h],*DenominatorsF[h],
|
||||
*Numerators[h],*Denominators[h],
|
||||
OFRp, SP_iters) );
|
||||
Bdys.push_back( new GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicy>(
|
||||
Bdys.push_back( new GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF>(
|
||||
*Numerators[h],*Denominators[h],
|
||||
*NumeratorsF[h],*DenominatorsF[h],
|
||||
*Numerators[h],*Denominators[h],
|
||||
OFRp, SP_iters) );
|
||||
#else
|
||||
Bdys.push_back( new GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
|
||||
|
@ -329,7 +329,6 @@ int main(int argc, char **argv) {
|
||||
|
||||
|
||||
auto grid4= GridPtr;
|
||||
auto rbgrid4= GridRBPtr;
|
||||
auto rbgrid = StrangeOp.FermionRedBlackGrid();
|
||||
auto grid = StrangeOp.FermionGrid();
|
||||
if(1){
|
||||
|
@ -146,6 +146,8 @@ NAMESPACE_END(Grid);
|
||||
int main(int argc, char **argv) {
|
||||
using namespace Grid;
|
||||
|
||||
std::cout << " Grid Initialise "<<std::endl;
|
||||
|
||||
Grid_init(&argc, &argv);
|
||||
|
||||
CartesianCommunicator::BarrierWorld();
|
||||
@ -164,35 +166,30 @@ int main(int argc, char **argv) {
|
||||
typedef MobiusEOFAFermionF FermionEOFAActionF;
|
||||
typedef typename FermionActionF::FermionField FermionFieldF;
|
||||
|
||||
typedef WilsonImplD2 FermionImplPolicyD2;
|
||||
typedef MobiusFermionD2 FermionActionD2;
|
||||
typedef MobiusEOFAFermionD2 FermionEOFAActionD2;
|
||||
typedef typename FermionActionD2::FermionField FermionFieldD2;
|
||||
|
||||
typedef Grid::XmlReader Serialiser;
|
||||
|
||||
//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
|
||||
IntegratorParameters MD;
|
||||
// typedef GenericHMCRunner<LeapFrog> HMCWrapper;
|
||||
// MD.name = std::string("Leap Frog");
|
||||
typedef GenericHMCRunner<ForceGradient> HMCWrapper;
|
||||
MD.name = std::string("Force Gradient");
|
||||
//typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
|
||||
// MD.name = std::string("MinimumNorm2");
|
||||
// typedef GenericHMCRunner<ForceGradient> HMCWrapper;
|
||||
// MD.name = std::string("Force Gradient");
|
||||
typedef GenericHMCRunner<MinimumNorm2> HMCWrapper;
|
||||
MD.name = std::string("MinimumNorm2");
|
||||
// TrajL = 2
|
||||
// 4/2 => 0.6 dH
|
||||
// 3/3 => 0.8 dH .. depth 3, slower
|
||||
//MD.MDsteps = 4;
|
||||
MD.MDsteps = 12;
|
||||
MD.MDsteps = 14;
|
||||
MD.trajL = 0.5;
|
||||
|
||||
HMCparameters HMCparams;
|
||||
HMCparams.StartTrajectory = 1077;
|
||||
HMCparams.Trajectories = 1;
|
||||
HMCparams.Trajectories = 20;
|
||||
HMCparams.NoMetropolisUntil= 0;
|
||||
// "[HotStart, ColdStart, TepidStart, CheckpointStart]\n";
|
||||
// HMCparams.StartingType =std::string("ColdStart");
|
||||
HMCparams.StartingType =std::string("CheckpointStart");
|
||||
HMCparams.StartingType =std::string("ColdStart");
|
||||
// HMCparams.StartingType =std::string("CheckpointStart");
|
||||
HMCparams.MD = MD;
|
||||
HMCWrapper TheHMC(HMCparams);
|
||||
|
||||
@ -228,7 +225,7 @@ int main(int argc, char **argv) {
|
||||
Real pv_mass = 1.0;
|
||||
// std::vector<Real> hasenbusch({ 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
|
||||
// std::vector<Real> hasenbusch({ light_mass, 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
|
||||
std::vector<Real> hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated
|
||||
std::vector<Real> hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 }); // Updated
|
||||
// std::vector<Real> hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass });
|
||||
|
||||
auto GridPtr = TheHMC.Resources.GetCartesian();
|
||||
@ -249,11 +246,6 @@ int main(int argc, char **argv) {
|
||||
Coordinate shm;
|
||||
|
||||
GlobalSharedMemory::GetShmDims(mpi,shm);
|
||||
|
||||
Coordinate CommDim(Nd);
|
||||
for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
|
||||
|
||||
Coordinate NonDirichlet(Nd+1,0);
|
||||
|
||||
//////////////////////////
|
||||
// Fermion Grids
|
||||
@ -272,7 +264,6 @@ int main(int argc, char **argv) {
|
||||
// temporarily need a gauge field
|
||||
LatticeGaugeFieldD U(GridPtr); U=Zero();
|
||||
LatticeGaugeFieldF UF(GridPtrF); UF=Zero();
|
||||
LatticeGaugeFieldD2 UD2(GridPtrF); UD2=Zero();
|
||||
|
||||
std::cout << GridLogMessage << " Running the HMC "<< std::endl;
|
||||
TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file
|
||||
@ -283,15 +274,13 @@ int main(int argc, char **argv) {
|
||||
std::vector<Complex> boundary = {1,1,1,-1};
|
||||
FermionAction::ImplParams Params(boundary);
|
||||
FermionActionF::ImplParams ParamsF(boundary);
|
||||
Params.dirichlet=NonDirichlet;
|
||||
ParamsF.dirichlet=NonDirichlet;
|
||||
|
||||
// double StoppingCondition = 1e-14;
|
||||
// double MDStoppingCondition = 1e-9;
|
||||
double StoppingCondition = 1e-8;
|
||||
double MDStoppingCondition = 1e-7;
|
||||
double MDStoppingConditionLoose = 1e-7;
|
||||
double MDStoppingConditionStrange = 1e-7;
|
||||
double StoppingCondition = 1e-9;
|
||||
double MDStoppingCondition = 1e-8;
|
||||
double MDStoppingConditionLoose = 1e-8;
|
||||
double MDStoppingConditionStrange = 1e-8;
|
||||
double MaxCGIterations = 300000;
|
||||
ConjugateGradient<FermionField> CG(StoppingCondition,MaxCGIterations);
|
||||
ConjugateGradient<FermionField> MDCG(MDStoppingCondition,MaxCGIterations);
|
||||
@ -311,12 +300,12 @@ int main(int argc, char **argv) {
|
||||
|
||||
// Probably dominates the force - back to EOFA.
|
||||
OneFlavourRationalParams SFRp;
|
||||
SFRp.lo = 0.25;
|
||||
SFRp.hi = 25.0;
|
||||
SFRp.lo = 0.1;
|
||||
SFRp.hi = 30.0;
|
||||
SFRp.MaxIter = 10000;
|
||||
SFRp.tolerance= 1.0e-5;
|
||||
SFRp.mdtolerance= 2.0e-4;
|
||||
SFRp.degree = 8;
|
||||
SFRp.tolerance= 1.0e-8;
|
||||
SFRp.mdtolerance= 2.0e-6;
|
||||
SFRp.degree = 10;
|
||||
SFRp.precision= 50;
|
||||
|
||||
MobiusEOFAFermionD Strange_Op_L (U , *FGrid , *FrbGrid , *GridPtr , *GridRBPtr , strange_mass, strange_mass, pv_mass, 0.0, -1, M5, b, c);
|
||||
@ -376,33 +365,29 @@ int main(int argc, char **argv) {
|
||||
////////////////////////////////////
|
||||
std::vector<Real> light_den;
|
||||
std::vector<Real> light_num;
|
||||
std::vector<int> dirichlet_den;
|
||||
std::vector<int> dirichlet_num;
|
||||
|
||||
int n_hasenbusch = hasenbusch.size();
|
||||
light_den.push_back(light_mass); dirichlet_den.push_back(0);
|
||||
light_den.push_back(light_mass);
|
||||
for(int h=0;h<n_hasenbusch;h++){
|
||||
light_den.push_back(hasenbusch[h]); dirichlet_den.push_back(0);
|
||||
light_den.push_back(hasenbusch[h]);
|
||||
}
|
||||
|
||||
for(int h=0;h<n_hasenbusch;h++){
|
||||
light_num.push_back(hasenbusch[h]); dirichlet_num.push_back(0);
|
||||
light_num.push_back(hasenbusch[h]);
|
||||
}
|
||||
light_num.push_back(pv_mass); dirichlet_num.push_back(0);
|
||||
light_num.push_back(pv_mass);
|
||||
|
||||
std::vector<FermionAction *> Numerators;
|
||||
std::vector<FermionAction *> Denominators;
|
||||
std::vector<FermionActionF *> NumeratorsF;
|
||||
std::vector<FermionActionF *> DenominatorsF;
|
||||
std::vector<FermionActionD2 *> NumeratorsD2;
|
||||
std::vector<FermionActionD2 *> DenominatorsD2;
|
||||
std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
|
||||
std::vector<MxPCG *> ActionMPCG;
|
||||
std::vector<MxPCG *> MPCG;
|
||||
|
||||
#define MIXED_PRECISION
|
||||
#ifdef MIXED_PRECISION
|
||||
std::vector<OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicyD2> *> Bdys;
|
||||
std::vector<OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF> *> Bdys;
|
||||
#else
|
||||
std::vector<OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys;
|
||||
#endif
|
||||
@ -416,9 +401,7 @@ int main(int argc, char **argv) {
|
||||
std::cout << GridLogMessage
|
||||
<< " 2f quotient Action ";
|
||||
std::cout << "det D("<<light_den[h]<<")";
|
||||
if ( dirichlet_den[h] ) std::cout << "^dirichlet ";
|
||||
std::cout << "/ det D("<<light_num[h]<<")";
|
||||
if ( dirichlet_num[h] ) std::cout << "^dirichlet ";
|
||||
std::cout << std::endl;
|
||||
|
||||
FermionAction::ImplParams ParamsNum(boundary);
|
||||
@ -426,21 +409,11 @@ int main(int argc, char **argv) {
|
||||
FermionActionF::ImplParams ParamsDenF(boundary);
|
||||
FermionActionF::ImplParams ParamsNumF(boundary);
|
||||
|
||||
ParamsNum.dirichlet = NonDirichlet;
|
||||
ParamsDen.dirichlet = NonDirichlet;
|
||||
|
||||
ParamsNum.partialDirichlet = 0;
|
||||
ParamsDen.partialDirichlet = 0;
|
||||
|
||||
Numerators.push_back (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
|
||||
Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
|
||||
|
||||
ParamsDenF.dirichlet = ParamsDen.dirichlet;
|
||||
ParamsDenF.partialDirichlet = ParamsDen.partialDirichlet;
|
||||
DenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsDenF));
|
||||
|
||||
ParamsNumF.dirichlet = ParamsNum.dirichlet;
|
||||
ParamsNumF.partialDirichlet = ParamsNum.partialDirichlet;
|
||||
NumeratorsF.push_back (new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[h],M5,b,c, ParamsNumF));
|
||||
|
||||
LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
|
||||
@ -477,7 +450,6 @@ int main(int argc, char **argv) {
|
||||
// Gauge action
|
||||
/////////////////////////////////////////////////////////////
|
||||
Level3.push_back(&GaugeAction);
|
||||
// TheHMC.TheAction.push_back(Level1);
|
||||
TheHMC.TheAction.push_back(Level2);
|
||||
TheHMC.TheAction.push_back(Level3);
|
||||
std::cout << GridLogMessage << " Action complete "<< std::endl;
|
||||
|
@ -1,7 +1,8 @@
|
||||
# Grid [),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=GridBasedSoftware_Grid&tab=projectOverview)
|
||||
|
||||
# Grid
|
||||
**Data parallel C++ mathematical object library.**
|
||||
|
||||
[),branch:default:true)/statusIcon.svg)](https://ci.dev.dirac.ed.ac.uk/project/GridBasedSoftware_Grid?mode=builds)
|
||||
|
||||
License: GPL v2.
|
||||
|
||||
Last update June 2017.
|
||||
|
@ -425,7 +425,7 @@ void Benchmark(int Ls, Coordinate Dirichlet)
|
||||
|
||||
err = r_eo-result;
|
||||
n2e= norm2(err);
|
||||
std::cout<<GridLogMessage << "norm diff "<< n2e<< " Line "<<__LINE__ <<std::endl;
|
||||
std::cout<<GridLogMessage << "norm diff "<< n2e<<std::endl;
|
||||
assert(n2e<1.0e-4);
|
||||
|
||||
pickCheckerboard(Even,src_e,err);
|
||||
|
387
benchmarks/Benchmark_dwf_fp32_paranoid.cc
Normal file
387
benchmarks/Benchmark_dwf_fp32_paranoid.cc
Normal file
@ -0,0 +1,387 @@
|
||||
/*************************************************************************************
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
Source file: ./benchmarks/Benchmark_dwf.cc
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/Grid.h>
|
||||
#ifdef GRID_CUDA
|
||||
#define CUDA_PROFILE
|
||||
#endif
|
||||
|
||||
#ifdef CUDA_PROFILE
|
||||
#include <cuda_profiler_api.h>
|
||||
#endif
|
||||
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
|
||||
template<class d>
|
||||
struct scal {
|
||||
d internal;
|
||||
};
|
||||
|
||||
Gamma::Algebra Gmu [] = {
|
||||
Gamma::Algebra::GammaX,
|
||||
Gamma::Algebra::GammaY,
|
||||
Gamma::Algebra::GammaZ,
|
||||
Gamma::Algebra::GammaT
|
||||
};
|
||||
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
|
||||
int threads = GridThread::GetThreads();
|
||||
|
||||
Coordinate latt4 = GridDefaultLatt();
|
||||
int Ls=16;
|
||||
for(int i=0;i<argc;i++)
|
||||
if(std::string(argv[i]) == "-Ls"){
|
||||
std::stringstream ss(argv[i+1]); ss >> Ls;
|
||||
}
|
||||
|
||||
GridLogLayout();
|
||||
|
||||
long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
|
||||
|
||||
|
||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
|
||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||
|
||||
std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
|
||||
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
|
||||
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
|
||||
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
|
||||
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
|
||||
|
||||
std::vector<int> seeds4({1,2,3,4});
|
||||
std::vector<int> seeds5({5,6,7,8});
|
||||
|
||||
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
|
||||
GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG"));
|
||||
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
|
||||
GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG"));
|
||||
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
|
||||
|
||||
LatticeFermionF src (FGrid); random(RNG5,src);
|
||||
LatticeFermionF src1 (FGrid); random(RNG5,src1);
|
||||
#if 0
|
||||
src = Zero();
|
||||
{
|
||||
Coordinate origin({0,0,0,latt4[2]-1,0});
|
||||
SpinColourVectorF tmp;
|
||||
tmp=Zero();
|
||||
tmp()(0)(0)=Complex(-2.0,0.0);
|
||||
std::cout << " source site 0 " << tmp<<std::endl;
|
||||
pokeSite(tmp,src,origin);
|
||||
}
|
||||
#else
|
||||
RealD N2 = 1.0/::sqrt(norm2(src));
|
||||
src = src*N2;
|
||||
#endif
|
||||
|
||||
|
||||
LatticeFermionF result(FGrid); result=Zero();
|
||||
LatticeFermionF ref(FGrid); ref=Zero();
|
||||
LatticeFermionF tmp(FGrid);
|
||||
LatticeFermionF err(FGrid);
|
||||
|
||||
std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
|
||||
LatticeGaugeFieldF Umu(UGrid);
|
||||
SU<Nc>::HotConfiguration(RNG4,Umu);
|
||||
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
|
||||
#if 0
|
||||
Umu=1.0;
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
LatticeColourMatrixF ttmp(UGrid);
|
||||
ttmp = PeekIndex<LorentzIndex>(Umu,mu);
|
||||
// if (mu !=2 ) ttmp = 0;
|
||||
// ttmp = ttmp* pow(10.0,mu);
|
||||
PokeIndex<LorentzIndex>(Umu,ttmp,mu);
|
||||
}
|
||||
std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
|
||||
#endif
|
||||
|
||||
////////////////////////////////////
|
||||
// Naive wilson implementation
|
||||
////////////////////////////////////
|
||||
// replicate across fifth dimension
|
||||
// LatticeGaugeFieldF Umu5d(FGrid);
|
||||
std::vector<LatticeColourMatrixF> U(4,UGrid);
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
|
||||
}
|
||||
std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
|
||||
|
||||
if (1)
|
||||
{
|
||||
ref = Zero();
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
|
||||
tmp = Cshift(src,mu+1,1);
|
||||
{
|
||||
autoView( tmp_v , tmp , CpuWrite);
|
||||
autoView( U_v , U[mu] , CpuRead);
|
||||
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
||||
for(int s=0;s<Ls;s++){
|
||||
tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
|
||||
}
|
||||
}
|
||||
}
|
||||
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
|
||||
|
||||
{
|
||||
autoView( tmp_v , tmp , CpuWrite);
|
||||
autoView( U_v , U[mu] , CpuRead);
|
||||
autoView( src_v, src , CpuRead);
|
||||
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
||||
for(int s=0;s<Ls;s++){
|
||||
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
|
||||
}
|
||||
}
|
||||
}
|
||||
tmp =Cshift(tmp,mu+1,-1);
|
||||
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
|
||||
}
|
||||
ref = -0.5*ref;
|
||||
}
|
||||
|
||||
RealD mass=0.1;
|
||||
RealD M5 =1.8;
|
||||
|
||||
RealD NP = UGrid->_Nprocessors;
|
||||
RealD NN = UGrid->NodeCount();
|
||||
|
||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
|
||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
|
||||
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
|
||||
std::cout << GridLogMessage<< "* VComplexF size is "<<sizeof(vComplexF)<< " B"<<std::endl;
|
||||
if ( sizeof(RealF)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
|
||||
if ( sizeof(RealF)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
|
||||
#ifdef GRID_OMP
|
||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
|
||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
|
||||
#endif
|
||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
|
||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||
|
||||
DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
||||
int ncall =100;
|
||||
|
||||
if (1) {
|
||||
FGrid->Barrier();
|
||||
Dw.Dhop(src,result,0);
|
||||
std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
|
||||
double t0=usecond();
|
||||
for(int i=0;i<ncall;i++){
|
||||
Dw.Dhop(src1,result,0);
|
||||
Dw.Dhop(src,result,0);
|
||||
err = ref-result;
|
||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
||||
assert (norm2(err)< 1.0e-4 );
|
||||
}
|
||||
double t1=usecond();
|
||||
FGrid->Barrier();
|
||||
|
||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||
double flops=single_site_flops*volume*ncall;
|
||||
|
||||
auto nsimd = vComplex::Nsimd();
|
||||
auto simdwidth = sizeof(vComplex);
|
||||
|
||||
// RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
|
||||
double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
|
||||
|
||||
// mem: Nd Wilson * Ls, Nd gauge, Nc colors
|
||||
double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
|
||||
|
||||
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
|
||||
// std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
||||
// std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
|
||||
std::cout<<GridLogMessage << "RF GiB/s (base 2) = "<< 1000000. * data_rf/((t1-t0))<<std::endl;
|
||||
std::cout<<GridLogMessage << "mem GiB/s (base 2) = "<< 1000000. * data_mem/((t1-t0))<<std::endl;
|
||||
err = ref-result;
|
||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
||||
//exit(0);
|
||||
|
||||
if(( norm2(err)>1.0e-4) ) {
|
||||
|
||||
/*
|
||||
std::cout << "RESULT\n " << result<<std::endl;
|
||||
std::cout << "REF \n " << ref <<std::endl;
|
||||
std::cout << "ERR \n " << err <<std::endl;
|
||||
*/
|
||||
std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
|
||||
FGrid->Barrier();
|
||||
exit(-1);
|
||||
}
|
||||
assert (norm2(err)< 1.0e-4 );
|
||||
}
|
||||
|
||||
if (1)
|
||||
{ // Naive wilson dag implementation
|
||||
ref = Zero();
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
|
||||
// ref = src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
|
||||
tmp = Cshift(src,mu+1,1);
|
||||
{
|
||||
autoView( ref_v, ref, CpuWrite);
|
||||
autoView( tmp_v, tmp, CpuRead);
|
||||
autoView( U_v , U[mu] , CpuRead);
|
||||
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
||||
for(int s=0;s<Ls;s++){
|
||||
int i=s+Ls*ss;
|
||||
ref_v[i]+= U_v[ss]*(tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]); ;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
autoView( tmp_v , tmp , CpuWrite);
|
||||
autoView( U_v , U[mu] , CpuRead);
|
||||
autoView( src_v, src , CpuRead);
|
||||
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
||||
for(int s=0;s<Ls;s++){
|
||||
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
|
||||
}
|
||||
}
|
||||
}
|
||||
// tmp =adj(U[mu])*src;
|
||||
tmp =Cshift(tmp,mu+1,-1);
|
||||
{
|
||||
autoView( ref_v, ref, CpuWrite);
|
||||
autoView( tmp_v, tmp, CpuRead);
|
||||
for(int i=0;i<ref_v.size();i++){
|
||||
ref_v[i]+= tmp_v[i] - Gamma(Gmu[mu])*tmp_v[i]; ;
|
||||
}
|
||||
}
|
||||
}
|
||||
ref = -0.5*ref;
|
||||
}
|
||||
// dump=1;
|
||||
Dw.Dhop(src,result,1);
|
||||
std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
|
||||
std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm dag ref "<< norm2(ref)<<std::endl;
|
||||
err = ref-result;
|
||||
std::cout<<GridLogMessage << "norm dag diff "<< norm2(err)<<std::endl;
|
||||
if((norm2(err)>1.0e-4)){
|
||||
/*
|
||||
std::cout<< "DAG RESULT\n " <<ref << std::endl;
|
||||
std::cout<< "DAG sRESULT\n " <<result << std::endl;
|
||||
std::cout<< "DAG ERR \n " << err <<std::endl;
|
||||
*/
|
||||
}
|
||||
LatticeFermionF src_e (FrbGrid);
|
||||
LatticeFermionF src_o (FrbGrid);
|
||||
LatticeFermionF r_e (FrbGrid);
|
||||
LatticeFermionF r_o (FrbGrid);
|
||||
LatticeFermionF r_eo (FGrid);
|
||||
|
||||
std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
|
||||
pickCheckerboard(Even,src_e,src);
|
||||
pickCheckerboard(Odd,src_o,src);
|
||||
|
||||
std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
|
||||
std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
|
||||
|
||||
|
||||
// S-direction is INNERMOST and takes no part in the parity.
|
||||
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionF::DhopEO "<<std::endl;
|
||||
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
|
||||
if ( sizeof(RealF)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
|
||||
if ( sizeof(RealF)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
|
||||
#ifdef GRID_OMP
|
||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
|
||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
|
||||
#endif
|
||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
|
||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
||||
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||
{
|
||||
FGrid->Barrier();
|
||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||
double t0=usecond();
|
||||
for(int i=0;i<ncall;i++){
|
||||
#ifdef CUDA_PROFILE
|
||||
if(i==10) cudaProfilerStart();
|
||||
#endif
|
||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||
#ifdef CUDA_PROFILE
|
||||
if(i==20) cudaProfilerStop();
|
||||
#endif
|
||||
}
|
||||
double t1=usecond();
|
||||
FGrid->Barrier();
|
||||
|
||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||
double flops=(single_site_flops*volume*ncall)/2.0;
|
||||
|
||||
std::cout<<GridLogMessage << "Deo mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
std::cout<<GridLogMessage << "Deo mflop/s per rank "<< flops/(t1-t0)/NP<<std::endl;
|
||||
std::cout<<GridLogMessage << "Deo mflop/s per node "<< flops/(t1-t0)/NN<<std::endl;
|
||||
}
|
||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||
Dw.DhopOE(src_e,r_o,DaggerNo);
|
||||
Dw.Dhop (src ,result,DaggerNo);
|
||||
|
||||
std::cout<<GridLogMessage << "r_e"<<norm2(r_e)<<std::endl;
|
||||
std::cout<<GridLogMessage << "r_o"<<norm2(r_o)<<std::endl;
|
||||
std::cout<<GridLogMessage << "res"<<norm2(result)<<std::endl;
|
||||
|
||||
setCheckerboard(r_eo,r_o);
|
||||
setCheckerboard(r_eo,r_e);
|
||||
|
||||
err = r_eo-result;
|
||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
||||
if((norm2(err)>1.0e-4)){
|
||||
/*
|
||||
std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
|
||||
std::cout<< "Deo REF\n " <<result << std::endl;
|
||||
std::cout<< "Deo ERR \n " << err <<std::endl;
|
||||
*/
|
||||
}
|
||||
|
||||
pickCheckerboard(Even,src_e,err);
|
||||
pickCheckerboard(Odd,src_o,err);
|
||||
std::cout<<GridLogMessage << "norm diff even "<< norm2(src_e)<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm diff odd "<< norm2(src_o)<<std::endl;
|
||||
|
||||
assert(norm2(src_e)<1.0e-4);
|
||||
assert(norm2(src_o)<1.0e-4);
|
||||
Grid_finalize();
|
||||
exit(0);
|
||||
}
|
90
documentation/David_notes.txt
Normal file
90
documentation/David_notes.txt
Normal file
@ -0,0 +1,90 @@
|
||||
Branch: develop
|
||||
|
||||
Files:
|
||||
|
||||
Grid/lattice/PaddedCell.h -- Halo exchange
|
||||
tests/Test_general_stencil.cc -- test local off axis stencil addressing
|
||||
tests/debug/Test_padded_cell.cc -- test PaddedCell halo exchange and the General local stencil by computing ALL plaquettes on lattice
|
||||
|
||||
Functionality:
|
||||
|
||||
-- extend a lattice field:
|
||||
Grid/lattice/PaddedCell.h
|
||||
|
||||
// Constructor
|
||||
PaddedCell(int _depth,GridCartesian *_grid)
|
||||
|
||||
// Expand a field "in" to depth "d"
|
||||
template<class vobj>
|
||||
inline Lattice<vobj> Exchange(Lattice<vobj> &in)
|
||||
|
||||
// Take the "apple core" of in to a smaller local volume
|
||||
template<class vobj>
|
||||
inline Lattice<vobj> Extract(Lattice<vobj> &in)
|
||||
|
||||
-- Plaquette test:
|
||||
tests/debug/Test_padded_cell.cc
|
||||
/////////////////////////////////////////////////
|
||||
// Create a padded cell of extra padding depth=1
|
||||
/////////////////////////////////////////////////
|
||||
int depth = 1;
|
||||
PaddedCell Ghost(depth,&GRID);
|
||||
LatticeGaugeField Ughost = Ghost.Exchange(Umu);
|
||||
|
||||
///// Array for the site plaquette
|
||||
GridBase *GhostGrid = Ughost.Grid();
|
||||
LatticeComplex gplaq(GhostGrid);
|
||||
|
||||
std::vector<Coordinate> shifts;
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
for(int nu=mu+1;nu<Nd;nu++){
|
||||
|
||||
// Umu(x) Unu(x+mu) Umu^dag(x+nu) Unu^dag(x)
|
||||
Coordinate shift_0(Nd,0);
|
||||
Coordinate shift_mu(Nd,0); shift_mu[mu]=1;
|
||||
Coordinate shift_nu(Nd,0); shift_nu[nu]=1;
|
||||
shifts.push_back(shift_0);
|
||||
shifts.push_back(shift_mu);
|
||||
shifts.push_back(shift_nu);
|
||||
shifts.push_back(shift_0);
|
||||
}
|
||||
}
|
||||
GeneralLocalStencil gStencil(GhostGrid,shifts);
|
||||
|
||||
gplaq=Zero();
|
||||
{
|
||||
autoView( gp_v , gplaq, CpuWrite);
|
||||
autoView( t_v , trplaq, CpuRead);
|
||||
autoView( U_v , Ughost, CpuRead);
|
||||
for(int ss=0;ss<gp_v.size();ss++){
|
||||
int s=0;
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
for(int nu=mu+1;nu<Nd;nu++){
|
||||
|
||||
auto SE0 = gStencil.GetEntry(s+0,ss);
|
||||
auto SE1 = gStencil.GetEntry(s+1,ss);
|
||||
auto SE2 = gStencil.GetEntry(s+2,ss);
|
||||
auto SE3 = gStencil.GetEntry(s+3,ss);
|
||||
|
||||
int o0 = SE0->_offset;
|
||||
int o1 = SE1->_offset;
|
||||
int o2 = SE2->_offset;
|
||||
int o3 = SE3->_offset;
|
||||
|
||||
auto U0 = U_v[o0](mu);
|
||||
auto U1 = U_v[o1](nu);
|
||||
auto U2 = adj(U_v[o2](mu));
|
||||
auto U3 = adj(U_v[o3](nu));
|
||||
|
||||
gpermute(U0,SE0->_permute);
|
||||
gpermute(U1,SE1->_permute);
|
||||
gpermute(U2,SE2->_permute);
|
||||
gpermute(U3,SE3->_permute);
|
||||
|
||||
gp_v[ss]() =gp_v[ss]() + trace( U0*U1*U2*U3 );
|
||||
s=s+4;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
cplaq = Ghost.Extract(gplaq);
|
@ -10,9 +10,8 @@ For first time setup of the Xcode and Grid build environment on Mac OS, you will
|
||||
|
||||
1. Install Xcode and the Xcode command-line utilities
|
||||
2. Set Grid environment variables
|
||||
3. Install and build Open MPI ***optional***
|
||||
4. Install and build Grid pre-requisites
|
||||
5. Install, Configure and Build Grid
|
||||
3. Install and build Grid pre-requisites
|
||||
4. Install, Configure and Build Grid
|
||||
|
||||
Apple's [Xcode website][Xcode] is the go-to reference for 1, and the definitive reference for 4 and 5 is the [Grid Documentation][GridDoc].
|
||||
|
||||
@ -92,60 +91,33 @@ launchctl setenv GridPkg /opt/local</string>
|
||||
</plist>
|
||||
```
|
||||
|
||||
## 3. Install and build Open MPI -- ***optional***
|
||||
|
||||
Download the latest version of [Open MPI][OMPI] version 3.1 (I used 3.1.5) and build it like so:
|
||||
|
||||
[OMPI]: https://www.open-mpi.org/software/ompi/v3.1/
|
||||
|
||||
../configure CC=clang CXX=clang++ CXXFLAGS=-g --prefix=$GridPre/bin
|
||||
make -j 4 all install
|
||||
|
||||
***Note the `/bin` at the end of the prefix - this is required. As a quirk of the OpenMPI installer, `--prefix` must point to the `bin` subdirectory, with other files installed in `$GridPre/include`, `$GridPre/lib`, `$GridPre/share`, etc.***
|
||||
|
||||
Grid does not have any dependencies on fortran, however many standard scientific packages do, so you may wish to download GNU fortran (e.g. MacPorts ``gfortran`` package) and add the following to your configure invocation:
|
||||
|
||||
F77=gfortran FC=gfortran
|
||||
|
||||
## 4. Install and build Grid pre-requisites
|
||||
## 3. Install and build Grid pre-requisites
|
||||
|
||||
To simplify the installation of **Grid pre-requisites**, you can use your favourite package manager, e.g.:
|
||||
|
||||
### 1. [MacPorts][MacPorts]
|
||||
### 3.1. [MacPorts][MacPorts]
|
||||
|
||||
[MacPorts]: https://www.macports.org "MacPorts package manager"
|
||||
|
||||
Install [MacPorts][MacPorts] if you haven't done so already, and then install packages with:
|
||||
|
||||
sudo port install <portname>
|
||||
sudo port install openmpi git-flow-avh gmp hdf5 mpfr fftw-3-single lapack wget autoconf automake bison cmake gawk libomp
|
||||
|
||||
These are the `portname`s for mandatory Grid libraries:
|
||||
On a Mac without GPUs:
|
||||
|
||||
* git-flow-avh
|
||||
* gmp
|
||||
* hdf5
|
||||
* mpfr
|
||||
sudo port install OpenBLAS +native
|
||||
|
||||
and these are the `portname`s for optional Grid libraries:
|
||||
To use `Gnu sha256sum`:
|
||||
|
||||
* fftw-3-single
|
||||
* lapack
|
||||
* doxygen
|
||||
* OpenBLAS
|
||||
pushd /opt/local/bin; sudo ln -s gsha256sum sha256sum; popd
|
||||
|
||||
***Please update this list with any packages I've missed! ... and double-check whether OpenBLAS is really for Grid. NB: lapack doesn't seem to work. Should it be scalapack?***
|
||||
These `port`s are not strictly necessary, but they are helpful:
|
||||
|
||||
### 2. [Homebrew][Homebrew]
|
||||
sudo port install gnuplot gsl h5utils nasm rclone texinfo tree xorg-server
|
||||
|
||||
[Homebrew]: https://brew.sh "Homebrew package manager"
|
||||
***Please update this list with any packages I've missed!***
|
||||
|
||||
Install [Homebrew][Homebrew] if you haven't done so already, and then install packages with:
|
||||
|
||||
sudo brew install <packagename>
|
||||
|
||||
The same packages are available as from MacPorts.
|
||||
|
||||
### Install LIME ***optional***
|
||||
#### Install LIME
|
||||
|
||||
There isn't currently a port for [C-LIME][C-LIME], so download the source and then build it:
|
||||
|
||||
@ -154,9 +126,19 @@ There isn't currently a port for [C-LIME][C-LIME], so download the source and th
|
||||
../configure CC=clang --prefix=$GridPre
|
||||
make -j 4 all install
|
||||
|
||||
## 5. Install, Configure and Build Grid
|
||||
### 3.2. [Homebrew][Homebrew]
|
||||
|
||||
### 5.1 Install Grid
|
||||
[Homebrew]: https://brew.sh "Homebrew package manager"
|
||||
|
||||
Install [Homebrew][Homebrew] if you haven't done so already, and then install packages with:
|
||||
|
||||
sudo brew install <packagename>
|
||||
|
||||
I don't use Homebrew, so I'm not sure what the Brew package name equivalents are. ** Please update if you know **
|
||||
|
||||
## 4. Install, Configure and Build Grid
|
||||
|
||||
### 4.1 Install Grid
|
||||
|
||||
[Grid]: https://github.com/paboyle/Grid
|
||||
|
||||
@ -174,7 +156,7 @@ or
|
||||
|
||||
depending on how many times you like to enter your password.
|
||||
|
||||
### 5.2 Configure Grid
|
||||
### 4.2 Configure Grid
|
||||
|
||||
The Xcode build system supports multiple configurations for each project, by default: `Debug` and `Release`, but more configurations can be defined. We will create separate Grid build directories for each configuration, using the Grid **Autoconf** build system to make each configuration. NB: it is **not** necessary to run `make install` on them once they are built (IDE features such as *jump to definition* will work better of you don't).
|
||||
|
||||
@ -198,7 +180,7 @@ Debug configuration with MPI:
|
||||
|
||||
../configure CXX=clang++ CXXFLAGS="-I$GridPkg/include/libomp -Xpreprocessor -fopenmp -std=c++11" LDFLAGS="-L$GridPkg/lib/libomp" LIBS="-lomp" --with-hdf5=$GridPkg --with-gmp=$GridPkg --with-mpfr=$GridPkg --with-fftw=$GridPkg --with-lime=$GridPre --enable-simd=GEN --enable-comms=mpi-auto MPICXX=$GridPre/bin/mpicxx --prefix=$GridPre/MPIDebug
|
||||
|
||||
### 5.3 Build Grid
|
||||
### 4.3 Build Grid
|
||||
|
||||
Each configuration must be built before they can be used. You can either:
|
||||
|
||||
|
133
examples/socket_grid.cc
Normal file
133
examples/socket_grid.cc
Normal file
@ -0,0 +1,133 @@
|
||||
#include <sys/socket.h>
|
||||
#include <sys/un.h>
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <err.h>
|
||||
#include <fcntl.h>
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
static int sock;
|
||||
static const char *sock_path_fmt = "/tmp/GridUnixSocket.%d";
|
||||
static char sock_path[256];
|
||||
|
||||
class UnixSockets {
|
||||
public:
|
||||
static void Open(int rank)
|
||||
{
|
||||
int errnum;
|
||||
|
||||
sock = socket(AF_UNIX, SOCK_DGRAM, 0); assert(sock>0);
|
||||
printf("allocated socket %d\n",sock);
|
||||
|
||||
struct sockaddr_un sa_un = { 0 };
|
||||
sa_un.sun_family = AF_UNIX;
|
||||
snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,rank);
|
||||
unlink(sa_un.sun_path);
|
||||
if (bind(sock, (struct sockaddr *)&sa_un, sizeof(sa_un))) {
|
||||
perror("bind failure");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
printf("bound socket %d to %s\n",sock,sa_un.sun_path);
|
||||
}
|
||||
|
||||
static int RecvFileDescriptor(void)
|
||||
{
|
||||
int n;
|
||||
int fd;
|
||||
char buf[1];
|
||||
struct iovec iov;
|
||||
struct msghdr msg;
|
||||
struct cmsghdr *cmsg;
|
||||
char cms[CMSG_SPACE(sizeof(int))];
|
||||
|
||||
iov.iov_base = buf;
|
||||
iov.iov_len = 1;
|
||||
|
||||
memset(&msg, 0, sizeof msg);
|
||||
msg.msg_name = 0;
|
||||
msg.msg_namelen = 0;
|
||||
msg.msg_iov = &iov;
|
||||
msg.msg_iovlen = 1;
|
||||
|
||||
msg.msg_control = (caddr_t)cms;
|
||||
msg.msg_controllen = sizeof cms;
|
||||
|
||||
if((n=recvmsg(sock, &msg, 0)) < 0) {
|
||||
perror("recvmsg failed");
|
||||
return -1;
|
||||
}
|
||||
if(n == 0){
|
||||
perror("recvmsg returned 0");
|
||||
return -1;
|
||||
}
|
||||
cmsg = CMSG_FIRSTHDR(&msg);
|
||||
memmove(&fd, CMSG_DATA(cmsg), sizeof(int));
|
||||
printf("received fd %d from socket %d\n",fd,sock);
|
||||
return fd;
|
||||
}
|
||||
|
||||
static void SendFileDescriptor(int fildes,int xmit_to_rank)
|
||||
{
|
||||
struct msghdr msg;
|
||||
struct iovec iov;
|
||||
struct cmsghdr *cmsg = NULL;
|
||||
char ctrl[CMSG_SPACE(sizeof(int))];
|
||||
char data = ' ';
|
||||
|
||||
memset(&msg, 0, sizeof(struct msghdr));
|
||||
memset(ctrl, 0, CMSG_SPACE(sizeof(int)));
|
||||
iov.iov_base = &data;
|
||||
iov.iov_len = sizeof(data);
|
||||
|
||||
sprintf(sock_path,sock_path_fmt,xmit_to_rank);
|
||||
printf("sending FD %d over socket %d to rank %d AF_UNIX path %s\n",fildes,sock,xmit_to_rank,sock_path);fflush(stdout);
|
||||
|
||||
struct sockaddr_un sa_un = { 0 };
|
||||
sa_un.sun_family = AF_UNIX;
|
||||
snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,xmit_to_rank);
|
||||
|
||||
msg.msg_name = (void *)&sa_un;
|
||||
msg.msg_namelen = sizeof(sa_un);
|
||||
msg.msg_iov = &iov;
|
||||
msg.msg_iovlen = 1;
|
||||
msg.msg_controllen = CMSG_SPACE(sizeof(int));
|
||||
msg.msg_control = ctrl;
|
||||
|
||||
cmsg = CMSG_FIRSTHDR(&msg);
|
||||
cmsg->cmsg_level = SOL_SOCKET;
|
||||
cmsg->cmsg_type = SCM_RIGHTS;
|
||||
cmsg->cmsg_len = CMSG_LEN(sizeof(int));
|
||||
|
||||
*((int *) CMSG_DATA(cmsg)) = fildes;
|
||||
|
||||
if ( sendmsg(sock, &msg, 0) == -1 ) perror("sendmsg failed");
|
||||
};
|
||||
};
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
int me = fork()?0:1;
|
||||
|
||||
UnixSockets::Open(me);
|
||||
|
||||
// need MPI barrier
|
||||
sleep(10);
|
||||
const char * message = "Hello, World\n";
|
||||
if( me ) {
|
||||
int fd = open("foo",O_RDWR|O_CREAT,0666);
|
||||
if ( fd < 0 ) {
|
||||
perror("failed to open file");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
// rank 1 sends ot rank 0
|
||||
UnixSockets::SendFileDescriptor(fd,0);
|
||||
close(fd);
|
||||
} else {
|
||||
// rank 0 sends receives frmo rank 1
|
||||
int fd = UnixSockets::RecvFileDescriptor();
|
||||
write(fd,(const void *)message,strlen(message));
|
||||
close(fd);
|
||||
}
|
||||
}
|
@ -60,7 +60,7 @@ while test $# -gt 0; do
|
||||
;;
|
||||
|
||||
--cxxflags)
|
||||
echo @GRID_CXXFLAGS@
|
||||
echo @GRID_CXXFLAGS@ -I@prefix@/include
|
||||
;;
|
||||
|
||||
--cxx)
|
||||
@ -72,11 +72,11 @@ while test $# -gt 0; do
|
||||
;;
|
||||
|
||||
--ldflags)
|
||||
echo @GRID_LDFLAGS@
|
||||
echo @GRID_LDFLAGS@ -L@prefix@/lib
|
||||
;;
|
||||
|
||||
--libs)
|
||||
echo @GRID_LIBS@
|
||||
echo @GRID_LIBS@ -lGrid
|
||||
;;
|
||||
|
||||
--summary)
|
||||
|
@ -1,7 +1,7 @@
|
||||
CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
|
||||
../../configure --enable-comms=mpi-auto \
|
||||
--with-lime=$CLIME \
|
||||
--enable-unified=yes \
|
||||
--enable-unified=no \
|
||||
--enable-shm=nvlink \
|
||||
--enable-tracing=timer \
|
||||
--enable-accelerator=hip \
|
||||
|
@ -5,8 +5,8 @@ module load emacs
|
||||
#module load gperftools
|
||||
module load PrgEnv-gnu
|
||||
module load rocm/5.3.0
|
||||
module load cray-mpich/8.1.16
|
||||
#module load cray-mpich/8.1.17
|
||||
#module load cray-mpich/8.1.16
|
||||
module load cray-mpich/8.1.17
|
||||
module load gmp
|
||||
module load cray-fftw
|
||||
module load craype-accel-amd-gfx90a
|
||||
|
44
systems/Lumi/benchmarks/bench2.slurm
Executable file
44
systems/Lumi/benchmarks/bench2.slurm
Executable file
@ -0,0 +1,44 @@
|
||||
#!/bin/bash -l
|
||||
#SBATCH --job-name=bench_lehner
|
||||
#SBATCH --partition=small-g
|
||||
#SBATCH --nodes=2
|
||||
#SBATCH --ntasks-per-node=8
|
||||
#SBATCH --cpus-per-task=7
|
||||
#SBATCH --gpus-per-node=8
|
||||
#SBATCH --time=00:10:00
|
||||
#SBATCH --account=project_465000546
|
||||
#SBATCH --gpu-bind=none
|
||||
#SBATCH --exclusive
|
||||
#SBATCH --mem=0
|
||||
|
||||
CPU_BIND="map_cpu:48,56,32,40,16,24,1,8"
|
||||
echo $CPU_BIND
|
||||
|
||||
cat << EOF > select_gpu
|
||||
#!/bin/bash
|
||||
export GPU_MAP=(0 1 2 3 4 5 6 7)
|
||||
export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
|
||||
export HIP_VISIBLE_DEVICES=\$GPU
|
||||
unset ROCR_VISIBLE_DEVICES
|
||||
echo RANK \$SLURM_LOCALID using GPU \$GPU
|
||||
exec \$*
|
||||
EOF
|
||||
|
||||
chmod +x ./select_gpu
|
||||
|
||||
root=/scratch/project_465000546/boylepet/Grid/systems/Lumi
|
||||
source ${root}/sourceme.sh
|
||||
|
||||
export OMP_NUM_THREADS=7
|
||||
export MPICH_GPU_SUPPORT_ENABLED=1
|
||||
export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
|
||||
|
||||
for vol in 16.16.16.64 32.32.32.64 32.32.32.128
|
||||
do
|
||||
srun --cpu-bind=${CPU_BIND} ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol > log.shm0.ov.$vol
|
||||
#srun --cpu-bind=${CPU_BIND} ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.ov.$vol
|
||||
|
||||
srun --cpu-bind=${CPU_BIND} ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 0 --grid $vol > log.shm0.seq.$vol
|
||||
#srun --cpu-bind=${CPU_BIND} ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.seq.$vol
|
||||
done
|
||||
|
30
systems/Lumi/config-command
Normal file
30
systems/Lumi/config-command
Normal file
@ -0,0 +1,30 @@
|
||||
spack load c-lime
|
||||
spack load gmp
|
||||
spack load mpfr
|
||||
CLIME=`spack find --paths c-lime | grep c-lime| cut -c 15-`
|
||||
GMP=`spack find --paths gmp | grep gmp | cut -c 12-`
|
||||
MPFR=`spack find --paths mpfr | grep mpfr | cut -c 13-`
|
||||
echo clime X$CLIME
|
||||
echo gmp X$GMP
|
||||
echo mpfr X$MPFR
|
||||
|
||||
../../configure \
|
||||
--enable-comms=mpi-auto \
|
||||
--with-lime=$CLIME \
|
||||
--enable-unified=no \
|
||||
--enable-shm=nvlink \
|
||||
--enable-accelerator=hip \
|
||||
--enable-gen-simd-width=64 \
|
||||
--enable-simd=GPU \
|
||||
--enable-accelerator-cshift \
|
||||
--with-gmp=$GMP \
|
||||
--with-mpfr=$MPFR \
|
||||
--with-fftw=$FFTW_DIR/.. \
|
||||
--disable-fermion-reps \
|
||||
--disable-gparity \
|
||||
CXX=hipcc MPICXX=mpicxx \
|
||||
CXXFLAGS="-fPIC --offload-arch=gfx90a -I/opt/rocm/include/ -std=c++14 -I/opt/cray/pe/mpich/8.1.23/ofi/gnu/9.1/include" \
|
||||
LDFLAGS="-L/opt/cray/pe/mpich/8.1.23/ofi/gnu/9.1/lib -lmpi -L/opt/cray/pe/mpich/8.1.23/gtl/lib -lmpi_gtl_hsa -lamdhip64 -fopenmp"
|
||||
|
||||
|
||||
|
5
systems/Lumi/sourceme.sh
Normal file
5
systems/Lumi/sourceme.sh
Normal file
@ -0,0 +1,5 @@
|
||||
source ~/spack/share/spack/setup-env.sh
|
||||
module load CrayEnv LUMI/22.12 partition/G cray-fftw/3.3.10.1 rocm
|
||||
spack load c-lime
|
||||
spack load gmp
|
||||
spack load mpfr
|
53
systems/OEM/README
Normal file
53
systems/OEM/README
Normal file
@ -0,0 +1,53 @@
|
||||
1. Prerequisites:
|
||||
===================
|
||||
Make sure you have the latest Intel ipcx release loaded (via modules or similar)
|
||||
Make sure you have SYCL aware MPICH or Intel MPI loaded (assumed as mpicxx)
|
||||
|
||||
2. Obtain Grid:
|
||||
===================
|
||||
|
||||
bash$
|
||||
git clone https://github.com/paboyle/Grid
|
||||
cd Grid
|
||||
./bootstrap.sh
|
||||
cd systems/PVC
|
||||
|
||||
3. Build Grid:
|
||||
===================
|
||||
|
||||
Here, configure command is stored in file config-command:
|
||||
|
||||
bash$
|
||||
../../configure \
|
||||
--enable-simd=GPU \
|
||||
--enable-gen-simd-width=64 \
|
||||
--enable-comms=mpi-auto \
|
||||
--enable-accelerator-cshift \
|
||||
--disable-gparity \
|
||||
--disable-fermion-reps \
|
||||
--enable-shm=nvlink \
|
||||
--enable-accelerator=sycl \
|
||||
--enable-unified=no \
|
||||
MPICXX=mpicxx \
|
||||
CXX=icpx \
|
||||
LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader " \
|
||||
CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare "
|
||||
|
||||
make all
|
||||
|
||||
4. Run a benchmark:
|
||||
===================
|
||||
|
||||
*** Assumes interactive access to node. ***
|
||||
|
||||
run Benchmark_dwf_fp32 using benchmarks/bench.sh
|
||||
|
||||
bash$
|
||||
cd benchmarks
|
||||
./bench.sh
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
18
systems/OEM/benchmarks/bench.sh
Executable file
18
systems/OEM/benchmarks/bench.sh
Executable file
@ -0,0 +1,18 @@
|
||||
#!/bin/bash
|
||||
|
||||
export EnableImplicitScaling=0
|
||||
export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
|
||||
export ZE_AFFINITY_MASK=$gpu_id.$tile_id
|
||||
export ONEAPI_DEVICE_FILTER=gpu,level_zero
|
||||
export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
|
||||
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
||||
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
|
||||
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
|
||||
|
||||
mpiexec -launcher ssh -n 1 -host localhost ./select_gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads 16 --shm-mpi 1 --shm 2048 --device-mem 32768 | tee 1tile.log
|
||||
mpiexec -launcher ssh -n 2 -host localhost ./select_gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads 16 --shm-mpi 1 --shm 2048 --device-mem 32768 | tee 2tile.log
|
||||
|
||||
#mpiexec -launcher ssh -n 4 -host localhost ./select_gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.2.2 --grid 16.16.64.64 --accelerator-threads 16 --shm-mpi 0 --shm 2048 --device-mem 32768 | tee 4tile.log
|
||||
#mpiexec -launcher ssh -n 8 -host localhost ./select_gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.2.4 --grid 16.16.64.128 --accelerator-threads 16 --shm-mpi 0 --shm 2048 --device-mem 32768 | tee 8tile.log
|
||||
|
||||
|
13
systems/OEM/benchmarks/select_gpu.sh
Executable file
13
systems/OEM/benchmarks/select_gpu.sh
Executable file
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
|
||||
num_tile=2
|
||||
|
||||
gpu_id=$(( (MPI_LOCAL_RANKID % num_tile ) ))
|
||||
tile_id=$((MPI_LOCAL_RANKID / num_tile))
|
||||
|
||||
export ZE_AFFINITY_MASK=$gpu_id.$tile_id
|
||||
|
||||
echo "local rank $MPI_LOCALRANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK"
|
||||
|
||||
"$@"
|
||||
|
15
systems/OEM/config-command
Normal file
15
systems/OEM/config-command
Normal file
@ -0,0 +1,15 @@
|
||||
../../configure \
|
||||
--enable-simd=GPU \
|
||||
--enable-gen-simd-width=64 \
|
||||
--enable-comms=mpi-auto \
|
||||
--enable-accelerator-cshift \
|
||||
--disable-gparity \
|
||||
--disable-fermion-reps \
|
||||
--enable-shm=nvlink \
|
||||
--enable-accelerator=sycl \
|
||||
--enable-unified=no \
|
||||
MPICXX=mpicxx \
|
||||
CXX=icpx \
|
||||
LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader " \
|
||||
CXXFLAGS="-fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare "
|
||||
|
3
systems/OEM/setup.sh
Normal file
3
systems/OEM/setup.sh
Normal file
@ -0,0 +1,3 @@
|
||||
export https_proxy=http://proxy-chain.intel.com:911
|
||||
module load intel-release
|
||||
module load intel/mpich
|
@ -4,7 +4,7 @@
|
||||
#SBATCH -p QZ1J-ICX-PVC
|
||||
##SBATCH -p QZ1J-SPR-PVC-2C
|
||||
|
||||
source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
|
||||
#source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
|
||||
|
||||
export NT=8
|
||||
|
||||
|
@ -4,7 +4,7 @@
|
||||
|
||||
#SBATCH -p QZ1J-ICX-PVC
|
||||
|
||||
source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
|
||||
#source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
|
||||
|
||||
export NT=16
|
||||
|
||||
@ -19,11 +19,15 @@ export SYCL_DEVICE_FILTER=gpu,level_zero
|
||||
export I_MPI_OFFLOAD_CELL=tile
|
||||
export EnableImplicitScaling=0
|
||||
export EnableWalkerPartition=0
|
||||
export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1
|
||||
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
||||
#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1
|
||||
#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
||||
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0
|
||||
|
||||
#mpiexec -launcher ssh -n 1 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0 > 1tile.log
|
||||
for i in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
|
||||
do
|
||||
mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 0 --device-mem 32768 > 1.1.1.2.log$i
|
||||
mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 0 --device-mem 32768 > 2.1.1.1.log$i
|
||||
done
|
||||
|
||||
mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user