1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-22 09:42:02 +01:00

Compare commits

...

56 Commits

Author SHA1 Message Date
12d20d8e15 Merge branch 'release/0.10.0' 2023-03-29 16:35:33 -04:00
10e6d7c6ce Merge branch 'feature/dirichlet' into develop 2023-03-29 16:26:47 -04:00
c42e25e5b8 Dirichlet remove 2023-03-29 16:25:52 -04:00
25777e5967 Merge branch 'release/0.9.0' 2023-03-29 15:27:58 -04:00
a00ae981e0 Fence propagation from SYCL 2023-03-29 15:00:40 -04:00
58e020b62a Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2023-03-29 14:37:40 -04:00
a7e1aceeca Compile fix on Nvidia 2023-03-29 14:36:50 -04:00
7212432f43 More careful fencing 2023-03-28 20:10:22 -07:00
4a261fab30 Changes premerge to develop 2023-03-28 20:04:21 -07:00
6af97069b9 Preparing for close of feature/dirichlet
Initial code change review complete
2023-03-28 13:39:44 -07:00
5068413cdb Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2023-03-28 08:35:38 -07:00
71c6960eea Commet 2023-03-28 08:34:24 -07:00
ddf6d5c9e3 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2023-03-28 11:33:05 -04:00
900e01f49b Temporary 2023-03-27 21:35:06 -07:00
2376156fbc Merge branch 'develop' into feature/dirichlet 2023-03-27 21:33:50 -07:00
3f2fd49db4 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2023-03-27 17:29:54 -07:00
0efa107cb6 Script update 2023-03-27 17:29:43 -07:00
8feedb4f6f Include files moved 2023-03-27 17:29:21 -07:00
05e562e3d7 Move the copy synch out to stencil and do one per call instead of one per packet 2023-03-27 17:28:38 -07:00
dd3bbb8fa2 MOve the synchronise out to the stencil so one call instead of one call per packet 2023-03-27 17:27:45 -07:00
2fbcf13c46 SYCL fix 2023-03-27 14:25:14 -07:00
4ea48ef0c4 Merge pull request #419 from lehner/feature/gpt
Separate rankSum from sum
2023-03-24 15:42:16 -04:00
5c85774ee3 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2023-03-24 15:40:57 -04:00
d8a9a745d8 stream synchronise 2023-03-24 15:40:30 -04:00
dcf172da3b Merge pull request #415 from paboyle/feature/block_lanczos22
Feature/block lanczos22
2023-03-24 12:08:16 -04:00
d57ed25071 Merge branch 'feature/dirichlet' into feature/block_lanczos22 2023-03-24 12:08:09 -04:00
546be724e7 Merge pull request #421 from UniOfLeicester/feature/accel_Copy_plane
Populate the Cshift_table in the GPU
2023-03-24 12:04:06 -04:00
8a1b9073f9 Mshift update 2023-03-23 15:39:30 -04:00
1a7114d4b9 Temporary algorithm while sorting out mixed prec 2023-03-23 15:38:35 -04:00
3f385f717c Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet
Conflicts:
	systems/PVC/benchmarks/run-2tile-mpi.sh
	systems/PVC/config-command
2023-03-23 14:52:53 -04:00
481bbaf1fc Interface to query memory use 2023-03-23 12:55:31 -04:00
281488611a WriteDiscard on construct 2023-03-23 10:28:50 -04:00
c180a52518 Merge branch 'feature/dirichlet' of https://www.github.com/paboyle/Grid into feature/dirichlet 2023-03-23 10:28:01 -04:00
90130e25e9 TODO list 2023-03-23 10:27:02 -04:00
7db8dd7a95 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2023-03-21 16:04:27 -04:00
8b43be39c0 Config command 2023-03-21 16:00:52 -04:00
f17f879206 Test update 2023-03-21 15:59:29 -04:00
68428fceab Integrator update 2023-03-21 15:58:49 -04:00
4135f2dcd1 Compressor 2023-03-21 15:41:41 -04:00
c5bdf61215 AUdit fix 2023-03-21 15:38:39 -04:00
88e218e8ee Stencil updates 2023-03-21 15:37:58 -04:00
0f2b786436 Vector -> vector 2023-03-21 15:36:11 -04:00
bae0f8ea99 Merge pull request #425 from rrhodgson/feature/CacheLogging
Huge Cache
2023-03-21 08:59:08 -04:00
bbbcd36ae5 Merge pull request #426 from rrhodgson/feature/LCDeflation
Batched Local Coherence Tools
2023-03-21 08:58:40 -04:00
a3e935c902 Batched block project/promote size checks 2023-02-27 11:38:16 +00:00
7731c7db8e Add huge cache type and allow Ncache==0 2023-02-26 14:15:28 +00:00
ff97340324 Expose cached bytes 2023-02-26 12:22:45 +00:00
920a51438d Added batched Mixed precision CG 2023-02-14 17:04:13 +00:00
be528b6d27 Add batched block project/promote functions 2023-02-14 14:37:10 +00:00
7d62f1d6d2 Populate the Cshift_table in the GPU
Cshift is allocated in Unified memory and used
in the LambdaApply kernels but also populated
from the host. This creates a lot of Unified HtoD
and DtoH mem operations and has a negative effect
in performance. With this commit we populate the
Cshift table in the device with the
populate_Cshift_table() kernel.
2023-01-11 21:26:25 +00:00
458c943987 merged upstream 2022-12-31 11:16:21 +02:00
88015b0858 Split sum in rankSum and GlobalSum 2022-12-26 10:01:32 +01:00
dc747c54be Merge branch 'develop' into feature/dirichlet
Conflicts:
	Grid/qcd/action/fermion/WilsonCompressor.h
	Grid/stencil/Stencil.h
2022-12-13 08:24:58 -05:00
dc6a38f177 Minor cleanup 2022-11-30 17:13:12 -05:00
82c1ecf60f Block lanczos added 2022-11-30 16:08:40 -05:00
deab11e68b Flop cout matches DiRAC-ITT-2020 2020-11-16 17:15:34 +01:00
61 changed files with 4418 additions and 208 deletions

View File

@ -55,6 +55,7 @@ NAMESPACE_CHECK(BiCGSTAB);
#include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h> #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
#include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h> #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
#include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h> #include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h>
#include <Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h>
#include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h> #include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
#include <Grid/algorithms/iterative/BlockConjugateGradient.h> #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
#include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h> #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>

View File

@ -191,7 +191,7 @@ public:
std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl; std::cout << GridLogDebug << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl;
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0); if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);

View File

@ -0,0 +1,213 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
Copyright (C) 2015
Author: Raoul Hodgson <raoul.hodgson@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
NAMESPACE_BEGIN(Grid);
//Mixed precision restarted defect correction CG
template<class FieldD,class FieldF,
typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class MixedPrecisionConjugateGradientBatched : public LinearFunction<FieldD> {
public:
using LinearFunction<FieldD>::operator();
RealD Tolerance;
RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
Integer MaxInnerIterations;
Integer MaxOuterIterations;
Integer MaxPatchupIterations;
GridBase* SinglePrecGrid; //Grid for single-precision fields
RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
LinearOperatorBase<FieldF> &Linop_f;
LinearOperatorBase<FieldD> &Linop_d;
//Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
LinearFunction<FieldF> *guesser;
bool updateResidual;
MixedPrecisionConjugateGradientBatched(RealD tol,
Integer maxinnerit,
Integer maxouterit,
Integer maxpatchit,
GridBase* _sp_grid,
LinearOperatorBase<FieldF> &_Linop_f,
LinearOperatorBase<FieldD> &_Linop_d,
bool _updateResidual=true) :
Linop_f(_Linop_f), Linop_d(_Linop_d),
Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), MaxPatchupIterations(maxpatchit), SinglePrecGrid(_sp_grid),
OuterLoopNormMult(100.), guesser(NULL), updateResidual(_updateResidual) { };
void useGuesser(LinearFunction<FieldF> &g){
guesser = &g;
}
void operator() (const FieldD &src_d_in, FieldD &sol_d){
std::vector<FieldD> srcs_d_in{src_d_in};
std::vector<FieldD> sols_d{sol_d};
(*this)(srcs_d_in,sols_d);
sol_d = sols_d[0];
}
void operator() (const std::vector<FieldD> &src_d_in, std::vector<FieldD> &sol_d){
assert(src_d_in.size() == sol_d.size());
int NBatch = src_d_in.size();
std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl;
Integer TotalOuterIterations = 0; //Number of restarts
std::vector<Integer> TotalInnerIterations(NBatch,0); //Number of inner CG iterations
std::vector<Integer> TotalFinalStepIterations(NBatch,0); //Number of CG iterations in final patch-up step
GridStopWatch TotalTimer;
TotalTimer.Start();
GridStopWatch InnerCGtimer;
GridStopWatch PrecChangeTimer;
int cb = src_d_in[0].Checkerboard();
std::vector<RealD> src_norm;
std::vector<RealD> norm;
std::vector<RealD> stop;
GridBase* DoublePrecGrid = src_d_in[0].Grid();
FieldD tmp_d(DoublePrecGrid);
tmp_d.Checkerboard() = cb;
FieldD tmp2_d(DoublePrecGrid);
tmp2_d.Checkerboard() = cb;
std::vector<FieldD> src_d;
std::vector<FieldF> src_f;
std::vector<FieldF> sol_f;
for (int i=0; i<NBatch; i++) {
sol_d[i].Checkerboard() = cb;
src_norm.push_back(norm2(src_d_in[i]));
norm.push_back(0.);
stop.push_back(src_norm[i] * Tolerance*Tolerance);
src_d.push_back(src_d_in[i]); //source for next inner iteration, computed from residual during operation
src_f.push_back(SinglePrecGrid);
src_f[i].Checkerboard() = cb;
sol_f.push_back(SinglePrecGrid);
sol_f[i].Checkerboard() = cb;
}
RealD inner_tol = InnerTolerance;
ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
CG_f.ErrorOnNoConverge = false;
Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
std::cout << GridLogMessage << std::endl;
std::cout << GridLogMessage << "Outer iteration " << outer_iter << std::endl;
bool allConverged = true;
for (int i=0; i<NBatch; i++) {
//Compute double precision rsd and also new RHS vector.
Linop_d.HermOp(sol_d[i], tmp_d);
norm[i] = axpy_norm(src_d[i], -1., tmp_d, src_d_in[i]); //src_d is residual vector
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Outer iteration " << outer_iter <<" solve " << i << " residual "<< norm[i] << " target "<< stop[i] <<std::endl;
PrecChangeTimer.Start();
precisionChange(src_f[i], src_d[i]);
PrecChangeTimer.Stop();
sol_f[i] = Zero();
if(norm[i] > OuterLoopNormMult * stop[i]) {
allConverged = false;
}
}
if (allConverged) break;
if (updateResidual) {
RealD normMax = *std::max_element(std::begin(norm), std::end(norm));
RealD stopMax = *std::max_element(std::begin(stop), std::end(stop));
while( normMax * inner_tol * inner_tol < stopMax) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
CG_f.Tolerance = inner_tol;
}
//Optionally improve inner solver guess (eg using known eigenvectors)
if(guesser != NULL) {
(*guesser)(src_f, sol_f);
}
for (int i=0; i<NBatch; i++) {
//Inner CG
InnerCGtimer.Start();
CG_f(Linop_f, src_f[i], sol_f[i]);
InnerCGtimer.Stop();
TotalInnerIterations[i] += CG_f.IterationsToComplete;
//Convert sol back to double and add to double prec solution
PrecChangeTimer.Start();
precisionChange(tmp_d, sol_f[i]);
PrecChangeTimer.Stop();
axpy(sol_d[i], 1.0, tmp_d, sol_d[i]);
}
}
//Final trial CG
std::cout << GridLogMessage << std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Starting final patch-up double-precision solve"<<std::endl;
for (int i=0; i<NBatch; i++) {
ConjugateGradient<FieldD> CG_d(Tolerance, MaxPatchupIterations);
CG_d(Linop_d, src_d_in[i], sol_d[i]);
TotalFinalStepIterations[i] += CG_d.IterationsToComplete;
}
TotalTimer.Stop();
std::cout << GridLogMessage << std::endl;
for (int i=0; i<NBatch; i++) {
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: solve " << i << " Inner CG iterations " << TotalInnerIterations[i] << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations[i] << std::endl;
}
std::cout << GridLogMessage << std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
}
};
NAMESPACE_END(Grid);
#endif

View File

@ -0,0 +1,373 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Christopher Kelly <ckelly@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
//CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision.
//The residual is stored in single precision, but the search directions and solution are stored in double precision.
//Every update_freq iterations the residual is corrected in double precision.
//For safety the a final regular CG is applied to clean up if necessary
//PB Pure single, then double fixup
template<class FieldD, class FieldF,
typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class ConjugateGradientMultiShiftMixedPrecCleanup : public OperatorMultiFunction<FieldD>,
public OperatorFunction<FieldD>
{
public:
using OperatorFunction<FieldD>::operator();
RealD Tolerance;
Integer MaxIterationsMshift;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
std::vector<int> IterationsToCompleteShift; // Iterations for this shift
int verbose;
MultiShiftFunction shifts;
std::vector<RealD> TrueResidualShift;
int ReliableUpdateFreq; //number of iterations between reliable updates
GridBase* SinglePrecGrid; //Grid for single-precision fields
LinearOperatorBase<FieldF> &Linop_f; //single precision
ConjugateGradientMultiShiftMixedPrecCleanup(Integer maxit, const MultiShiftFunction &_shifts,
GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
int _ReliableUpdateFreq) :
MaxIterationsMshift(maxit), shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq),
MaxIterations(20000)
{
verbose=1;
IterationsToCompleteShift.resize(_shifts.order);
TrueResidualShift.resize(_shifts.order);
}
void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, FieldD &psi)
{
GridBase *grid = src.Grid();
int nshift = shifts.order;
std::vector<FieldD> results(nshift,grid);
(*this)(Linop,src,results,psi);
}
void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, std::vector<FieldD> &results, FieldD &psi)
{
int nshift = shifts.order;
(*this)(Linop,src,results);
psi = shifts.norm*src;
for(int i=0;i<nshift;i++){
psi = psi + shifts.residues[i]*results[i];
}
return;
}
void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
{
GRID_TRACE("ConjugateGradientMultiShiftMixedPrecCleanup");
GridBase *DoublePrecGrid = src_d.Grid();
////////////////////////////////////////////////////////////////////////
// Convenience references to the info stored in "MultiShiftFunction"
////////////////////////////////////////////////////////////////////////
int nshift = shifts.order;
std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
std::vector<RealD> &mresidual(shifts.tolerances);
std::vector<RealD> alpha(nshift,1.0);
//Double precision search directions
FieldD p_d(DoublePrecGrid);
std::vector<FieldF> ps_f (nshift, SinglePrecGrid);// Search directions (single precision)
std::vector<FieldF> psi_f(nshift, SinglePrecGrid);// solutions (single precision)
FieldD tmp_d(DoublePrecGrid);
FieldD r_d(DoublePrecGrid);
FieldF r_f(SinglePrecGrid);
FieldD mmp_d(DoublePrecGrid);
assert(psi_d.size()==nshift);
assert(mass.size()==nshift);
assert(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector
RealD bs[nshift];
RealD rsq[nshift];
RealD rsqf[nshift];
RealD z[nshift][2];
int converged[nshift];
const int primary =0;
//Primary shift fields CG iteration
RealD a,b,c,d;
RealD cp,bp,qq; //prev
// Matrix mult fields
FieldF p_f(SinglePrecGrid);
FieldF mmp_f(SinglePrecGrid);
// Check lightest mass
for(int s=0;s<nshift;s++){
assert( mass[s]>= mass[primary] );
converged[s]=0;
}
// Wire guess to zero
// Residuals "r" are src
// First search direction "p" is also src
cp = norm2(src_d);
// Handle trivial case of zero src.
if( cp == 0. ){
for(int s=0;s<nshift;s++){
psi_d[s] = Zero();
psi_f[s] = Zero();
IterationsToCompleteShift[s] = 1;
TrueResidualShift[s] = 0.;
}
return;
}
for(int s=0;s<nshift;s++){
rsq[s] = cp * mresidual[s] * mresidual[s];
rsqf[s] =rsq[s];
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
// ps_d[s] = src_d;
precisionChangeFast(ps_f[s],src_d);
}
// r and p for primary
p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
r_d = p_d;
//MdagM+m[0]
precisionChangeFast(p_f,p_d);
Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
precisionChangeFast(tmp_d,mmp_f);
Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
tmp_d = tmp_d - mmp_d;
std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
// assert(norm2(tmp_d)< 1.0e-4);
axpy(mmp_d,mass[0],p_d,mmp_d);
RealD rn = norm2(p_d);
d += rn*mass[0];
b = -cp /d;
// Set up the various shift variables
int iz=0;
z[0][1-iz] = 1.0;
z[0][iz] = 1.0;
bs[0] = b;
for(int s=1;s<nshift;s++){
z[s][1-iz] = 1.0;
z[s][iz] = 1.0/( 1.0 - b*(mass[s]-mass[0]));
bs[s] = b*z[s][iz];
}
// r += b[0] A.p[0]
// c= norm(r)
c=axpy_norm(r_d,b,mmp_d,r_d);
for(int s=0;s<nshift;s++) {
axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
precisionChangeFast(psi_f[s],psi_d[s]);
}
///////////////////////////////////////
// Timers
///////////////////////////////////////
GridStopWatch AXPYTimer, ShiftTimer, QRTimer, MatrixTimer, SolverTimer, PrecChangeTimer, CleanupTimer;
SolverTimer.Start();
// Iteration loop
int k;
for (k=1;k<=MaxIterationsMshift;k++){
a = c /cp;
AXPYTimer.Start();
axpy(p_d,a,p_d,r_d);
AXPYTimer.Stop();
PrecChangeTimer.Start();
precisionChangeFast(r_f, r_d);
PrecChangeTimer.Stop();
AXPYTimer.Start();
for(int s=0;s<nshift;s++){
if ( ! converged[s] ) {
if (s==0){
axpy(ps_f[s],a,ps_f[s],r_f);
} else{
RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
axpby(ps_f[s],z[s][iz],as,r_f,ps_f[s]);
}
}
}
AXPYTimer.Stop();
cp=c;
PrecChangeTimer.Start();
precisionChangeFast(p_f, p_d); //get back single prec search direction for linop
PrecChangeTimer.Stop();
MatrixTimer.Start();
Linop_f.HermOp(p_f,mmp_f);
MatrixTimer.Stop();
PrecChangeTimer.Start();
precisionChangeFast(mmp_d, mmp_f); // From Float to Double
PrecChangeTimer.Stop();
d=real(innerProduct(p_d,mmp_d));
axpy(mmp_d,mass[0],p_d,mmp_d);
RealD rn = norm2(p_d);
d += rn*mass[0];
bp=b;
b=-cp/d;
// Toggle the recurrence history
bs[0] = b;
iz = 1-iz;
ShiftTimer.Start();
for(int s=1;s<nshift;s++){
if((!converged[s])){
RealD z0 = z[s][1-iz];
RealD z1 = z[s][iz];
z[s][iz] = z0*z1*bp
/ (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b));
bs[s] = b*z[s][iz]/z0; // NB sign rel to Mike
}
}
ShiftTimer.Stop();
//Update single precision solutions
AXPYTimer.Start();
for(int s=0;s<nshift;s++){
int ss = s;
if( (!converged[s]) ) {
axpy(psi_f[ss],-bs[s]*alpha[s],ps_f[s],psi_f[ss]);
}
}
c = axpy_norm(r_d,b,mmp_d,r_d);
AXPYTimer.Stop();
// Convergence checks
int all_converged = 1;
for(int s=0;s<nshift;s++){
if ( (!converged[s]) ){
IterationsToCompleteShift[s] = k;
RealD css = c * z[s][iz]* z[s][iz];
if(css<rsqf[s]){
if ( ! converged[s] )
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
converged[s]=1;
} else {
all_converged=0;
}
}
}
if ( all_converged || k == MaxIterationsMshift-1){
SolverTimer.Stop();
for(int s=0;s<nshift;s++){
precisionChangeFast(psi_d[s],psi_f[s]);
}
if ( all_converged ){
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: All shifts have converged iteration "<<k<<std::endl;
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: Checking solutions"<<std::endl;
} else {
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: Not all shifts have converged iteration "<<k<<std::endl;
}
// Check answers
for(int s=0; s < nshift; s++) {
Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
axpy(tmp_d,mass[s],psi_d[s],mmp_d);
axpy(r_d,-alpha[s],src_d,tmp_d);
RealD rn = norm2(r_d);
RealD cn = norm2(src_d);
TrueResidualShift[s] = std::sqrt(rn/cn);
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift["<<s<<"] true residual "<< TrueResidualShift[s] << " target " << mresidual[s] << std::endl;
//If we have not reached the desired tolerance, do a (mixed precision) CG cleanup
if(rn >= rsq[s]){
CleanupTimer.Start();
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: performing cleanup step for shift " << s << std::endl;
//Setup linear operators for final cleanup
ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldD> Linop_shift_d(Linop_d, mass[s]);
ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldF> Linop_shift_f(Linop_f, mass[s]);
MixedPrecisionConjugateGradient<FieldD,FieldF> cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d);
cg(src_d, psi_d[s]);
TrueResidualShift[s] = cg.TrueResidual;
CleanupTimer.Stop();
}
}
std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrecCleanup: Time Breakdown for body"<<std::endl;
std::cout << GridLogMessage << "\tSolver " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tAXPY " << AXPYTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tShift " << ShiftTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tPrecision Change " << PrecChangeTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tFinal Cleanup " << CleanupTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tSolver+Cleanup " << SolverTimer.Elapsed() + CleanupTimer.Elapsed() << std::endl;
IterationsToComplete = k;
return;
}
}
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
assert(0);
}
};
NAMESPACE_END(Grid);

View File

@ -81,6 +81,7 @@ public:
using OperatorFunction<FieldD>::operator(); using OperatorFunction<FieldD>::operator();
RealD Tolerance; RealD Tolerance;
Integer MaxIterationsMshift;
Integer MaxIterations; Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
std::vector<int> IterationsToCompleteShift; // Iterations for this shift std::vector<int> IterationsToCompleteShift; // Iterations for this shift
@ -95,9 +96,9 @@ public:
ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts, ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts,
GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f, GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
int _ReliableUpdateFreq int _ReliableUpdateFreq) :
) : MaxIterationsMshift(maxit), shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq),
MaxIterations(maxit), shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq) MaxIterations(20000)
{ {
verbose=1; verbose=1;
IterationsToCompleteShift.resize(_shifts.order); IterationsToCompleteShift.resize(_shifts.order);
@ -247,7 +248,7 @@ public:
// Iteration loop // Iteration loop
int k; int k;
for (k=1;k<=MaxIterations;k++){ for (k=1;k<=MaxIterationsMshift;k++){
a = c /cp; a = c /cp;
AXPYTimer.Start(); AXPYTimer.Start();
@ -353,11 +354,16 @@ public:
} }
} }
if ( all_converged ){ if ( all_converged || k == MaxIterationsMshift-1){
SolverTimer.Stop(); SolverTimer.Stop();
if ( all_converged ){
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl; std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl; std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
} else {
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Not all shifts have converged iteration "<<k<<std::endl;
}
// Check answers // Check answers
for(int s=0; s < nshift; s++) { for(int s=0; s < nshift; s++) {
@ -400,11 +406,9 @@ public:
return; return;
} }
} }
// ugly hack
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl; std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
// assert(0); assert(0);
} }
}; };

File diff suppressed because it is too large Load Diff

View File

@ -4,11 +4,14 @@ NAMESPACE_BEGIN(Grid);
/*Allocation types, saying which pointer cache should be used*/ /*Allocation types, saying which pointer cache should be used*/
#define Cpu (0) #define Cpu (0)
#define CpuSmall (1) #define CpuHuge (1)
#define Acc (2) #define CpuSmall (2)
#define AccSmall (3) #define Acc (3)
#define Shared (4) #define AccHuge (4)
#define SharedSmall (5) #define AccSmall (5)
#define Shared (6)
#define SharedHuge (7)
#define SharedSmall (8)
#undef GRID_MM_VERBOSE #undef GRID_MM_VERBOSE
uint64_t total_shared; uint64_t total_shared;
uint64_t total_device; uint64_t total_device;
@ -35,12 +38,15 @@ void MemoryManager::PrintBytes(void)
} }
uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; }
uint64_t MemoryManager::HostCacheBytes() { return CacheBytes[Cpu] + CacheBytes[CpuHuge] + CacheBytes[CpuSmall]; }
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
// Data tables for recently freed pooiniter caches // Data tables for recently freed pooiniter caches
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax]; MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
int MemoryManager::Victim[MemoryManager::NallocType]; int MemoryManager::Victim[MemoryManager::NallocType];
int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 8, 16, 8, 16 }; int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 0, 8, 8, 0, 16, 8, 0, 16 };
uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType]; uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
// Actual allocation and deallocation utils // Actual allocation and deallocation utils
@ -170,6 +176,16 @@ void MemoryManager::Init(void)
} }
} }
str= getenv("GRID_ALLOC_NCACHE_HUGE");
if ( str ) {
Nc = atoi(str);
if ( (Nc>=0) && (Nc < NallocCacheMax)) {
Ncache[CpuHuge]=Nc;
Ncache[AccHuge]=Nc;
Ncache[SharedHuge]=Nc;
}
}
str= getenv("GRID_ALLOC_NCACHE_SMALL"); str= getenv("GRID_ALLOC_NCACHE_SMALL");
if ( str ) { if ( str ) {
Nc = atoi(str); Nc = atoi(str);
@ -190,7 +206,9 @@ void MemoryManager::InitMessage(void) {
std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl; std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
#ifdef ALLOCATION_CACHE #ifdef ALLOCATION_CACHE
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl; std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent host allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<" HUGE "<<Ncache[CpuHuge]<<std::endl;
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent device allocations: SMALL "<<Ncache[AccSmall]<<" LARGE "<<Ncache[Acc]<<" Huge "<<Ncache[AccHuge]<<std::endl;
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent shared allocations: SMALL "<<Ncache[SharedSmall]<<" LARGE "<<Ncache[Shared]<<" Huge "<<Ncache[SharedHuge]<<std::endl;
#endif #endif
#ifdef GRID_UVM #ifdef GRID_UVM
@ -222,8 +240,11 @@ void MemoryManager::InitMessage(void) {
void *MemoryManager::Insert(void *ptr,size_t bytes,int type) void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
{ {
#ifdef ALLOCATION_CACHE #ifdef ALLOCATION_CACHE
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT); int cache;
int cache = type + small; if (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2;
else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
else cache = type;
return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]); return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);
#else #else
return ptr; return ptr;
@ -232,11 +253,12 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes) void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes)
{ {
assert(ncache>0);
#ifdef GRID_OMP #ifdef GRID_OMP
assert(omp_in_parallel()==0); assert(omp_in_parallel()==0);
#endif #endif
if (ncache == 0) return ptr;
void * ret = NULL; void * ret = NULL;
int v = -1; int v = -1;
@ -271,8 +293,11 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
void *MemoryManager::Lookup(size_t bytes,int type) void *MemoryManager::Lookup(size_t bytes,int type)
{ {
#ifdef ALLOCATION_CACHE #ifdef ALLOCATION_CACHE
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT); int cache;
int cache = type+small; if (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2;
else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
else cache = type;
return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]); return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]);
#else #else
return NULL; return NULL;
@ -281,7 +306,6 @@ void *MemoryManager::Lookup(size_t bytes,int type)
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes) void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes)
{ {
assert(ncache>0);
#ifdef GRID_OMP #ifdef GRID_OMP
assert(omp_in_parallel()==0); assert(omp_in_parallel()==0);
#endif #endif

View File

@ -35,6 +35,7 @@ NAMESPACE_BEGIN(Grid);
// Move control to configure.ac and Config.h? // Move control to configure.ac and Config.h?
#define GRID_ALLOC_SMALL_LIMIT (4096) #define GRID_ALLOC_SMALL_LIMIT (4096)
#define GRID_ALLOC_HUGE_LIMIT (2147483648)
#define STRINGIFY(x) #x #define STRINGIFY(x) #x
#define TOSTRING(x) STRINGIFY(x) #define TOSTRING(x) STRINGIFY(x)
@ -70,6 +71,21 @@ enum ViewMode {
CpuWriteDiscard = 0x10 // same for now CpuWriteDiscard = 0x10 // same for now
}; };
struct MemoryStatus {
uint64_t DeviceBytes;
uint64_t DeviceLRUBytes;
uint64_t DeviceMaxBytes;
uint64_t HostToDeviceBytes;
uint64_t DeviceToHostBytes;
uint64_t HostToDeviceXfer;
uint64_t DeviceToHostXfer;
uint64_t DeviceEvictions;
uint64_t DeviceDestroy;
uint64_t DeviceAllocCacheBytes;
uint64_t HostAllocCacheBytes;
};
class MemoryManager { class MemoryManager {
private: private:
@ -83,7 +99,7 @@ private:
} AllocationCacheEntry; } AllocationCacheEntry;
static const int NallocCacheMax=128; static const int NallocCacheMax=128;
static const int NallocType=6; static const int NallocType=9;
static AllocationCacheEntry Entries[NallocType][NallocCacheMax]; static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
static int Victim[NallocType]; static int Victim[NallocType];
static int Ncache[NallocType]; static int Ncache[NallocType];
@ -122,6 +138,25 @@ private:
static uint64_t DeviceEvictions; static uint64_t DeviceEvictions;
static uint64_t DeviceDestroy; static uint64_t DeviceDestroy;
static uint64_t DeviceCacheBytes();
static uint64_t HostCacheBytes();
static MemoryStatus GetFootprint(void) {
MemoryStatus stat;
stat.DeviceBytes = DeviceBytes;
stat.DeviceLRUBytes = DeviceLRUBytes;
stat.DeviceMaxBytes = DeviceMaxBytes;
stat.HostToDeviceBytes = HostToDeviceBytes;
stat.DeviceToHostBytes = DeviceToHostBytes;
stat.HostToDeviceXfer = HostToDeviceXfer;
stat.DeviceToHostXfer = DeviceToHostXfer;
stat.DeviceEvictions = DeviceEvictions;
stat.DeviceDestroy = DeviceDestroy;
stat.DeviceAllocCacheBytes = DeviceCacheBytes();
stat.HostAllocCacheBytes = HostCacheBytes();
return stat;
};
private: private:
#ifndef GRID_UVM #ifndef GRID_UVM
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////

View File

@ -400,9 +400,6 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
} }
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir) void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{ {
acceleratorCopySynchronise();
StencilBarrier();// Synch shared memory on a single nodes
int nreq=list.size(); int nreq=list.size();
if (nreq==0) return; if (nreq==0) return;

View File

@ -37,10 +37,11 @@ Author: Christoph Lehner <christoph@lhnr.de>
#ifdef GRID_HIP #ifdef GRID_HIP
#include <hip/hip_runtime_api.h> #include <hip/hip_runtime_api.h>
#endif #endif
#ifdef GRID_SYCl #ifdef GRID_SYCL
#define GRID_SYCL_LEVEL_ZERO_IPC
#endif #endif
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
#define header "SharedMemoryMpi: " #define header "SharedMemoryMpi: "
/*Construct from an MPI communicator*/ /*Construct from an MPI communicator*/

View File

@ -297,6 +297,30 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
} }
} }
#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
template <typename T>
T iDivUp(T a, T b) // Round a / b to nearest higher integer value
{ return (a % b != 0) ? (a / b + 1) : (a / b); }
template <typename T>
__global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx >= e1*e2) return;
int n, b, o;
n = idx / e2;
b = idx % e2;
o = n*stride + b;
vector[2*idx + 0] = lo + o;
vector[2*idx + 1] = ro + o;
}
#endif
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
// local to node block strided copies // local to node block strided copies
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
@ -321,12 +345,20 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
int ent=0; int ent=0;
if(cbmask == 0x3 ){ if(cbmask == 0x3 ){
#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
ent = e1*e2;
dim3 blockSize(acceleratorThreads());
dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
accelerator_barrier();
#else
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*stride+b; int o =n*stride+b;
Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o); Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
} }
} }
#endif
} else { } else {
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
@ -377,11 +409,19 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
int ent=0; int ent=0;
if ( cbmask == 0x3 ) { if ( cbmask == 0x3 ) {
#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
ent = e1*e2;
dim3 blockSize(acceleratorThreads());
dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
accelerator_barrier();
#else
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*stride; int o =n*stride;
Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b); Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
}} }}
#endif
} else { } else {
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){

View File

@ -153,33 +153,44 @@ inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
} }
template<class vobj> template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg) inline typename vobj::scalar_object rankSum(const Lattice<vobj> &arg)
{ {
Integer osites = arg.Grid()->oSites(); Integer osites = arg.Grid()->oSites();
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
typename vobj::scalar_object ssum;
autoView( arg_v, arg, AcceleratorRead); autoView( arg_v, arg, AcceleratorRead);
ssum= sum_gpu(&arg_v[0],osites); return sum_gpu(&arg_v[0],osites);
#else #else
autoView(arg_v, arg, CpuRead); autoView(arg_v, arg, CpuRead);
auto ssum= sum_cpu(&arg_v[0],osites); return sum_cpu(&arg_v[0],osites);
#endif #endif
}
template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
{
auto ssum = rankSum(arg);
arg.Grid()->GlobalSum(ssum); arg.Grid()->GlobalSum(ssum);
return ssum; return ssum;
} }
template<class vobj> template<class vobj>
inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg) inline typename vobj::scalar_object rankSumLarge(const Lattice<vobj> &arg)
{ {
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
autoView( arg_v, arg, AcceleratorRead); autoView( arg_v, arg, AcceleratorRead);
Integer osites = arg.Grid()->oSites(); Integer osites = arg.Grid()->oSites();
auto ssum= sum_gpu_large(&arg_v[0],osites); return sum_gpu_large(&arg_v[0],osites);
#else #else
autoView(arg_v, arg, CpuRead); autoView(arg_v, arg, CpuRead);
Integer osites = arg.Grid()->oSites(); Integer osites = arg.Grid()->oSites();
auto ssum= sum_cpu(&arg_v[0],osites); return sum_cpu(&arg_v[0],osites);
#endif #endif
}
template<class vobj>
inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
{
auto ssum = rankSumLarge(arg);
arg.Grid()->GlobalSum(ssum); arg.Grid()->GlobalSum(ssum);
return ssum; return ssum;
} }

View File

@ -211,25 +211,22 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
assert(ok); assert(ok);
Integer smemSize = numThreads * sizeof(sobj); Integer smemSize = numThreads * sizeof(sobj);
// UVM seems to be buggy under later CUDA drivers // Move out of UVM
// This fails on A100 and driver 5.30.02 / CUDA 12.1 // Turns out I had messed up the synchronise after move to compute stream
// Fails with multiple NVCC versions back to 11.4, // as running this on the default stream fools the synchronise
// which worked with earlier drivers.
// Not sure which driver had first fail and this bears checking
// Is awkward as must install multiple driver versions
#undef UVM_BLOCK_BUFFER #undef UVM_BLOCK_BUFFER
#ifndef UVM_BLOCK_BUFFER #ifndef UVM_BLOCK_BUFFER
commVector<sobj> buffer(numBlocks); commVector<sobj> buffer(numBlocks);
sobj *buffer_v = &buffer[0]; sobj *buffer_v = &buffer[0];
sobj result; sobj result;
reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size); reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
accelerator_barrier(); accelerator_barrier();
acceleratorCopyFromDevice(buffer_v,&result,sizeof(result)); acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
#else #else
Vector<sobj> buffer(numBlocks); Vector<sobj> buffer(numBlocks);
sobj *buffer_v = &buffer[0]; sobj *buffer_v = &buffer[0];
sobj result; sobj result;
reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size); reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
accelerator_barrier(); accelerator_barrier();
result = *buffer_v; result = *buffer_v;
#endif #endif

View File

@ -440,6 +440,7 @@ public:
_grid->GlobalCoorToGlobalIndex(gcoor,gidx); _grid->GlobalCoorToGlobalIndex(gcoor,gidx);
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor); _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
assert(rank == _grid->ThisRank() ); assert(rank == _grid->ThisRank() );
int l_idx=generator_idx(o_idx,i_idx); int l_idx=generator_idx(o_idx,i_idx);

View File

@ -288,7 +288,36 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed); blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed);
} }
} }
template<class vobj,class CComplex,int nbasis,class VLattice>
inline void batchBlockProject(std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
const std::vector<Lattice<vobj>> &fineData,
const VLattice &Basis)
{
int NBatch = fineData.size();
assert(coarseData.size() == NBatch);
GridBase * fine = fineData[0].Grid();
GridBase * coarse= coarseData[0].Grid();
Lattice<iScalar<CComplex>> ip(coarse);
std::vector<Lattice<vobj>> fineDataCopy = fineData;
autoView(ip_, ip, AcceleratorWrite);
for(int v=0;v<nbasis;v++) {
for (int k=0; k<NBatch; k++) {
autoView( coarseData_ , coarseData[k], AcceleratorWrite);
blockInnerProductD(ip,Basis[v],fineDataCopy[k]); // ip = <basis|fine>
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
convertType(coarseData_[sc](v),ip_[sc]);
});
// improve numerical stability of projection
// |fine> = |fine> - <basis|fine> |basis>
ip=-ip;
blockZAXPY(fineDataCopy[k],ip,Basis[v],fineDataCopy[k]);
}
}
}
template<class vobj,class vobj2,class CComplex> template<class vobj,class vobj2,class CComplex>
inline void blockZAXPY(Lattice<vobj> &fineZ, inline void blockZAXPY(Lattice<vobj> &fineZ,
@ -590,6 +619,26 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
} }
#endif #endif
template<class vobj,class CComplex,int nbasis,class VLattice>
inline void batchBlockPromote(const std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
std::vector<Lattice<vobj>> &fineData,
const VLattice &Basis)
{
int NBatch = coarseData.size();
assert(fineData.size() == NBatch);
GridBase * fine = fineData[0].Grid();
GridBase * coarse = coarseData[0].Grid();
for (int k=0; k<NBatch; k++)
fineData[k]=Zero();
for (int i=0;i<nbasis;i++) {
for (int k=0; k<NBatch; k++) {
Lattice<iScalar<CComplex>> ip = PeekIndex<0>(coarseData[k],i);
blockZAXPY(fineData[k],ip,Basis[i],fineData[k]);
}
}
}
// Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars. // Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars.
// Simd layouts need not match since we use peek/poke Local // Simd layouts need not match since we use peek/poke Local
template<class vobj,class vvobj> template<class vobj,class vvobj>

View File

@ -36,7 +36,7 @@ NAMESPACE_BEGIN(Grid);
// Wilson compressor will need FaceGather policies for: // Wilson compressor will need FaceGather policies for:
// Periodic, Dirichlet, and partial Dirichlet for DWF // Periodic, Dirichlet, and partial Dirichlet for DWF
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
const int dwf_compressor_depth=1; const int dwf_compressor_depth=2;
#define DWF_COMPRESS #define DWF_COMPRESS
class FaceGatherPartialDWF class FaceGatherPartialDWF
{ {

View File

@ -463,11 +463,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
if( interior && exterior ) { if( interior && exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;} if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
#ifdef SYCL_HACK
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteSycl); return; }
#else
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
#endif
#ifndef GRID_CUDA #ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;}
#endif #endif
@ -478,6 +474,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;}
#endif #endif
} else if( exterior ) { } else if( exterior ) {
acceleratorFenceComputeStream();
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;} if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;}
#ifndef GRID_CUDA #ifndef GRID_CUDA
@ -502,10 +499,9 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
#ifndef GRID_CUDA #ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;}
#endif #endif
acceleratorFenceComputeStream();
} else if( interior ) { } else if( interior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;} if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteDagInt); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteDagInt); return;}
#ifndef GRID_CUDA #ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;}
#endif #endif
@ -516,7 +512,6 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
#ifndef GRID_CUDA #ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;}
#endif #endif
acceleratorFenceComputeStream();
} }
assert(0 && " Kernel optimisation case not covered "); assert(0 && " Kernel optimisation case not covered ");
} }

View File

@ -1 +0,0 @@
../CayleyFermion5DInstantiation.cc.master

View File

@ -1 +0,0 @@
../ContinuedFractionFermion5DInstantiation.cc.master

View File

@ -1 +0,0 @@
../DomainWallEOFAFermionInstantiation.cc.master

View File

@ -1 +0,0 @@
../MobiusEOFAFermionInstantiation.cc.master

View File

@ -1 +0,0 @@
../PartialFractionFermion5DInstantiation.cc.master

View File

@ -1 +0,0 @@
../WilsonCloverFermionInstantiation.cc.master

View File

@ -1 +0,0 @@
../WilsonFermion5DInstantiation.cc.master

View File

@ -1 +0,0 @@
../WilsonFermionInstantiation.cc.master

View File

@ -1 +0,0 @@
../WilsonKernelsInstantiation.cc.master

View File

@ -1 +0,0 @@
../WilsonTMFermionInstantiation.cc.master

View File

@ -1 +0,0 @@
#define IMPLEMENTATION WilsonImplD2

View File

@ -1 +0,0 @@
../CayleyFermion5DInstantiation.cc.master

View File

@ -1 +0,0 @@
../ContinuedFractionFermion5DInstantiation.cc.master

View File

@ -1 +0,0 @@
../DomainWallEOFAFermionInstantiation.cc.master

View File

@ -1 +0,0 @@
../MobiusEOFAFermionInstantiation.cc.master

View File

@ -1 +0,0 @@
../PartialFractionFermion5DInstantiation.cc.master

View File

@ -1 +0,0 @@
../WilsonFermion5DInstantiation.cc.master

View File

@ -1 +0,0 @@
../WilsonKernelsInstantiation.cc.master

View File

@ -1 +0,0 @@
#define IMPLEMENTATION ZWilsonImplD2

View File

@ -127,6 +127,8 @@ NAMESPACE_BEGIN(Grid);
ApproxNegPowerAction.tolerances[i] = action_tolerance[i]; ApproxNegPowerAction.tolerances[i] = action_tolerance[i];
ApproxHalfPowerAction.tolerances[i] = action_tolerance[i]; ApproxHalfPowerAction.tolerances[i] = action_tolerance[i];
ApproxNegHalfPowerAction.tolerances[i]= action_tolerance[i]; ApproxNegHalfPowerAction.tolerances[i]= action_tolerance[i];
}
for(int i=0;i<ApproxPowerMD.tolerances.size();i++){
ApproxPowerMD.tolerances[i] = md_tolerance[i]; ApproxPowerMD.tolerances[i] = md_tolerance[i];
ApproxNegPowerMD.tolerances[i] = md_tolerance[i]; ApproxNegPowerMD.tolerances[i] = md_tolerance[i];
ApproxHalfPowerMD.tolerances[i] = md_tolerance[i]; ApproxHalfPowerMD.tolerances[i] = md_tolerance[i];

View File

@ -29,6 +29,8 @@
#ifndef QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_MIXED_PREC_H #ifndef QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_MIXED_PREC_H
#define QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_MIXED_PREC_H #define QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_MIXED_PREC_H
#include <Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -58,7 +60,7 @@ NAMESPACE_BEGIN(Grid);
//Allow derived classes to override the multishift CG //Allow derived classes to override the multishift CG
virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, FermionFieldD &out){ virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, FermionFieldD &out){
#if 0 #if 0
SchurDifferentiableOperator<ImplD> schurOp(numerator ? NumOp : DenOp); SchurDifferentiableOperator<ImplD> schurOp(numerator ? NumOpD : DenOpD);
ConjugateGradientMultiShift<FermionFieldD> msCG(MaxIter, approx); ConjugateGradientMultiShift<FermionFieldD> msCG(MaxIter, approx);
msCG(schurOp,in, out); msCG(schurOp,in, out);
#else #else
@ -67,6 +69,7 @@ NAMESPACE_BEGIN(Grid);
FermionFieldD2 inD2(NumOpD2.FermionRedBlackGrid()); FermionFieldD2 inD2(NumOpD2.FermionRedBlackGrid());
FermionFieldD2 outD2(NumOpD2.FermionRedBlackGrid()); FermionFieldD2 outD2(NumOpD2.FermionRedBlackGrid());
// Action better with higher precision?
ConjugateGradientMultiShiftMixedPrec<FermionFieldD2, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq); ConjugateGradientMultiShiftMixedPrec<FermionFieldD2, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
precisionChange(inD2,in); precisionChange(inD2,in);
std::cout << "msCG single solve "<<norm2(inD2)<<" " <<norm2(in)<<std::endl; std::cout << "msCG single solve "<<norm2(inD2)<<" " <<norm2(in)<<std::endl;
@ -76,12 +79,12 @@ NAMESPACE_BEGIN(Grid);
} }
virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, std::vector<FermionFieldD> &out_elems, FermionFieldD &out){ virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, std::vector<FermionFieldD> &out_elems, FermionFieldD &out){
SchurDifferentiableOperator<ImplD2> schurOpD2(numerator ? NumOpD2 : DenOpD2); SchurDifferentiableOperator<ImplD2> schurOpD2(numerator ? NumOpD2 : DenOpD2);
SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF); SchurDifferentiableOperator<ImplF> schurOpF (numerator ? NumOpF : DenOpF);
FermionFieldD2 inD2(NumOpD2.FermionRedBlackGrid()); FermionFieldD2 inD2(NumOpD2.FermionRedBlackGrid());
FermionFieldD2 outD2(NumOpD2.FermionRedBlackGrid()); FermionFieldD2 outD2(NumOpD2.FermionRedBlackGrid());
std::vector<FermionFieldD2> out_elemsD2(out_elems.size(),NumOpD2.FermionRedBlackGrid()); std::vector<FermionFieldD2> out_elemsD2(out_elems.size(),NumOpD2.FermionRedBlackGrid());
ConjugateGradientMultiShiftMixedPrec<FermionFieldD2, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq); ConjugateGradientMultiShiftMixedPrecCleanup<FermionFieldD2, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
precisionChange(inD2,in); precisionChange(inD2,in);
std::cout << "msCG in "<<norm2(inD2)<<" " <<norm2(in)<<std::endl; std::cout << "msCG in "<<norm2(inD2)<<" " <<norm2(in)<<std::endl;
msCG(schurOpD2, inD2, out_elemsD2, outD2); msCG(schurOpD2, inD2, out_elemsD2, outD2);

View File

@ -112,40 +112,27 @@ NAMESPACE_BEGIN(Grid);
// NumOp == V // NumOp == V
// DenOp == M // DenOp == M
// //
AUDIT();
FermionField etaOdd (NumOp.FermionRedBlackGrid()); FermionField etaOdd (NumOp.FermionRedBlackGrid());
FermionField etaEven(NumOp.FermionRedBlackGrid()); FermionField etaEven(NumOp.FermionRedBlackGrid());
FermionField tmp (NumOp.FermionRedBlackGrid()); FermionField tmp (NumOp.FermionRedBlackGrid());
AUDIT();
pickCheckerboard(Even,etaEven,eta); pickCheckerboard(Even,etaEven,eta);
AUDIT();
pickCheckerboard(Odd,etaOdd,eta); pickCheckerboard(Odd,etaOdd,eta);
AUDIT();
NumOp.ImportGauge(U); NumOp.ImportGauge(U);
AUDIT();
DenOp.ImportGauge(U); DenOp.ImportGauge(U);
std::cout << " TwoFlavourRefresh: Imported gauge "<<std::endl; std::cout << " TwoFlavourRefresh: Imported gauge "<<std::endl;
AUDIT();
SchurDifferentiableOperator<Impl> Mpc(DenOp); SchurDifferentiableOperator<Impl> Mpc(DenOp);
AUDIT();
SchurDifferentiableOperator<Impl> Vpc(NumOp); SchurDifferentiableOperator<Impl> Vpc(NumOp);
AUDIT();
std::cout << " TwoFlavourRefresh: Diff ops "<<std::endl; std::cout << " TwoFlavourRefresh: Diff ops "<<std::endl;
AUDIT();
// Odd det factors // Odd det factors
Mpc.MpcDag(etaOdd,PhiOdd); Mpc.MpcDag(etaOdd,PhiOdd);
AUDIT();
std::cout << " TwoFlavourRefresh: MpcDag "<<std::endl; std::cout << " TwoFlavourRefresh: MpcDag "<<std::endl;
tmp=Zero(); tmp=Zero();
AUDIT();
std::cout << " TwoFlavourRefresh: Zero() guess "<<std::endl; std::cout << " TwoFlavourRefresh: Zero() guess "<<std::endl;
AUDIT();
HeatbathSolver(Vpc,PhiOdd,tmp); HeatbathSolver(Vpc,PhiOdd,tmp);
AUDIT();
std::cout << " TwoFlavourRefresh: Heatbath solver "<<std::endl; std::cout << " TwoFlavourRefresh: Heatbath solver "<<std::endl;
Vpc.Mpc(tmp,PhiOdd); Vpc.Mpc(tmp,PhiOdd);
std::cout << " TwoFlavourRefresh: Mpc "<<std::endl; std::cout << " TwoFlavourRefresh: Mpc "<<std::endl;

View File

@ -134,14 +134,12 @@ protected:
double start_force = usecond(); double start_force = usecond();
std::cout << GridLogMessage << "AuditForce["<<level<<"]["<<a<<"] before"<<std::endl; std::cout << GridLogMessage << "AuditForce["<<level<<"]["<<a<<"] before"<<std::endl;
AUDIT();
as[level].actions.at(a)->deriv_timer_start(); as[level].actions.at(a)->deriv_timer_start();
as[level].actions.at(a)->deriv(Us, force); // deriv should NOT include Ta as[level].actions.at(a)->deriv(Us, force); // deriv should NOT include Ta
as[level].actions.at(a)->deriv_timer_stop(); as[level].actions.at(a)->deriv_timer_stop();
std::cout << GridLogMessage << "AuditForce["<<level<<"]["<<a<<"] after"<<std::endl; std::cout << GridLogMessage << "AuditForce["<<level<<"]["<<a<<"] after"<<std::endl;
AUDIT();
std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl; std::cout << GridLogIntegrator << "Smearing (on/off): " << as[level].actions.at(a)->is_smeared << std::endl;
auto name = as[level].actions.at(a)->action_name(); auto name = as[level].actions.at(a)->action_name();
@ -284,6 +282,15 @@ public:
<< as[level].actions.at(actionID)->deriv_us*1.0e-6<<" s"<< std::endl; << as[level].actions.at(actionID)->deriv_us*1.0e-6<<" s"<< std::endl;
} }
} }
std::cout << GridLogMessage << "--------------------------- "<<std::endl;
std::cout << GridLogMessage << " Dslash counts "<<std::endl;
std::cout << GridLogMessage << "------------------------- "<<std::endl;
uint64_t full, partial, dirichlet;
DslashGetCounts(dirichlet,partial,full);
std::cout << GridLogMessage << " Full BCs : "<<full<<std::endl;
std::cout << GridLogMessage << " Partial dirichlet BCs : "<<partial<<std::endl;
std::cout << GridLogMessage << " Dirichlet BCs : "<<dirichlet<<std::endl;
std::cout << GridLogMessage << "--------------------------- "<<std::endl; std::cout << GridLogMessage << "--------------------------- "<<std::endl;
std::cout << GridLogMessage << " Force average size "<<std::endl; std::cout << GridLogMessage << " Force average size "<<std::endl;
std::cout << GridLogMessage << "------------------------- "<<std::endl; std::cout << GridLogMessage << "------------------------- "<<std::endl;
@ -373,12 +380,12 @@ public:
Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared); Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
std::cout << GridLogMessage << "AuditRefresh["<<level<<"]["<<actionID<<"] before"<<std::endl; std::cout << GridLogMessage << "AuditRefresh["<<level<<"]["<<actionID<<"] before"<<std::endl;
AUDIT();
as[level].actions.at(actionID)->refresh_timer_start(); as[level].actions.at(actionID)->refresh_timer_start();
as[level].actions.at(actionID)->refresh(Us, sRNG, pRNG); as[level].actions.at(actionID)->refresh(Us, sRNG, pRNG);
as[level].actions.at(actionID)->refresh_timer_stop(); as[level].actions.at(actionID)->refresh_timer_stop();
std::cout << GridLogMessage << "AuditRefresh["<<level<<"]["<<actionID<<"] after"<<std::endl; std::cout << GridLogMessage << "AuditRefresh["<<level<<"]["<<actionID<<"] after"<<std::endl;
AUDIT();
} }
// Refresh the higher representation actions // Refresh the higher representation actions
@ -415,7 +422,7 @@ public:
// Actions // Actions
for (int level = 0; level < as.size(); ++level) { for (int level = 0; level < as.size(); ++level) {
for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) { for (int actionID = 0; actionID < as[level].actions.size(); ++actionID) {
AUDIT();
// get gauge field from the SmearingPolicy and // get gauge field from the SmearingPolicy and
// based on the boolean is_smeared in actionID // based on the boolean is_smeared in actionID
Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared); Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
@ -425,7 +432,7 @@ public:
as[level].actions.at(actionID)->S_timer_stop(); as[level].actions.at(actionID)->S_timer_stop();
std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl; std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl;
H += Hterm; H += Hterm;
AUDIT();
} }
as[level].apply(S_hireps, Representations, level, H); as[level].apply(S_hireps, Representations, level, H);
} }
@ -438,9 +445,9 @@ public:
void operator()(std::vector<Action<FieldType>*> repr_set, Repr& Rep, int level, RealD& H) { void operator()(std::vector<Action<FieldType>*> repr_set, Repr& Rep, int level, RealD& H) {
for (int a = 0; a < repr_set.size(); ++a) { for (int a = 0; a < repr_set.size(); ++a) {
AUDIT();
RealD Hterm = repr_set.at(a)->Sinitial(Rep.U); RealD Hterm = repr_set.at(a)->Sinitial(Rep.U);
AUDIT();
std::cout << GridLogMessage << "Sinitial Level " << level << " term " << a << " H Hirep = " << Hterm << std::endl; std::cout << GridLogMessage << "Sinitial Level " << level << " term " << a << " H Hirep = " << Hterm << std::endl;
H += Hterm; H += Hterm;
@ -465,10 +472,10 @@ public:
Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared); Field& Us = Smearer.get_U(as[level].actions.at(actionID)->is_smeared);
std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl; std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] action eval " << std::endl;
as[level].actions.at(actionID)->S_timer_start(); as[level].actions.at(actionID)->S_timer_start();
AUDIT();
Hterm = as[level].actions.at(actionID)->Sinitial(Us); Hterm = as[level].actions.at(actionID)->Sinitial(Us);
as[level].actions.at(actionID)->S_timer_stop(); as[level].actions.at(actionID)->S_timer_stop();
AUDIT();
std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl; std::cout << GridLogMessage << "S [" << level << "][" << actionID << "] H = " << Hterm << std::endl;
H += Hterm; H += Hterm;
} }
@ -481,7 +488,6 @@ public:
void integrate(Field& U) void integrate(Field& U)
{ {
AUDIT();
// reset the clocks // reset the clocks
t_U = 0; t_U = 0;
for (int level = 0; level < as.size(); ++level) { for (int level = 0; level < as.size(); ++level) {
@ -499,10 +505,8 @@ public:
assert(fabs(t_U - t_P[level]) < 1.0e-6); // must be the same assert(fabs(t_U - t_P[level]) < 1.0e-6); // must be the same
std::cout << GridLogIntegrator << " times[" << level << "]= " << t_P[level] << " " << t_U << std::endl; std::cout << GridLogIntegrator << " times[" << level << "]= " << t_P[level] << " " << t_U << std::endl;
} }
AUDIT();
FieldImplementation::Project(U); FieldImplementation::Project(U);
AUDIT();
// and that we indeed got to the end of the trajectory // and that we indeed got to the end of the trajectory
assert(fabs(t_U - Params.trajL) < 1.0e-6); assert(fabs(t_U - Params.trajL) < 1.0e-6);

View File

@ -29,6 +29,27 @@
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
uint64_t DslashFullCount;
uint64_t DslashPartialCount;
uint64_t DslashDirichletCount;
void DslashResetCounts(void)
{
DslashFullCount=0;
DslashPartialCount=0;
DslashDirichletCount=0;
}
void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full)
{
dirichlet = DslashDirichletCount;
partial = DslashPartialCount;
full = DslashFullCount;
}
void DslashLogFull(void) { DslashFullCount++;}
void DslashLogPartial(void) { DslashPartialCount++;}
void DslashLogDirichlet(void){ DslashDirichletCount++;}
void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask, void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
int off,std::vector<std::pair<int,int> > & table) int off,std::vector<std::pair<int,int> > & table)
{ {

View File

@ -120,6 +120,12 @@ void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,
} }
*/ */
void DslashResetCounts(void);
void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full);
void DslashLogFull(void);
void DslashLogPartial(void);
void DslashLogDirichlet(void);
struct StencilEntry { struct StencilEntry {
#ifdef GRID_CUDA #ifdef GRID_CUDA
uint64_t _byte_offset; // 8 bytes uint64_t _byte_offset; // 8 bytes
@ -312,6 +318,7 @@ public:
int face_table_computed; int face_table_computed;
int partialDirichlet; int partialDirichlet;
int fullDirichlet;
std::vector<commVector<std::pair<int,int> > > face_table ; std::vector<commVector<std::pair<int,int> > > face_table ;
Vector<int> surface_list; Vector<int> surface_list;
@ -427,7 +434,6 @@ public:
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs) void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
{ {
accelerator_barrier();
for(int i=0;i<Packets.size();i++){ for(int i=0;i<Packets.size();i++){
_grid->StencilSendToRecvFromBegin(MpiReqs, _grid->StencilSendToRecvFromBegin(MpiReqs,
Packets[i].send_buf, Packets[i].send_buf,
@ -436,12 +442,17 @@ public:
Packets[i].from_rank,Packets[i].do_recv, Packets[i].from_rank,Packets[i].do_recv,
Packets[i].xbytes,Packets[i].rbytes,i); Packets[i].xbytes,Packets[i].rbytes,i);
} }
_grid->StencilBarrier();// Synch shared memory on a single nodes
} }
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs) void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
{ {
_grid->StencilSendToRecvFromComplete(MpiReqs,0); _grid->StencilSendToRecvFromComplete(MpiReqs,0);
if ( this->partialDirichlet ) DslashLogPartial();
else if ( this->fullDirichlet ) DslashLogDirichlet();
else DslashLogFull();
acceleratorCopySynchronise();
// Everyone agrees we are all done
_grid->StencilBarrier();
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Blocking send and receive. Either sequential or parallel. // Blocking send and receive. Either sequential or parallel.
@ -519,7 +530,6 @@ public:
{ {
_grid->StencilBarrier();// Synch shared memory on a single nodes _grid->StencilBarrier();// Synch shared memory on a single nodes
// conformable(source.Grid(),_grid);
assert(source.Grid()==_grid); assert(source.Grid()==_grid);
u_comm_offset=0; u_comm_offset=0;
@ -645,8 +655,8 @@ public:
CommsMerge(decompress,Mergers,Decompressions); CommsMerge(decompress,Mergers,Decompressions);
} }
template<class decompressor> void CommsMergeSHM(decompressor decompress) { template<class decompressor> void CommsMergeSHM(decompressor decompress) {
_grid->StencilBarrier();// Synch shared memory on a single nodes assert(MergersSHM.size()==0);
CommsMerge(decompress,MergersSHM,DecompressionsSHM); assert(DecompressionsSHM.size()==0);
} }
template<class decompressor> template<class decompressor>
@ -655,9 +665,11 @@ public:
for(int i=0;i<mm.size();i++){ for(int i=0;i<mm.size();i++){
decompressor::MergeFace(decompress,mm[i]); decompressor::MergeFace(decompress,mm[i]);
} }
if ( mm.size() ) acceleratorFenceComputeStream();
for(int i=0;i<dd.size();i++){ for(int i=0;i<dd.size();i++){
decompressor::DecompressFace(decompress,dd[i]); decompressor::DecompressFace(decompress,dd[i]);
} }
if ( dd.size() ) acceleratorFenceComputeStream();
} }
//////////////////////////////////////// ////////////////////////////////////////
// Set up routines // Set up routines
@ -770,6 +782,10 @@ public:
if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0); if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0);
partialDirichlet = p.partialDirichlet; partialDirichlet = p.partialDirichlet;
DirichletBlock(p.dirichlet); // comms send/recv set up DirichletBlock(p.dirichlet); // comms send/recv set up
fullDirichlet=0;
for(int d=0;d<p.dirichlet.size();d++){
if (p.dirichlet[d]) fullDirichlet=1;
}
_unified_buffer_size=0; _unified_buffer_size=0;
surface_list.resize(0); surface_list.resize(0);

View File

@ -167,14 +167,13 @@ void GridCmdOptionInt(std::string &str,int & val)
return; return;
} }
void GridCmdOptionFloat(std::string &str,float & val) void GridCmdOptionFloat(std::string &str,double & val)
{ {
std::stringstream ss(str); std::stringstream ss(str);
ss>>val; ss>>val;
return; return;
} }
void GridParseLayout(char **argv,int argc, void GridParseLayout(char **argv,int argc,
Coordinate &latt_c, Coordinate &latt_c,
Coordinate &mpi_c) Coordinate &mpi_c)

View File

@ -57,7 +57,7 @@ void GridCmdOptionCSL(std::string str,std::vector<std::string> & vec);
template<class VectorInt> template<class VectorInt>
void GridCmdOptionIntVector(const std::string &str,VectorInt & vec); void GridCmdOptionIntVector(const std::string &str,VectorInt & vec);
void GridCmdOptionInt(std::string &str,int & val); void GridCmdOptionInt(std::string &str,int & val);
void GridCmdOptionFloat(std::string &str,float & val); void GridCmdOptionFloat(std::string &str,double & val);
void GridParseLayout(char **argv,int argc, void GridParseLayout(char **argv,int argc,

View File

@ -164,11 +164,6 @@ int main(int argc, char **argv) {
typedef MobiusEOFAFermionF FermionEOFAActionF; typedef MobiusEOFAFermionF FermionEOFAActionF;
typedef typename FermionActionF::FermionField FermionFieldF; typedef typename FermionActionF::FermionField FermionFieldF;
typedef WilsonImplD2 FermionImplPolicyD2;
typedef MobiusFermionD2 FermionActionD2;
typedef MobiusEOFAFermionD2 FermionEOFAActionD2;
typedef typename FermionActionD2::FermionField FermionFieldD2;
typedef Grid::XmlReader Serialiser; typedef Grid::XmlReader Serialiser;
//:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: //::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
@ -232,31 +227,34 @@ int main(int argc, char **argv) {
// std::vector<Real> hasenbusch({ light_mass, 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated // std::vector<Real> hasenbusch({ light_mass, 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated
// std::vector<Real> hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass }); // std::vector<Real> hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass });
OneFlavourRationalParams OFRp; // Up/down int SP_iters=10000;
OFRp.lo = 4.0e-5;
RationalActionParams OFRp; // Up/down
OFRp.lo = 6.0e-5;
OFRp.hi = 90.0; OFRp.hi = 90.0;
OFRp.MaxIter = 60000; OFRp.inv_pow = 2;
OFRp.tolerance= 1.0e-5; OFRp.MaxIter = SP_iters; // get most shifts by 2000, stop sharing space
OFRp.mdtolerance= 1.0e-3; OFRp.action_tolerance= 1.0e-8;
OFRp.action_degree = 18;
OFRp.md_tolerance= 1.0e-5;
OFRp.md_degree = 14;
// OFRp.degree = 20; converges // OFRp.degree = 20; converges
// OFRp.degree = 16; // OFRp.degree = 16;
OFRp.degree = 18;
OFRp.precision= 80; OFRp.precision= 80;
OFRp.BoundsCheckFreq=0; OFRp.BoundsCheckFreq=0;
std::vector<RealD> ActionTolByPole({ std::vector<RealD> ActionTolByPole({
1.0e-8,1.0e-8,1.0e-8,1.0e-8, 1.0e-7,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8, 1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8, 1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8, 1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8 1.0e-8,1.0e-8
}); });
std::vector<RealD> MDTolByPole({ std::vector<RealD> MDTolByPole({
1.0e-5,5.0e-6,1.0e-6,1.0e-7, // soften convergence more more 1.6e-5,5.0e-6,1.0e-6,3.0e-7, // soften convergence more more
// 1.0e-6,3.0e-7,1.0e-7,1.0e-7, // 1.0e-6,3.0e-7,1.0e-7,1.0e-7,
// 3.0e-6,1.0e-6,1.0e-7,1.0e-7, // soften convergence // 3.0e-6,1.0e-6,1.0e-7,1.0e-7, // soften convergence
1.0e-8,1.0e-8,1.0e-8,1.0e-8, 1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8, 1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8 1.0e-8,1.0e-8
}); });
@ -265,10 +263,8 @@ int main(int argc, char **argv) {
typedef SchurDiagMooeeOperator<FermionActionF,FermionFieldF> LinearOperatorF; typedef SchurDiagMooeeOperator<FermionActionF,FermionFieldF> LinearOperatorF;
typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD; typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
typedef SchurDiagMooeeOperator<FermionActionD2,FermionFieldD2 > LinearOperatorD2;
typedef SchurDiagMooeeOperator<FermionEOFAActionF,FermionFieldF> LinearOperatorEOFAF; typedef SchurDiagMooeeOperator<FermionEOFAActionF,FermionFieldF> LinearOperatorEOFAF;
typedef SchurDiagMooeeOperator<FermionEOFAAction ,FermionField > LinearOperatorEOFAD; typedef SchurDiagMooeeOperator<FermionEOFAAction ,FermionField > LinearOperatorEOFAD;
typedef SchurDiagMooeeOperator<FermionEOFAActionD2,FermionFieldD2 > LinearOperatorEOFAD2;
typedef MixedPrecisionConjugateGradientOperatorFunction<MobiusFermionD,MobiusFermionF,LinearOperatorD,LinearOperatorF> MxPCG; typedef MixedPrecisionConjugateGradientOperatorFunction<MobiusFermionD,MobiusFermionF,LinearOperatorD,LinearOperatorF> MxPCG;
typedef MixedPrecisionConjugateGradientOperatorFunction<MobiusEOFAFermionD,MobiusEOFAFermionF,LinearOperatorEOFAD,LinearOperatorEOFAF> MxPCG_EOFA; typedef MixedPrecisionConjugateGradientOperatorFunction<MobiusEOFAFermionD,MobiusEOFAFermionF,LinearOperatorEOFAD,LinearOperatorEOFAF> MxPCG_EOFA;
@ -321,7 +317,6 @@ int main(int argc, char **argv) {
// temporarily need a gauge field // temporarily need a gauge field
LatticeGaugeFieldD U(GridPtr); U=Zero(); LatticeGaugeFieldD U(GridPtr); U=Zero();
LatticeGaugeFieldF UF(GridPtrF); UF=Zero(); LatticeGaugeFieldF UF(GridPtrF); UF=Zero();
LatticeGaugeFieldD2 UD2(GridPtrF); UD2=Zero();
std::cout << GridLogMessage << " Running the HMC "<< std::endl; std::cout << GridLogMessage << " Running the HMC "<< std::endl;
TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file
@ -340,6 +335,7 @@ int main(int argc, char **argv) {
ParamsDirF.dirichlet=Dirichlet; ParamsDirF.dirichlet=Dirichlet;
ParamsDir.partialDirichlet=1; ParamsDir.partialDirichlet=1;
ParamsDirF.partialDirichlet=1; ParamsDirF.partialDirichlet=1;
std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
// double StoppingCondition = 1e-14; // double StoppingCondition = 1e-14;
// double MDStoppingCondition = 1e-9; // double MDStoppingCondition = 1e-9;
@ -424,7 +420,7 @@ int main(int argc, char **argv) {
ActionCGL, ActionCGR, ActionCGL, ActionCGR,
DerivativeCGL, DerivativeCGR, DerivativeCGL, DerivativeCGR,
SFRp, true); SFRp, true);
// Level2.push_back(&EOFA); Level2.push_back(&EOFA);
//////////////////////////////////// ////////////////////////////////////
// up down action // up down action
@ -449,17 +445,15 @@ int main(int argc, char **argv) {
std::vector<FermionAction *> Denominators; std::vector<FermionAction *> Denominators;
std::vector<FermionActionF *> NumeratorsF; std::vector<FermionActionF *> NumeratorsF;
std::vector<FermionActionF *> DenominatorsF; std::vector<FermionActionF *> DenominatorsF;
std::vector<FermionActionD2 *> NumeratorsD2;
std::vector<FermionActionD2 *> DenominatorsD2;
std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients; std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
std::vector<MxPCG *> ActionMPCG; std::vector<MxPCG *> ActionMPCG;
std::vector<MxPCG *> MPCG; std::vector<MxPCG *> MPCG;
#define MIXED_PRECISION #define MIXED_PRECISION
#ifdef MIXED_PRECISION #ifdef MIXED_PRECISION
std::vector<OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicyD2> *> Bdys; std::vector<GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicy> *> Bdys;
#else #else
std::vector<OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys; std::vector<GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys;
#endif #endif
typedef SchurDiagMooeeOperator<FermionActionF,FermionFieldF> LinearOperatorF; typedef SchurDiagMooeeOperator<FermionActionF,FermionFieldF> LinearOperatorF;
@ -532,31 +526,19 @@ int main(int argc, char **argv) {
Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],*MPCG[h],*ActionMPCG[h],CG)); Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],*MPCG[h],*ActionMPCG[h],CG));
} else { } else {
#ifdef MIXED_PRECISION #ifdef MIXED_PRECISION
// Use the D2 data types and make them use same grid as single Bdys.push_back( new GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicy>(
FermionActionD2::ImplParams ParamsDenD2(boundary);
FermionActionD2::ImplParams ParamsNumD2(boundary);
ParamsDenD2.dirichlet = ParamsDen.dirichlet;
ParamsDenD2.partialDirichlet = ParamsDen.partialDirichlet;
DenominatorsD2.push_back(new FermionActionD2(UD2,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsDenD2));
ParamsNumD2.dirichlet = ParamsNum.dirichlet;
ParamsNumD2.partialDirichlet = ParamsNum.partialDirichlet;
NumeratorsD2.push_back (new FermionActionD2(UD2,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[h],M5,b,c, ParamsNumD2));
Bdys.push_back( new OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicyD2>(
*Numerators[h],*Denominators[h], *Numerators[h],*Denominators[h],
*NumeratorsF[h],*DenominatorsF[h], *NumeratorsF[h],*DenominatorsF[h],
*NumeratorsD2[h],*DenominatorsD2[h], *Numerators[h],*Denominators[h],
OFRp, 400) ); OFRp, SP_iters) );
Bdys.push_back( new OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicyD2>( Bdys.push_back( new GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicy>(
*Numerators[h],*Denominators[h], *Numerators[h],*Denominators[h],
*NumeratorsF[h],*DenominatorsF[h], *NumeratorsF[h],*DenominatorsF[h],
*NumeratorsD2[h],*DenominatorsD2[h], *Numerators[h],*Denominators[h],
OFRp, 400) ); OFRp, SP_iters) );
#else #else
Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp)); Bdys.push_back( new GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp)); Bdys.push_back( new GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
#endif #endif
} }
} }

View File

@ -183,7 +183,7 @@ int main(int argc, char **argv) {
// 4/2 => 0.6 dH // 4/2 => 0.6 dH
// 3/3 => 0.8 dH .. depth 3, slower // 3/3 => 0.8 dH .. depth 3, slower
//MD.MDsteps = 4; //MD.MDsteps = 4;
MD.MDsteps = 3; MD.MDsteps = 12;
MD.trajL = 0.5; MD.trajL = 0.5;
HMCparameters HMCparams; HMCparameters HMCparams;
@ -200,8 +200,8 @@ int main(int argc, char **argv) {
TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
CheckpointerParameters CPparams; CheckpointerParameters CPparams;
CPparams.config_prefix = "ckpoint_DDHMC_lat"; CPparams.config_prefix = "ckpoint_HMC_lat";
CPparams.rng_prefix = "ckpoint_DDHMC_rng"; CPparams.rng_prefix = "ckpoint_HMC_rng";
CPparams.saveInterval = 1; CPparams.saveInterval = 1;
CPparams.format = "IEEE64BIG"; CPparams.format = "IEEE64BIG";
TheHMC.Resources.LoadNerscCheckpointer(CPparams); TheHMC.Resources.LoadNerscCheckpointer(CPparams);
@ -228,7 +228,7 @@ int main(int argc, char **argv) {
Real pv_mass = 1.0; Real pv_mass = 1.0;
// std::vector<Real> hasenbusch({ 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // std::vector<Real> hasenbusch({ 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
// std::vector<Real> hasenbusch({ light_mass, 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // std::vector<Real> hasenbusch({ light_mass, 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
std::vector<Real> hasenbusch({ light_mass, 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated std::vector<Real> hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated
// std::vector<Real> hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass }); // std::vector<Real> hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass });
auto GridPtr = TheHMC.Resources.GetCartesian(); auto GridPtr = TheHMC.Resources.GetCartesian();
@ -299,8 +299,8 @@ int main(int argc, char **argv) {
//////////////////////////////////// ////////////////////////////////////
// Collect actions // Collect actions
//////////////////////////////////// ////////////////////////////////////
ActionLevel<HMCWrapper::Field> Level1(1); // ActionLevel<HMCWrapper::Field> Level1(1);
ActionLevel<HMCWrapper::Field> Level2(3); ActionLevel<HMCWrapper::Field> Level2(1);
ActionLevel<HMCWrapper::Field> Level3(15); ActionLevel<HMCWrapper::Field> Level3(15);
//////////////////////////////////// ////////////////////////////////////
@ -369,7 +369,7 @@ int main(int argc, char **argv) {
ActionCGL, ActionCGR, ActionCGL, ActionCGR,
DerivativeCGL, DerivativeCGR, DerivativeCGL, DerivativeCGR,
SFRp, true); SFRp, true);
// Level2.push_back(&EOFA); Level2.push_back(&EOFA);
//////////////////////////////////// ////////////////////////////////////
// up down action // up down action
@ -477,7 +477,7 @@ int main(int argc, char **argv) {
// Gauge action // Gauge action
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
Level3.push_back(&GaugeAction); Level3.push_back(&GaugeAction);
TheHMC.TheAction.push_back(Level1); // TheHMC.TheAction.push_back(Level1);
TheHMC.TheAction.push_back(Level2); TheHMC.TheAction.push_back(Level2);
TheHMC.TheAction.push_back(Level3); TheHMC.TheAction.push_back(Level3);
std::cout << GridLogMessage << " Action complete "<< std::endl; std::cout << GridLogMessage << " Action complete "<< std::endl;

10
TODO
View File

@ -1,3 +1,12 @@
- - Slice sum optimisation & A2A - atomic addition
- - Also faster non-atomic reduction
- - Remaining PRs
- - DDHMC
- - MixedPrec is the action eval, high precision
- - MixedPrecCleanup is the force eval, low precision
=================
=================
Lattice_basis.h -- > HIP and SYCL GPU code Lattice_basis.h -- > HIP and SYCL GPU code
@ -8,6 +17,7 @@ DDHMC
-- Multishift Mixed Precision - DONE -- Multishift Mixed Precision - DONE
-- Pole dependent residual - DONE -- Pole dependent residual - DONE
======= =======
-- comms threads issue?? -- comms threads issue??
-- Part done: Staggered kernel performance on GPU -- Part done: Staggered kernel performance on GPU

View File

@ -0,0 +1,387 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./benchmarks/Benchmark_dwf.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#ifdef GRID_CUDA
#define CUDA_PROFILE
#endif
#ifdef CUDA_PROFILE
#include <cuda_profiler_api.h>
#endif
using namespace std;
using namespace Grid;
template<class d>
struct scal {
d internal;
};
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
};
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
int threads = GridThread::GetThreads();
Coordinate latt4 = GridDefaultLatt();
int Ls=16;
for(int i=0;i<argc;i++)
if(std::string(argv[i]) == "-Ls"){
std::stringstream ss(argv[i+1]); ss >> Ls;
}
GridLogLayout();
long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG"));
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG"));
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
LatticeFermionF src (FGrid); random(RNG5,src);
LatticeFermionF src1 (FGrid); random(RNG5,src1);
#if 0
src = Zero();
{
Coordinate origin({0,0,0,latt4[2]-1,0});
SpinColourVectorF tmp;
tmp=Zero();
tmp()(0)(0)=Complex(-2.0,0.0);
std::cout << " source site 0 " << tmp<<std::endl;
pokeSite(tmp,src,origin);
}
#else
RealD N2 = 1.0/::sqrt(norm2(src));
src = src*N2;
#endif
LatticeFermionF result(FGrid); result=Zero();
LatticeFermionF ref(FGrid); ref=Zero();
LatticeFermionF tmp(FGrid);
LatticeFermionF err(FGrid);
std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
LatticeGaugeFieldF Umu(UGrid);
SU<Nc>::HotConfiguration(RNG4,Umu);
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
#if 0
Umu=1.0;
for(int mu=0;mu<Nd;mu++){
LatticeColourMatrixF ttmp(UGrid);
ttmp = PeekIndex<LorentzIndex>(Umu,mu);
// if (mu !=2 ) ttmp = 0;
// ttmp = ttmp* pow(10.0,mu);
PokeIndex<LorentzIndex>(Umu,ttmp,mu);
}
std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
#endif
////////////////////////////////////
// Naive wilson implementation
////////////////////////////////////
// replicate across fifth dimension
// LatticeGaugeFieldF Umu5d(FGrid);
std::vector<LatticeColourMatrixF> U(4,UGrid);
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
}
std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
if (1)
{
ref = Zero();
for(int mu=0;mu<Nd;mu++){
tmp = Cshift(src,mu+1,1);
{
autoView( tmp_v , tmp , CpuWrite);
autoView( U_v , U[mu] , CpuRead);
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
}
}
}
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
{
autoView( tmp_v , tmp , CpuWrite);
autoView( U_v , U[mu] , CpuRead);
autoView( src_v, src , CpuRead);
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
}
}
}
tmp =Cshift(tmp,mu+1,-1);
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
}
ref = -0.5*ref;
}
RealD mass=0.1;
RealD M5 =1.8;
RealD NP = UGrid->_Nprocessors;
RealD NN = UGrid->NodeCount();
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
std::cout << GridLogMessage<< "* VComplexF size is "<<sizeof(vComplexF)<< " B"<<std::endl;
if ( sizeof(RealF)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(RealF)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
#ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
#endif
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
int ncall =100;
if (1) {
FGrid->Barrier();
Dw.Dhop(src,result,0);
std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
double t0=usecond();
for(int i=0;i<ncall;i++){
Dw.Dhop(src1,result,0);
Dw.Dhop(src,result,0);
err = ref-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
assert (norm2(err)< 1.0e-4 );
}
double t1=usecond();
FGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=single_site_flops*volume*ncall;
auto nsimd = vComplex::Nsimd();
auto simdwidth = sizeof(vComplex);
// RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
// mem: Nd Wilson * Ls, Nd gauge, Nc colors
double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
// std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
// std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
std::cout<<GridLogMessage << "RF GiB/s (base 2) = "<< 1000000. * data_rf/((t1-t0))<<std::endl;
std::cout<<GridLogMessage << "mem GiB/s (base 2) = "<< 1000000. * data_mem/((t1-t0))<<std::endl;
err = ref-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
//exit(0);
if(( norm2(err)>1.0e-4) ) {
/*
std::cout << "RESULT\n " << result<<std::endl;
std::cout << "REF \n " << ref <<std::endl;
std::cout << "ERR \n " << err <<std::endl;
*/
std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
FGrid->Barrier();
exit(-1);
}
assert (norm2(err)< 1.0e-4 );
}
if (1)
{ // Naive wilson dag implementation
ref = Zero();
for(int mu=0;mu<Nd;mu++){
// ref = src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
tmp = Cshift(src,mu+1,1);
{
autoView( ref_v, ref, CpuWrite);
autoView( tmp_v, tmp, CpuRead);
autoView( U_v , U[mu] , CpuRead);
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
int i=s+Ls*ss;
ref_v[i]+= U_v[ss]*(tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]); ;
}
}
}
{
autoView( tmp_v , tmp , CpuWrite);
autoView( U_v , U[mu] , CpuRead);
autoView( src_v, src , CpuRead);
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
}
}
}
// tmp =adj(U[mu])*src;
tmp =Cshift(tmp,mu+1,-1);
{
autoView( ref_v, ref, CpuWrite);
autoView( tmp_v, tmp, CpuRead);
for(int i=0;i<ref_v.size();i++){
ref_v[i]+= tmp_v[i] - Gamma(Gmu[mu])*tmp_v[i]; ;
}
}
}
ref = -0.5*ref;
}
// dump=1;
Dw.Dhop(src,result,1);
std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "norm dag ref "<< norm2(ref)<<std::endl;
err = ref-result;
std::cout<<GridLogMessage << "norm dag diff "<< norm2(err)<<std::endl;
if((norm2(err)>1.0e-4)){
/*
std::cout<< "DAG RESULT\n " <<ref << std::endl;
std::cout<< "DAG sRESULT\n " <<result << std::endl;
std::cout<< "DAG ERR \n " << err <<std::endl;
*/
}
LatticeFermionF src_e (FrbGrid);
LatticeFermionF src_o (FrbGrid);
LatticeFermionF r_e (FrbGrid);
LatticeFermionF r_o (FrbGrid);
LatticeFermionF r_eo (FGrid);
std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd,src_o,src);
std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
// S-direction is INNERMOST and takes no part in the parity.
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionF::DhopEO "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplexF::Nsimd()<<std::endl;
if ( sizeof(RealF)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(RealF)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
#ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
#endif
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
{
FGrid->Barrier();
Dw.DhopEO(src_o,r_e,DaggerNo);
double t0=usecond();
for(int i=0;i<ncall;i++){
#ifdef CUDA_PROFILE
if(i==10) cudaProfilerStart();
#endif
Dw.DhopEO(src_o,r_e,DaggerNo);
#ifdef CUDA_PROFILE
if(i==20) cudaProfilerStop();
#endif
}
double t1=usecond();
FGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=(single_site_flops*volume*ncall)/2.0;
std::cout<<GridLogMessage << "Deo mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "Deo mflop/s per rank "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "Deo mflop/s per node "<< flops/(t1-t0)/NN<<std::endl;
}
Dw.DhopEO(src_o,r_e,DaggerNo);
Dw.DhopOE(src_e,r_o,DaggerNo);
Dw.Dhop (src ,result,DaggerNo);
std::cout<<GridLogMessage << "r_e"<<norm2(r_e)<<std::endl;
std::cout<<GridLogMessage << "r_o"<<norm2(r_o)<<std::endl;
std::cout<<GridLogMessage << "res"<<norm2(result)<<std::endl;
setCheckerboard(r_eo,r_o);
setCheckerboard(r_eo,r_e);
err = r_eo-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
if((norm2(err)>1.0e-4)){
/*
std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
std::cout<< "Deo REF\n " <<result << std::endl;
std::cout<< "Deo ERR \n " << err <<std::endl;
*/
}
pickCheckerboard(Even,src_e,err);
pickCheckerboard(Odd,src_o,err);
std::cout<<GridLogMessage << "norm diff even "<< norm2(src_e)<<std::endl;
std::cout<<GridLogMessage << "norm diff odd "<< norm2(src_o)<<std::endl;
assert(norm2(src_e)<1.0e-4);
assert(norm2(src_o)<1.0e-4);
Grid_finalize();
exit(0);
}

View File

@ -1,12 +1,13 @@
CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-` CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
../../configure --enable-comms=mpi-auto \ ../../configure --enable-comms=mpi-auto \
--with-lime=$CLIME \ --with-lime=$CLIME \
--enable-unified=no \ --enable-unified=yes \
--enable-shm=nvlink \ --enable-shm=nvlink \
--enable-tracing=timer \ --enable-tracing=timer \
--enable-accelerator=hip \ --enable-accelerator=hip \
--enable-gen-simd-width=64 \ --enable-gen-simd-width=64 \
--enable-simd=GPU \ --enable-simd=GPU \
--disable-accelerator-cshift \
--with-gmp=$OLCF_GMP_ROOT \ --with-gmp=$OLCF_GMP_ROOT \
--with-fftw=$FFTW_DIR/.. \ --with-fftw=$FFTW_DIR/.. \
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \ --with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \

View File

@ -4,7 +4,7 @@
#SBATCH -p QZ1J-ICX-PVC #SBATCH -p QZ1J-ICX-PVC
##SBATCH -p QZ1J-SPR-PVC-2C ##SBATCH -p QZ1J-SPR-PVC-2C
source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh #source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
export NT=8 export NT=8

View File

@ -4,7 +4,7 @@
#SBATCH -p QZ1J-ICX-PVC #SBATCH -p QZ1J-ICX-PVC
source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh #source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
export NT=16 export NT=16
@ -19,16 +19,15 @@ export SYCL_DEVICE_FILTER=gpu,level_zero
export I_MPI_OFFLOAD_CELL=tile export I_MPI_OFFLOAD_CELL=tile
export EnableImplicitScaling=0 export EnableImplicitScaling=0
export EnableWalkerPartition=0 export EnableWalkerPartition=0
export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1 #export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 #export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0
for i in 0 for i in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
do do
mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 1 --device-mem 32768 mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 0 --device-mem 32768 > 1.1.1.2.log$i
mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 1 --device-mem 32768 mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 0 --device-mem 32768 > 2.1.1.1.log$i
done done
#mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_halo --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 1 > halo.2tile.1x2.log
#mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_halo --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 1 > halo.2tile.2x1.log
mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0

View File

@ -5,10 +5,5 @@ export ZE_AFFINITY_MASK=0.$MPI_LOCALRANKID
echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK
#if [ $MPI_LOCALRANKID = "0" ]
#then
# ~psteinbr/build_pti/ze_tracer -c $@
# onetrace --chrome-kernel-timeline $@
#else
$@ $@
#fi

View File

@ -14,4 +14,3 @@ INSTALL=/nfs/site/home/paboylx/prereqs/
LDFLAGS="-fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$INSTALL/lib" \ LDFLAGS="-fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$INSTALL/lib" \
CXXFLAGS="-fsycl-unnamed-lambda -fsycl -no-fma -I$INSTALL/include -Wno-tautological-compare" CXXFLAGS="-fsycl-unnamed-lambda -fsycl -no-fma -I$INSTALL/include -Wno-tautological-compare"

View File

@ -73,12 +73,12 @@ int main (int argc, char ** argv)
RealD M5 =1.8; RealD M5 =1.8;
std::cout<<GridLogMessage<<"**************************************************************"<<std::endl; std::cout<<GridLogMessage<<"**************************************************************"<<std::endl;
std::cout<<GridLogMessage <<"DomainWallFermion vectorised test"<<std::endl; std::cout<<GridLogMessage <<"DomainWallFermion test"<<std::endl;
std::cout<<GridLogMessage<<"**************************************************************"<<std::endl; std::cout<<GridLogMessage<<"**************************************************************"<<std::endl;
std::vector<Complex> boundary = {1,1,1,-1}; std::vector<Complex> boundary = {1,1,1,-1};
DomainWallFermionD::ImplParams Params(boundary); DomainWallFermionD::ImplParams Params(boundary);
Coordinate Dirichlet({0,8,8,16,32}); // Coordinate Dirichlet({0,8,8,16,32});
Params.dirichlet=Dirichlet; // Params.dirichlet=Dirichlet;
DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,Params); DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,Params);
TestWhat<DomainWallFermionD>(Ddwf,FGrid,FrbGrid,UGrid,mass,M5,&RNG4,&RNG5); TestWhat<DomainWallFermionD>(Ddwf,FGrid,FrbGrid,UGrid,mass,M5,&RNG4,&RNG5);

305
tests/forces/Test_bdy.cc Normal file
View File

@ -0,0 +1,305 @@
/*
2f Full det MdagM 10^6 force ~ 1.3e7
rid : Message : 1767.283471 s : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 1767.283476 s : S1 : 1.52885e+09
Grid : Message : 1767.283480 s : S2 : 1.52886e+09
Grid : Message : 1767.283482 s : dS : 8877.34
Grid : Message : 1767.283483 s : dSpred : 8877.7
Grid : Message : 1767.283484 s : diff : -0.360484
Grid : Message : 1767.283485 s : *********************************************************
2f Full det MpcdagMpc 10^6 force ~ 1.8e6
Grid : Message : 2399.576962 s : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 2399.576968 s : S1 : 1.52885e+09
Grid : Message : 2399.576972 s : S2 : 1.52886e+09
Grid : Message : 2399.576974 s : dS : 9728.49
Grid : Message : 2399.576975 s : dSpred : 9726.58
Grid : Message : 2399.576976 s : diff : 1.90683
Grid : Message : 2399.576977 s : *********************************************************
2f bdy MdagM 1500 force Force ~ 2800
Grid : Message : 4622.385061 s : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 4622.385067 s : S1 : 1.52885e+09
Grid : Message : 4622.385071 s : S2 : 1.52885e+09
Grid : Message : 4622.385072 s : dS : 25.4944
Grid : Message : 4622.385073 s : dSpred : 25.4672
Grid : Message : 4622.385074 s : diff : 0.0271414
Grid : Message : 4622.385075 s : *********************************************************
2f bdy MpcdagMpc 10^6 force Force ~ 2200
Grid : Message : 4622.385061 s : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 4622.385067 s : S1 : 1.52885e+09
Grid : Message : 4622.385071 s : S2 : 1.52885e+09
Grid : Message : 4622.385072 s : dS : 25.4944
Grid : Message : 4622.385073 s : dSpred : 25.4672
Grid : Message : 4622.385074 s : diff : 0.0271414
Grid : Message : 4622.385075 s : *********************************************************
1f Bdy Det
Optimisation log: looser rational AND MD tolerances sloppy
MobiusForce.221179 -- same as HMC. dS is mispredicted Forece ~2.8
Grid : Message : 6582.258991 s : dS : 0.024478
Grid : Message : 6582.258992 s : dSpred : 0.00791876
Grid : Message : 6582.258994 s : diff : 0.0165592
MobiusForce.221193 -- tight rational AND MD tolerances to 1e-8 ~ 2.8 same
Grid : Message : 1964.939209 s : S1 : 7.64404e+08
Grid : Message : 1964.939213 s : S2 : 7.64404e+08
Grid : Message : 1964.939215 s : dS : -0.00775838 <--- too loose even on action
Grid : Message : 1964.939216 s : dSpred : -0.00416793
Grid : Message : 1964.939217 s : diff : -0.00359045
MobiusForce.221394 -- looser rational, MD tol 1e-8 ~ 2.8 same
Grid : Message : 1198.346720 s : S1 : 764404649.48886
Grid : Message : 1198.346760 s : S2 : 764404649.5133
Grid : Message : 1198.346780 s : dS : 0.024440884590149
Grid : Message : 1198.346800 s : dSpred : 0.0079145154465184
Grid : Message : 1198.346810 s : diff : 0.016526369143631
MobiusForce.221394 -- tight rational, MD tol sloppy Force ~ 2.8
Grid : Message : 2376.921950 s : S1 : 764404436.44069
Grid : Message : 2376.921954 s : S2 : 764404436.43299
Grid : Message : 2376.921956 s : dS : -0.0076971054077148
Grid : Message : 2376.921958 s : dSpred : -0.0041610472282526
Grid : Message : 2376.921959 s : diff : -0.0035360581794623
*/
//
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/Test_double_ratio.cc
Copyright (C) 2022
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
typedef MobiusFermionD FermionAction;
typedef WilsonImplD FimplD;
typedef WilsonImplD FermionImplPolicy;
template<class Gimpl>
void ForceTest(Action<LatticeGaugeField> &action,LatticeGaugeField & U,MomentumFilterBase<LatticeGaugeField> &Filter)
{
GridBase *UGrid = U.Grid();
std::vector<int> seeds({1,2,3,5});
GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds);
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds);
LatticeColourMatrix Pmu(UGrid);
LatticeGaugeField P(UGrid);
LatticeGaugeField UdSdU(UGrid);
std::cout << GridLogMessage << "*********************************************************"<<std::endl;
std::cout << GridLogMessage << " Force test for "<<action.action_name()<<std::endl;
std::cout << GridLogMessage << "*********************************************************"<<std::endl;
RealD eps=0.005;
std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
std::cout << GridLogMessage << " Refresh "<<action.action_name()<<std::endl;
std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
Gimpl::generate_momenta(P,sRNG,RNG4);
Filter.applyFilter(P);
#if 0
FieldMetaData header;
std::string file("./ckpoint_lat.2000");
NerscIO::readConfiguration(U,header,file);
#else
U = 1.0;
#endif
action.refresh(U,sRNG,RNG4);
std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
std::cout << GridLogMessage << " Action "<<action.action_name()<<std::endl;
std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
RealD S1 = action.S(U);
Gimpl::update_field(P,U,eps);
std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
std::cout << GridLogMessage << " Derivative "<<action.action_name()<<std::endl;
std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
action.deriv(U,UdSdU);
UdSdU = Ta(UdSdU);
Filter.applyFilter(UdSdU);
DumpSliceNorm("Force",UdSdU,Nd-1);
Gimpl::update_field(P,U,eps);
std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
std::cout << GridLogMessage << " Action "<<action.action_name()<<std::endl;
std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
RealD S2 = action.S(U);
// Use the derivative
LatticeComplex dS(UGrid); dS = Zero();
for(int mu=0;mu<Nd;mu++){
auto UdSdUmu = PeekIndex<LorentzIndex>(UdSdU,mu);
Pmu= PeekIndex<LorentzIndex>(P,mu);
dS = dS - trace(Pmu*UdSdUmu)*eps*2.0*2.0;
}
ComplexD dSpred = sum(dS);
RealD diff = S2-S1-dSpred.real();
std::cout<< GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
std::cout<< GridLogMessage << "S1 : "<< S1 <<std::endl;
std::cout<< GridLogMessage << "S2 : "<< S2 <<std::endl;
std::cout<< GridLogMessage << "dS : "<< S2-S1 <<std::endl;
std::cout<< GridLogMessage << "dSpred : "<< dSpred.real() <<std::endl;
std::cout<< GridLogMessage << "diff : "<< diff<<std::endl;
std::cout<< GridLogMessage << "*********************************************************"<<std::endl;
// assert(diff<1.0);
std::cout<< GridLogMessage << "Done" <<std::endl;
std::cout << GridLogMessage << "*********************************************************"<<std::endl;
}
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
std::cout << std::setprecision(14);
Coordinate latt_size = GridDefaultLatt();
Coordinate mpi_layout = GridDefaultMpi();
Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
Coordinate shm;
GlobalSharedMemory::GetShmDims(mpi_layout,shm);
const int Ls=12;
const int Nt = latt_size[3];
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
////////////////////////////////////////////////////////////////
// Domain decomposed operator
////////////////////////////////////////////////////////////////
Coordinate CommDim(Nd);
for(int d=0;d<Nd;d++) CommDim[d]= (mpi_layout[d]/shm[d])>1 ? 1 : 0;
Coordinate NonDirichlet(Nd+1,0);
Coordinate Dirichlet(Nd+1,0);
Dirichlet[1] = CommDim[0]*latt_size[0]/mpi_layout[0] * shm[0];
Dirichlet[2] = CommDim[1]*latt_size[1]/mpi_layout[1] * shm[1];
Dirichlet[3] = CommDim[2]*latt_size[2]/mpi_layout[2] * shm[2];
Dirichlet[4] = CommDim[3]*latt_size[3]/mpi_layout[3] * shm[3];
Coordinate Block4(Nd);
Block4[0] = Dirichlet[1];
Block4[1] = Dirichlet[2];
Block4[2] = Dirichlet[3];
Block4[3] = Dirichlet[4];
std::vector<Complex> boundary = {1,1,1,-1};
FermionAction::ImplParams Params(boundary);
FermionAction::ImplParams ParamsDir(boundary);
Params.dirichlet=NonDirichlet;
ParamsDir.dirichlet=Dirichlet;
ParamsDir.partialDirichlet=1;
///////////////////// Gauge Field and Gauge Forces ////////////////////////////
LatticeGaugeField U(UGrid);
RealD beta=6.0;
WilsonGaugeActionR PlaqAction(beta);
IwasakiGaugeActionR RectAction(beta);
MomentumFilterNone<LatticeGaugeField> FilterNone;
ForceTest<GimplTypesR>(PlaqAction,U,FilterNone);
ForceTest<GimplTypesR>(RectAction,U,FilterNone);
////////////////////////////////////
// Action
////////////////////////////////////
RealD mass=0.00078;
RealD pvmass=1.0;
RealD M5=1.8;
RealD b=1.5;
RealD c=0.5;
// Double versions
FermionAction DdwfPeriodic(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c,Params);
FermionAction PVPeriodic (U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,pvmass,M5,b,c,Params);
FermionAction DdwfDirichlet(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c,ParamsDir);
double StoppingCondition = 1.0e-8;
double MaxCGIterations = 50000;
ConjugateGradient<LatticeFermion> CG(StoppingCondition,MaxCGIterations);
//////////////////// Two Flavour Determinant Ratio ///////////////////////////////
TwoFlavourRatioPseudoFermionAction<FimplD> Nf2(PVPeriodic, DdwfPeriodic,CG,CG);
// ForceTest<GimplTypesR>(Nf2,U,FilterNone);
//////////////////// Two Flavour Determinant force test Even Odd ///////////////////////////////
TwoFlavourEvenOddRatioPseudoFermionAction<FimplD> Nf2eo(PVPeriodic, DdwfPeriodic,CG,CG);
// ForceTest<GimplTypesR>(Nf2eo,U,FilterNone);
//////////////////// Domain forces ////////////////////
int Width=4;
DDHMCFilter<WilsonImplD::Field> DDHMCFilter(Block4,Width);
//////////////////// Two flavour boundary det ////////////////////
TwoFlavourRatioPseudoFermionAction<FimplD> BdyNf2(DdwfDirichlet, DdwfPeriodic,CG,CG);
// ForceTest<GimplTypesR>(BdyNf2,U,DDHMCFilter);
//////////////////// Two flavour eo boundary det ////////////////////
TwoFlavourEvenOddRatioPseudoFermionAction<FimplD> BdyNf2eo(DdwfDirichlet, DdwfPeriodic,CG,CG);
// ForceTest<GimplTypesR>(BdyNf2eo,U,DDHMCFilter);
//////////////////// One flavour boundary det ////////////////////
OneFlavourRationalParams OFRp; // Up/down
OFRp.lo = 4.0e-5;
OFRp.hi = 90.0;
OFRp.MaxIter = 60000;
OFRp.tolerance= 1.0e-8;
OFRp.mdtolerance= 1.0e-6;
OFRp.degree = 18;
OFRp.precision= 80;
OFRp.BoundsCheckFreq=0;
std::vector<RealD> ActionTolByPole({
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8
});
std::vector<RealD> MDTolByPole({
1.0e-6,3.0e-7,1.0e-7,1.0e-7, // Orig sloppy
// 1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8
});
OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> BdySqrt(DdwfDirichlet,DdwfPeriodic,OFRp);
ForceTest<GimplTypesR>(BdySqrt,U,DDHMCFilter);
Grid_finalize();
}

View File

@ -476,6 +476,20 @@ int main (int argc, char ** argv)
// ForceTest<GimplTypesR>(BdyNf2eo,U,DDHMCFilter); // ForceTest<GimplTypesR>(BdyNf2eo,U,DDHMCFilter);
//////////////////// One flavour boundary det //////////////////// //////////////////// One flavour boundary det ////////////////////
RationalActionParams OFRp; // Up/down
OFRp.lo = 6.0e-5;
OFRp.hi = 90.0;
OFRp.inv_pow = 2;
OFRp.MaxIter = SP_iters; // get most shifts by 2000, stop sharing space
OFRp.action_tolerance= 1.0e-8;
OFRp.action_degree = 18;
OFRp.md_tolerance= 1.0e-5;
OFRp.md_degree = 14;
// OFRp.degree = 20; converges
// OFRp.degree = 16;
OFRp.precision= 80;
OFRp.BoundsCheckFreq=0;
/*
OneFlavourRationalParams OFRp; // Up/down OneFlavourRationalParams OFRp; // Up/down
OFRp.lo = 4.0e-5; OFRp.lo = 4.0e-5;
OFRp.hi = 90.0; OFRp.hi = 90.0;
@ -485,6 +499,23 @@ int main (int argc, char ** argv)
OFRp.degree = 18; OFRp.degree = 18;
OFRp.precision= 80; OFRp.precision= 80;
OFRp.BoundsCheckFreq=0; OFRp.BoundsCheckFreq=0;
*/
std::vector<RealD> ActionTolByPole({
1.0e-7,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8
});
std::vector<RealD> MDTolByPole({
1.6e-5,5.0e-6,1.0e-6,3.0e-7, // soften convergence more more
// 1.0e-6,3.0e-7,1.0e-7,1.0e-7,
// 3.0e-6,1.0e-6,1.0e-7,1.0e-7, // soften convergence
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8
});
/*
std::vector<RealD> ActionTolByPole({ std::vector<RealD> ActionTolByPole({
1.0e-8,1.0e-8,1.0e-8,1.0e-8, 1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8, 1.0e-8,1.0e-8,1.0e-8,1.0e-8,
@ -499,9 +530,9 @@ int main (int argc, char ** argv)
// 1.0e-8,1.0e-8,1.0e-8,1.0e-8, // 1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8, 1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8, 1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8 1.0e-8,1.0e-8
}); });
*/
OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> BdySqrt(DdwfDirichlet,DdwfPeriodic,OFRp); OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> BdySqrt(DdwfDirichlet,DdwfPeriodic,OFRp);
BdySqrt.SetTolerances(ActionTolByPole,MDTolByPole); BdySqrt.SetTolerances(ActionTolByPole,MDTolByPole);
ForceTest<GimplTypesR>(BdySqrt,U,DDHMCFilter); ForceTest<GimplTypesR>(BdySqrt,U,DDHMCFilter);

View File

@ -0,0 +1,73 @@
#Example script
DIR=/gpfs/alpine/phy157/proj-shared/phy157dwf/chulwoo/Grid/BL/build/tests/lanczos
BIN=${DIR}/Test_dwf_block_lanczos
VOL='--grid 16.16.16.32 '
GRID='--mpi 1.1.1.4 '
CONF='--gconf ckpoint_lat.IEEE64BIG.2000 '
OPT='--mass 0.01 --M5 1.8 --phase in.params --omega in.params --shm 4096'
#BL='--rbl 16.1024.128.1000.10 --split 1.1.4.4 --check_int 100 --resid 1.0e-5 --cheby_l 0.007 --cheby_u 7 --cheby_n 51'
BL='--rbl 4.128.16.100.10 --split 1.1.1.4 --check_int 25 --resid 1.0e-5 --cheby_l 0.007 --cheby_u 7 --cheby_n 51'
ARGS=${CONF}" "${OPT}" "${BL}" "${VOL}" "${GRID}
export APP="${BIN} ${ARGS}"
echo APP=${APP}
#export JS="jsrun --nrs 32 -a4 -g4 -c42 -dpacked -b packed:7 --smpiargs="-gpu" "
export JS="jsrun --nrs 1 -a4 -g4 -c42 -dpacked -b packed:10 --smpiargs="-gpu" "
$JS $APP
#sample in.param
boundary_phase 0 1 0
boundary_phase 1 1 0
boundary_phase 2 1 0
boundary_phase 3 -1 0
omega 0 0.5 0
omega 1 0.5 0
omega 2 0.5 0
omega 3 0.5 0
omega 4 0.5 0
omega 5 0.5 0
omega 6 0.5 0
omega 7 0.5 0
omega 8 0.5 0
omega 9 0.5 0
omega 10 0.5 0
omega 11 0.5 0
#output
Grid : Message : 1.717474 s : Gauge Configuration ckpoint_lat.IEEE64BIG.2000
Grid : Message : 1.717478 s : boundary_phase[0] = (1,0)
Grid : Message : 1.717497 s : boundary_phase[1] = (1,0)
Grid : Message : 1.717500 s : boundary_phase[2] = (1,0)
Grid : Message : 1.717503 s : boundary_phase[3] = (-1,0)
Grid : Message : 1.717506 s : Ls 12
Grid : Message : 1.717507 s : mass 0.01
Grid : Message : 1.717510 s : M5 1.8
Grid : Message : 1.717512 s : mob_b 1.5
Grid : Message : 1.717514 s : omega[0] = (0.5,0)
Grid : Message : 1.717517 s : omega[1] = (0.5,0)
Grid : Message : 1.717520 s : omega[2] = (0.5,0)
Grid : Message : 1.717523 s : omega[3] = (0.5,0)
Grid : Message : 1.717526 s : omega[4] = (0.5,0)
Grid : Message : 1.717529 s : omega[5] = (0.5,0)
Grid : Message : 1.717532 s : omega[6] = (0.5,0)
Grid : Message : 1.717535 s : omega[7] = (0.5,0)
Grid : Message : 1.717538 s : omega[8] = (0.5,0)
Grid : Message : 1.717541 s : omega[9] = (0.5,0)
Grid : Message : 1.717544 s : omega[10] = (0.5,0)
Grid : Message : 1.717547 s : omega[11] = (0.5,0)
Grid : Message : 1.717550 s : Nu 4
Grid : Message : 1.717551 s : Nk 128
Grid : Message : 1.717552 s : Np 16
Grid : Message : 1.717553 s : Nm 288
Grid : Message : 1.717554 s : Nstop 100
Grid : Message : 1.717555 s : Ntest 25
Grid : Message : 1.717557 s : MaxIter 10
Grid : Message : 1.717558 s : resid 1e-05
Grid : Message : 1.717560 s : Cheby Poly 0.007,7,51

View File

@ -0,0 +1,410 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/Test_dwf_block_lanczos.cc
Copyright (C) 2022
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Yong-Chull Jang <ypj@quark.phy.bnl.gov>
Author: Chulwoo Jung <chulwoo@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/util/Init.h>
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
using namespace std;
using namespace Grid;
//using namespace Grid::QCD;
//typedef typename GparityDomainWallFermionR::FermionField FermionField;
typedef typename ZMobiusFermionF::FermionField FermionField;
RealD AllZero(RealD x){ return 0.;}
class CmdJobParams
{
public:
std::string gaugefile;
int Ls;
double mass;
double M5;
double mob_b;
std::vector<ComplexD> omega;
std::vector<Complex> boundary_phase;
std::vector<int> mpi_split;
LanczosType Impl;
int Nu;
int Nk;
int Np;
int Nm;
int Nstop;
int Ntest;
int MaxIter;
double resid;
double low;
double high;
int order;
CmdJobParams()
: gaugefile("Hot"),
Ls(8), mass(0.01), M5(1.8), mob_b(1.5),
Impl(LanczosType::irbl),mpi_split(4,1),
Nu(4), Nk(200), Np(200), Nstop(100), Ntest(1), MaxIter(10), resid(1.0e-8),
low(0.2), high(5.5), order(11)
{Nm=Nk+Np;};
void Parse(char **argv, int argc);
};
void CmdJobParams::Parse(char **argv,int argc)
{
std::string arg;
std::vector<int> vi;
double re,im;
int expect, idx;
std::string vstr;
std::ifstream pfile;
if( GridCmdOptionExists(argv,argv+argc,"--gconf") ){
gaugefile = GridCmdOptionPayload(argv,argv+argc,"--gconf");
}
if( GridCmdOptionExists(argv,argv+argc,"--phase") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--phase");
pfile.open(arg);
assert(pfile);
expect = 0;
while( pfile >> vstr ) {
if ( vstr.compare("boundary_phase") == 0 ) {
pfile >> vstr;
GridCmdOptionInt(vstr,idx);
assert(expect==idx);
pfile >> vstr;
GridCmdOptionFloat(vstr,re);
pfile >> vstr;
GridCmdOptionFloat(vstr,im);
boundary_phase.push_back({re,im});
expect++;
}
}
pfile.close();
} else {
for (int i=0; i<4; ++i) boundary_phase.push_back({1.,0.});
}
if( GridCmdOptionExists(argv,argv+argc,"--omega") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--omega");
pfile.open(arg);
assert(pfile);
Ls = 0;
while( pfile >> vstr ) {
if ( vstr.compare("omega") == 0 ) {
pfile >> vstr;
GridCmdOptionInt(vstr,idx);
assert(Ls==idx);
pfile >> vstr;
GridCmdOptionFloat(vstr,re);
pfile >> vstr;
GridCmdOptionFloat(vstr,im);
omega.push_back({re,im});
Ls++;
}
}
pfile.close();
} else {
if( GridCmdOptionExists(argv,argv+argc,"--Ls") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--Ls");
GridCmdOptionInt(arg,Ls);
}
}
if( GridCmdOptionExists(argv,argv+argc,"--mass") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--mass");
GridCmdOptionFloat(arg,mass);
}
if( GridCmdOptionExists(argv,argv+argc,"--M5") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--M5");
GridCmdOptionFloat(arg,M5);
}
if( GridCmdOptionExists(argv,argv+argc,"--mob_b") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--mob_b");
GridCmdOptionFloat(arg,mob_b);
}
if( GridCmdOptionExists(argv,argv+argc,"--irbl") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--irbl");
GridCmdOptionIntVector(arg,vi);
Nu = vi[0];
Nk = vi[1];
Np = vi[2];
Nstop = vi[3];
MaxIter = vi[4];
// ypj[fixme] mode overriding message is needed.
Impl = LanczosType::irbl;
Nm = Nk+Np;
}
// block Lanczos with explicit extension of its dimensions
if( GridCmdOptionExists(argv,argv+argc,"--rbl") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--rbl");
GridCmdOptionIntVector(arg,vi);
Nu = vi[0];
Nk = vi[1];
Np = vi[2]; // vector space is enlarged by adding Np vectors
Nstop = vi[3];
MaxIter = vi[4];
// ypj[fixme] mode overriding message is needed.
Impl = LanczosType::rbl;
Nm = Nk+Np*MaxIter;
}
#if 1
// block Lanczos with explicit extension of its dimensions
if( GridCmdOptionExists(argv,argv+argc,"--split") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--split");
GridCmdOptionIntVector(arg,vi);
for(int i=0;i<mpi_split.size();i++)
mpi_split[i] = vi[i];
}
#endif
if( GridCmdOptionExists(argv,argv+argc,"--check_int") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--check_int");
GridCmdOptionInt(arg,Ntest);
}
if( GridCmdOptionExists(argv,argv+argc,"--resid") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--resid");
GridCmdOptionFloat(arg,resid);
}
if( GridCmdOptionExists(argv,argv+argc,"--cheby_l") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--cheby_l");
GridCmdOptionFloat(arg,low);
}
if( GridCmdOptionExists(argv,argv+argc,"--cheby_u") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--cheby_u");
GridCmdOptionFloat(arg,high);
}
if( GridCmdOptionExists(argv,argv+argc,"--cheby_n") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--cheby_n");
GridCmdOptionInt(arg,order);
}
if ( CartesianCommunicator::RankWorld() == 0 ) {
std::streamsize ss = std::cout.precision();
std::cout << GridLogMessage <<" Gauge Configuration "<< gaugefile << '\n';
std::cout.precision(15);
for ( int i=0; i<4; ++i ) std::cout << GridLogMessage <<" boundary_phase["<< i << "] = " << boundary_phase[i] << '\n';
std::cout.precision(ss);
std::cout << GridLogMessage <<" Ls "<< Ls << '\n';
std::cout << GridLogMessage <<" mass "<< mass << '\n';
std::cout << GridLogMessage <<" M5 "<< M5 << '\n';
std::cout << GridLogMessage <<" mob_b "<< mob_b << '\n';
std::cout.precision(15);
for ( int i=0; i<Ls; ++i ) std::cout << GridLogMessage <<" omega["<< i << "] = " << omega[i] << '\n';
std::cout.precision(ss);
std::cout << GridLogMessage <<" Nu "<< Nu << '\n';
std::cout << GridLogMessage <<" Nk "<< Nk << '\n';
std::cout << GridLogMessage <<" Np "<< Np << '\n';
std::cout << GridLogMessage <<" Nm "<< Nm << '\n';
std::cout << GridLogMessage <<" Nstop "<< Nstop << '\n';
std::cout << GridLogMessage <<" Ntest "<< Ntest << '\n';
std::cout << GridLogMessage <<" MaxIter "<< MaxIter << '\n';
std::cout << GridLogMessage <<" resid "<< resid << '\n';
std::cout << GridLogMessage <<" Cheby Poly "<< low << "," << high << "," << order << std::endl;
}
}
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
CmdJobParams JP;
JP.Parse(argv,argc);
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(JP.Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(JP.Ls,UGrid);
// printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n",UGrid,UrbGrid,FGrid,FrbGrid);
GridCartesian * UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
GridCartesian * FGridF = SpaceTimeGrid::makeFiveDimGrid(JP.Ls,UGridF);
GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(JP.Ls,UGridF);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
GridParallelRNG RNG5(FGridF); RNG5.SeedFixedIntegers(seeds5);
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
// ypj [note] why seed RNG5 again? bug? In this case, run with a default seed().
GridParallelRNG RNG5rb(FrbGridF); RNG5rb.SeedFixedIntegers(seeds5);
LatticeGaugeField Umu(UGrid);
std::vector<LatticeColourMatrix> U(4,UGrid);
LatticeGaugeFieldF UmuF(UGridF);
std::vector<LatticeColourMatrix> UF(4,UGridF);
if ( JP.gaugefile.compare("Hot") == 0 ) {
SU3::HotConfiguration(RNG4, Umu);
} else {
FieldMetaData header;
NerscIO::readConfiguration(Umu,header,JP.gaugefile);
// ypj [fixme] additional checks for the loaded configuration?
}
precisionChange (UmuF,Umu);
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
}
RealD mass = JP.mass;
RealD M5 = JP.M5;
// ypj [fixme] flexible support for a various Fermions
// RealD mob_b = JP.mob_b; // Gparity
// std::vector<ComplexD> omega; // ZMobius
// GparityMobiusFermionD ::ImplParams params;
// std::vector<int> twists({1,1,1,0});
// params.twists = twists;
// GparityMobiusFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.,params);
// SchurDiagTwoOperator<GparityMobiusFermionR,FermionField> HermOp(Ddwf);
// int mrhs = JP.Nu;
int Ndir=4;
auto mpi_layout = GridDefaultMpi();
std::vector<int> mpi_split (Ndir,1);
#if 0
int tmp=mrhs, dir=0;
std::cout << GridLogMessage << "dir= "<<dir <<"tmp= "<<tmp<<"mpi_split= "<<mpi_split[dir]<<"mpi_layout= "<<mpi_split[dir]<<std::endl;
while ( tmp> 1) {
if ((mpi_split[dir]*2) <= mpi_layout[dir]){
mpi_split[dir] *=2;
tmp = tmp/2;
}
std::cout << GridLogMessage << "dir= "<<dir <<"tmp= "<<tmp<<"mpi_split= "<<mpi_split[dir]<<"mpi_layout= "<<mpi_layout[dir]<<std::endl;
dir = (dir+1)%Ndir;
}
#endif
int mrhs=1;
for(int i =0;i<Ndir;i++){
mpi_split[i] = mpi_layout[i] / JP.mpi_split[i] ;
mrhs *= JP.mpi_split[i];
}
std::cout << GridLogMessage << "mpi_layout= " << mpi_layout << std::endl;
std::cout << GridLogMessage << "mpi_split= " << mpi_split << std::endl;
std::cout << GridLogMessage << "mrhs= " << mrhs << std::endl;
// assert(JP.Nu==tmp);
/////////////////////////////////////////////
// Split into 1^4 mpi communicators, keeping it explicitly single
/////////////////////////////////////////////
GridCartesian * SGrid = new GridCartesian(GridDefaultLatt(),
GridDefaultSimd(Nd,vComplexF::Nsimd()),
mpi_split,
*UGrid);
GridCartesian * SFGrid = SpaceTimeGrid::makeFiveDimGrid(JP.Ls,SGrid);
GridRedBlackCartesian * SrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(SGrid);
GridRedBlackCartesian * SFrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(JP.Ls,SGrid);
LatticeGaugeFieldF s_Umu(SGrid);
Grid_split (UmuF,s_Umu);
//WilsonFermionR::ImplParams params;
ZMobiusFermionF::ImplParams params;
params.overlapCommsCompute = true;
params.boundary_phases = JP.boundary_phase;
ZMobiusFermionF DdwfF(UmuF,*FGridF,*FrbGridF,*UGridF,*UrbGridF,mass,M5,JP.omega,1.,0.,params);
// SchurDiagTwoOperator<ZMobiusFermionF,FermionField> HermOp(Ddwf);
SchurDiagOneOperator<ZMobiusFermionF,FermionField> HermOp(DdwfF);
ZMobiusFermionF Dsplit(s_Umu,*SFGrid,*SFrbGrid,*SGrid,*SrbGrid,mass,M5,JP.omega,1.,0.,params);
// SchurDiagTwoOperator<ZMobiusFermionF,FermionField> SHermOp(Dsplit);
SchurDiagOneOperator<ZMobiusFermionF,FermionField> SHermOp(Dsplit);
//std::vector<double> Coeffs { 0.,-1.};
// ypj [note] this may not be supported by some compilers
std::vector<double> Coeffs({ 0.,-1.});
Polynomial<FermionField> PolyX(Coeffs);
//Chebyshev<FermionField> Cheb(0.2,5.5,11);
Chebyshev<FermionField> Cheb(JP.low,JP.high,JP.order);
// Cheb.csv(std::cout);
ImplicitlyRestartedBlockLanczos<FermionField> IRBL(HermOp, SHermOp,
FrbGridF,SFrbGrid,mrhs,
Cheb,
JP.Nstop, JP.Ntest,
JP.Nu, JP.Nk, JP.Nm,
JP.resid,
JP.MaxIter,
IRBLdiagonaliseWithEigen);
// IRBLdiagonaliseWithLAPACK);
IRBL.split_test=1;
std::vector<RealD> eval(JP.Nm);
std::vector<FermionField> src(JP.Nu,FrbGridF);
if (0)
{
// in case RNG is too slow
std::cout << GridLogMessage << "Using RNG5"<<std::endl;
FermionField src_tmp(FGrid);
for ( int i=0; i<JP.Nu; ++i ){
// gaussian(RNG5,src_tmp);
ComplexD rnd;
RealD re;
fillScalar(re,RNG5._gaussian[0],RNG5._generators[0]);
std::cout << i <<" / "<< JP.Nm <<" re "<< re << std::endl;
// printf("%d / %d re %e\n",i,FGrid->_processor,re);
src_tmp=re;
pickCheckerboard(Odd,src[i],src_tmp);
}
RNG5.Report();
} else {
std::cout << GridLogMessage << "Using RNG5rb"<<std::endl;
for ( int i=0; i<JP.Nu; ++i )
gaussian(RNG5rb,src[i]);
RNG5rb.Report();
}
std::vector<FermionField> evec(JP.Nm,FrbGridF);
for(int i=0;i<1;++i){
std::cout << GridLogMessage << i <<" / "<< JP.Nm <<" grid pointer "<< evec[i].Grid() << std::endl;
};
int Nconv;
IRBL.calc(eval,evec,src,Nconv,JP.Impl);
Grid_finalize();
}

View File

@ -0,0 +1,401 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/Test_dwf_block_lanczos.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/util/Init.h>
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
using namespace std;
using namespace Grid;
//using namespace Grid::QCD;
//typedef typename GparityDomainWallFermionR::FermionField FermionField;
typedef typename ZMobiusFermionR::FermionField FermionField;
RealD AllZero(RealD x){ return 0.;}
class CmdJobParams
{
public:
std::string gaugefile;
int Ls;
double mass;
double M5;
double mob_b;
std::vector<ComplexD> omega;
std::vector<Complex> boundary_phase;
std::vector<int> mpi_split;
LanczosType Impl;
int Nu;
int Nk;
int Np;
int Nm;
int Nstop;
int Ntest;
int MaxIter;
double resid;
double low;
double high;
int order;
CmdJobParams()
: gaugefile("Hot"),
Ls(8), mass(0.01), M5(1.8), mob_b(1.5),
Impl(LanczosType::irbl),mpi_split(4,1),
Nu(4), Nk(200), Np(200), Nstop(100), Ntest(1), MaxIter(10), resid(1.0e-8),
low(0.2), high(5.5), order(11)
{Nm=Nk+Np;};
void Parse(char **argv, int argc);
};
void CmdJobParams::Parse(char **argv,int argc)
{
std::string arg;
std::vector<int> vi;
double re,im;
int expect, idx;
std::string vstr;
std::ifstream pfile;
if( GridCmdOptionExists(argv,argv+argc,"--gconf") ){
gaugefile = GridCmdOptionPayload(argv,argv+argc,"--gconf");
}
if( GridCmdOptionExists(argv,argv+argc,"--phase") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--phase");
pfile.open(arg);
assert(pfile);
expect = 0;
while( pfile >> vstr ) {
if ( vstr.compare("boundary_phase") == 0 ) {
pfile >> vstr;
GridCmdOptionInt(vstr,idx);
assert(expect==idx);
pfile >> vstr;
GridCmdOptionFloat(vstr,re);
pfile >> vstr;
GridCmdOptionFloat(vstr,im);
boundary_phase.push_back({re,im});
expect++;
}
}
pfile.close();
} else {
for (int i=0; i<4; ++i) boundary_phase.push_back({1.,0.});
}
if( GridCmdOptionExists(argv,argv+argc,"--omega") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--omega");
pfile.open(arg);
assert(pfile);
Ls = 0;
while( pfile >> vstr ) {
if ( vstr.compare("omega") == 0 ) {
pfile >> vstr;
GridCmdOptionInt(vstr,idx);
assert(Ls==idx);
pfile >> vstr;
GridCmdOptionFloat(vstr,re);
pfile >> vstr;
GridCmdOptionFloat(vstr,im);
omega.push_back({re,im});
Ls++;
}
}
pfile.close();
} else {
if( GridCmdOptionExists(argv,argv+argc,"--Ls") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--Ls");
GridCmdOptionInt(arg,Ls);
}
}
if( GridCmdOptionExists(argv,argv+argc,"--mass") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--mass");
GridCmdOptionFloat(arg,mass);
}
if( GridCmdOptionExists(argv,argv+argc,"--M5") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--M5");
GridCmdOptionFloat(arg,M5);
}
if( GridCmdOptionExists(argv,argv+argc,"--mob_b") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--mob_b");
GridCmdOptionFloat(arg,mob_b);
}
if( GridCmdOptionExists(argv,argv+argc,"--irbl") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--irbl");
GridCmdOptionIntVector(arg,vi);
Nu = vi[0];
Nk = vi[1];
Np = vi[2];
Nstop = vi[3];
MaxIter = vi[4];
// ypj[fixme] mode overriding message is needed.
Impl = LanczosType::irbl;
Nm = Nk+Np;
}
// block Lanczos with explicit extension of its dimensions
if( GridCmdOptionExists(argv,argv+argc,"--rbl") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--rbl");
GridCmdOptionIntVector(arg,vi);
Nu = vi[0];
Nk = vi[1];
Np = vi[2]; // vector space is enlarged by adding Np vectors
Nstop = vi[3];
MaxIter = vi[4];
// ypj[fixme] mode overriding message is needed.
Impl = LanczosType::rbl;
Nm = Nk+Np*MaxIter;
}
#if 1
// block Lanczos with explicit extension of its dimensions
if( GridCmdOptionExists(argv,argv+argc,"--split") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--split");
GridCmdOptionIntVector(arg,vi);
for(int i=0;i<mpi_split.size();i++)
mpi_split[i] = vi[i];
}
#endif
if( GridCmdOptionExists(argv,argv+argc,"--check_int") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--check_int");
GridCmdOptionInt(arg,Ntest);
}
if( GridCmdOptionExists(argv,argv+argc,"--resid") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--resid");
GridCmdOptionFloat(arg,resid);
}
if( GridCmdOptionExists(argv,argv+argc,"--cheby_l") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--cheby_l");
GridCmdOptionFloat(arg,low);
}
if( GridCmdOptionExists(argv,argv+argc,"--cheby_u") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--cheby_u");
GridCmdOptionFloat(arg,high);
}
if( GridCmdOptionExists(argv,argv+argc,"--cheby_n") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--cheby_n");
GridCmdOptionInt(arg,order);
}
if ( CartesianCommunicator::RankWorld() == 0 ) {
std::streamsize ss = std::cout.precision();
std::cout << GridLogMessage <<" Gauge Configuration "<< gaugefile << '\n';
std::cout.precision(15);
for ( int i=0; i<4; ++i ) std::cout << GridLogMessage <<" boundary_phase["<< i << "] = " << boundary_phase[i] << '\n';
std::cout.precision(ss);
std::cout << GridLogMessage <<" Ls "<< Ls << '\n';
std::cout << GridLogMessage <<" mass "<< mass << '\n';
std::cout << GridLogMessage <<" M5 "<< M5 << '\n';
std::cout << GridLogMessage <<" mob_b "<< mob_b << '\n';
std::cout.precision(15);
for ( int i=0; i<Ls; ++i ) std::cout << GridLogMessage <<" omega["<< i << "] = " << omega[i] << '\n';
std::cout.precision(ss);
std::cout << GridLogMessage <<" Nu "<< Nu << '\n';
std::cout << GridLogMessage <<" Nk "<< Nk << '\n';
std::cout << GridLogMessage <<" Np "<< Np << '\n';
std::cout << GridLogMessage <<" Nm "<< Nm << '\n';
std::cout << GridLogMessage <<" Nstop "<< Nstop << '\n';
std::cout << GridLogMessage <<" Ntest "<< Ntest << '\n';
std::cout << GridLogMessage <<" MaxIter "<< MaxIter << '\n';
std::cout << GridLogMessage <<" resid "<< resid << '\n';
std::cout << GridLogMessage <<" Cheby Poly "<< low << "," << high << "," << order << std::endl;
}
}
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
CmdJobParams JP;
JP.Parse(argv,argc);
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(JP.Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(JP.Ls,UGrid);
// printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n",UGrid,UrbGrid,FGrid,FrbGrid);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
// ypj [note] why seed RNG5 again? bug? In this case, run with a default seed().
GridParallelRNG RNG5rb(FrbGrid); RNG5rb.SeedFixedIntegers(seeds5);
LatticeGaugeField Umu(UGrid);
std::vector<LatticeColourMatrix> U(4,UGrid);
if ( JP.gaugefile.compare("Hot") == 0 ) {
SU3::HotConfiguration(RNG4, Umu);
} else {
FieldMetaData header;
NerscIO::readConfiguration(Umu,header,JP.gaugefile);
// ypj [fixme] additional checks for the loaded configuration?
}
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
}
RealD mass = JP.mass;
RealD M5 = JP.M5;
// ypj [fixme] flexible support for a various Fermions
// RealD mob_b = JP.mob_b; // Gparity
// std::vector<ComplexD> omega; // ZMobius
// GparityMobiusFermionD ::ImplParams params;
// std::vector<int> twists({1,1,1,0});
// params.twists = twists;
// GparityMobiusFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.,params);
// SchurDiagTwoOperator<GparityMobiusFermionR,FermionField> HermOp(Ddwf);
// int mrhs = JP.Nu;
int Ndir=4;
auto mpi_layout = GridDefaultMpi();
std::vector<int> mpi_split (Ndir,1);
#if 0
int tmp=mrhs, dir=0;
std::cout << GridLogMessage << "dir= "<<dir <<"tmp= "<<tmp<<"mpi_split= "<<mpi_split[dir]<<"mpi_layout= "<<mpi_split[dir]<<std::endl;
while ( tmp> 1) {
if ((mpi_split[dir]*2) <= mpi_layout[dir]){
mpi_split[dir] *=2;
tmp = tmp/2;
}
std::cout << GridLogMessage << "dir= "<<dir <<"tmp= "<<tmp<<"mpi_split= "<<mpi_split[dir]<<"mpi_layout= "<<mpi_layout[dir]<<std::endl;
dir = (dir+1)%Ndir;
}
#endif
int mrhs=1;
for(int i =0;i<Ndir;i++){
mpi_split[i] = mpi_layout[i] / JP.mpi_split[i] ;
mrhs *= JP.mpi_split[i];
}
std::cout << GridLogMessage << "mpi_layout= " << mpi_layout << std::endl;
std::cout << GridLogMessage << "mpi_split= " << mpi_split << std::endl;
std::cout << GridLogMessage << "mrhs= " << mrhs << std::endl;
// assert(JP.Nu==tmp);
/////////////////////////////////////////////
// Split into 1^4 mpi communicators
/////////////////////////////////////////////
GridCartesian * SGrid = new GridCartesian(GridDefaultLatt(),
GridDefaultSimd(Nd,vComplex::Nsimd()),
mpi_split,
*UGrid);
GridCartesian * SFGrid = SpaceTimeGrid::makeFiveDimGrid(JP.Ls,SGrid);
GridRedBlackCartesian * SrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(SGrid);
GridRedBlackCartesian * SFrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(JP.Ls,SGrid);
LatticeGaugeField s_Umu(SGrid);
Grid_split (Umu,s_Umu);
//WilsonFermionR::ImplParams params;
ZMobiusFermionR::ImplParams params;
params.overlapCommsCompute = true;
params.boundary_phases = JP.boundary_phase;
ZMobiusFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,JP.omega,1.,0.,params);
// SchurDiagTwoOperator<ZMobiusFermionR,FermionField> HermOp(Ddwf);
SchurDiagOneOperator<ZMobiusFermionR,FermionField> HermOp(Ddwf);
ZMobiusFermionR Dsplit(s_Umu,*SFGrid,*SFrbGrid,*SGrid,*SrbGrid,mass,M5,JP.omega,1.,0.,params);
// SchurDiagTwoOperator<ZMobiusFermionR,FermionField> SHermOp(Dsplit);
SchurDiagOneOperator<ZMobiusFermionR,FermionField> SHermOp(Dsplit);
//std::vector<double> Coeffs { 0.,-1.};
// ypj [note] this may not be supported by some compilers
std::vector<double> Coeffs({ 0.,-1.});
Polynomial<FermionField> PolyX(Coeffs);
//Chebyshev<FermionField> Cheb(0.2,5.5,11);
Chebyshev<FermionField> Cheb(JP.low,JP.high,JP.order);
// Cheb.csv(std::cout);
ImplicitlyRestartedBlockLanczos<FermionField> IRBL(HermOp, SHermOp,
FrbGrid,SFrbGrid,mrhs,
Cheb,
JP.Nstop, JP.Ntest,
JP.Nu, JP.Nk, JP.Nm,
JP.resid,
JP.MaxIter,
IRBLdiagonaliseWithEigen);
// IRBLdiagonaliseWithLAPACK);
IRBL.split_test=0;
std::vector<RealD> eval(JP.Nm);
std::vector<FermionField> src(JP.Nu,FrbGrid);
if (0)
{
// in case RNG is too slow
std::cout << GridLogMessage << "Using RNG5"<<std::endl;
FermionField src_tmp(FGrid);
for ( int i=0; i<JP.Nu; ++i ){
// gaussian(RNG5,src_tmp);
ComplexD rnd;
RealD re;
fillScalar(re,RNG5._gaussian[0],RNG5._generators[0]);
std::cout << i <<" / "<< JP.Nm <<" re "<< re << std::endl;
// printf("%d / %d re %e\n",i,FGrid->_processor,re);
src_tmp=re;
pickCheckerboard(Odd,src[i],src_tmp);
}
RNG5.Report();
} else {
std::cout << GridLogMessage << "Using RNG5rb"<<std::endl;
for ( int i=0; i<JP.Nu; ++i )
gaussian(RNG5rb,src[i]);
RNG5rb.Report();
}
std::vector<FermionField> evec(JP.Nm,FrbGrid);
for(int i=0;i<1;++i){
std::cout << GridLogMessage << i <<" / "<< JP.Nm <<" grid pointer "<< evec[i].Grid() << std::endl;
};
int Nconv;
IRBL.calc(eval,evec,src,Nconv,JP.Impl);
Grid_finalize();
}

View File

@ -0,0 +1,408 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/Test_dwf_block_lanczos.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/util/Init.h>
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
using namespace std;
using namespace Grid;
//using namespace Grid::QCD;
//typedef typename GparityDomainWallFermionR::FermionField FermionField;
typedef typename ZMobiusFermionF::FermionField FermionField;
RealD AllZero(RealD x){ return 0.;}
class CmdJobParams
{
public:
std::string gaugefile;
int Ls;
double mass;
double M5;
double mob_b;
std::vector<ComplexD> omega;
std::vector<Complex> boundary_phase;
std::vector<int> mpi_split;
LanczosType Impl;
int Nu;
int Nk;
int Np;
int Nm;
int Nstop;
int Ntest;
int MaxIter;
double resid;
double low;
double high;
int order;
CmdJobParams()
: gaugefile("Hot"),
Ls(8), mass(0.01), M5(1.8), mob_b(1.5),
Impl(LanczosType::irbl),mpi_split(4,1),
Nu(4), Nk(200), Np(200), Nstop(100), Ntest(1), MaxIter(10), resid(1.0e-8),
low(0.2), high(5.5), order(11)
{Nm=Nk+Np;};
void Parse(char **argv, int argc);
};
void CmdJobParams::Parse(char **argv,int argc)
{
std::string arg;
std::vector<int> vi;
double re,im;
int expect, idx;
std::string vstr;
std::ifstream pfile;
if( GridCmdOptionExists(argv,argv+argc,"--gconf") ){
gaugefile = GridCmdOptionPayload(argv,argv+argc,"--gconf");
}
if( GridCmdOptionExists(argv,argv+argc,"--phase") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--phase");
pfile.open(arg);
assert(pfile);
expect = 0;
while( pfile >> vstr ) {
if ( vstr.compare("boundary_phase") == 0 ) {
pfile >> vstr;
GridCmdOptionInt(vstr,idx);
assert(expect==idx);
pfile >> vstr;
GridCmdOptionFloat(vstr,re);
pfile >> vstr;
GridCmdOptionFloat(vstr,im);
boundary_phase.push_back({re,im});
expect++;
}
}
pfile.close();
} else {
for (int i=0; i<4; ++i) boundary_phase.push_back({1.,0.});
}
if( GridCmdOptionExists(argv,argv+argc,"--omega") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--omega");
pfile.open(arg);
assert(pfile);
Ls = 0;
while( pfile >> vstr ) {
if ( vstr.compare("omega") == 0 ) {
pfile >> vstr;
GridCmdOptionInt(vstr,idx);
assert(Ls==idx);
pfile >> vstr;
GridCmdOptionFloat(vstr,re);
pfile >> vstr;
GridCmdOptionFloat(vstr,im);
omega.push_back({re,im});
Ls++;
}
}
pfile.close();
} else {
if( GridCmdOptionExists(argv,argv+argc,"--Ls") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--Ls");
GridCmdOptionInt(arg,Ls);
}
}
if( GridCmdOptionExists(argv,argv+argc,"--mass") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--mass");
GridCmdOptionFloat(arg,mass);
}
if( GridCmdOptionExists(argv,argv+argc,"--M5") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--M5");
GridCmdOptionFloat(arg,M5);
}
if( GridCmdOptionExists(argv,argv+argc,"--mob_b") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--mob_b");
GridCmdOptionFloat(arg,mob_b);
}
if( GridCmdOptionExists(argv,argv+argc,"--irbl") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--irbl");
GridCmdOptionIntVector(arg,vi);
Nu = vi[0];
Nk = vi[1];
Np = vi[2];
Nstop = vi[3];
MaxIter = vi[4];
// ypj[fixme] mode overriding message is needed.
Impl = LanczosType::irbl;
Nm = Nk+Np;
}
// block Lanczos with explicit extension of its dimensions
if( GridCmdOptionExists(argv,argv+argc,"--rbl") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--rbl");
GridCmdOptionIntVector(arg,vi);
Nu = vi[0];
Nk = vi[1];
Np = vi[2]; // vector space is enlarged by adding Np vectors
Nstop = vi[3];
MaxIter = vi[4];
// ypj[fixme] mode overriding message is needed.
Impl = LanczosType::rbl;
Nm = Nk+Np*MaxIter;
}
#if 1
// block Lanczos with explicit extension of its dimensions
if( GridCmdOptionExists(argv,argv+argc,"--split") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--split");
GridCmdOptionIntVector(arg,vi);
for(int i=0;i<mpi_split.size();i++)
mpi_split[i] = vi[i];
}
#endif
if( GridCmdOptionExists(argv,argv+argc,"--check_int") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--check_int");
GridCmdOptionInt(arg,Ntest);
}
if( GridCmdOptionExists(argv,argv+argc,"--resid") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--resid");
GridCmdOptionFloat(arg,resid);
}
if( GridCmdOptionExists(argv,argv+argc,"--cheby_l") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--cheby_l");
GridCmdOptionFloat(arg,low);
}
if( GridCmdOptionExists(argv,argv+argc,"--cheby_u") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--cheby_u");
GridCmdOptionFloat(arg,high);
}
if( GridCmdOptionExists(argv,argv+argc,"--cheby_n") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--cheby_n");
GridCmdOptionInt(arg,order);
}
if ( CartesianCommunicator::RankWorld() == 0 ) {
std::streamsize ss = std::cout.precision();
std::cout << GridLogMessage <<" Gauge Configuration "<< gaugefile << '\n';
std::cout.precision(15);
for ( int i=0; i<4; ++i ) std::cout << GridLogMessage <<" boundary_phase["<< i << "] = " << boundary_phase[i] << '\n';
std::cout.precision(ss);
std::cout << GridLogMessage <<" Ls "<< Ls << '\n';
std::cout << GridLogMessage <<" mass "<< mass << '\n';
std::cout << GridLogMessage <<" M5 "<< M5 << '\n';
std::cout << GridLogMessage <<" mob_b "<< mob_b << '\n';
std::cout.precision(15);
for ( int i=0; i<Ls; ++i ) std::cout << GridLogMessage <<" omega["<< i << "] = " << omega[i] << '\n';
std::cout.precision(ss);
std::cout << GridLogMessage <<" Nu "<< Nu << '\n';
std::cout << GridLogMessage <<" Nk "<< Nk << '\n';
std::cout << GridLogMessage <<" Np "<< Np << '\n';
std::cout << GridLogMessage <<" Nm "<< Nm << '\n';
std::cout << GridLogMessage <<" Nstop "<< Nstop << '\n';
std::cout << GridLogMessage <<" Ntest "<< Ntest << '\n';
std::cout << GridLogMessage <<" MaxIter "<< MaxIter << '\n';
std::cout << GridLogMessage <<" resid "<< resid << '\n';
std::cout << GridLogMessage <<" Cheby Poly "<< low << "," << high << "," << order << std::endl;
}
}
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
CmdJobParams JP;
JP.Parse(argv,argc);
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(JP.Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(JP.Ls,UGrid);
// printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n",UGrid,UrbGrid,FGrid,FrbGrid);
GridCartesian * UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
GridCartesian * FGridF = SpaceTimeGrid::makeFiveDimGrid(JP.Ls,UGridF);
GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(JP.Ls,UGridF);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
GridParallelRNG RNG5(FGridF); RNG5.SeedFixedIntegers(seeds5);
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
// ypj [note] why seed RNG5 again? bug? In this case, run with a default seed().
GridParallelRNG RNG5rb(FrbGridF); RNG5rb.SeedFixedIntegers(seeds5);
LatticeGaugeField Umu(UGrid);
std::vector<LatticeColourMatrix> U(4,UGrid);
LatticeGaugeFieldF UmuF(UGridF);
std::vector<LatticeColourMatrix> UF(4,UGridF);
if ( JP.gaugefile.compare("Hot") == 0 ) {
SU3::HotConfiguration(RNG4, Umu);
} else {
FieldMetaData header;
NerscIO::readConfiguration(Umu,header,JP.gaugefile);
// ypj [fixme] additional checks for the loaded configuration?
}
precisionChange (UmuF,Umu);
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
}
RealD mass = JP.mass;
RealD M5 = JP.M5;
// ypj [fixme] flexible support for a various Fermions
// RealD mob_b = JP.mob_b; // Gparity
// std::vector<ComplexD> omega; // ZMobius
// GparityMobiusFermionD ::ImplParams params;
// std::vector<int> twists({1,1,1,0});
// params.twists = twists;
// GparityMobiusFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.,params);
// SchurDiagTwoOperator<GparityMobiusFermionR,FermionField> HermOp(Ddwf);
// int mrhs = JP.Nu;
int Ndir=4;
auto mpi_layout = GridDefaultMpi();
std::vector<int> mpi_split (Ndir,1);
#if 0
int tmp=mrhs, dir=0;
std::cout << GridLogMessage << "dir= "<<dir <<"tmp= "<<tmp<<"mpi_split= "<<mpi_split[dir]<<"mpi_layout= "<<mpi_split[dir]<<std::endl;
while ( tmp> 1) {
if ((mpi_split[dir]*2) <= mpi_layout[dir]){
mpi_split[dir] *=2;
tmp = tmp/2;
}
std::cout << GridLogMessage << "dir= "<<dir <<"tmp= "<<tmp<<"mpi_split= "<<mpi_split[dir]<<"mpi_layout= "<<mpi_layout[dir]<<std::endl;
dir = (dir+1)%Ndir;
}
#endif
int mrhs=1;
for(int i =0;i<Ndir;i++){
mpi_split[i] = mpi_layout[i] / JP.mpi_split[i] ;
mrhs *= JP.mpi_split[i];
}
std::cout << GridLogMessage << "mpi_layout= " << mpi_layout << std::endl;
std::cout << GridLogMessage << "mpi_split= " << mpi_split << std::endl;
std::cout << GridLogMessage << "mrhs= " << mrhs << std::endl;
// assert(JP.Nu==tmp);
/////////////////////////////////////////////
// Split into 1^4 mpi communicators, keeping it explicitly single
/////////////////////////////////////////////
GridCartesian * SGrid = new GridCartesian(GridDefaultLatt(),
GridDefaultSimd(Nd,vComplexF::Nsimd()),
mpi_split,
*UGrid);
GridCartesian * SFGrid = SpaceTimeGrid::makeFiveDimGrid(JP.Ls,SGrid);
GridRedBlackCartesian * SrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(SGrid);
GridRedBlackCartesian * SFrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(JP.Ls,SGrid);
LatticeGaugeFieldF s_Umu(SGrid);
Grid_split (UmuF,s_Umu);
//WilsonFermionR::ImplParams params;
ZMobiusFermionF::ImplParams params;
params.overlapCommsCompute = true;
params.boundary_phases = JP.boundary_phase;
ZMobiusFermionF DdwfF(UmuF,*FGridF,*FrbGridF,*UGridF,*UrbGridF,mass,M5,JP.omega,1.,0.,params);
// SchurDiagTwoOperator<ZMobiusFermionF,FermionField> HermOp(Ddwf);
SchurDiagOneOperator<ZMobiusFermionF,FermionField> HermOp(DdwfF);
ZMobiusFermionF Dsplit(s_Umu,*SFGrid,*SFrbGrid,*SGrid,*SrbGrid,mass,M5,JP.omega,1.,0.,params);
// SchurDiagTwoOperator<ZMobiusFermionF,FermionField> SHermOp(Dsplit);
SchurDiagOneOperator<ZMobiusFermionF,FermionField> SHermOp(Dsplit);
//std::vector<double> Coeffs { 0.,-1.};
// ypj [note] this may not be supported by some compilers
std::vector<double> Coeffs({ 0.,-1.});
Polynomial<FermionField> PolyX(Coeffs);
//Chebyshev<FermionField> Cheb(0.2,5.5,11);
Chebyshev<FermionField> Cheb(JP.low,JP.high,JP.order);
// Cheb.csv(std::cout);
ImplicitlyRestartedBlockLanczos<FermionField> IRBL(HermOp, SHermOp,
FrbGridF,SFrbGrid,mrhs,
Cheb,
JP.Nstop, JP.Ntest,
JP.Nu, JP.Nk, JP.Nm,
JP.resid,
JP.MaxIter,
IRBLdiagonaliseWithEigen);
// IRBLdiagonaliseWithLAPACK);
IRBL.split_test=1;
std::vector<RealD> eval(JP.Nm);
std::vector<FermionField> src(JP.Nu,FrbGridF);
if (0)
{
// in case RNG is too slow
std::cout << GridLogMessage << "Using RNG5"<<std::endl;
FermionField src_tmp(FGrid);
for ( int i=0; i<JP.Nu; ++i ){
// gaussian(RNG5,src_tmp);
ComplexD rnd;
RealD re;
fillScalar(re,RNG5._gaussian[0],RNG5._generators[0]);
std::cout << i <<" / "<< JP.Nm <<" re "<< re << std::endl;
// printf("%d / %d re %e\n",i,FGrid->_processor,re);
src_tmp=re;
pickCheckerboard(Odd,src[i],src_tmp);
}
RNG5.Report();
} else {
std::cout << GridLogMessage << "Using RNG5rb"<<std::endl;
for ( int i=0; i<JP.Nu; ++i )
gaussian(RNG5rb,src[i]);
RNG5rb.Report();
}
std::vector<FermionField> evec(JP.Nm,FrbGridF);
for(int i=0;i<1;++i){
std::cout << GridLogMessage << i <<" / "<< JP.Nm <<" grid pointer "<< evec[i].Grid() << std::endl;
};
int Nconv;
IRBL.calc(eval,evec,src,Nconv,JP.Impl);
Grid_finalize();
}

View File

@ -35,26 +35,45 @@ template<typename Action>
struct Setup{}; struct Setup{};
template<> template<>
struct Setup<GparityMobiusFermionD>{ struct Setup<GparityMobiusFermionF>{
static GparityMobiusFermionD* getAction(LatticeGaugeField &Umu, static GparityMobiusFermionF* getAction(LatticeGaugeFieldF &Umu,
GridCartesian* FGrid, GridRedBlackCartesian* FrbGrid, GridCartesian* UGrid, GridRedBlackCartesian* UrbGrid){ GridCartesian* FGrid, GridRedBlackCartesian* FrbGrid, GridCartesian* UGrid, GridRedBlackCartesian* UrbGrid){
RealD mass=0.01; RealD mass=0.00054;
RealD M5=1.8; RealD M5=1.8;
RealD mob_b=1.5; RealD mob_b=1.5;
GparityMobiusFermionD ::ImplParams params; GparityMobiusFermionD ::ImplParams params;
std::vector<int> twists({1,1,1,0}); std::vector<int> twists({1,1,1,0});
params.twists = twists; params.twists = twists;
return new GparityMobiusFermionD(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.,params); return new GparityMobiusFermionF(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.,params);
} }
}; };
template<> template<>
struct Setup<DomainWallFermionF>{
static DomainWallFermionF* getAction(LatticeGaugeFieldF &Umu,
struct Setup<DomainWallFermionD>{ struct Setup<DomainWallFermionD>{
static DomainWallFermionD* getAction(LatticeGaugeField &Umu, static DomainWallFermionD* getAction(LatticeGaugeField &Umu,
GridCartesian* FGrid, GridRedBlackCartesian* FrbGrid, GridCartesian* UGrid, GridRedBlackCartesian* UrbGrid){ GridCartesian* FGrid, GridRedBlackCartesian* FrbGrid, GridCartesian* UGrid, GridRedBlackCartesian* UrbGrid){
RealD mass=0.01; RealD mass=0.00054;
RealD M5=1.8; RealD M5=1.8;
return new DomainWallFermionD(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); return new DomainWallFermionF(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
}
};
template<>
struct Setup<MobiusFermionF>{
static MobiusFermionF* getAction(LatticeGaugeFieldF &Umu,
GridCartesian* FGrid, GridRedBlackCartesian* FrbGrid, GridCartesian* UGrid, GridRedBlackCartesian* UrbGrid){
RealD mass=0.00054;
RealD M5=1.8;
RealD mob_b=1.5;
std::vector<Complex> boundary = {1,1,1,-1};
MobiusFermionF::ImplParams Params(boundary);
std::cout << GridLogMessage << "mass "<<mass<<std::endl;
std::cout << GridLogMessage << "M5 "<<M5<<std::endl;
std::cout << GridLogMessage << "mob_b "<<mob_b<<std::endl;
return new MobiusFermionF(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.,Params);
} }
}; };
@ -63,38 +82,60 @@ struct Setup<DomainWallFermionD>{
template<typename Action> template<typename Action>
void run(){ void run(){
typedef typename Action::FermionField FermionField; typedef typename Action::FermionField FermionField;
const int Ls=8; const int Ls=12;
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi()); GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n",UGrid,UrbGrid,FGrid,FrbGrid); // printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n",UGrid,UrbGrid,FGrid,FrbGrid);
GridCartesian* UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
GridRedBlackCartesian* UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
GridCartesian* FGridF = SpaceTimeGrid::makeFiveDimGrid(Ls, UGridF);
GridRedBlackCartesian* FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGridF);
std::vector<int> seeds4({1,2,3,4}); std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8}); std::vector<int> seeds5({5,6,7,8});
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5); GridParallelRNG RNG5(FGridF); RNG5.SeedFixedIntegers(seeds5);
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4); GridParallelRNG RNG4(UGridF); RNG4.SeedFixedIntegers(seeds4);
GridParallelRNG RNG5rb(FrbGrid); RNG5.SeedFixedIntegers(seeds5); GridParallelRNG RNG5rb(FrbGridF); RNG5.SeedFixedIntegers(seeds5);
LatticeGaugeField Umu(UGrid); LatticeGaugeField Umu(UGrid);
SU<Nc>::HotConfiguration(RNG4, Umu); // SU<Nc>::HotConfiguration(RNG4, Umu);
FieldMetaData header;
std::string file("./config");
Action *action = Setup<Action>::getAction(Umu,FGrid,FrbGrid,UGrid,UrbGrid); // int precision32 = 0;
// int tworow = 0;
// NerscIO::writeConfiguration(Umu,file,tworow,precision32);
NerscIO::readConfiguration(Umu,header,file);
LatticeGaugeFieldF UmuF(UGridF);
precisionChange(UmuF, Umu);
Action *action = Setup<Action>::getAction(UmuF,FGridF,FrbGridF,UGridF,UrbGridF);
//MdagMLinearOperator<Action,FermionField> HermOp(Ddwf); //MdagMLinearOperator<Action,FermionField> HermOp(Ddwf);
SchurDiagTwoOperator<Action,FermionField> HermOp(*action); // SchurDiagTwoOperator<Action,FermionField> HermOp(*action);
SchurDiagOneOperator<Action,FermionField> HermOp(*action);
const int Nstop = 30; const int Nstop = 150;
const int Nk = 40; const int Nk = 160;
const int Np = 40; const int Np = 40;
const int Nm = Nk+Np; const int Nm = Nk+Np;
const int MaxIt= 10000; const int MaxIt= 10000;
RealD resid = 1.0e-8; RealD resid = 1.0e-6;
std::cout << GridLogMessage << "Nstop "<<Nstop<<std::endl;
std::cout << GridLogMessage << "Nk "<<Nk<<std::endl;
std::cout << GridLogMessage << "Np "<<Np<<std::endl;
std::cout << GridLogMessage << "resid "<<resid<<std::endl;
std::vector<double> Coeffs { 0.,-1.}; std::vector<double> Coeffs { 0.,-1.};
Polynomial<FermionField> PolyX(Coeffs); Polynomial<FermionField> PolyX(Coeffs);
Chebyshev<FermionField> Cheby(0.2,5.,11); Chebyshev<FermionField> Cheby(0.0000006,5.5,4001);
std::cout << GridLogMessage << "Cheby(0.0000006,5.5,4001) "<<std::endl;
FunctionHermOp<FermionField> OpCheby(Cheby,HermOp); FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
PlainHermOp<FermionField> Op (HermOp); PlainHermOp<FermionField> Op (HermOp);
@ -102,9 +143,9 @@ void run(){
ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby,Op,Nstop,Nk,Nm,resid,MaxIt); ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby,Op,Nstop,Nk,Nm,resid,MaxIt);
std::vector<RealD> eval(Nm); std::vector<RealD> eval(Nm);
FermionField src(FrbGrid); FermionField src(FrbGridF);
gaussian(RNG5rb,src); gaussian(RNG5rb,src);
std::vector<FermionField> evec(Nm,FrbGrid); std::vector<FermionField> evec(Nm,FrbGridF);
for(int i=0;i<1;i++){ for(int i=0;i<1;i++){
std::cout << GridLogMessage <<i<<" / "<< Nm<< " grid pointer "<<evec[i].Grid()<<std::endl; std::cout << GridLogMessage <<i<<" / "<< Nm<< " grid pointer "<<evec[i].Grid()<<std::endl;
}; };
@ -119,7 +160,7 @@ int main (int argc, char ** argv)
{ {
Grid_init(&argc,&argv); Grid_init(&argc,&argv);
std::string action = "GparityMobius"; std::string action = "Mobius";
for(int i=1;i<argc;i++){ for(int i=1;i<argc;i++){
if(std::string(argv[i]) == "-action"){ if(std::string(argv[i]) == "-action"){
action = argv[i+1]; action = argv[i+1];
@ -127,9 +168,11 @@ int main (int argc, char ** argv)
} }
if(action == "GparityMobius"){ if(action == "GparityMobius"){
run<GparityMobiusFermionD>(); run<GparityMobiusFermionF>();
}else if(action == "DWF"){ }else if(action == "DWF"){
run<DomainWallFermionD>(); run<DomainWallFermionF>();
}else if(action == "Mobius"){
run<MobiusFermionF>();
}else{ }else{
std::cout << "Unknown action" << std::endl; std::cout << "Unknown action" << std::endl;
exit(1); exit(1);