1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-22 09:42:02 +01:00

Compare commits

..

68 Commits

Author SHA1 Message Date
d57ed25071 Merge branch 'feature/dirichlet' into feature/block_lanczos22 2023-03-24 12:08:09 -04:00
8a1b9073f9 Mshift update 2023-03-23 15:39:30 -04:00
1a7114d4b9 Temporary algorithm while sorting out mixed prec 2023-03-23 15:38:35 -04:00
3f385f717c Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet
Conflicts:
	systems/PVC/benchmarks/run-2tile-mpi.sh
	systems/PVC/config-command
2023-03-23 14:52:53 -04:00
c180a52518 Merge branch 'feature/dirichlet' of https://www.github.com/paboyle/Grid into feature/dirichlet 2023-03-23 10:28:01 -04:00
90130e25e9 TODO list 2023-03-23 10:27:02 -04:00
23298acb81 Merge pull request #424 from giltirn/feature/dirichlet-precchange
Precision change implementation
2023-03-22 23:04:52 -04:00
52384e34cf Discard on construct 2023-03-22 19:40:32 -04:00
d0bb033ea2 Device resident GPU block buffer instead of UVM as hit likely UVM
bug. Code worked on CUDA 11.4 but fails on later drivers (certainly 530.30.02, but need to
find the perlmutter driver version).
2023-03-22 19:07:32 -04:00
c6621806ca Compiling on laptop and running 2023-03-21 17:27:09 -04:00
0b6f0f6d2f Merge branch 'feature/dirichlet' of https://www.github.com/paboyle/Grid into feature/dirichlet 2023-03-21 16:06:55 -04:00
b5b759df73 Merge branch 'develop' into feature/dirichlet 2023-03-21 16:05:46 -04:00
7db8dd7a95 Merge branch 'feature/dirichlet' of https://github.com/paboyle/Grid into feature/dirichlet 2023-03-21 16:04:27 -04:00
8b43be39c0 Config command 2023-03-21 16:00:52 -04:00
f17f879206 Test update 2023-03-21 15:59:29 -04:00
68428fceab Integrator update 2023-03-21 15:58:49 -04:00
4135f2dcd1 Compressor 2023-03-21 15:41:41 -04:00
c5bdf61215 AUdit fix 2023-03-21 15:38:39 -04:00
88e218e8ee Stencil updates 2023-03-21 15:37:58 -04:00
0f2b786436 Vector -> vector 2023-03-21 15:36:11 -04:00
e1c326558a COmms improvements 2023-03-21 08:53:56 -07:00
39c0815d9e WriteDiscard 2023-03-21 08:57:29 -04:00
83d86943db Fixed compile bug in MemoryManagerShared caused by Audit function not being passed a string 2023-02-23 13:09:45 -05:00
e82cf1d311 Further prec-change improvements
Mixed prec CG algorithm has been modified to precompute precision change workspaces

As the original Test_dwf_mixedcg_prec has been coopted to do a performance stability and reproducibility test, requiring the single-prec CG to be run 200 times, I have created a new version of Test_dwf_mixedcg_prec in the solver subdirectory that just does the mixed vs double CG test
2023-02-23 09:45:29 -05:00
1db58a8acc Precision change improvements
Added a new, much faster implementation of precision change that uses (optionally) a precomputed workspace containing pointer offsets that is device resident, such that all lattice copying occurs only on the device and no host<->device transfer is required, other than the pointer table. It also avoids the need to unpack and repack the fields using explicit lane copying. When this new precisionChange is called without a workspace, one will be computed on-the-fly; however it is still considerably faster than the original implementation.

In the special case of using double2 and when the Grids are the same, calls to the new precisionChange will automatically use precisionChangeFast, such that there is a single API call for all precision changes.

Reliable update and mixed-prec multishift have been modified to precompute precision change workspaces

Renamed the original precisionChange as precisionChangeOrig

Fixed incorrect pointer offset bug in copyLane

Added a test and a benchmark for precisionChange

Added a test for reliable update CG
2023-02-21 10:52:42 -05:00
796abfad80 Merge pull request #422 from fjosw/fix/NVCC_DIAG_PRAGMA_SUPPORT
Disable diagnostic pragma warnings for CUDA 12+
2023-01-17 09:34:49 -05:00
ad0270ac8c fix: diagnostic pragma warnings fixed for CUDA 12+ 2023-01-12 12:36:30 +00:00
4ca1bf7cca Added gauge invariance test 2022-12-21 07:23:16 -05:00
2ff868f7a5 CPU open doesn't need to free space 2022-12-20 05:10:23 -05:00
ede02b6883 Memory manager debug Felix case 2022-12-20 05:10:23 -05:00
1822ced302 Bug fix 2022-12-20 05:10:23 -05:00
37ba32776f More logging 2022-12-20 05:10:23 -05:00
99b3697b03 More loggin 2022-12-20 05:10:23 -05:00
43a45ec97b SSC_START 2022-12-20 05:10:23 -05:00
b00a4142e5 A=A fix 2022-12-20 05:10:23 -05:00
3791bc527b Logging pulled in from dirichlet branch 2022-12-20 05:10:23 -05:00
d8c29f5fcf Updated FFT test for PETSc 2022-12-18 12:05:00 -05:00
281f8101fe Matt FFT test 2022-12-17 20:35:33 -05:00
dc747c54be Merge branch 'develop' into feature/dirichlet
Conflicts:
	Grid/qcd/action/fermion/WilsonCompressor.h
	Grid/stencil/Stencil.h
2022-12-13 08:24:58 -05:00
07acfe89f2 Merge pull request #417 from rrhodgson/feature/fermtoprop
Feature/fermtoprop
2022-12-06 12:45:03 -05:00
40234f531f FermToProp accelerator_for -> thread_for 2022-12-06 17:34:51 +00:00
d49694f38f PropToFerm fix 2022-12-06 15:48:54 +00:00
dc6a38f177 Minor cleanup 2022-11-30 17:13:12 -05:00
82c1ecf60f Block lanczos added 2022-11-30 16:08:40 -05:00
97a098636d FermToProp 2022-11-30 15:36:35 -05:00
e13930c8b2 Faster fermtoprop case 2022-11-30 15:11:29 -05:00
0655dab466 Open MP on host enabled 2022-11-08 13:38:54 -08:00
7f097bcc28 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2022-11-08 13:23:40 -08:00
5c75aa5008 Device mem 2022-11-08 13:22:57 -08:00
1873101362 PVC 2022-11-08 13:22:45 -08:00
63fd1dfa62 Config on PVC 2022-11-08 13:22:09 -08:00
bd68861b28 SYCL sum 2022-11-08 12:49:26 -08:00
82e959f66c SYCL reduction 2022-11-08 12:45:25 -08:00
62e52de06d Merge pull request #414 from fjosw/feat/eCloverGPU
Compact Exponential Cloverterm on GPU
2022-11-01 09:15:44 -04:00
184adeedb8 feat: renamed open_boundaries to fixedBoundaries 2022-10-26 12:53:46 +01:00
5fa6a8b96d docs: CompactClover debug info generalized. 2022-10-26 12:41:14 +01:00
a2a879b668 docs: CompactClover Debug Info improved. 2022-10-25 17:20:42 +01:00
9317d893b2 docs: details about inversion of CompactClover term added. 2022-10-25 17:10:06 +01:00
86075fdd45 feat: MassTerm and ExponentiateClover merged into InstantiateClover 2022-10-25 17:05:34 +01:00
b36442e263 feat: CloverHelpers::InvertClover implemented which handles the
inversion of the Clover term depending on clover type and the boundary
conditions.
2022-10-25 16:57:01 +01:00
513d797ea6 fix: signature of CompactWilsonCloverHelpers::Exponentiate fixed. 2022-10-25 16:17:22 +01:00
9e4835a3e3 feat: changed CompactWilsonExpClover exponentiation to Taylor expansion
with Horner scheme.
2022-10-25 15:19:43 +01:00
477ebf24f4 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2022-10-04 11:19:43 -07:00
0d5639f707 Run script update 2022-10-04 11:13:41 -07:00
413312f9a9 Benchmark the halo construction.
THe bye counts are out and should be doubled for SIMD directions
2022-10-04 11:12:59 -07:00
03508448f8 Remove verbose 2022-10-04 11:12:15 -07:00
e1e5c75023 Stencil gather improvements - SVM was running slow and used for a pointer array that wasn't needed to be in SVM 2022-10-04 11:11:10 -07:00
9296299b61 Better commenting 2022-10-04 11:10:34 -07:00
59 changed files with 5183 additions and 461 deletions

View File

@ -45,7 +45,7 @@ directory
//disables nvcc specific warning in json.hpp
#pragma clang diagnostic ignored "-Wdeprecated-register"
#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
//disables nvcc specific warning in json.hpp
#pragma nv_diag_suppress unsigned_compare_with_zero
#pragma nv_diag_suppress cast_to_qualified_type

View File

@ -14,7 +14,7 @@
/* NVCC save and restore compile environment*/
#ifdef __NVCC__
#pragma push
#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
#pragma nv_diag_suppress code_is_unreachable
#else
#pragma diag_suppress code_is_unreachable

View File

@ -108,7 +108,10 @@ NAMESPACE_BEGIN(Grid);
GridStopWatch PrecChangeTimer;
Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
precisionChangeWorkspace pc_wk_sp_to_dp(DoublePrecGrid, SinglePrecGrid);
precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, DoublePrecGrid);
for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
//Compute double precision rsd and also new RHS vector.
Linop_d.HermOp(sol_d, tmp_d);
@ -123,7 +126,7 @@ NAMESPACE_BEGIN(Grid);
while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
PrecChangeTimer.Start();
precisionChange(src_f, src_d);
precisionChange(src_f, src_d, pc_wk_dp_to_sp);
PrecChangeTimer.Stop();
sol_f = Zero();
@ -142,7 +145,7 @@ NAMESPACE_BEGIN(Grid);
//Convert sol back to double and add to double prec solution
PrecChangeTimer.Start();
precisionChange(tmp_d, sol_f);
precisionChange(tmp_d, sol_f, pc_wk_sp_to_dp);
PrecChangeTimer.Stop();
axpy(sol_d, 1.0, tmp_d, sol_d);

View File

@ -0,0 +1,373 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Christopher Kelly <ckelly@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
//CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision.
//The residual is stored in single precision, but the search directions and solution are stored in double precision.
//Every update_freq iterations the residual is corrected in double precision.
//For safety the a final regular CG is applied to clean up if necessary
//PB Pure single, then double fixup
template<class FieldD, class FieldF,
typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class ConjugateGradientMultiShiftMixedPrecCleanup : public OperatorMultiFunction<FieldD>,
public OperatorFunction<FieldD>
{
public:
using OperatorFunction<FieldD>::operator();
RealD Tolerance;
Integer MaxIterationsMshift;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
std::vector<int> IterationsToCompleteShift; // Iterations for this shift
int verbose;
MultiShiftFunction shifts;
std::vector<RealD> TrueResidualShift;
int ReliableUpdateFreq; //number of iterations between reliable updates
GridBase* SinglePrecGrid; //Grid for single-precision fields
LinearOperatorBase<FieldF> &Linop_f; //single precision
ConjugateGradientMultiShiftMixedPrecCleanup(Integer maxit, const MultiShiftFunction &_shifts,
GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
int _ReliableUpdateFreq) :
MaxIterationsMshift(maxit), shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq),
MaxIterations(20000)
{
verbose=1;
IterationsToCompleteShift.resize(_shifts.order);
TrueResidualShift.resize(_shifts.order);
}
void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, FieldD &psi)
{
GridBase *grid = src.Grid();
int nshift = shifts.order;
std::vector<FieldD> results(nshift,grid);
(*this)(Linop,src,results,psi);
}
void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, std::vector<FieldD> &results, FieldD &psi)
{
int nshift = shifts.order;
(*this)(Linop,src,results);
psi = shifts.norm*src;
for(int i=0;i<nshift;i++){
psi = psi + shifts.residues[i]*results[i];
}
return;
}
void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
{
GRID_TRACE("ConjugateGradientMultiShiftMixedPrecCleanup");
GridBase *DoublePrecGrid = src_d.Grid();
////////////////////////////////////////////////////////////////////////
// Convenience references to the info stored in "MultiShiftFunction"
////////////////////////////////////////////////////////////////////////
int nshift = shifts.order;
std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
std::vector<RealD> &mresidual(shifts.tolerances);
std::vector<RealD> alpha(nshift,1.0);
//Double precision search directions
FieldD p_d(DoublePrecGrid);
std::vector<FieldF> ps_f (nshift, SinglePrecGrid);// Search directions (single precision)
std::vector<FieldF> psi_f(nshift, SinglePrecGrid);// solutions (single precision)
FieldD tmp_d(DoublePrecGrid);
FieldD r_d(DoublePrecGrid);
FieldF r_f(SinglePrecGrid);
FieldD mmp_d(DoublePrecGrid);
assert(psi_d.size()==nshift);
assert(mass.size()==nshift);
assert(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector
RealD bs[nshift];
RealD rsq[nshift];
RealD rsqf[nshift];
RealD z[nshift][2];
int converged[nshift];
const int primary =0;
//Primary shift fields CG iteration
RealD a,b,c,d;
RealD cp,bp,qq; //prev
// Matrix mult fields
FieldF p_f(SinglePrecGrid);
FieldF mmp_f(SinglePrecGrid);
// Check lightest mass
for(int s=0;s<nshift;s++){
assert( mass[s]>= mass[primary] );
converged[s]=0;
}
// Wire guess to zero
// Residuals "r" are src
// First search direction "p" is also src
cp = norm2(src_d);
// Handle trivial case of zero src.
if( cp == 0. ){
for(int s=0;s<nshift;s++){
psi_d[s] = Zero();
psi_f[s] = Zero();
IterationsToCompleteShift[s] = 1;
TrueResidualShift[s] = 0.;
}
return;
}
for(int s=0;s<nshift;s++){
rsq[s] = cp * mresidual[s] * mresidual[s];
rsqf[s] =rsq[s];
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
// ps_d[s] = src_d;
precisionChangeFast(ps_f[s],src_d);
}
// r and p for primary
p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
r_d = p_d;
//MdagM+m[0]
precisionChangeFast(p_f,p_d);
Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
precisionChangeFast(tmp_d,mmp_f);
Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
tmp_d = tmp_d - mmp_d;
std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
// assert(norm2(tmp_d)< 1.0e-4);
axpy(mmp_d,mass[0],p_d,mmp_d);
RealD rn = norm2(p_d);
d += rn*mass[0];
b = -cp /d;
// Set up the various shift variables
int iz=0;
z[0][1-iz] = 1.0;
z[0][iz] = 1.0;
bs[0] = b;
for(int s=1;s<nshift;s++){
z[s][1-iz] = 1.0;
z[s][iz] = 1.0/( 1.0 - b*(mass[s]-mass[0]));
bs[s] = b*z[s][iz];
}
// r += b[0] A.p[0]
// c= norm(r)
c=axpy_norm(r_d,b,mmp_d,r_d);
for(int s=0;s<nshift;s++) {
axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
precisionChangeFast(psi_f[s],psi_d[s]);
}
///////////////////////////////////////
// Timers
///////////////////////////////////////
GridStopWatch AXPYTimer, ShiftTimer, QRTimer, MatrixTimer, SolverTimer, PrecChangeTimer, CleanupTimer;
SolverTimer.Start();
// Iteration loop
int k;
for (k=1;k<=MaxIterationsMshift;k++){
a = c /cp;
AXPYTimer.Start();
axpy(p_d,a,p_d,r_d);
AXPYTimer.Stop();
PrecChangeTimer.Start();
precisionChangeFast(r_f, r_d);
PrecChangeTimer.Stop();
AXPYTimer.Start();
for(int s=0;s<nshift;s++){
if ( ! converged[s] ) {
if (s==0){
axpy(ps_f[s],a,ps_f[s],r_f);
} else{
RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
axpby(ps_f[s],z[s][iz],as,r_f,ps_f[s]);
}
}
}
AXPYTimer.Stop();
cp=c;
PrecChangeTimer.Start();
precisionChangeFast(p_f, p_d); //get back single prec search direction for linop
PrecChangeTimer.Stop();
MatrixTimer.Start();
Linop_f.HermOp(p_f,mmp_f);
MatrixTimer.Stop();
PrecChangeTimer.Start();
precisionChangeFast(mmp_d, mmp_f); // From Float to Double
PrecChangeTimer.Stop();
d=real(innerProduct(p_d,mmp_d));
axpy(mmp_d,mass[0],p_d,mmp_d);
RealD rn = norm2(p_d);
d += rn*mass[0];
bp=b;
b=-cp/d;
// Toggle the recurrence history
bs[0] = b;
iz = 1-iz;
ShiftTimer.Start();
for(int s=1;s<nshift;s++){
if((!converged[s])){
RealD z0 = z[s][1-iz];
RealD z1 = z[s][iz];
z[s][iz] = z0*z1*bp
/ (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b));
bs[s] = b*z[s][iz]/z0; // NB sign rel to Mike
}
}
ShiftTimer.Stop();
//Update single precision solutions
AXPYTimer.Start();
for(int s=0;s<nshift;s++){
int ss = s;
if( (!converged[s]) ) {
axpy(psi_f[ss],-bs[s]*alpha[s],ps_f[s],psi_f[ss]);
}
}
c = axpy_norm(r_d,b,mmp_d,r_d);
AXPYTimer.Stop();
// Convergence checks
int all_converged = 1;
for(int s=0;s<nshift;s++){
if ( (!converged[s]) ){
IterationsToCompleteShift[s] = k;
RealD css = c * z[s][iz]* z[s][iz];
if(css<rsqf[s]){
if ( ! converged[s] )
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
converged[s]=1;
} else {
all_converged=0;
}
}
}
if ( all_converged || k == MaxIterationsMshift-1){
SolverTimer.Stop();
for(int s=0;s<nshift;s++){
precisionChangeFast(psi_d[s],psi_f[s]);
}
if ( all_converged ){
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: All shifts have converged iteration "<<k<<std::endl;
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: Checking solutions"<<std::endl;
} else {
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: Not all shifts have converged iteration "<<k<<std::endl;
}
// Check answers
for(int s=0; s < nshift; s++) {
Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
axpy(tmp_d,mass[s],psi_d[s],mmp_d);
axpy(r_d,-alpha[s],src_d,tmp_d);
RealD rn = norm2(r_d);
RealD cn = norm2(src_d);
TrueResidualShift[s] = std::sqrt(rn/cn);
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift["<<s<<"] true residual "<< TrueResidualShift[s] << " target " << mresidual[s] << std::endl;
//If we have not reached the desired tolerance, do a (mixed precision) CG cleanup
if(rn >= rsq[s]){
CleanupTimer.Start();
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: performing cleanup step for shift " << s << std::endl;
//Setup linear operators for final cleanup
ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldD> Linop_shift_d(Linop_d, mass[s]);
ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldF> Linop_shift_f(Linop_f, mass[s]);
MixedPrecisionConjugateGradient<FieldD,FieldF> cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d);
cg(src_d, psi_d[s]);
TrueResidualShift[s] = cg.TrueResidual;
CleanupTimer.Stop();
}
}
std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrecCleanup: Time Breakdown for body"<<std::endl;
std::cout << GridLogMessage << "\tSolver " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tAXPY " << AXPYTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tShift " << ShiftTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tPrecision Change " << PrecChangeTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tFinal Cleanup " << CleanupTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tSolver+Cleanup " << SolverTimer.Elapsed() + CleanupTimer.Elapsed() << std::endl;
IterationsToComplete = k;
return;
}
}
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
assert(0);
}
};
NAMESPACE_END(Grid);

View File

@ -81,6 +81,7 @@ public:
using OperatorFunction<FieldD>::operator();
RealD Tolerance;
Integer MaxIterationsMshift;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
std::vector<int> IterationsToCompleteShift; // Iterations for this shift
@ -95,9 +96,9 @@ public:
ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts,
GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
int _ReliableUpdateFreq
) :
MaxIterations(maxit), shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq)
int _ReliableUpdateFreq) :
MaxIterationsMshift(maxit), shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq),
MaxIterations(20000)
{
verbose=1;
IterationsToCompleteShift.resize(_shifts.order);
@ -130,6 +131,9 @@ public:
GRID_TRACE("ConjugateGradientMultiShiftMixedPrec");
GridBase *DoublePrecGrid = src_d.Grid();
precisionChangeWorkspace pc_wk_s_to_d(DoublePrecGrid,SinglePrecGrid);
precisionChangeWorkspace pc_wk_d_to_s(SinglePrecGrid,DoublePrecGrid);
////////////////////////////////////////////////////////////////////////
// Convenience references to the info stored in "MultiShiftFunction"
////////////////////////////////////////////////////////////////////////
@ -200,10 +204,10 @@ public:
r_d = p_d;
//MdagM+m[0]
precisionChangeFast(p_f,p_d);
precisionChange(p_f, p_d, pc_wk_d_to_s);
Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
precisionChangeFast(tmp_d,mmp_f);
precisionChange(tmp_d, mmp_f, pc_wk_s_to_d);
Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
tmp_d = tmp_d - mmp_d;
std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
@ -244,7 +248,7 @@ public:
// Iteration loop
int k;
for (k=1;k<=MaxIterations;k++){
for (k=1;k<=MaxIterationsMshift;k++){
a = c /cp;
AXPYTimer.Start();
@ -263,7 +267,7 @@ public:
AXPYTimer.Stop();
PrecChangeTimer.Start();
precisionChangeFast(p_f, p_d); //get back single prec search direction for linop
precisionChange(p_f, p_d, pc_wk_d_to_s); //get back single prec search direction for linop
PrecChangeTimer.Stop();
cp=c;
@ -272,7 +276,7 @@ public:
MatrixTimer.Stop();
PrecChangeTimer.Start();
precisionChangeFast(mmp_d, mmp_f); // From Float to Double
precisionChange(mmp_d, mmp_f, pc_wk_s_to_d); // From Float to Double
PrecChangeTimer.Stop();
AXPYTimer.Start();
@ -350,12 +354,17 @@ public:
}
}
if ( all_converged ){
if ( all_converged || k == MaxIterationsMshift-1){
SolverTimer.Stop();
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
if ( all_converged ){
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
} else {
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Not all shifts have converged iteration "<<k<<std::endl;
}
// Check answers
for(int s=0; s < nshift; s++) {
Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
@ -396,12 +405,10 @@ public:
return;
}
}
// ugly hack
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
// assert(0);
assert(0);
}
};

View File

@ -48,7 +48,7 @@ public:
LinearOperatorBase<FieldF> &Linop_f;
LinearOperatorBase<FieldD> &Linop_d;
GridBase* SinglePrecGrid;
RealD Delta; //reliable update parameter
RealD Delta; //reliable update parameter. A reliable update is performed when the residual drops by a factor of Delta relative to its value at the last update
//Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single
LinearOperatorBase<FieldF> *Linop_fallback;
@ -65,7 +65,9 @@ public:
ErrorOnNoConverge(err_on_no_conv),
DoFinalCleanup(true),
Linop_fallback(NULL)
{};
{
assert(Delta > 0. && Delta < 1. && "Expect 0 < Delta < 1");
};
void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
Linop_fallback = &_Linop_fallback;
@ -116,9 +118,12 @@ public:
}
//Single prec initialization
precisionChangeWorkspace pc_wk_sp_to_dp(src.Grid(), SinglePrecGrid);
precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, src.Grid());
FieldF r_f(SinglePrecGrid);
r_f.Checkerboard() = r.Checkerboard();
precisionChange(r_f, r);
precisionChange(r_f, r, pc_wk_dp_to_sp);
FieldF psi_f(r_f);
psi_f = Zero();
@ -134,7 +139,8 @@ public:
GridStopWatch LinalgTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
GridStopWatch PrecChangeTimer;
SolverTimer.Start();
int k = 0;
int l = 0;
@ -173,7 +179,9 @@ public:
// Stopping condition
if (cp <= rsq) {
//Although not written in the paper, I assume that I have to add on the final solution
precisionChange(mmp, psi_f);
PrecChangeTimer.Start();
precisionChange(mmp, psi_f, pc_wk_sp_to_dp);
PrecChangeTimer.Stop();
psi = psi + mmp;
@ -194,7 +202,10 @@ public:
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tPrecChange " << PrecChangeTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tPrecChange avg time " << PrecChangeTimer.Elapsed()/(2*l+1) <<std::endl;
IterationsToComplete = k;
ReliableUpdatesPerformed = l;
@ -214,14 +225,21 @@ public:
else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate "
<< cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n";
precisionChange(mmp, psi_f);
PrecChangeTimer.Start();
precisionChange(mmp, psi_f, pc_wk_sp_to_dp);
PrecChangeTimer.Stop();
psi = psi + mmp;
MatrixTimer.Start();
Linop_d.HermOpAndNorm(psi, mmp, d, qq);
MatrixTimer.Stop();
r = src - mmp;
psi_f = Zero();
precisionChange(r_f, r);
PrecChangeTimer.Start();
precisionChange(r_f, r, pc_wk_dp_to_sp);
PrecChangeTimer.Stop();
cp = norm2(r);
MaxResidSinceLastRelUp = cp;

File diff suppressed because it is too large Load Diff

View File

@ -97,8 +97,8 @@ private:
static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim,uint64_t &cbytes) ;
static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t &cbytes) ;
static void PrintBytes(void);
public:
static void PrintBytes(void);
static void Audit(std::string s);
static void Init(void);
static void InitMessage(void);
@ -119,6 +119,8 @@ private:
static uint64_t DeviceToHostBytes;
static uint64_t HostToDeviceXfer;
static uint64_t DeviceToHostXfer;
static uint64_t DeviceEvictions;
static uint64_t DeviceDestroy;
private:
#ifndef GRID_UVM
@ -176,6 +178,7 @@ private:
public:
static void Print(void);
static void PrintAll(void);
static void PrintState( void* CpuPtr);
static int isOpen (void* CpuPtr);
static void ViewClose(void* CpuPtr,ViewMode mode);

View File

@ -28,6 +28,8 @@ uint64_t MemoryManager::HostToDeviceBytes;
uint64_t MemoryManager::DeviceToHostBytes;
uint64_t MemoryManager::HostToDeviceXfer;
uint64_t MemoryManager::DeviceToHostXfer;
uint64_t MemoryManager::DeviceEvictions;
uint64_t MemoryManager::DeviceDestroy;
////////////////////////////////////
// Priority ordering for unlocked entries
@ -115,8 +117,10 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr) {
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
DeviceDestroy++;
DeviceBytes -=AccCache.bytes;
LRUremove(AccCache);
AccCache.AccPtr=(uint64_t) NULL;
dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
}
uint64_t CpuPtr = AccCache.CpuPtr;
@ -126,8 +130,14 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
{
///////////////////////////////////////////////////////////////////////////
// Make CPU consistent, remove from Accelerator, remove entry
// Cannot be locked. If allocated must be in LRU pool.
// Make CPU consistent, remove from Accelerator, remove from LRU, LEAVE CPU only entry
// Cannot be acclocked. If allocated must be in LRU pool.
//
// Nov 2022... Felix issue: Allocating two CpuPtrs, can have an entry in LRU-q with CPUlock.
// and require to evict the AccPtr copy. Eviction was a mistake in CpuViewOpen
// but there is a weakness where CpuLock entries are attempted for erase
// Take these OUT LRU queue when CPU locked?
// Cannot take out the table as cpuLock data is important.
///////////////////////////////////////////////////////////////////////////
assert(AccCache.state!=Empty);
@ -139,15 +149,17 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
if(AccCache.state==AccDirty) {
Flush(AccCache);
}
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr) {
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
DeviceBytes -=AccCache.bytes;
LRUremove(AccCache);
AccCache.AccPtr=(uint64_t)NULL;
AccCache.state=CpuDirty; // CPU primary now
DeviceBytes -=AccCache.bytes;
dprintf("MemoryManager: Free(%lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
}
uint64_t CpuPtr = AccCache.CpuPtr;
EntryErase(CpuPtr);
// uint64_t CpuPtr = AccCache.CpuPtr;
DeviceEvictions++;
// EntryErase(CpuPtr);
}
void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
{
@ -221,13 +233,16 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
}
void MemoryManager::EvictVictims(uint64_t bytes)
{
assert(bytes<DeviceMaxBytes);
while(bytes+DeviceLRUBytes > DeviceMaxBytes){
if ( DeviceLRUBytes > 0){
assert(LRU.size()>0);
uint64_t victim = LRU.back();
uint64_t victim = LRU.back(); // From the LRU
auto AccCacheIterator = EntryLookup(victim);
auto & AccCache = AccCacheIterator->second;
Evict(AccCache);
} else {
return;
}
}
}
@ -322,7 +337,8 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
assert(0);
}
// If view is opened on device remove from LRU
assert(AccCache.accLock>0);
// If view is opened on device must remove from LRU
if(AccCache.LRU_valid==1){
// must possibly remove from LRU as now locked on GPU
dprintf("AccCache entry removed from LRU \n");
@ -388,9 +404,10 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
if (!AccCache.AccPtr) {
EvictVictims(bytes);
}
// CPU doesn't need to free space
// if (!AccCache.AccPtr) {
// EvictVictims(bytes);
// }
assert((mode==CpuRead)||(mode==CpuWrite));
assert(AccCache.accLock==0); // Programming error
@ -444,20 +461,28 @@ void MemoryManager::NotifyDeletion(void *_ptr)
void MemoryManager::Print(void)
{
PrintBytes();
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogDebug << "Memory Manager " << std::endl;
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogDebug << DeviceBytes << " bytes allocated on device " << std::endl;
std::cout << GridLogDebug << DeviceLRUBytes<< " bytes evictable on device " << std::endl;
std::cout << GridLogDebug << DeviceMaxBytes<< " bytes max on device " << std::endl;
std::cout << GridLogDebug << HostToDeviceXfer << " transfers to device " << std::endl;
std::cout << GridLogDebug << DeviceToHostXfer << " transfers from device " << std::endl;
std::cout << GridLogDebug << HostToDeviceBytes<< " bytes transfered to device " << std::endl;
std::cout << GridLogDebug << DeviceToHostBytes<< " bytes transfered from device " << std::endl;
std::cout << GridLogDebug << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogDebug << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
std::cout << GridLogMessage << "Memory Manager " << std::endl;
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
std::cout << GridLogMessage << DeviceBytes << " bytes allocated on device " << std::endl;
std::cout << GridLogMessage << DeviceLRUBytes<< " bytes evictable on device " << std::endl;
std::cout << GridLogMessage << DeviceMaxBytes<< " bytes max on device " << std::endl;
std::cout << GridLogMessage << HostToDeviceXfer << " transfers to device " << std::endl;
std::cout << GridLogMessage << DeviceToHostXfer << " transfers from device " << std::endl;
std::cout << GridLogMessage << HostToDeviceBytes<< " bytes transfered to device " << std::endl;
std::cout << GridLogMessage << DeviceToHostBytes<< " bytes transfered from device " << std::endl;
std::cout << GridLogMessage << DeviceEvictions << " Evictions from device " << std::endl;
std::cout << GridLogMessage << DeviceDestroy << " Destroyed vectors on device " << std::endl;
std::cout << GridLogMessage << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
}
void MemoryManager::PrintAll(void)
{
Print();
std::cout << GridLogMessage << std::endl;
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
auto &AccCache = it->second;
@ -467,13 +492,13 @@ void MemoryManager::Print(void)
if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
if ( AccCache.state==Consistent)str = std::string("Consistent");
std::cout << GridLogDebug << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
std::cout << GridLogMessage << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
<< "\t" << AccCache.cpuLock
<< "\t" << AccCache.accLock
<< "\t" << AccCache.LRU_valid<<std::endl;
}
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogMessage << "--------------------------------------------" << std::endl;
};
int MemoryManager::isOpen (void* _CpuPtr)
@ -489,6 +514,25 @@ int MemoryManager::isOpen (void* _CpuPtr)
}
void MemoryManager::Audit(std::string s)
{
uint64_t CpuBytes=0;
uint64_t AccBytes=0;
uint64_t LruBytes1=0;
uint64_t LruBytes2=0;
uint64_t LruCnt=0;
uint64_t LockedBytes=0;
std::cout << " Memory Manager::Audit() from "<<s<<std::endl;
for(auto it=LRU.begin();it!=LRU.end();it++){
uint64_t cpuPtr = *it;
assert(EntryPresent(cpuPtr));
auto AccCacheIterator = EntryLookup(cpuPtr);
auto & AccCache = AccCacheIterator->second;
LruBytes2+=AccCache.bytes;
assert(AccCache.LRU_valid==1);
assert(AccCache.LRU_entry==it);
}
std::cout << " Memory Manager::Audit() LRU queue matches table entries "<<std::endl;
for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
auto &AccCache = it->second;
@ -498,7 +542,14 @@ void MemoryManager::Audit(std::string s)
if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
if ( AccCache.state==Consistent)str = std::string("Consistent");
if ( AccCache.cpuLock || AccCache.accLock ) {
CpuBytes+=AccCache.bytes;
if( AccCache.AccPtr ) AccBytes+=AccCache.bytes;
if( AccCache.LRU_valid ) LruBytes1+=AccCache.bytes;
if( AccCache.LRU_valid ) LruCnt++;
if ( AccCache.cpuLock || AccCache.accLock ) {
assert(AccCache.LRU_valid==0);
std::cout << GridLogError << s<< "\n\t 0x"<<std::hex<<AccCache.CpuPtr<<std::dec
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
<< "\t cpuLock " << AccCache.cpuLock
@ -509,6 +560,15 @@ void MemoryManager::Audit(std::string s)
assert( AccCache.cpuLock== 0 ) ;
assert( AccCache.accLock== 0 ) ;
}
std::cout << " Memory Manager::Audit() no locked table entries "<<std::endl;
assert(LruBytes1==LruBytes2);
assert(LruBytes1==DeviceLRUBytes);
std::cout << " Memory Manager::Audit() evictable bytes matches sum over table "<<std::endl;
assert(AccBytes==DeviceBytes);
std::cout << " Memory Manager::Audit() device bytes matches sum over table "<<std::endl;
assert(LruCnt == LRU.size());
std::cout << " Memory Manager::Audit() LRU entry count matches "<<std::endl;
}
void MemoryManager::PrintState(void* _CpuPtr)
@ -526,8 +586,8 @@ void MemoryManager::PrintState(void* _CpuPtr)
if ( AccCache.state==EvictNext) str = std::string("EvictNext");
std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
std::cout << GridLogMessage << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
std::cout << GridLogMessage << "\tx"<<std::hex<<AccCache.CpuPtr<<std::dec
<< "\tx"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
<< "\t" << AccCache.cpuLock
<< "\t" << AccCache.accLock
<< "\t" << AccCache.LRU_valid<<std::endl;

View File

@ -12,6 +12,8 @@ uint64_t MemoryManager::HostToDeviceBytes;
uint64_t MemoryManager::DeviceToHostBytes;
uint64_t MemoryManager::HostToDeviceXfer;
uint64_t MemoryManager::DeviceToHostXfer;
uint64_t MemoryManager::DeviceEvictions;
uint64_t MemoryManager::DeviceDestroy;
void MemoryManager::Audit(std::string s){};
void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
@ -22,6 +24,7 @@ void MemoryManager::PrintState(void* CpuPtr)
std::cout << GridLogMessage << "Host<->Device memory movement not currently managed by Grid." << std::endl;
};
void MemoryManager::Print(void){};
void MemoryManager::PrintAll(void){};
void MemoryManager::NotifyDeletion(void *ptr){};
NAMESPACE_END(Grid);

View File

@ -291,8 +291,8 @@ public:
typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
conformable(*this,r);
this->checkerboard = r.Checkerboard();
auto me = View(AcceleratorWriteDiscard);
auto him= r.View(AcceleratorRead);
auto me = View(AcceleratorWriteDiscard);
accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss));
});
@ -306,8 +306,8 @@ public:
inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
this->checkerboard = r.Checkerboard();
conformable(*this,r);
auto me = View(AcceleratorWriteDiscard);
auto him= r.View(AcceleratorRead);
auto me = View(AcceleratorWriteDiscard);
accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss));
});

View File

@ -28,6 +28,9 @@ Author: Christoph Lehner <christoph@lhnr.de>
#if defined(GRID_CUDA)||defined(GRID_HIP)
#include <Grid/lattice/Lattice_reduction_gpu.h>
#endif
#if defined(GRID_SYCL)
#include <Grid/lattice/Lattice_reduction_sycl.h>
#endif
NAMESPACE_BEGIN(Grid);
@ -124,7 +127,7 @@ inline Double max(const Double *arg, Integer osites)
template<class vobj>
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
return sum_gpu(arg,osites);
#else
return sum_cpu(arg,osites);
@ -133,7 +136,7 @@ inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
template<class vobj>
inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
return sumD_gpu(arg,osites);
#else
return sumD_cpu(arg,osites);
@ -142,7 +145,7 @@ inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
template<class vobj>
inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
return sumD_gpu_large(arg,osites);
#else
return sumD_cpu(arg,osites);
@ -152,13 +155,13 @@ inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)
autoView( arg_v, arg, AcceleratorRead);
Integer osites = arg.Grid()->oSites();
auto ssum= sum_gpu(&arg_v[0],osites);
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
typename vobj::scalar_object ssum;
autoView( arg_v, arg, AcceleratorRead);
ssum= sum_gpu(&arg_v[0],osites);
#else
autoView(arg_v, arg, CpuRead);
Integer osites = arg.Grid()->oSites();
auto ssum= sum_cpu(&arg_v[0],osites);
#endif
arg.Grid()->GlobalSum(ssum);
@ -168,7 +171,7 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
template<class vobj>
inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
autoView( arg_v, arg, AcceleratorRead);
Integer osites = arg.Grid()->oSites();
auto ssum= sum_gpu_large(&arg_v[0],osites);
@ -232,11 +235,10 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
{
autoView( left_v , left, AcceleratorRead);
autoView( right_v,right, AcceleratorRead);
// This code could read coalesce
// GPU - SIMT lane compliance...
accelerator_for( ss, sites, nsimd,{
auto x_l = left_v(ss);

View File

@ -211,13 +211,28 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
assert(ok);
Integer smemSize = numThreads * sizeof(sobj);
Vector<sobj> buffer(numBlocks);
// UVM seems to be buggy under later CUDA drivers
// This fails on A100 and driver 5.30.02 / CUDA 12.1
// Fails with multiple NVCC versions back to 11.4,
// which worked with earlier drivers.
// Not sure which driver had first fail and this bears checking
// Is awkward as must install multiple driver versions
#undef UVM_BLOCK_BUFFER
#ifndef UVM_BLOCK_BUFFER
commVector<sobj> buffer(numBlocks);
sobj *buffer_v = &buffer[0];
sobj result;
reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
accelerator_barrier();
auto result = buffer_v[0];
acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
#else
Vector<sobj> buffer(numBlocks);
sobj *buffer_v = &buffer[0];
sobj result;
reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
accelerator_barrier();
result = *buffer_v;
#endif
return result;
}

View File

@ -0,0 +1,125 @@
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////////////////////////////
// Possibly promote to double and sum
/////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_objectD sobjD;
sobj *mysum =(sobj *) malloc_shared(sizeof(sobj),*theGridAccelerator);
sobj identity; zeroit(identity);
sobj ret ;
Integer nsimd= vobj::Nsimd();
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
auto Reduction = cl::sycl::reduction(mysum,identity,std::plus<>());
cgh.parallel_for(cl::sycl::range<1>{osites},
Reduction,
[=] (cl::sycl::id<1> item, auto &sum) {
auto osite = item[0];
sum +=Reduce(lat[osite]);
});
});
theGridAccelerator->wait();
ret = mysum[0];
free(mysum,*theGridAccelerator);
sobjD dret; convertType(dret,ret);
return dret;
}
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
{
return sumD_gpu_tensor(lat,osites);
}
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
{
return sumD_gpu_large(lat,osites);
}
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
{
return sumD_gpu_large(lat,osites);
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
// Return as same precision as input performing reduction in double precision though
/////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj>
inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_object sobj;
sobj result;
result = sumD_gpu(lat,osites);
return result;
}
template <class vobj>
inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_object sobj;
sobj result;
result = sumD_gpu_large(lat,osites);
return result;
}
NAMESPACE_END(Grid);
/*
template<class Double> Double svm_reduce(Double *vec,uint64_t L)
{
Double sumResult; zeroit(sumResult);
Double *d_sum =(Double *)cl::sycl::malloc_shared(sizeof(Double),*theGridAccelerator);
Double identity; zeroit(identity);
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
auto Reduction = cl::sycl::reduction(d_sum,identity,std::plus<>());
cgh.parallel_for(cl::sycl::range<1>{L},
Reduction,
[=] (cl::sycl::id<1> index, auto &sum) {
sum +=vec[index];
});
});
theGridAccelerator->wait();
Double ret = d_sum[0];
free(d_sum,*theGridAccelerator);
std::cout << " svm_reduce finished "<<L<<" sites sum = " << ret <<std::endl;
return ret;
}
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
{
typedef typename vobj::vector_type vector;
typedef typename vobj::scalar_type scalar;
typedef typename vobj::scalar_typeD scalarD;
typedef typename vobj::scalar_objectD sobjD;
sobjD ret;
scalarD *ret_p = (scalarD *)&ret;
const int nsimd = vobj::Nsimd();
const int words = sizeof(vobj)/sizeof(vector);
Vector<scalar> buffer(osites*nsimd);
scalar *buf = &buffer[0];
vector *dat = (vector *)lat;
for(int w=0;w<words;w++) {
accelerator_for(ss,osites,nsimd,{
int lane = acceleratorSIMTlane(nsimd);
buf[ss*nsimd+lane] = dat[ss*words+w].getlane(lane);
});
//Precision change at this point is to late to gain precision
ret_p[w] = svm_reduce(buf,nsimd*osites);
}
return ret;
}
*/

View File

@ -440,7 +440,17 @@ public:
_grid->GlobalCoorToGlobalIndex(gcoor,gidx);
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
#if 1
assert(rank == _grid->ThisRank() );
#else
//
if (rank != _grid->ThisRank() ){
std::cout <<"rank "<<rank<<" _grid->ThisRank() "<<_grid->ThisRank()<< std::endl;
// exit(-42);
// assert(0);
}
#endif
int l_idx=generator_idx(o_idx,i_idx);
_generators[l_idx] = master_engine;

View File

@ -1080,6 +1080,7 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
});
}
//Very fast precision change. Requires in/out objects to reside on same Grid (e.g. by using double2 for the double-precision field)
template<class VobjOut, class VobjIn>
void precisionChangeFast(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
{
@ -1097,9 +1098,9 @@ void precisionChangeFast(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
precisionChange(vout,vin,N);
});
}
//Convert a Lattice from one precision to another
//Convert a Lattice from one precision to another (original, slow implementation)
template<class VobjOut, class VobjIn>
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
void precisionChangeOrig(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
{
assert(out.Grid()->Nd() == in.Grid()->Nd());
for(int d=0;d<out.Grid()->Nd();d++){
@ -1145,6 +1146,128 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
});
}
//The workspace for a precision change operation allowing for the reuse of the mapping to save time on subsequent calls
class precisionChangeWorkspace{
std::pair<Integer,Integer>* fmap_device; //device pointer
//maintain grids for checking
GridBase* _out_grid;
GridBase* _in_grid;
public:
precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid): _out_grid(out_grid), _in_grid(in_grid){
//Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device
assert(out_grid->Nd() == in_grid->Nd());
for(int d=0;d<out_grid->Nd();d++){
assert(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]);
}
int Nsimd_out = out_grid->Nsimd();
std::vector<Coordinate> out_icorrs(out_grid->Nsimd()); //reuse these
for(int lane=0; lane < out_grid->Nsimd(); lane++)
out_grid->iCoorFromIindex(out_icorrs[lane], lane);
std::vector<std::pair<Integer,Integer> > fmap_host(out_grid->lSites()); //lsites = osites*Nsimd
thread_for(out_oidx,out_grid->oSites(),{
Coordinate out_ocorr;
out_grid->oCoorFromOindex(out_ocorr, out_oidx);
Coordinate lcorr; //the local coordinate (common to both in and out as full coordinate)
for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
out_grid->InOutCoorToLocalCoor(out_ocorr, out_icorrs[out_lane], lcorr);
//int in_oidx = in_grid->oIndex(lcorr), in_lane = in_grid->iIndex(lcorr);
//Note oIndex and OcorrFromOindex (and same for iIndex) are not inverse for checkerboarded lattice, the former coordinates being defined on the full lattice and the latter on the reduced lattice
//Until this is fixed we need to circumvent the problem locally. Here I will use the coordinates defined on the reduced lattice for simplicity
int in_oidx = 0, in_lane = 0;
for(int d=0;d<in_grid->_ndimension;d++){
in_oidx += in_grid->_ostride[d] * ( lcorr[d] % in_grid->_rdimensions[d] );
in_lane += in_grid->_istride[d] * ( lcorr[d] / in_grid->_rdimensions[d] );
}
fmap_host[out_lane + Nsimd_out*out_oidx] = std::pair<Integer,Integer>( in_oidx, in_lane );
}
});
//Copy the map to the device (if we had a way to tell if an accelerator is in use we could avoid this copy for CPU-only machines)
size_t fmap_bytes = out_grid->lSites() * sizeof(std::pair<Integer,Integer>);
fmap_device = (std::pair<Integer,Integer>*)acceleratorAllocDevice(fmap_bytes);
acceleratorCopyToDevice(fmap_host.data(), fmap_device, fmap_bytes);
}
//Prevent moving or copying
precisionChangeWorkspace(const precisionChangeWorkspace &r) = delete;
precisionChangeWorkspace(precisionChangeWorkspace &&r) = delete;
precisionChangeWorkspace &operator=(const precisionChangeWorkspace &r) = delete;
precisionChangeWorkspace &operator=(precisionChangeWorkspace &&r) = delete;
std::pair<Integer,Integer> const* getMap() const{ return fmap_device; }
void checkGrids(GridBase* out, GridBase* in) const{
conformable(out, _out_grid);
conformable(in, _in_grid);
}
~precisionChangeWorkspace(){
acceleratorFreeDevice(fmap_device);
}
};
//We would like to use precisionChangeFast when possible. However usage of this requires the Grids to be the same (runtime check)
//*and* the precisionChange(VobjOut::vector_type, VobjIn, int) function to be defined for the types; this requires an extra compile-time check which we do using some SFINAE trickery
template<class VobjOut, class VobjIn>
auto _precisionChangeFastWrap(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, int dummy)->decltype( precisionChange( ((typename VobjOut::vector_type*)0), ((typename VobjIn::vector_type*)0), 1), int()){
if(out.Grid() == in.Grid()){
precisionChangeFast(out,in);
return 1;
}else{
return 0;
}
}
template<class VobjOut, class VobjIn>
int _precisionChangeFastWrap(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, long dummy){ //note long here is intentional; it means the above is preferred if available
return 0;
}
//Convert a lattice of one precision to another. Much faster than original implementation but requires a pregenerated workspace
//which contains the mapping data.
template<class VobjOut, class VobjIn>
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, const precisionChangeWorkspace &workspace){
if(_precisionChangeFastWrap(out,in,0)) return;
static_assert( std::is_same<typename VobjOut::scalar_typeD, typename VobjIn::scalar_typeD>::value == 1, "precisionChange: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
out.Checkerboard() = in.Checkerboard();
constexpr int Nsimd_out = VobjOut::Nsimd();
workspace.checkGrids(out.Grid(),in.Grid());
std::pair<Integer,Integer> const* fmap_device = workspace.getMap();
//Do the copy/precision change
autoView( out_v , out, AcceleratorWrite);
autoView( in_v , in, AcceleratorRead);
accelerator_for(out_oidx, out.Grid()->oSites(), 1,{
std::pair<Integer,Integer> const* fmap_osite = fmap_device + out_oidx*Nsimd_out;
for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
int in_oidx = fmap_osite[out_lane].first;
int in_lane = fmap_osite[out_lane].second;
copyLane(out_v[out_oidx], out_lane, in_v[in_oidx], in_lane);
}
});
}
//Convert a Lattice from one precision to another. Much faster than original implementation but slower than precisionChangeFast
//or precisionChange called with pregenerated workspace, as it needs to internally generate the workspace on the host and copy to device
template<class VobjOut, class VobjIn>
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
if(_precisionChangeFastWrap(out,in,0)) return;
precisionChangeWorkspace workspace(out.Grid(), in.Grid());
precisionChange(out, in, workspace);
}
////////////////////////////////////////////////////////////////////////////////
// Communicate between grids
////////////////////////////////////////////////////////////////////////////////

View File

@ -30,6 +30,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_PERFCOUNT_H
#define GRID_PERFCOUNT_H
#ifndef __SSC_START
#define __SSC_START
#define __SSC_STOP
#endif
#include <sys/time.h>
#include <ctime>
#include <chrono>

View File

@ -16,7 +16,7 @@
#ifdef __NVCC__
#pragma push
#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
#pragma nv_diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
#else
#pragma diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"

View File

@ -507,9 +507,20 @@ template<class vobj> void pokeLorentz(vobj &lhs,const decltype(peekIndex<Lorentz
// Fermion <-> propagator assignements
//////////////////////////////////////////////
//template <class Prop, class Ferm>
#define FAST_FERM_TO_PROP
template <class Fimpl>
void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
{
#ifdef FAST_FERM_TO_PROP
autoView(p_v,p,CpuWrite);
autoView(f_v,f,CpuRead);
thread_for(idx,p_v.oSites(),{
for(int ss = 0; ss < Ns; ++ss) {
for(int cc = 0; cc < Fimpl::Dimension; ++cc) {
p_v[idx]()(ss,s)(cc,c) = f_v[idx]()(ss)(cc); // Propagator sink index is LEFT, suitable for left mult by gauge link (e.g.)
}}
});
#else
for(int j = 0; j < Ns; ++j)
{
auto pjs = peekSpin(p, j, s);
@ -521,12 +532,23 @@ void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::Fermio
}
pokeSpin(p, pjs, j, s);
}
#endif
}
//template <class Prop, class Ferm>
template <class Fimpl>
void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
{
#ifdef FAST_FERM_TO_PROP
autoView(p_v,p,CpuRead);
autoView(f_v,f,CpuWrite);
thread_for(idx,p_v.oSites(),{
for(int ss = 0; ss < Ns; ++ss) {
for(int cc = 0; cc < Fimpl::Dimension; ++cc) {
f_v[idx]()(ss)(cc) = p_v[idx]()(ss,s)(cc,c); // LEFT index is copied across for s,c right index
}}
});
#else
for(int j = 0; j < Ns; ++j)
{
auto pjs = peekSpin(p, j, s);
@ -538,6 +560,7 @@ void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::Propagato
}
pokeSpin(f, fj, j);
}
#endif
}
//////////////////////////////////////////////

View File

@ -205,15 +205,18 @@ public:
typedef WilsonCloverHelpers<Impl> Helpers;
typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
static void MassTerm(CloverField& Clover, RealD diag_mass) {
static void InstantiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
Clover += diag_mass;
}
static void Exponentiate_Clover(CloverDiagonalField& Diagonal,
CloverTriangleField& Triangle,
RealD csw_t, RealD diag_mass) {
static void InvertClover(CloverField& InvClover,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle,
CloverDiagonalField& diagonalInv,
CloverTriangleField& triangleInv,
bool fixedBoundaries) {
// Do nothing
CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv);
}
// TODO: implement Cmunu for better performances with compact layout, but don't do it
@ -238,9 +241,17 @@ public:
template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
static void MassTerm(CloverField& Clover, RealD diag_mass) {
// do nothing!
// mass term is multiplied to exp(Clover) below
// Can this be avoided?
static void IdentityTimesC(const CloverField& in, RealD c) {
int DimRep = Impl::Dimension;
autoView(in_v, in, AcceleratorWrite);
accelerator_for(ss, in.Grid()->oSites(), 1, {
for (int sa=0; sa<Ns; sa++)
for (int ca=0; ca<DimRep; ca++)
in_v[ss]()(sa,sa)(ca,ca) = c;
});
}
static int getNMAX(RealD prec, RealD R) {
@ -255,175 +266,62 @@ public:
return NMAX;
}
static int getNMAX(Lattice<iImplCloverDiagonal<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
static int getNMAX(Lattice<iImplCloverDiagonal<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
static void ExponentiateHermitean6by6(const iMatrix<ComplexD,6> &arg, const RealD& alpha, const std::vector<RealD>& cN, const int Niter, iMatrix<ComplexD,6>& dest){
static void InstantiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
typedef iMatrix<ComplexD,6> mat;
GridBase* grid = Clover.Grid();
CloverField ExpClover(grid);
RealD qn[6];
RealD qnold[6];
RealD p[5];
RealD trA2, trA3, trA4;
int NMAX = getNMAX(Clover, 3.*csw_t/diag_mass);
mat A2, A3, A4, A5;
A2 = alpha * alpha * arg * arg;
A3 = alpha * arg * A2;
A4 = A2 * A2;
A5 = A2 * A3;
Clover *= (1.0/diag_mass);
trA2 = toReal( trace(A2) );
trA3 = toReal( trace(A3) );
trA4 = toReal( trace(A4));
p[0] = toReal( trace(A3 * A3)) / 6.0 - 0.125 * trA4 * trA2 - trA3 * trA3 / 18.0 + trA2 * trA2 * trA2/ 48.0;
p[1] = toReal( trace(A5)) / 5.0 - trA3 * trA2 / 6.0;
p[2] = toReal( trace(A4)) / 4.0 - 0.125 * trA2 * trA2;
p[3] = trA3 / 3.0;
p[4] = 0.5 * trA2;
qnold[0] = cN[Niter];
qnold[1] = 0.0;
qnold[2] = 0.0;
qnold[3] = 0.0;
qnold[4] = 0.0;
qnold[5] = 0.0;
for(int i = Niter-1; i >= 0; i--)
{
qn[0] = p[0] * qnold[5] + cN[i];
qn[1] = p[1] * qnold[5] + qnold[0];
qn[2] = p[2] * qnold[5] + qnold[1];
qn[3] = p[3] * qnold[5] + qnold[2];
qn[4] = p[4] * qnold[5] + qnold[3];
qn[5] = qnold[4];
qnold[0] = qn[0];
qnold[1] = qn[1];
qnold[2] = qn[2];
qnold[3] = qn[3];
qnold[4] = qn[4];
qnold[5] = qn[5];
}
mat unit(1.0);
dest = (qn[0] * unit + qn[1] * alpha * arg + qn[2] * A2 + qn[3] * A3 + qn[4] * A4 + qn[5] * A5);
}
static void Exponentiate_Clover(CloverDiagonalField& Diagonal, CloverTriangleField& Triangle, RealD csw_t, RealD diag_mass) {
GridBase* grid = Diagonal.Grid();
int NMAX = getNMAX(Diagonal, 3.*csw_t/diag_mass);
//
// Implementation completely in Daniel's layout
//
// Taylor expansion with Cayley-Hamilton recursion
// underlying Horner scheme as above
// Taylor expansion, slow but generic
// Horner scheme: a0 + a1 x + a2 x^2 + .. = a0 + x (a1 + x(...))
// qN = cN
// qn = cn + qn+1 X
std::vector<RealD> cn(NMAX+1);
cn[0] = 1.0;
for (int i=1; i<=NMAX; i++){
for (int i=1; i<=NMAX; i++)
cn[i] = cn[i-1] / RealD(i);
}
// Taken over from Daniel's implementation
conformable(Diagonal, Triangle);
ExpClover = Zero();
IdentityTimesC(ExpClover, cn[NMAX]);
for (int i=NMAX-1; i>=0; i--)
ExpClover = ExpClover * Clover + cn[i];
long lsites = grid->lSites();
{
typedef typename SiteCloverDiagonal::scalar_object scalar_object_diagonal;
typedef typename SiteCloverTriangle::scalar_object scalar_object_triangle;
typedef iMatrix<ComplexD,6> mat;
// prepare inverse
CloverInv = (-1.0)*Clover;
autoView(diagonal_v, Diagonal, CpuRead);
autoView(triangle_v, Triangle, CpuRead);
autoView(diagonalExp_v, Diagonal, CpuWrite);
autoView(triangleExp_v, Triangle, CpuWrite);
Clover = ExpClover * diag_mass;
thread_for(site, lsites, { // NOTE: Not on GPU because of (peek/poke)LocalSite
ExpClover = Zero();
IdentityTimesC(ExpClover, cn[NMAX]);
for (int i=NMAX-1; i>=0; i--)
ExpClover = ExpClover * CloverInv + cn[i];
mat srcCloverOpUL(0.0); // upper left block
mat srcCloverOpLR(0.0); // lower right block
mat ExpCloverOp;
CloverInv = ExpClover * (1.0/diag_mass);
scalar_object_diagonal diagonal_tmp = Zero();
scalar_object_diagonal diagonal_exp_tmp = Zero();
scalar_object_triangle triangle_tmp = Zero();
scalar_object_triangle triangle_exp_tmp = Zero();
Coordinate lcoor;
grid->LocalIndexToLocalCoor(site, lcoor);
peekLocalSite(diagonal_tmp, diagonal_v, lcoor);
peekLocalSite(triangle_tmp, triangle_v, lcoor);
int block;
block = 0;
for(int i = 0; i < 6; i++){
for(int j = 0; j < 6; j++){
if (i == j){
srcCloverOpUL(i,j) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
}
else{
srcCloverOpUL(i,j) = static_cast<ComplexD>(TensorRemove(CompactHelpers::triangle_elem(triangle_tmp, block, i, j)));
}
}
}
block = 1;
for(int i = 0; i < 6; i++){
for(int j = 0; j < 6; j++){
if (i == j){
srcCloverOpLR(i,j) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
}
else{
srcCloverOpLR(i,j) = static_cast<ComplexD>(TensorRemove(CompactHelpers::triangle_elem(triangle_tmp, block, i, j)));
}
}
}
// exp(Clover)
ExponentiateHermitean6by6(srcCloverOpUL,1.0/diag_mass,cn,NMAX,ExpCloverOp);
block = 0;
for(int i = 0; i < 6; i++){
for(int j = 0; j < 6; j++){
if (i == j){
diagonal_exp_tmp()(block)(i) = ExpCloverOp(i,j);
}
else if(i < j){
triangle_exp_tmp()(block)(CompactHelpers::triangle_index(i, j)) = ExpCloverOp(i,j);
}
}
}
ExponentiateHermitean6by6(srcCloverOpLR,1.0/diag_mass,cn,NMAX,ExpCloverOp);
block = 1;
for(int i = 0; i < 6; i++){
for(int j = 0; j < 6; j++){
if (i == j){
diagonal_exp_tmp()(block)(i) = ExpCloverOp(i,j);
}
else if(i < j){
triangle_exp_tmp()(block)(CompactHelpers::triangle_index(i, j)) = ExpCloverOp(i,j);
}
}
}
pokeLocalSite(diagonal_exp_tmp, diagonalExp_v, lcoor);
pokeLocalSite(triangle_exp_tmp, triangleExp_v, lcoor);
});
}
Diagonal *= diag_mass;
Triangle *= diag_mass;
}
static void InvertClover(CloverField& InvClover,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle,
CloverDiagonalField& diagonalInv,
CloverTriangleField& triangleInv,
bool fixedBoundaries) {
if (fixedBoundaries)
{
CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv);
}
else
{
CompactHelpers::ConvertLayout(InvClover, diagonalInv, triangleInv);
}
}
static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
assert(0);

View File

@ -225,7 +225,7 @@ public:
RealD csw_t;
RealD cF;
bool open_boundaries;
bool fixedBoundaries;
CloverDiagonalField Diagonal, DiagonalEven, DiagonalOdd;
CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;

View File

@ -36,7 +36,7 @@ NAMESPACE_BEGIN(Grid);
// Wilson compressor will need FaceGather policies for:
// Periodic, Dirichlet, and partial Dirichlet for DWF
///////////////////////////////////////////////////////////////
const int dwf_compressor_depth=1;
const int dwf_compressor_depth=2;
#define DWF_COMPRESS
class FaceGatherPartialDWF
{
@ -110,7 +110,7 @@ public:
////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj,class cobj,class compressor>
static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
Vector<cobj *> pointers,int dimension,int plane,int cbmask,
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
compressor &compress,int type,int partial)
{
GridBase *Grid = rhs.Grid();
@ -209,7 +209,7 @@ public:
}
template<class vobj,class cobj,class compressor>
static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
Vector<cobj *> pointers,int dimension,int plane,int cbmask,
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
compressor &compress,int type,int partial)
{
// std::cout << " face gather exch DWF partial "<<partial <<std::endl;
@ -320,7 +320,7 @@ public:
typedef decltype(coalescedRead(in0)) sobj;
typedef decltype(coalescedRead(out0)) hsobj;
unsigned int Nsimd = vobj::Nsimd();
constexpr unsigned int Nsimd = vobj::Nsimd();
unsigned int mask = Nsimd >> (type + 1);
int lane = acceleratorSIMTlane(Nsimd);
int j0 = lane &(~mask); // inner coor zero
@ -484,24 +484,26 @@ public:
int dag = compress.dag;
int face_idx=0;
#define vet_same_node(a,b) \
{ auto tmp = b; }
if ( dag ) {
assert(this->same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
assert(this->same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
assert(this->same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
assert(this->same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx));
assert(this->same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx));
assert(this->same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx));
assert(this->same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx));
assert(this->same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx));
vet_same_node(this->same_node[Xp],this->HaloGatherDir(source,XpCompress,Xp,face_idx));
vet_same_node(this->same_node[Yp],this->HaloGatherDir(source,YpCompress,Yp,face_idx));
vet_same_node(this->same_node[Zp],this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
vet_same_node(this->same_node[Tp],this->HaloGatherDir(source,TpCompress,Tp,face_idx));
vet_same_node(this->same_node[Xm],this->HaloGatherDir(source,XmCompress,Xm,face_idx));
vet_same_node(this->same_node[Ym],this->HaloGatherDir(source,YmCompress,Ym,face_idx));
vet_same_node(this->same_node[Zm],this->HaloGatherDir(source,ZmCompress,Zm,face_idx));
vet_same_node(this->same_node[Tm],this->HaloGatherDir(source,TmCompress,Tm,face_idx));
} else {
assert(this->same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx));
assert(this->same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx));
assert(this->same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx));
assert(this->same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx));
assert(this->same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx));
assert(this->same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx));
assert(this->same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx));
assert(this->same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx));
vet_same_node(this->same_node[Xp],this->HaloGatherDir(source,XmCompress,Xp,face_idx));
vet_same_node(this->same_node[Yp],this->HaloGatherDir(source,YmCompress,Yp,face_idx));
vet_same_node(this->same_node[Zp],this->HaloGatherDir(source,ZmCompress,Zp,face_idx));
vet_same_node(this->same_node[Tp],this->HaloGatherDir(source,TmCompress,Tp,face_idx));
vet_same_node(this->same_node[Xm],this->HaloGatherDir(source,XpCompress,Xm,face_idx));
vet_same_node(this->same_node[Ym],this->HaloGatherDir(source,YpCompress,Ym,face_idx));
vet_same_node(this->same_node[Zm],this->HaloGatherDir(source,ZpCompress,Zm,face_idx));
vet_same_node(this->same_node[Tm],this->HaloGatherDir(source,TpCompress,Tm,face_idx));
}
this->face_table_computed=1;
assert(this->u_comm_offset==this->_unified_buffer_size);

View File

@ -48,7 +48,7 @@ CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(Gaug
, csw_r(_csw_r)
, csw_t(_csw_t)
, cF(_cF)
, open_boundaries(impl_p.boundary_phases[Nd-1] == 0.0)
, fixedBoundaries(impl_p.boundary_phases[Nd-1] == 0.0)
, Diagonal(&Fgrid), Triangle(&Fgrid)
, DiagonalEven(&Hgrid), TriangleEven(&Hgrid)
, DiagonalOdd(&Hgrid), TriangleOdd(&Hgrid)
@ -67,7 +67,7 @@ CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(Gaug
csw_r /= clover_anisotropy.xi_0;
ImportGauge(_Umu);
if (open_boundaries) {
if (fixedBoundaries) {
this->BoundaryMaskEven.Checkerboard() = Even;
this->BoundaryMaskOdd.Checkerboard() = Odd;
CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
@ -77,31 +77,31 @@ CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(Gaug
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::Dhop(const FermionField& in, FermionField& out, int dag) {
WilsonBase::Dhop(in, out, dag);
if(open_boundaries) ApplyBoundaryMask(out);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopOE(const FermionField& in, FermionField& out, int dag) {
WilsonBase::DhopOE(in, out, dag);
if(open_boundaries) ApplyBoundaryMask(out);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopEO(const FermionField& in, FermionField& out, int dag) {
WilsonBase::DhopEO(in, out, dag);
if(open_boundaries) ApplyBoundaryMask(out);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
WilsonBase::DhopDir(in, out, dir, disp);
if(this->open_boundaries) ApplyBoundaryMask(out);
if(this->fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
WilsonBase::DhopDirAll(in, out);
if(this->open_boundaries) {
if(this->fixedBoundaries) {
for(auto& o : out) ApplyBoundaryMask(o);
}
}
@ -112,7 +112,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField& in,
WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
Mooee(in, Tmp);
axpy(out, 1.0, out, Tmp);
if(open_boundaries) ApplyBoundaryMask(out);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
@ -121,19 +121,19 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField& i
WilsonBase::Dhop(in, out, DaggerYes); // call base to save applying bc
MooeeDag(in, Tmp);
axpy(out, 1.0, out, Tmp);
if(open_boundaries) ApplyBoundaryMask(out);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::Meooe(const FermionField& in, FermionField& out) {
WilsonBase::Meooe(in, out);
if(open_boundaries) ApplyBoundaryMask(out);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeooeDag(const FermionField& in, FermionField& out) {
WilsonBase::MeooeDag(in, out);
if(open_boundaries) ApplyBoundaryMask(out);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
@ -147,7 +147,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField&
} else {
MooeeInternal(in, out, Diagonal, Triangle);
}
if(open_boundaries) ApplyBoundaryMask(out);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
@ -166,7 +166,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionFiel
} else {
MooeeInternal(in, out, DiagonalInv, TriangleInv);
}
if(open_boundaries) ApplyBoundaryMask(out);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
@ -186,7 +186,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::MdirAll(const FermionField
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
assert(!open_boundaries); // TODO check for changes required for open bc
assert(!fixedBoundaries); // TODO check for changes required for open bc
// NOTE: code copied from original clover term
conformable(X.Grid(), Y.Grid());
@ -305,6 +305,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
GridBase* grid = _Umu.Grid();
typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
CloverField TmpOriginal(grid);
CloverField TmpInverse(grid);
// Compute the field strength terms mu>nu
double t2 = usecond();
@ -324,24 +325,27 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
// Handle mass term based on clover policy
CloverHelpers::MassTerm(TmpOriginal, this->diag_mass);
// Convert the data layout of the clover term
// Instantiate the clover term
// - In case of the standard clover the mass term is added
// - In case of the exponential clover the clover term is exponentiated
double t4 = usecond();
CloverHelpers::InstantiateClover(TmpOriginal, TmpInverse, csw_t, this->diag_mass);
// Convert the data layout of the clover term
double t5 = usecond();
CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
// Exponentiate the clover (nothing happens in case of the standard clover)
double t5 = usecond();
CloverHelpers::Exponentiate_Clover(Diagonal, Triangle, csw_t, this->diag_mass);
// Possible modify the boundary values
// Modify the clover term at the temporal boundaries in case of open boundary conditions
double t6 = usecond();
if(open_boundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);
if(fixedBoundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);
// Invert the Clover term (explicit inversion needed for the improvement in case of open boundary conditions)
// Invert the Clover term
// In case of the exponential clover with (anti-)periodic boundary conditions exp(-Clover) saved
// in TmpInverse can be used. In all other cases the clover term has to be explictly inverted.
// TODO: For now this inversion is explictly done on the CPU
double t7 = usecond();
CompactHelpers::Invert(Diagonal, Triangle, DiagonalInv, TriangleInv);
CloverHelpers::InvertClover(TmpInverse, Diagonal, Triangle, DiagonalInv, TriangleInv, fixedBoundaries);
// Fill the remaining clover fields
double t8 = usecond();
@ -362,10 +366,10 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
std::cout << GridLogDebug << "allocations = " << (t2 - t1) / 1e6 << std::endl;
std::cout << GridLogDebug << "field strength = " << (t3 - t2) / 1e6 << std::endl;
std::cout << GridLogDebug << "fill clover = " << (t4 - t3) / 1e6 << std::endl;
std::cout << GridLogDebug << "convert = " << (t5 - t4) / 1e6 << std::endl;
std::cout << GridLogDebug << "exponentiation = " << (t6 - t5) / 1e6 << std::endl;
std::cout << GridLogDebug << "boundaries = " << (t7 - t6) / 1e6 << std::endl;
std::cout << GridLogDebug << "inversions = " << (t8 - t7) / 1e6 << std::endl;
std::cout << GridLogDebug << "instantiate clover = " << (t5 - t4) / 1e6 << std::endl;
std::cout << GridLogDebug << "convert layout = " << (t6 - t5) / 1e6 << std::endl;
std::cout << GridLogDebug << "modify boundaries = " << (t7 - t6) / 1e6 << std::endl;
std::cout << GridLogDebug << "invert clover = " << (t8 - t7) / 1e6 << std::endl;
std::cout << GridLogDebug << "pick cbs = " << (t9 - t8) / 1e6 << std::endl;
std::cout << GridLogDebug << "total = " << (t9 - t0) / 1e6 << std::endl;
}

View File

@ -439,6 +439,17 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
int sF = ss*Ls; \
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \
});
#define ASM_CALL_SLICE(A) \
auto grid = in.Grid() ; \
int nt = grid->LocalDimensions()[4]; \
int nxyz = Nsite/nt ; \
for(int t=0;t<nt;t++){ \
thread_for( sss, nxyz, { \
int ss = t*nxyz+sss; \
int sU = ss; \
int sF = ss*Ls; \
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \
});}
template <class Impl>
void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,

View File

@ -127,6 +127,8 @@ NAMESPACE_BEGIN(Grid);
ApproxNegPowerAction.tolerances[i] = action_tolerance[i];
ApproxHalfPowerAction.tolerances[i] = action_tolerance[i];
ApproxNegHalfPowerAction.tolerances[i]= action_tolerance[i];
}
for(int i=0;i<ApproxPowerMD.tolerances.size();i++){
ApproxPowerMD.tolerances[i] = md_tolerance[i];
ApproxNegPowerMD.tolerances[i] = md_tolerance[i];
ApproxHalfPowerMD.tolerances[i] = md_tolerance[i];

View File

@ -29,6 +29,8 @@
#ifndef QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_MIXED_PREC_H
#define QCD_PSEUDOFERMION_GENERAL_EVEN_ODD_RATIONAL_RATIO_MIXED_PREC_H
#include <Grid/algorithms/iterative/ConjugateGradientMultiShiftCleanup.h>
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -58,7 +60,7 @@ NAMESPACE_BEGIN(Grid);
//Allow derived classes to override the multishift CG
virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, FermionFieldD &out){
#if 0
SchurDifferentiableOperator<ImplD> schurOp(numerator ? NumOp : DenOp);
SchurDifferentiableOperator<ImplD> schurOp(numerator ? NumOpD : DenOpD);
ConjugateGradientMultiShift<FermionFieldD> msCG(MaxIter, approx);
msCG(schurOp,in, out);
#else
@ -66,7 +68,8 @@ NAMESPACE_BEGIN(Grid);
SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF);
FermionFieldD2 inD2(NumOpD2.FermionRedBlackGrid());
FermionFieldD2 outD2(NumOpD2.FermionRedBlackGrid());
// Action better with higher precision?
ConjugateGradientMultiShiftMixedPrec<FermionFieldD2, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
precisionChange(inD2,in);
std::cout << "msCG single solve "<<norm2(inD2)<<" " <<norm2(in)<<std::endl;
@ -76,12 +79,12 @@ NAMESPACE_BEGIN(Grid);
}
virtual void multiShiftInverse(bool numerator, const MultiShiftFunction &approx, const Integer MaxIter, const FermionFieldD &in, std::vector<FermionFieldD> &out_elems, FermionFieldD &out){
SchurDifferentiableOperator<ImplD2> schurOpD2(numerator ? NumOpD2 : DenOpD2);
SchurDifferentiableOperator<ImplF> schurOpF(numerator ? NumOpF : DenOpF);
SchurDifferentiableOperator<ImplF> schurOpF (numerator ? NumOpF : DenOpF);
FermionFieldD2 inD2(NumOpD2.FermionRedBlackGrid());
FermionFieldD2 outD2(NumOpD2.FermionRedBlackGrid());
std::vector<FermionFieldD2> out_elemsD2(out_elems.size(),NumOpD2.FermionRedBlackGrid());
ConjugateGradientMultiShiftMixedPrec<FermionFieldD2, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
ConjugateGradientMultiShiftMixedPrecCleanup<FermionFieldD2, FermionFieldF> msCG(MaxIter, approx, NumOpF.FermionRedBlackGrid(), schurOpF, ReliableUpdateFreq);
precisionChange(inD2,in);
std::cout << "msCG in "<<norm2(inD2)<<" " <<norm2(in)<<std::endl;
msCG(schurOpD2, inD2, out_elemsD2, outD2);

View File

@ -284,6 +284,15 @@ public:
<< as[level].actions.at(actionID)->deriv_us*1.0e-6<<" s"<< std::endl;
}
}
std::cout << GridLogMessage << "--------------------------- "<<std::endl;
std::cout << GridLogMessage << " Dslash counts "<<std::endl;
std::cout << GridLogMessage << "------------------------- "<<std::endl;
uint64_t full, partial, dirichlet;
DslashGetCounts(dirichlet,partial,full);
std::cout << GridLogMessage << " Full BCs : "<<full<<std::endl;
std::cout << GridLogMessage << " Partial dirichlet BCs : "<<partial<<std::endl;
std::cout << GridLogMessage << " Dirichlet BCs : "<<dirichlet<<std::endl;
std::cout << GridLogMessage << "--------------------------- "<<std::endl;
std::cout << GridLogMessage << " Force average size "<<std::endl;
std::cout << GridLogMessage << "------------------------- "<<std::endl;

View File

@ -36,7 +36,7 @@ public:
}
template<class vobj,class cobj,class compressor>
static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
Vector<cobj *> pointers,int dimension,int plane,int cbmask,
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
compressor &compress,int type,int partial)
{
assert( (table.size()&0x1)==0);

View File

@ -29,6 +29,27 @@
NAMESPACE_BEGIN(Grid);
uint64_t DslashFullCount;
uint64_t DslashPartialCount;
uint64_t DslashDirichletCount;
void DslashResetCounts(void)
{
DslashFullCount=0;
DslashPartialCount=0;
DslashDirichletCount=0;
}
void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full)
{
dirichlet = DslashDirichletCount;
partial = DslashPartialCount;
full = DslashFullCount;
}
void DslashLogFull(void) { DslashFullCount++;}
void DslashLogPartial(void) { DslashPartialCount++;}
void DslashLogDirichlet(void){ DslashDirichletCount++;}
void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
int off,std::vector<std::pair<int,int> > & table)
{

View File

@ -91,11 +91,14 @@ void Gather_plane_simple_table (commVector<std::pair<int,int> >& table,const Lat
///////////////////////////////////////////////////////////////////
template<class cobj,class vobj,class compressor>
void Gather_plane_exchange_table(const Lattice<vobj> &rhs,
commVector<cobj *> pointers,int dimension,int plane,int cbmask,compressor &compress,int type) __attribute__((noinline));
commVector<cobj *> pointers,
int dimension,int plane,
int cbmask,compressor &compress,int type) __attribute__((noinline));
template<class cobj,class vobj,class compressor>
void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
Vector<cobj *> pointers,int dimension,int plane,int cbmask,
void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,
const Lattice<vobj> &rhs,
std::vector<cobj *> &pointers,int dimension,int plane,int cbmask,
compressor &compress,int type)
{
assert( (table.size()&0x1)==0);
@ -103,19 +106,26 @@ void Gather_plane_exchange_table(commVector<std::pair<int,int> >& table,const La
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
auto rhs_v = rhs.View(AcceleratorRead);
auto rhs_p = &rhs_v[0];
auto p0=&pointers[0][0];
auto p1=&pointers[1][0];
auto tp=&table[0];
accelerator_forNB(j, num, vobj::Nsimd(), {
compress.CompressExchange(p0,p1, &rhs_v[0], j,
so+tp[2*j ].second,
so+tp[2*j+1].second,
type);
compress.CompressExchange(p0,p1, rhs_p, j,
so+tp[2*j ].second,
so+tp[2*j+1].second,
type);
});
rhs_v.ViewClose();
}
*/
void DslashResetCounts(void);
void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full);
void DslashLogFull(void);
void DslashLogPartial(void);
void DslashLogDirichlet(void);
struct StencilEntry {
#ifdef GRID_CUDA
uint64_t _byte_offset; // 8 bytes
@ -257,8 +267,8 @@ public:
struct Merge {
static constexpr int Nsimd = vobj::Nsimd();
cobj * mpointer;
Vector<scalar_object *> rpointers;
Vector<cobj *> vpointers;
// std::vector<scalar_object *> rpointers;
std::vector<cobj *> vpointers;
Integer buffer_size;
Integer type;
Integer partial; // partial dirichlet BCs
@ -308,6 +318,7 @@ public:
int face_table_computed;
int partialDirichlet;
int fullDirichlet;
std::vector<commVector<std::pair<int,int> > > face_table ;
Vector<int> surface_list;
@ -358,7 +369,7 @@ public:
return 0;
}
#else
//
// fancy calculation for shm code
inline int SameNode(int point) {
int dimension = this->_directions[point];
@ -378,7 +389,7 @@ public:
int nbr_proc;
if (displacement>0) nbr_proc = 1;
else nbr_proc = pd-1;
else nbr_proc = pd-1;
// FIXME this logic needs to be sorted for three link term
// assert( (displacement==1) || (displacement==-1));
@ -432,11 +443,15 @@ public:
Packets[i].from_rank,Packets[i].do_recv,
Packets[i].xbytes,Packets[i].rbytes,i);
}
_grid->StencilBarrier();// Synch shared memory on a single nodes
}
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
{
_grid->StencilSendToRecvFromComplete(MpiReqs,0);
if ( this->partialDirichlet ) DslashLogPartial();
else if ( this->fullDirichlet ) DslashLogDirichlet();
else DslashLogFull();
}
////////////////////////////////////////////////////////////////////////
// Blocking send and receive. Either sequential or parallel.
@ -625,7 +640,7 @@ public:
d.buffer_size = buffer_size;
dv.push_back(d);
}
void AddMerge(cobj *merge_p,Vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
Merge m;
m.partial = this->partialDirichlet;
m.dims = _grid->_fdimensions;
@ -765,6 +780,10 @@ public:
if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0);
partialDirichlet = p.partialDirichlet;
DirichletBlock(p.dirichlet); // comms send/recv set up
fullDirichlet=0;
for(int d=0;d<p.dirichlet.size();d++){
if (p.dirichlet[d]) fullDirichlet=1;
}
_unified_buffer_size=0;
surface_list.resize(0);
@ -1268,8 +1287,8 @@ public:
assert(bytes*simd_layout == reduced_buffer_size*datum_bytes);
Vector<cobj *> rpointers(maxl);
Vector<cobj *> spointers(maxl);
std::vector<cobj *> rpointers(maxl);
std::vector<cobj *> spointers(maxl);
///////////////////////////////////////////
// Work out what to send where

View File

@ -226,7 +226,7 @@ template<class vobjOut, class vobjIn>
accelerator_inline
void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __restrict__ vecIn, int lane_in)
{
static_assert( std::is_same<typename vobjOut::DoublePrecision, typename vobjIn::DoublePrecision>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
static_assert( std::is_same<typename vobjOut::scalar_typeD, typename vobjIn::scalar_typeD>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
typedef typename vobjOut::vector_type ovector_type;
typedef typename vobjIn::vector_type ivector_type;
@ -251,9 +251,9 @@ void copyLane(vobjOut & __restrict__ vecOut, int lane_out, const vobjIn & __rest
ovector_type * __restrict__ op = (ovector_type *)&vecOut;
ivector_type * __restrict__ ip = (ivector_type *)&vecIn;
for(int w=0;w<owords;w++){
itmp = ip[iNsimd*w].getlane(lane_in);
itmp = ip[w].getlane(lane_in);
otmp = itmp; //potential precision change
op[oNsimd*w].putlane(otmp,lane_out);
op[w].putlane(otmp,lane_out);
}
}

View File

@ -305,14 +305,14 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
}); \
});
#define accelerator_barrier(dummy) { printf(" theGridAccelerator::wait()\n"); theGridAccelerator->wait(); }
#define accelerator_barrier(dummy) { theGridAccelerator->wait(); }
inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorCopySynchronise(void) { printf(" theCopyAccelerator::wait()\n"); theCopyAccelerator->wait(); }
inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait(); }
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes);}
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}

View File

@ -167,14 +167,13 @@ void GridCmdOptionInt(std::string &str,int & val)
return;
}
void GridCmdOptionFloat(std::string &str,float & val)
void GridCmdOptionFloat(std::string &str,double & val)
{
std::stringstream ss(str);
ss>>val;
return;
}
void GridParseLayout(char **argv,int argc,
Coordinate &latt_c,
Coordinate &mpi_c)

View File

@ -57,7 +57,7 @@ void GridCmdOptionCSL(std::string str,std::vector<std::string> & vec);
template<class VectorInt>
void GridCmdOptionIntVector(const std::string &str,VectorInt & vec);
void GridCmdOptionInt(std::string &str,int & val);
void GridCmdOptionFloat(std::string &str,float & val);
void GridCmdOptionFloat(std::string &str,double & val);
void GridParseLayout(char **argv,int argc,

View File

@ -164,11 +164,6 @@ int main(int argc, char **argv) {
typedef MobiusEOFAFermionF FermionEOFAActionF;
typedef typename FermionActionF::FermionField FermionFieldF;
typedef WilsonImplD2 FermionImplPolicyD2;
typedef MobiusFermionD2 FermionActionD2;
typedef MobiusEOFAFermionD2 FermionEOFAActionD2;
typedef typename FermionActionD2::FermionField FermionFieldD2;
typedef Grid::XmlReader Serialiser;
//::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
@ -232,31 +227,34 @@ int main(int argc, char **argv) {
// std::vector<Real> hasenbusch({ light_mass, 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated
// std::vector<Real> hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass });
OneFlavourRationalParams OFRp; // Up/down
OFRp.lo = 4.0e-5;
int SP_iters=10000;
RationalActionParams OFRp; // Up/down
OFRp.lo = 6.0e-5;
OFRp.hi = 90.0;
OFRp.MaxIter = 60000;
OFRp.tolerance= 1.0e-5;
OFRp.mdtolerance= 1.0e-3;
OFRp.inv_pow = 2;
OFRp.MaxIter = SP_iters; // get most shifts by 2000, stop sharing space
OFRp.action_tolerance= 1.0e-8;
OFRp.action_degree = 18;
OFRp.md_tolerance= 1.0e-5;
OFRp.md_degree = 14;
// OFRp.degree = 20; converges
// OFRp.degree = 16;
OFRp.degree = 18;
OFRp.precision= 80;
OFRp.BoundsCheckFreq=0;
std::vector<RealD> ActionTolByPole({
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-7,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8
});
std::vector<RealD> MDTolByPole({
1.0e-5,5.0e-6,1.0e-6,1.0e-7, // soften convergence more more
1.6e-5,5.0e-6,1.0e-6,3.0e-7, // soften convergence more more
// 1.0e-6,3.0e-7,1.0e-7,1.0e-7,
// 3.0e-6,1.0e-6,1.0e-7,1.0e-7, // soften convergence
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8
});
@ -265,10 +263,8 @@ int main(int argc, char **argv) {
typedef SchurDiagMooeeOperator<FermionActionF,FermionFieldF> LinearOperatorF;
typedef SchurDiagMooeeOperator<FermionAction ,FermionField > LinearOperatorD;
typedef SchurDiagMooeeOperator<FermionActionD2,FermionFieldD2 > LinearOperatorD2;
typedef SchurDiagMooeeOperator<FermionEOFAActionF,FermionFieldF> LinearOperatorEOFAF;
typedef SchurDiagMooeeOperator<FermionEOFAAction ,FermionField > LinearOperatorEOFAD;
typedef SchurDiagMooeeOperator<FermionEOFAActionD2,FermionFieldD2 > LinearOperatorEOFAD2;
typedef MixedPrecisionConjugateGradientOperatorFunction<MobiusFermionD,MobiusFermionF,LinearOperatorD,LinearOperatorF> MxPCG;
typedef MixedPrecisionConjugateGradientOperatorFunction<MobiusEOFAFermionD,MobiusEOFAFermionF,LinearOperatorEOFAD,LinearOperatorEOFAF> MxPCG_EOFA;
@ -321,7 +317,6 @@ int main(int argc, char **argv) {
// temporarily need a gauge field
LatticeGaugeFieldD U(GridPtr); U=Zero();
LatticeGaugeFieldF UF(GridPtrF); UF=Zero();
LatticeGaugeFieldD2 UD2(GridPtrF); UD2=Zero();
std::cout << GridLogMessage << " Running the HMC "<< std::endl;
TheHMC.ReadCommandLine(argc,argv); // params on CML or from param file
@ -340,6 +335,7 @@ int main(int argc, char **argv) {
ParamsDirF.dirichlet=Dirichlet;
ParamsDir.partialDirichlet=1;
ParamsDirF.partialDirichlet=1;
std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
// double StoppingCondition = 1e-14;
// double MDStoppingCondition = 1e-9;
@ -424,7 +420,7 @@ int main(int argc, char **argv) {
ActionCGL, ActionCGR,
DerivativeCGL, DerivativeCGR,
SFRp, true);
// Level2.push_back(&EOFA);
Level2.push_back(&EOFA);
////////////////////////////////////
// up down action
@ -449,17 +445,15 @@ int main(int argc, char **argv) {
std::vector<FermionAction *> Denominators;
std::vector<FermionActionF *> NumeratorsF;
std::vector<FermionActionF *> DenominatorsF;
std::vector<FermionActionD2 *> NumeratorsD2;
std::vector<FermionActionD2 *> DenominatorsD2;
std::vector<TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy> *> Quotients;
std::vector<MxPCG *> ActionMPCG;
std::vector<MxPCG *> MPCG;
#define MIXED_PRECISION
#ifdef MIXED_PRECISION
std::vector<OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicyD2> *> Bdys;
std::vector<GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicy> *> Bdys;
#else
std::vector<OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys;
std::vector<GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> *> Bdys;
#endif
typedef SchurDiagMooeeOperator<FermionActionF,FermionFieldF> LinearOperatorF;
@ -532,31 +526,19 @@ int main(int argc, char **argv) {
Quotients.push_back (new TwoFlavourEvenOddRatioPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],*MPCG[h],*ActionMPCG[h],CG));
} else {
#ifdef MIXED_PRECISION
// Use the D2 data types and make them use same grid as single
FermionActionD2::ImplParams ParamsDenD2(boundary);
FermionActionD2::ImplParams ParamsNumD2(boundary);
ParamsDenD2.dirichlet = ParamsDen.dirichlet;
ParamsDenD2.partialDirichlet = ParamsDen.partialDirichlet;
DenominatorsD2.push_back(new FermionActionD2(UD2,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsDenD2));
ParamsNumD2.dirichlet = ParamsNum.dirichlet;
ParamsNumD2.partialDirichlet = ParamsNum.partialDirichlet;
NumeratorsD2.push_back (new FermionActionD2(UD2,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[h],M5,b,c, ParamsNumD2));
Bdys.push_back( new OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicyD2>(
Bdys.push_back( new GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicy>(
*Numerators[h],*Denominators[h],
*NumeratorsF[h],*DenominatorsF[h],
*NumeratorsD2[h],*DenominatorsD2[h],
OFRp, 400) );
Bdys.push_back( new OneFlavourEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicyD2>(
*Numerators[h],*Denominators[h],
OFRp, SP_iters) );
Bdys.push_back( new GeneralEvenOddRatioRationalMixedPrecPseudoFermionAction<FermionImplPolicy,FermionImplPolicyF,FermionImplPolicy>(
*Numerators[h],*Denominators[h],
*NumeratorsF[h],*DenominatorsF[h],
*NumeratorsD2[h],*DenominatorsD2[h],
OFRp, 400) );
*Numerators[h],*Denominators[h],
OFRp, SP_iters) );
#else
Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
Bdys.push_back( new OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
Bdys.push_back( new GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
Bdys.push_back( new GeneralEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy>(*Numerators[h],*Denominators[h],OFRp));
#endif
}
}

View File

@ -183,7 +183,7 @@ int main(int argc, char **argv) {
// 4/2 => 0.6 dH
// 3/3 => 0.8 dH .. depth 3, slower
//MD.MDsteps = 4;
MD.MDsteps = 3;
MD.MDsteps = 12;
MD.trajL = 0.5;
HMCparameters HMCparams;
@ -200,8 +200,8 @@ int main(int argc, char **argv) {
TheHMC.Resources.AddFourDimGrid("gauge"); // use default simd lanes decomposition
CheckpointerParameters CPparams;
CPparams.config_prefix = "ckpoint_DDHMC_lat";
CPparams.rng_prefix = "ckpoint_DDHMC_rng";
CPparams.config_prefix = "ckpoint_HMC_lat";
CPparams.rng_prefix = "ckpoint_HMC_rng";
CPparams.saveInterval = 1;
CPparams.format = "IEEE64BIG";
TheHMC.Resources.LoadNerscCheckpointer(CPparams);
@ -228,7 +228,7 @@ int main(int argc, char **argv) {
Real pv_mass = 1.0;
// std::vector<Real> hasenbusch({ 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
// std::vector<Real> hasenbusch({ light_mass, 0.01, 0.045, 0.108, 0.25, 0.51 , pv_mass });
std::vector<Real> hasenbusch({ light_mass, 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated
std::vector<Real> hasenbusch({ 0.005, 0.0145, 0.045, 0.108, 0.25, 0.51 , pv_mass }); // Updated
// std::vector<Real> hasenbusch({ light_mass, 0.0145, 0.045, 0.108, 0.25, 0.51 , 0.75 , pv_mass });
auto GridPtr = TheHMC.Resources.GetCartesian();
@ -299,8 +299,8 @@ int main(int argc, char **argv) {
////////////////////////////////////
// Collect actions
////////////////////////////////////
ActionLevel<HMCWrapper::Field> Level1(1);
ActionLevel<HMCWrapper::Field> Level2(3);
// ActionLevel<HMCWrapper::Field> Level1(1);
ActionLevel<HMCWrapper::Field> Level2(1);
ActionLevel<HMCWrapper::Field> Level3(15);
////////////////////////////////////
@ -369,7 +369,7 @@ int main(int argc, char **argv) {
ActionCGL, ActionCGR,
DerivativeCGL, DerivativeCGR,
SFRp, true);
// Level2.push_back(&EOFA);
Level2.push_back(&EOFA);
////////////////////////////////////
// up down action
@ -477,7 +477,7 @@ int main(int argc, char **argv) {
// Gauge action
/////////////////////////////////////////////////////////////
Level3.push_back(&GaugeAction);
TheHMC.TheAction.push_back(Level1);
// TheHMC.TheAction.push_back(Level1);
TheHMC.TheAction.push_back(Level2);
TheHMC.TheAction.push_back(Level3);
std::cout << GridLogMessage << " Action complete "<< std::endl;

10
TODO
View File

@ -1,3 +1,12 @@
- - Slice sum optimisation & A2A - atomic addition
- - Also faster non-atomic reduction
- - Remaining PRs
- - DDHMC
- - MixedPrec is the action eval, high precision
- - MixedPrecCleanup is the force eval, low precision
=================
=================
Lattice_basis.h -- > HIP and SYCL GPU code
@ -8,6 +17,7 @@ DDHMC
-- Multishift Mixed Precision - DONE
-- Pole dependent residual - DONE
=======
-- comms threads issue??
-- Part done: Staggered kernel performance on GPU

View File

@ -0,0 +1,131 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./benchmarks/Benchmark_dwf.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#ifdef GRID_CUDA
#define CUDA_PROFILE
#endif
#ifdef CUDA_PROFILE
#include <cuda_profiler_api.h>
#endif
using namespace std;
using namespace Grid;
template<class d>
struct scal {
d internal;
};
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
};
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
Coordinate latt4= GridDefaultLatt();
Coordinate mpi = GridDefaultMpi();
Coordinate simd = GridDefaultSimd(Nd,vComplexF::Nsimd());
GridLogLayout();
int Ls=16;
for(int i=0;i<argc;i++)
if(std::string(argv[i]) == "-Ls"){
std::stringstream ss(argv[i+1]); ss >> Ls;
}
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(latt4,simd ,mpi);
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG"));
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG"));
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
LatticeFermionF src (FGrid); random(RNG5,src);
RealD N2 = 1.0/::sqrt(norm2(src));
src = src*N2;
std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
LatticeGaugeFieldF Umu(UGrid);
SU<Nc>::HotConfiguration(RNG4,Umu);
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
RealD mass=0.1;
RealD M5 =1.8;
RealD NP = UGrid->_Nprocessors;
RealD NN = UGrid->NodeCount();
DomainWallFermionF Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
const int ncall = 500;
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionF::HaloGatherOpt "<<std::endl;
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
{
typename DomainWallFermionF::Compressor compressor(0);
FGrid->Barrier();
Dw.Stencil.HaloExchangeOptGather(src,compressor);
double t0=usecond();
for(int i=0;i<ncall;i++){
Dw.Stencil.HaloExchangeOptGather(src,compressor);
}
double t1=usecond();
FGrid->Barrier();
double bytes=0.0;
if(mpi[0]) bytes+=latt4[1]*latt4[2]*latt4[3];
if(mpi[1]) bytes+=latt4[0]*latt4[2]*latt4[3];
if(mpi[2]) bytes+=latt4[0]*latt4[1]*latt4[3];
if(mpi[3]) bytes+=latt4[0]*latt4[1]*latt4[2];
bytes = bytes * Ls * 8.* (24.+12.)* 2.0;
std::cout<<GridLogMessage << "Gather us /call = "<< (t1-t0)/ncall<<std::endl;
std::cout<<GridLogMessage << "Gather MBs /call = "<< bytes*ncall/(t1-t0)<<std::endl;
}
Grid_finalize();
exit(0);
}

View File

@ -0,0 +1,189 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./benchmarks/Benchmark_prec_change.cc
Copyright (C) 2015
Author: Christopher Kelly <ckelly@bnl.gov>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
int Ls = 12;
Coordinate latt4 = GridDefaultLatt();
GridCartesian * UGridD = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGridD = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridD);
GridCartesian * FGridD = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridD);
GridRedBlackCartesian * FrbGridD = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridD);
GridCartesian * UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
GridCartesian * FGridF = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
GridParallelRNG RNG4(UGridD); RNG4.SeedFixedIntegers(seeds4);
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
GridParallelRNG RNG5(FGridD); RNG5.SeedFixedIntegers(seeds5);
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
LatticeFermionD field_d(FGridD), tmp_d(FGridD);
random(RNG5,field_d); tmp_d = field_d;
LatticeFermionD2 field_d2(FGridF), tmp_d2(FGridF);
precisionChange(field_d2, field_d); tmp_d2 = field_d2;
LatticeFermionF field_f(FGridF), tmp_f(FGridF);
precisionChange(field_f, field_d); tmp_f = field_f;
int N = 500;
double time_ds = 0, time_sd = 0;
std::cout<<GridLogMessage << "Benchmarking single<->double original implementation (fields initially device-resident)" << std::endl;
for(int i=0;i<N;i++){
//We want to benchmark the typical scenario of both fields being device resident
//To do this, invoke an operation that will open a device view and touch all sites
//with a write operation that invalidates the CPU copy
field_d = tmp_d;
field_f = tmp_f;
double start=usecond();
precisionChangeOrig(field_d,field_f);
double stop=usecond();
time_sd += stop - start;
field_d = tmp_d;
field_f = tmp_f;
start=usecond();
precisionChangeOrig(field_f,field_d);
stop=usecond();
time_ds += stop - start;
}
std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;
precisionChangeWorkspace wk_sp_to_dp(field_d.Grid(),field_f.Grid());
precisionChangeWorkspace wk_dp_to_sp(field_f.Grid(),field_d.Grid());
std::cout<<GridLogMessage << "Benchmarking single<->double with pregenerated workspace(fields initially device-resident)" << std::endl;
time_sd = time_ds = 0;
for(int i=0;i<N;i++){
field_d = tmp_d;
field_f = tmp_f;
double start=usecond();
precisionChange(field_d,field_f, wk_sp_to_dp);
double stop=usecond();
time_sd += stop - start;
field_d = tmp_d;
field_f = tmp_f;
start=usecond();
precisionChange(field_f,field_d, wk_dp_to_sp);
stop=usecond();
time_ds += stop - start;
}
std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;
std::cout<<GridLogMessage << "Benchmarking single<->double with workspace generated on-the-fly (fields initially device-resident)" << std::endl;
time_sd = time_ds = 0;
for(int i=0;i<N;i++){
field_d = tmp_d;
field_f = tmp_f;
double start=usecond();
precisionChange(field_d,field_f);
double stop=usecond();
time_sd += stop - start;
field_d = tmp_d;
field_f = tmp_f;
start=usecond();
precisionChange(field_f,field_d);
stop=usecond();
time_ds += stop - start;
}
std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;
std::cout<<GridLogMessage << "Benchmarking single<->double2 (fields initially device-resident)" << std::endl;
time_sd = time_ds = 0;
for(int i=0;i<N;i++){
field_d2 = tmp_d2;
field_f = tmp_f;
double start=usecond();
precisionChangeFast(field_d2,field_f);
double stop=usecond();
time_sd += stop - start;
field_d2 = tmp_d2;
field_f = tmp_f;
start=usecond();
precisionChangeFast(field_f,field_d2);
stop=usecond();
time_ds += stop - start;
}
std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;
std::cout<<GridLogMessage << "Benchmarking single<->double2 through standard precisionChange call(fields initially device-resident) [NB: perf should be the same as the previous test!]" << std::endl;
time_sd = time_ds = 0;
for(int i=0;i<N;i++){
field_d2 = tmp_d2;
field_f = tmp_f;
double start=usecond();
precisionChange(field_d2,field_f);
double stop=usecond();
time_sd += stop - start;
field_d2 = tmp_d2;
field_f = tmp_f;
start=usecond();
precisionChange(field_f,field_d2);
stop=usecond();
time_ds += stop - start;
}
std::cout << "d->s " << time_ds/N << "us" << " s->d " << time_sd/N << "us" << std::endl;
Grid_finalize();
}

View File

@ -1,12 +1,13 @@
CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
../../configure --enable-comms=mpi-auto \
--with-lime=$CLIME \
--enable-unified=no \
--enable-unified=yes \
--enable-shm=nvlink \
--enable-tracing=timer \
--enable-accelerator=hip \
--enable-gen-simd-width=64 \
--enable-simd=GPU \
--disable-accelerator-cshift \
--with-gmp=$OLCF_GMP_ROOT \
--with-fftw=$FFTW_DIR/.. \
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \

4
systems/PVC/benchmarks/run-1tile.sh Normal file → Executable file
View File

@ -6,7 +6,7 @@
source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
export NT=16
export NT=8
export I_MPI_OFFLOAD=1
export I_MPI_OFFLOAD_TOPOLIB=level_zero
@ -21,7 +21,7 @@ export I_MPI_OFFLOAD_CELL=tile
export EnableImplicitScaling=0
export EnableWalkerPartition=0
export ZE_AFFINITY_MASK=0.0
mpiexec -launcher ssh -n 1 -host localhost ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 --cacheblocking 8.8.8.8
mpiexec -launcher ssh -n 1 -host localhost ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 --device-mem 32768
export ZE_AFFINITY_MASK=0
export I_MPI_OFFLOAD_CELL=device

View File

@ -1,13 +1,13 @@
#!/bin/bash
##SBATCH -p PVC-SPR-QZEH
##SBATCH -p PVC-ICX-QZNW
#SBATCH -p QZ1J-ICX-PVC
source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
export NT=16
# export IGC_EnableLSCFenceUGMBeforeEOT=0
# export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file=False"
#export IGC_ShaderDumpEnable=1
@ -19,8 +19,11 @@ export SYCL_DEVICE_FILTER=gpu,level_zero
export I_MPI_OFFLOAD_CELL=tile
export EnableImplicitScaling=0
export EnableWalkerPartition=0
export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0
mpiexec -launcher ssh -n 1 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 > 1tile.log
#mpiexec -launcher ssh -n 1 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0 > 1tile.log
mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 > 2tile.log
mpiexec -launcher ssh -n 2 -host localhost ./wrap.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 0

View File

@ -5,10 +5,10 @@ export ZE_AFFINITY_MASK=0.$MPI_LOCALRANKID
echo Ranke $MPI_LOCALRANKID ZE_AFFINITY_MASK is $ZE_AFFINITY_MASK
if [ $MPI_LOCALRANKID = "0" ]
then
#if [ $MPI_LOCALRANKID = "0" ]
#then
# ~psteinbr/build_pti/ze_tracer -c $@
onetrace --chrome-kernel-timeline $@
else
# onetrace --chrome-kernel-timeline $@
#else
$@
fi
#fi

View File

@ -1,15 +1,16 @@
INSTALL=/nfs/site/home/azusayax/install
INSTALL=/nfs/site/home/paboylx/prereqs/
../../configure \
--enable-simd=GPU \
--enable-gen-simd-width=64 \
--enable-comms=mpi \
--enable-comms=mpi-auto \
--disable-accelerator-cshift \
--disable-gparity \
--disable-fermion-reps \
--enable-shm=nvlink \
--enable-accelerator=sycl \
--enable-unified=yes \
CXX=mpicxx \
--enable-unified=no \
MPICXX=mpicxx \
CXX=dpcpp \
LDFLAGS="-fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$INSTALL/lib" \
CXXFLAGS="-cxx=icpx -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-constant-compare"
CXXFLAGS="-fsycl-unnamed-lambda -fsycl -no-fma -I$INSTALL/include -Wno-tautological-compare"

View File

@ -1,5 +1,6 @@
export https_proxy=http://proxy-chain.intel.com:911
export LD_LIBRARY_PATH=/nfs/site/home/azusayax/install/lib:$LD_LIBRARY_PATH
#export LD_LIBRARY_PATH=/nfs/site/home/azusayax/install/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$HOME/prereqs/lib/:$LD_LIBRARY_PATH
module load intel-release
source /opt/intel/oneapi/PVC_setup.sh

View File

@ -1,3 +1,2 @@
CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GEN --enable-debug --enable-comms=mpi
#CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GPU-RRII --enable-comms=mpi
#CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GPU --enable-debug --enable-comms=mpi
CXX=mpicxx-openmpi-mp CXXFLAGS=-I/opt/local/include/ LDFLAGS=-L/opt/local/lib/ ../../configure --enable-simd=GEN --enable-debug --enable-comms=mpi --enable-unified=yes

View File

@ -101,7 +101,7 @@ int main (int argc, char ** argv)
std:: cout << " MdagM site flops = "<< 4*MdagMsiteflops<<std::endl;
std:: cout << " CG site flops = "<< CGsiteflops <<std::endl;
int iters;
for(int i=0;i<200;i++){
for(int i=0;i<10;i++){
result_o = Zero();
t1=usecond();
mCG(src_o,result_o);

View File

@ -1,35 +1,12 @@
/*************************************************************************************
grid` physics library, www.github.com/paboyle/Grid
Source file: ./tests/Test_cshift.cc
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace Grid;
;
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT,
Gamma::Algebra::Gamma5
};
int main (int argc, char ** argv)
{
@ -49,22 +26,8 @@ int main (int argc, char ** argv)
GridCartesian GRID(latt_size,simd_layout,mpi_layout);
GridRedBlackCartesian RBGRID(&GRID);
LatticeComplexD one(&GRID);
LatticeComplexD zz(&GRID);
LatticeComplexD C(&GRID);
LatticeComplexD Ctilde(&GRID);
LatticeComplexD Cref (&GRID);
LatticeComplexD Csav (&GRID);
LatticeComplexD coor(&GRID);
LatticeSpinMatrixD S(&GRID);
LatticeSpinMatrixD Stilde(&GRID);
Coordinate p({1,3,2,3});
one = ComplexD(1.0,0.0);
zz = ComplexD(0.0,0.0);
ComplexD ci(0.0,1.0);
std::vector<int> seeds({1,2,3,4});
@ -73,7 +36,6 @@ int main (int argc, char ** argv)
pRNG.SeedFixedIntegers(seeds);
LatticeGaugeFieldD Umu(&GRID);
SU<Nc>::ColdConfiguration(pRNG,Umu); // Unit gauge
////////////////////////////////////////////////////
@ -81,17 +43,79 @@ int main (int argc, char ** argv)
////////////////////////////////////////////////////
{
LatticeFermionD src(&GRID); gaussian(pRNG,src);
LatticeFermionD src_p(&GRID);
LatticeFermionD tmp(&GRID);
LatticeFermionD ref(&GRID);
LatticeFermionD result(&GRID);
RealD mass=0.01;
RealD mass=0.1;
WilsonFermionD Dw(Umu,GRID,RBGRID,mass);
Dw.M(src,tmp);
Dw.M(src,ref);
std::cout << "Norm src "<<norm2(src)<<std::endl;
std::cout << "Norm Dw x src "<<norm2(ref)<<std::endl;
{
FFT theFFT(&GRID);
////////////////
// operator in Fourier space
////////////////
tmp =ref;
theFFT.FFT_all_dim(result,tmp,FFT::forward);
std::cout<<"FFT[ Dw x src ] "<< norm2(result)<<std::endl;
tmp = src;
theFFT.FFT_all_dim(src_p,tmp,FFT::forward);
std::cout<<"FFT[ src ] "<< norm2(src_p)<<std::endl;
/////////////////////////////////////////////////////////////////
// work out the predicted FT from Fourier
/////////////////////////////////////////////////////////////////
auto FGrid = &GRID;
LatticeFermionD Kinetic(FGrid); Kinetic = Zero();
LatticeComplexD kmu(FGrid);
LatticeInteger scoor(FGrid);
LatticeComplexD sk (FGrid); sk = Zero();
LatticeComplexD sk2(FGrid); sk2= Zero();
LatticeComplexD W(FGrid); W= Zero();
LatticeComplexD one(FGrid); one =ComplexD(1.0,0.0);
ComplexD ci(0.0,1.0);
for(int mu=0;mu<Nd;mu++) {
RealD TwoPiL = M_PI * 2.0/ latt_size[mu];
LatticeCoordinate(kmu,mu);
kmu = TwoPiL * kmu;
sk2 = sk2 + 2.0*sin(kmu*0.5)*sin(kmu*0.5);
sk = sk + sin(kmu) *sin(kmu);
// -1/2 Dw -> 1/2 gmu (eip - emip) = i sinp gmu
Kinetic = Kinetic + sin(kmu)*ci*(Gamma(Gmu[mu])*src_p);
}
W = mass + sk2;
Kinetic = Kinetic + W * src_p;
std::cout<<"Momentum space src "<< norm2(src_p)<<std::endl;
std::cout<<"Momentum space Dw x src "<< norm2(Kinetic)<<std::endl;
std::cout<<"FT[Coordinate space Dw] "<< norm2(result)<<std::endl;
result = result - Kinetic;
std::cout<<"diff "<< norm2(result)<<std::endl;
}
std::cout << " =======================================" <<std::endl;
std::cout << " Checking FourierFreePropagator x Dw = 1" <<std::endl;
std::cout << " =======================================" <<std::endl;
std::cout << "Dw src = " <<norm2(src)<<std::endl;
std::cout << "Dw tmp = " <<norm2(tmp)<<std::endl;
Dw.M(src,tmp);
Dw.FreePropagator(tmp,ref,mass);
std::cout << "Dw ref = " <<norm2(ref)<<std::endl;
@ -122,7 +146,8 @@ int main (int argc, char ** argv)
ferm()(0)(0) = ComplexD(1.0);
pokeSite(ferm,src,point);
RealD mass=0.01;
RealD mass=0.1;
WilsonFermionD Dw(Umu,GRID,RBGRID,mass);
// Momentum space prop
@ -155,6 +180,65 @@ int main (int argc, char ** argv)
DumpSliceNorm("Slice Norm Solution ",result,Nd-1);
}
////////////////////////////////////////////////////
//Gauge invariance test
////////////////////////////////////////////////////
{
std::cout<<"****************************************"<<std::endl;
std::cout << "Gauge invariance test \n";
std::cout<<"****************************************"<<std::endl;
LatticeGaugeField U_GT(&GRID); // Gauge transformed field
LatticeColourMatrix g(&GRID); // local Gauge xform matrix
U_GT = Umu;
// Make a random xform to teh gauge field
SU<Nc>::RandomGaugeTransform(pRNG,U_GT,g); // Unit gauge
LatticeFermionD src(&GRID);
LatticeFermionD tmp(&GRID);
LatticeFermionD ref(&GRID);
LatticeFermionD diff(&GRID);
// could loop over colors
src=Zero();
Coordinate point(4,0); // 0,0,0,0
SpinColourVectorD ferm;
ferm=Zero();
ferm()(0)(0) = ComplexD(1.0);
pokeSite(ferm,src,point);
RealD mass=0.1;
WilsonFermionD Dw(U_GT,GRID,RBGRID,mass);
// Momentum space prop
std::cout << " Solving by FFT and Feynman rules" <<std::endl;
Dw.FreePropagator(src,ref,mass) ;
Gamma G5(Gamma::Algebra::Gamma5);
LatticeFermionD result(&GRID);
const int sdir=0;
////////////////////////////////////////////////////////////////////////
// Conjugate gradient on normal equations system
////////////////////////////////////////////////////////////////////////
std::cout << " Solving by Conjugate Gradient (CGNE)" <<std::endl;
Dw.Mdag(src,tmp);
src=tmp;
MdagMLinearOperator<WilsonFermionD,LatticeFermionD> HermOp(Dw);
ConjugateGradient<LatticeFermionD> CG(1.0e-10,10000);
CG(HermOp,src,result);
////////////////////////////////////////////////////////////////////////
std::cout << " Taking difference" <<std::endl;
std::cout << "Dw result "<<norm2(result)<<std::endl;
std::cout << "Dw ref "<<norm2(ref)<<std::endl;
diff = ref - result;
std::cout << "result - ref "<<norm2(diff)<<std::endl;
DumpSliceNorm("Slice Norm Solution ",result,Nd-1);
}
Grid_finalize();
}

View File

@ -0,0 +1,110 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/Test_memory_manager.cc
Copyright (C) 2022
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
void MemoryTest(GridCartesian * FGrid,int N);
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
int N=100;
for(int i=0;i<N;i++){
std::cout << "============================"<<std::endl;
std::cout << "Epoch "<<i<<"/"<<N<<std::endl;
std::cout << "============================"<<std::endl;
MemoryTest(UGrid,256);
MemoryManager::Print();
AUDIT();
}
Grid_finalize();
}
void MemoryTest(GridCartesian * FGrid, int N)
{
LatticeComplexD zero(FGrid); zero=Zero();
std::vector<LatticeComplexD> A(N,zero);//FGrid);
std::vector<ComplexD> B(N,ComplexD(0.0)); // Update sequentially on host
for(int v=0;v<N;v++) A[v] = Zero();
uint64_t counter = 0;
for(int epoch = 0;epoch<10000;epoch++){
int v = random() %N; // Which vec
int w = random() %2; // Write or read
int e = random() %3; // expression or for loop
int dev= random() %2; // On device?
// int e=1;
ComplexD zc = counter++;
if ( w ) {
B[v] = B[v] + zc;
if ( e == 0 ) {
A[v] = A[v] + zc - A[v] + A[v];
} else {
if ( dev ) {
autoView(A_v,A[v],AcceleratorWrite);
accelerator_for(ss,FGrid->oSites(),1,{
A_v[ss] = A_v[ss] + zc;
});
} else {
autoView(A_v,A[v],CpuWrite);
thread_for(ss,FGrid->oSites(),{
A_v[ss] = A_v[ss] + zc;
});
}
}
} else {
if ( e == 0 ) {
A[v] = A[v] + A[v] - A[v];
} else {
if ( dev ) {
autoView(A_v,A[v],AcceleratorRead);
accelerator_for(ss,FGrid->oSites(),1,{
assert(B[v]==A_v[ss]()()().getlane(0));
});
// std::cout << "["<<v<<"] checked on GPU"<<B[v]<<std::endl;
} else {
autoView(A_v,A[v],CpuRead);
thread_for(ss,FGrid->oSites(),{
assert(B[v]==A_v[ss]()()().getlane(0));
});
// std::cout << "["<<v<<"] checked on CPU"<<B[v]<<std::endl;
}
}
}
}
}

View File

@ -0,0 +1,124 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/core/Test_prec_change.cc
Copyright (C) 2015
Author: Christopher Kelly <ckelly@bnl.gov>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
int Ls = 12;
Coordinate latt4 = GridDefaultLatt();
GridCartesian * UGridD = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGridD = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridD);
GridCartesian * FGridD = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridD);
GridRedBlackCartesian * FrbGridD = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridD);
GridCartesian * UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
GridCartesian * FGridF = SpaceTimeGrid::makeFiveDimGrid(Ls,UGridF);
GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGridF);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
GridParallelRNG RNG5(FGridD); RNG5.SeedFixedIntegers(seeds5);
GridParallelRNG RNG5F(FGridF); RNG5F.SeedFixedIntegers(seeds5);
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
LatticeFermionD field_d(FGridD), tmp_d(FGridD);
random(RNG5,field_d);
RealD norm2_field_d = norm2(field_d);
LatticeFermionD2 field_d2(FGridF), tmp_d2(FGridF);
random(RNG5F,field_d2);
RealD norm2_field_d2 = norm2(field_d2);
LatticeFermionF field_f(FGridF);
//Test original implementation
{
std::cout << GridLogMessage << "Testing original implementation" << std::endl;
field_f = Zero();
precisionChangeOrig(field_f,field_d);
RealD Ndiff = (norm2_field_d - norm2(field_f))/norm2_field_d;
std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of single and double prec fields differs by " << Ndiff << std::endl;
tmp_d = Zero();
precisionChangeOrig(tmp_d, field_f);
Ndiff = norm2( LatticeFermionD(tmp_d-field_d) ) / norm2_field_d;
std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of back-converted and original double prec fields differs by " << Ndiff << std::endl;
}
//Test new implementation with pregenerated workspace
{
std::cout << GridLogMessage << "Testing new implementation with pregenerated workspace" << std::endl;
precisionChangeWorkspace wk_sp_to_dp(field_d.Grid(),field_f.Grid());
precisionChangeWorkspace wk_dp_to_sp(field_f.Grid(),field_d.Grid());
field_f = Zero();
precisionChange(field_f,field_d,wk_dp_to_sp);
RealD Ndiff = (norm2_field_d - norm2(field_f))/norm2_field_d;
std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of single and double prec fields differs by " << Ndiff << std::endl;
tmp_d = Zero();
precisionChange(tmp_d, field_f,wk_sp_to_dp);
Ndiff = norm2( LatticeFermionD(tmp_d-field_d) ) / norm2_field_d;
std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of back-converted and original double prec fields differs by " << Ndiff << std::endl;
}
//Test new implementation without pregenerated workspace
{
std::cout << GridLogMessage << "Testing new implementation without pregenerated workspace" << std::endl;
field_f = Zero();
precisionChange(field_f,field_d);
RealD Ndiff = (norm2_field_d - norm2(field_f))/norm2_field_d;
std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of single and double prec fields differs by " << Ndiff << std::endl;
tmp_d = Zero();
precisionChange(tmp_d, field_f);
Ndiff = norm2( LatticeFermionD(tmp_d-field_d) ) / norm2_field_d;
std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of back-converted and original double prec fields differs by " << Ndiff << std::endl;
}
//Test fast implementation
{
std::cout << GridLogMessage << "Testing fast (double2) implementation" << std::endl;
field_f = Zero();
precisionChangeFast(field_f,field_d2);
RealD Ndiff = (norm2_field_d2 - norm2(field_f))/norm2_field_d2;
std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of single and double prec fields differs by " << Ndiff << std::endl;
tmp_d2 = Zero();
precisionChangeFast(tmp_d2, field_f);
Ndiff = norm2( LatticeFermionD2(tmp_d2-field_d2) ) / norm2_field_d2;
std::cout << GridLogMessage << (fabs(Ndiff) > 1e-05 ? "!!FAIL" : "Pass") << ": relative norm2 of back-converted and original double prec fields differs by " << Ndiff << std::endl;
}
std::cout << "Done" << std::endl;
Grid_finalize();
}

305
tests/forces/Test_bdy.cc Normal file
View File

@ -0,0 +1,305 @@
/*
2f Full det MdagM 10^6 force ~ 1.3e7
rid : Message : 1767.283471 s : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 1767.283476 s : S1 : 1.52885e+09
Grid : Message : 1767.283480 s : S2 : 1.52886e+09
Grid : Message : 1767.283482 s : dS : 8877.34
Grid : Message : 1767.283483 s : dSpred : 8877.7
Grid : Message : 1767.283484 s : diff : -0.360484
Grid : Message : 1767.283485 s : *********************************************************
2f Full det MpcdagMpc 10^6 force ~ 1.8e6
Grid : Message : 2399.576962 s : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 2399.576968 s : S1 : 1.52885e+09
Grid : Message : 2399.576972 s : S2 : 1.52886e+09
Grid : Message : 2399.576974 s : dS : 9728.49
Grid : Message : 2399.576975 s : dSpred : 9726.58
Grid : Message : 2399.576976 s : diff : 1.90683
Grid : Message : 2399.576977 s : *********************************************************
2f bdy MdagM 1500 force Force ~ 2800
Grid : Message : 4622.385061 s : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 4622.385067 s : S1 : 1.52885e+09
Grid : Message : 4622.385071 s : S2 : 1.52885e+09
Grid : Message : 4622.385072 s : dS : 25.4944
Grid : Message : 4622.385073 s : dSpred : 25.4672
Grid : Message : 4622.385074 s : diff : 0.0271414
Grid : Message : 4622.385075 s : *********************************************************
2f bdy MpcdagMpc 10^6 force Force ~ 2200
Grid : Message : 4622.385061 s : +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 4622.385067 s : S1 : 1.52885e+09
Grid : Message : 4622.385071 s : S2 : 1.52885e+09
Grid : Message : 4622.385072 s : dS : 25.4944
Grid : Message : 4622.385073 s : dSpred : 25.4672
Grid : Message : 4622.385074 s : diff : 0.0271414
Grid : Message : 4622.385075 s : *********************************************************
1f Bdy Det
Optimisation log: looser rational AND MD tolerances sloppy
MobiusForce.221179 -- same as HMC. dS is mispredicted Forece ~2.8
Grid : Message : 6582.258991 s : dS : 0.024478
Grid : Message : 6582.258992 s : dSpred : 0.00791876
Grid : Message : 6582.258994 s : diff : 0.0165592
MobiusForce.221193 -- tight rational AND MD tolerances to 1e-8 ~ 2.8 same
Grid : Message : 1964.939209 s : S1 : 7.64404e+08
Grid : Message : 1964.939213 s : S2 : 7.64404e+08
Grid : Message : 1964.939215 s : dS : -0.00775838 <--- too loose even on action
Grid : Message : 1964.939216 s : dSpred : -0.00416793
Grid : Message : 1964.939217 s : diff : -0.00359045
MobiusForce.221394 -- looser rational, MD tol 1e-8 ~ 2.8 same
Grid : Message : 1198.346720 s : S1 : 764404649.48886
Grid : Message : 1198.346760 s : S2 : 764404649.5133
Grid : Message : 1198.346780 s : dS : 0.024440884590149
Grid : Message : 1198.346800 s : dSpred : 0.0079145154465184
Grid : Message : 1198.346810 s : diff : 0.016526369143631
MobiusForce.221394 -- tight rational, MD tol sloppy Force ~ 2.8
Grid : Message : 2376.921950 s : S1 : 764404436.44069
Grid : Message : 2376.921954 s : S2 : 764404436.43299
Grid : Message : 2376.921956 s : dS : -0.0076971054077148
Grid : Message : 2376.921958 s : dSpred : -0.0041610472282526
Grid : Message : 2376.921959 s : diff : -0.0035360581794623
*/
//
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/Test_double_ratio.cc
Copyright (C) 2022
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
typedef MobiusFermionD FermionAction;
typedef WilsonImplD FimplD;
typedef WilsonImplD FermionImplPolicy;
template<class Gimpl>
void ForceTest(Action<LatticeGaugeField> &action,LatticeGaugeField & U,MomentumFilterBase<LatticeGaugeField> &Filter)
{
GridBase *UGrid = U.Grid();
std::vector<int> seeds({1,2,3,5});
GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds);
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds);
LatticeColourMatrix Pmu(UGrid);
LatticeGaugeField P(UGrid);
LatticeGaugeField UdSdU(UGrid);
std::cout << GridLogMessage << "*********************************************************"<<std::endl;
std::cout << GridLogMessage << " Force test for "<<action.action_name()<<std::endl;
std::cout << GridLogMessage << "*********************************************************"<<std::endl;
RealD eps=0.005;
std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
std::cout << GridLogMessage << " Refresh "<<action.action_name()<<std::endl;
std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
Gimpl::generate_momenta(P,sRNG,RNG4);
Filter.applyFilter(P);
#if 0
FieldMetaData header;
std::string file("./ckpoint_lat.2000");
NerscIO::readConfiguration(U,header,file);
#else
U = 1.0;
#endif
action.refresh(U,sRNG,RNG4);
std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
std::cout << GridLogMessage << " Action "<<action.action_name()<<std::endl;
std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
RealD S1 = action.S(U);
Gimpl::update_field(P,U,eps);
std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
std::cout << GridLogMessage << " Derivative "<<action.action_name()<<std::endl;
std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
action.deriv(U,UdSdU);
UdSdU = Ta(UdSdU);
Filter.applyFilter(UdSdU);
DumpSliceNorm("Force",UdSdU,Nd-1);
Gimpl::update_field(P,U,eps);
std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
std::cout << GridLogMessage << " Action "<<action.action_name()<<std::endl;
std::cout << GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
RealD S2 = action.S(U);
// Use the derivative
LatticeComplex dS(UGrid); dS = Zero();
for(int mu=0;mu<Nd;mu++){
auto UdSdUmu = PeekIndex<LorentzIndex>(UdSdU,mu);
Pmu= PeekIndex<LorentzIndex>(P,mu);
dS = dS - trace(Pmu*UdSdUmu)*eps*2.0*2.0;
}
ComplexD dSpred = sum(dS);
RealD diff = S2-S1-dSpred.real();
std::cout<< GridLogMessage << "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++"<<std::endl;
std::cout<< GridLogMessage << "S1 : "<< S1 <<std::endl;
std::cout<< GridLogMessage << "S2 : "<< S2 <<std::endl;
std::cout<< GridLogMessage << "dS : "<< S2-S1 <<std::endl;
std::cout<< GridLogMessage << "dSpred : "<< dSpred.real() <<std::endl;
std::cout<< GridLogMessage << "diff : "<< diff<<std::endl;
std::cout<< GridLogMessage << "*********************************************************"<<std::endl;
// assert(diff<1.0);
std::cout<< GridLogMessage << "Done" <<std::endl;
std::cout << GridLogMessage << "*********************************************************"<<std::endl;
}
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
std::cout << std::setprecision(14);
Coordinate latt_size = GridDefaultLatt();
Coordinate mpi_layout = GridDefaultMpi();
Coordinate simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
Coordinate shm;
GlobalSharedMemory::GetShmDims(mpi_layout,shm);
const int Ls=12;
const int Nt = latt_size[3];
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
////////////////////////////////////////////////////////////////
// Domain decomposed operator
////////////////////////////////////////////////////////////////
Coordinate CommDim(Nd);
for(int d=0;d<Nd;d++) CommDim[d]= (mpi_layout[d]/shm[d])>1 ? 1 : 0;
Coordinate NonDirichlet(Nd+1,0);
Coordinate Dirichlet(Nd+1,0);
Dirichlet[1] = CommDim[0]*latt_size[0]/mpi_layout[0] * shm[0];
Dirichlet[2] = CommDim[1]*latt_size[1]/mpi_layout[1] * shm[1];
Dirichlet[3] = CommDim[2]*latt_size[2]/mpi_layout[2] * shm[2];
Dirichlet[4] = CommDim[3]*latt_size[3]/mpi_layout[3] * shm[3];
Coordinate Block4(Nd);
Block4[0] = Dirichlet[1];
Block4[1] = Dirichlet[2];
Block4[2] = Dirichlet[3];
Block4[3] = Dirichlet[4];
std::vector<Complex> boundary = {1,1,1,-1};
FermionAction::ImplParams Params(boundary);
FermionAction::ImplParams ParamsDir(boundary);
Params.dirichlet=NonDirichlet;
ParamsDir.dirichlet=Dirichlet;
ParamsDir.partialDirichlet=1;
///////////////////// Gauge Field and Gauge Forces ////////////////////////////
LatticeGaugeField U(UGrid);
RealD beta=6.0;
WilsonGaugeActionR PlaqAction(beta);
IwasakiGaugeActionR RectAction(beta);
MomentumFilterNone<LatticeGaugeField> FilterNone;
ForceTest<GimplTypesR>(PlaqAction,U,FilterNone);
ForceTest<GimplTypesR>(RectAction,U,FilterNone);
////////////////////////////////////
// Action
////////////////////////////////////
RealD mass=0.00078;
RealD pvmass=1.0;
RealD M5=1.8;
RealD b=1.5;
RealD c=0.5;
// Double versions
FermionAction DdwfPeriodic(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c,Params);
FermionAction PVPeriodic (U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,pvmass,M5,b,c,Params);
FermionAction DdwfDirichlet(U,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c,ParamsDir);
double StoppingCondition = 1.0e-8;
double MaxCGIterations = 50000;
ConjugateGradient<LatticeFermion> CG(StoppingCondition,MaxCGIterations);
//////////////////// Two Flavour Determinant Ratio ///////////////////////////////
TwoFlavourRatioPseudoFermionAction<FimplD> Nf2(PVPeriodic, DdwfPeriodic,CG,CG);
// ForceTest<GimplTypesR>(Nf2,U,FilterNone);
//////////////////// Two Flavour Determinant force test Even Odd ///////////////////////////////
TwoFlavourEvenOddRatioPseudoFermionAction<FimplD> Nf2eo(PVPeriodic, DdwfPeriodic,CG,CG);
// ForceTest<GimplTypesR>(Nf2eo,U,FilterNone);
//////////////////// Domain forces ////////////////////
int Width=4;
DDHMCFilter<WilsonImplD::Field> DDHMCFilter(Block4,Width);
//////////////////// Two flavour boundary det ////////////////////
TwoFlavourRatioPseudoFermionAction<FimplD> BdyNf2(DdwfDirichlet, DdwfPeriodic,CG,CG);
// ForceTest<GimplTypesR>(BdyNf2,U,DDHMCFilter);
//////////////////// Two flavour eo boundary det ////////////////////
TwoFlavourEvenOddRatioPseudoFermionAction<FimplD> BdyNf2eo(DdwfDirichlet, DdwfPeriodic,CG,CG);
// ForceTest<GimplTypesR>(BdyNf2eo,U,DDHMCFilter);
//////////////////// One flavour boundary det ////////////////////
OneFlavourRationalParams OFRp; // Up/down
OFRp.lo = 4.0e-5;
OFRp.hi = 90.0;
OFRp.MaxIter = 60000;
OFRp.tolerance= 1.0e-8;
OFRp.mdtolerance= 1.0e-6;
OFRp.degree = 18;
OFRp.precision= 80;
OFRp.BoundsCheckFreq=0;
std::vector<RealD> ActionTolByPole({
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8
});
std::vector<RealD> MDTolByPole({
1.0e-6,3.0e-7,1.0e-7,1.0e-7, // Orig sloppy
// 1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8
});
OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> BdySqrt(DdwfDirichlet,DdwfPeriodic,OFRp);
ForceTest<GimplTypesR>(BdySqrt,U,DDHMCFilter);
Grid_finalize();
}

View File

@ -476,6 +476,20 @@ int main (int argc, char ** argv)
// ForceTest<GimplTypesR>(BdyNf2eo,U,DDHMCFilter);
//////////////////// One flavour boundary det ////////////////////
RationalActionParams OFRp; // Up/down
OFRp.lo = 6.0e-5;
OFRp.hi = 90.0;
OFRp.inv_pow = 2;
OFRp.MaxIter = SP_iters; // get most shifts by 2000, stop sharing space
OFRp.action_tolerance= 1.0e-8;
OFRp.action_degree = 18;
OFRp.md_tolerance= 1.0e-5;
OFRp.md_degree = 14;
// OFRp.degree = 20; converges
// OFRp.degree = 16;
OFRp.precision= 80;
OFRp.BoundsCheckFreq=0;
/*
OneFlavourRationalParams OFRp; // Up/down
OFRp.lo = 4.0e-5;
OFRp.hi = 90.0;
@ -485,6 +499,23 @@ int main (int argc, char ** argv)
OFRp.degree = 18;
OFRp.precision= 80;
OFRp.BoundsCheckFreq=0;
*/
std::vector<RealD> ActionTolByPole({
1.0e-7,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8
});
std::vector<RealD> MDTolByPole({
1.6e-5,5.0e-6,1.0e-6,3.0e-7, // soften convergence more more
// 1.0e-6,3.0e-7,1.0e-7,1.0e-7,
// 3.0e-6,1.0e-6,1.0e-7,1.0e-7, // soften convergence
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8
});
/*
std::vector<RealD> ActionTolByPole({
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
@ -499,9 +530,9 @@ int main (int argc, char ** argv)
// 1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8,1.0e-8,1.0e-8,
1.0e-8,1.0e-8
});
*/
OneFlavourEvenOddRatioRationalPseudoFermionAction<FermionImplPolicy> BdySqrt(DdwfDirichlet,DdwfPeriodic,OFRp);
BdySqrt.SetTolerances(ActionTolByPole,MDTolByPole);
ForceTest<GimplTypesR>(BdySqrt,U,DDHMCFilter);

View File

@ -0,0 +1,73 @@
#Example script
DIR=/gpfs/alpine/phy157/proj-shared/phy157dwf/chulwoo/Grid/BL/build/tests/lanczos
BIN=${DIR}/Test_dwf_block_lanczos
VOL='--grid 16.16.16.32 '
GRID='--mpi 1.1.1.4 '
CONF='--gconf ckpoint_lat.IEEE64BIG.2000 '
OPT='--mass 0.01 --M5 1.8 --phase in.params --omega in.params --shm 4096'
#BL='--rbl 16.1024.128.1000.10 --split 1.1.4.4 --check_int 100 --resid 1.0e-5 --cheby_l 0.007 --cheby_u 7 --cheby_n 51'
BL='--rbl 4.128.16.100.10 --split 1.1.1.4 --check_int 25 --resid 1.0e-5 --cheby_l 0.007 --cheby_u 7 --cheby_n 51'
ARGS=${CONF}" "${OPT}" "${BL}" "${VOL}" "${GRID}
export APP="${BIN} ${ARGS}"
echo APP=${APP}
#export JS="jsrun --nrs 32 -a4 -g4 -c42 -dpacked -b packed:7 --smpiargs="-gpu" "
export JS="jsrun --nrs 1 -a4 -g4 -c42 -dpacked -b packed:10 --smpiargs="-gpu" "
$JS $APP
#sample in.param
boundary_phase 0 1 0
boundary_phase 1 1 0
boundary_phase 2 1 0
boundary_phase 3 -1 0
omega 0 0.5 0
omega 1 0.5 0
omega 2 0.5 0
omega 3 0.5 0
omega 4 0.5 0
omega 5 0.5 0
omega 6 0.5 0
omega 7 0.5 0
omega 8 0.5 0
omega 9 0.5 0
omega 10 0.5 0
omega 11 0.5 0
#output
Grid : Message : 1.717474 s : Gauge Configuration ckpoint_lat.IEEE64BIG.2000
Grid : Message : 1.717478 s : boundary_phase[0] = (1,0)
Grid : Message : 1.717497 s : boundary_phase[1] = (1,0)
Grid : Message : 1.717500 s : boundary_phase[2] = (1,0)
Grid : Message : 1.717503 s : boundary_phase[3] = (-1,0)
Grid : Message : 1.717506 s : Ls 12
Grid : Message : 1.717507 s : mass 0.01
Grid : Message : 1.717510 s : M5 1.8
Grid : Message : 1.717512 s : mob_b 1.5
Grid : Message : 1.717514 s : omega[0] = (0.5,0)
Grid : Message : 1.717517 s : omega[1] = (0.5,0)
Grid : Message : 1.717520 s : omega[2] = (0.5,0)
Grid : Message : 1.717523 s : omega[3] = (0.5,0)
Grid : Message : 1.717526 s : omega[4] = (0.5,0)
Grid : Message : 1.717529 s : omega[5] = (0.5,0)
Grid : Message : 1.717532 s : omega[6] = (0.5,0)
Grid : Message : 1.717535 s : omega[7] = (0.5,0)
Grid : Message : 1.717538 s : omega[8] = (0.5,0)
Grid : Message : 1.717541 s : omega[9] = (0.5,0)
Grid : Message : 1.717544 s : omega[10] = (0.5,0)
Grid : Message : 1.717547 s : omega[11] = (0.5,0)
Grid : Message : 1.717550 s : Nu 4
Grid : Message : 1.717551 s : Nk 128
Grid : Message : 1.717552 s : Np 16
Grid : Message : 1.717553 s : Nm 288
Grid : Message : 1.717554 s : Nstop 100
Grid : Message : 1.717555 s : Ntest 25
Grid : Message : 1.717557 s : MaxIter 10
Grid : Message : 1.717558 s : resid 1e-05
Grid : Message : 1.717560 s : Cheby Poly 0.007,7,51

View File

@ -0,0 +1,410 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/Test_dwf_block_lanczos.cc
Copyright (C) 2022
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Yong-Chull Jang <ypj@quark.phy.bnl.gov>
Author: Chulwoo Jung <chulwoo@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/util/Init.h>
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
using namespace std;
using namespace Grid;
//using namespace Grid::QCD;
//typedef typename GparityDomainWallFermionR::FermionField FermionField;
typedef typename ZMobiusFermionF::FermionField FermionField;
RealD AllZero(RealD x){ return 0.;}
class CmdJobParams
{
public:
std::string gaugefile;
int Ls;
double mass;
double M5;
double mob_b;
std::vector<ComplexD> omega;
std::vector<Complex> boundary_phase;
std::vector<int> mpi_split;
LanczosType Impl;
int Nu;
int Nk;
int Np;
int Nm;
int Nstop;
int Ntest;
int MaxIter;
double resid;
double low;
double high;
int order;
CmdJobParams()
: gaugefile("Hot"),
Ls(8), mass(0.01), M5(1.8), mob_b(1.5),
Impl(LanczosType::irbl),mpi_split(4,1),
Nu(4), Nk(200), Np(200), Nstop(100), Ntest(1), MaxIter(10), resid(1.0e-8),
low(0.2), high(5.5), order(11)
{Nm=Nk+Np;};
void Parse(char **argv, int argc);
};
void CmdJobParams::Parse(char **argv,int argc)
{
std::string arg;
std::vector<int> vi;
double re,im;
int expect, idx;
std::string vstr;
std::ifstream pfile;
if( GridCmdOptionExists(argv,argv+argc,"--gconf") ){
gaugefile = GridCmdOptionPayload(argv,argv+argc,"--gconf");
}
if( GridCmdOptionExists(argv,argv+argc,"--phase") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--phase");
pfile.open(arg);
assert(pfile);
expect = 0;
while( pfile >> vstr ) {
if ( vstr.compare("boundary_phase") == 0 ) {
pfile >> vstr;
GridCmdOptionInt(vstr,idx);
assert(expect==idx);
pfile >> vstr;
GridCmdOptionFloat(vstr,re);
pfile >> vstr;
GridCmdOptionFloat(vstr,im);
boundary_phase.push_back({re,im});
expect++;
}
}
pfile.close();
} else {
for (int i=0; i<4; ++i) boundary_phase.push_back({1.,0.});
}
if( GridCmdOptionExists(argv,argv+argc,"--omega") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--omega");
pfile.open(arg);
assert(pfile);
Ls = 0;
while( pfile >> vstr ) {
if ( vstr.compare("omega") == 0 ) {
pfile >> vstr;
GridCmdOptionInt(vstr,idx);
assert(Ls==idx);
pfile >> vstr;
GridCmdOptionFloat(vstr,re);
pfile >> vstr;
GridCmdOptionFloat(vstr,im);
omega.push_back({re,im});
Ls++;
}
}
pfile.close();
} else {
if( GridCmdOptionExists(argv,argv+argc,"--Ls") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--Ls");
GridCmdOptionInt(arg,Ls);
}
}
if( GridCmdOptionExists(argv,argv+argc,"--mass") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--mass");
GridCmdOptionFloat(arg,mass);
}
if( GridCmdOptionExists(argv,argv+argc,"--M5") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--M5");
GridCmdOptionFloat(arg,M5);
}
if( GridCmdOptionExists(argv,argv+argc,"--mob_b") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--mob_b");
GridCmdOptionFloat(arg,mob_b);
}
if( GridCmdOptionExists(argv,argv+argc,"--irbl") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--irbl");
GridCmdOptionIntVector(arg,vi);
Nu = vi[0];
Nk = vi[1];
Np = vi[2];
Nstop = vi[3];
MaxIter = vi[4];
// ypj[fixme] mode overriding message is needed.
Impl = LanczosType::irbl;
Nm = Nk+Np;
}
// block Lanczos with explicit extension of its dimensions
if( GridCmdOptionExists(argv,argv+argc,"--rbl") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--rbl");
GridCmdOptionIntVector(arg,vi);
Nu = vi[0];
Nk = vi[1];
Np = vi[2]; // vector space is enlarged by adding Np vectors
Nstop = vi[3];
MaxIter = vi[4];
// ypj[fixme] mode overriding message is needed.
Impl = LanczosType::rbl;
Nm = Nk+Np*MaxIter;
}
#if 1
// block Lanczos with explicit extension of its dimensions
if( GridCmdOptionExists(argv,argv+argc,"--split") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--split");
GridCmdOptionIntVector(arg,vi);
for(int i=0;i<mpi_split.size();i++)
mpi_split[i] = vi[i];
}
#endif
if( GridCmdOptionExists(argv,argv+argc,"--check_int") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--check_int");
GridCmdOptionInt(arg,Ntest);
}
if( GridCmdOptionExists(argv,argv+argc,"--resid") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--resid");
GridCmdOptionFloat(arg,resid);
}
if( GridCmdOptionExists(argv,argv+argc,"--cheby_l") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--cheby_l");
GridCmdOptionFloat(arg,low);
}
if( GridCmdOptionExists(argv,argv+argc,"--cheby_u") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--cheby_u");
GridCmdOptionFloat(arg,high);
}
if( GridCmdOptionExists(argv,argv+argc,"--cheby_n") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--cheby_n");
GridCmdOptionInt(arg,order);
}
if ( CartesianCommunicator::RankWorld() == 0 ) {
std::streamsize ss = std::cout.precision();
std::cout << GridLogMessage <<" Gauge Configuration "<< gaugefile << '\n';
std::cout.precision(15);
for ( int i=0; i<4; ++i ) std::cout << GridLogMessage <<" boundary_phase["<< i << "] = " << boundary_phase[i] << '\n';
std::cout.precision(ss);
std::cout << GridLogMessage <<" Ls "<< Ls << '\n';
std::cout << GridLogMessage <<" mass "<< mass << '\n';
std::cout << GridLogMessage <<" M5 "<< M5 << '\n';
std::cout << GridLogMessage <<" mob_b "<< mob_b << '\n';
std::cout.precision(15);
for ( int i=0; i<Ls; ++i ) std::cout << GridLogMessage <<" omega["<< i << "] = " << omega[i] << '\n';
std::cout.precision(ss);
std::cout << GridLogMessage <<" Nu "<< Nu << '\n';
std::cout << GridLogMessage <<" Nk "<< Nk << '\n';
std::cout << GridLogMessage <<" Np "<< Np << '\n';
std::cout << GridLogMessage <<" Nm "<< Nm << '\n';
std::cout << GridLogMessage <<" Nstop "<< Nstop << '\n';
std::cout << GridLogMessage <<" Ntest "<< Ntest << '\n';
std::cout << GridLogMessage <<" MaxIter "<< MaxIter << '\n';
std::cout << GridLogMessage <<" resid "<< resid << '\n';
std::cout << GridLogMessage <<" Cheby Poly "<< low << "," << high << "," << order << std::endl;
}
}
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
CmdJobParams JP;
JP.Parse(argv,argc);
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(JP.Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(JP.Ls,UGrid);
// printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n",UGrid,UrbGrid,FGrid,FrbGrid);
GridCartesian * UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
GridCartesian * FGridF = SpaceTimeGrid::makeFiveDimGrid(JP.Ls,UGridF);
GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(JP.Ls,UGridF);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
GridParallelRNG RNG5(FGridF); RNG5.SeedFixedIntegers(seeds5);
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
// ypj [note] why seed RNG5 again? bug? In this case, run with a default seed().
GridParallelRNG RNG5rb(FrbGridF); RNG5rb.SeedFixedIntegers(seeds5);
LatticeGaugeField Umu(UGrid);
std::vector<LatticeColourMatrix> U(4,UGrid);
LatticeGaugeFieldF UmuF(UGridF);
std::vector<LatticeColourMatrix> UF(4,UGridF);
if ( JP.gaugefile.compare("Hot") == 0 ) {
SU3::HotConfiguration(RNG4, Umu);
} else {
FieldMetaData header;
NerscIO::readConfiguration(Umu,header,JP.gaugefile);
// ypj [fixme] additional checks for the loaded configuration?
}
precisionChange (UmuF,Umu);
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
}
RealD mass = JP.mass;
RealD M5 = JP.M5;
// ypj [fixme] flexible support for a various Fermions
// RealD mob_b = JP.mob_b; // Gparity
// std::vector<ComplexD> omega; // ZMobius
// GparityMobiusFermionD ::ImplParams params;
// std::vector<int> twists({1,1,1,0});
// params.twists = twists;
// GparityMobiusFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.,params);
// SchurDiagTwoOperator<GparityMobiusFermionR,FermionField> HermOp(Ddwf);
// int mrhs = JP.Nu;
int Ndir=4;
auto mpi_layout = GridDefaultMpi();
std::vector<int> mpi_split (Ndir,1);
#if 0
int tmp=mrhs, dir=0;
std::cout << GridLogMessage << "dir= "<<dir <<"tmp= "<<tmp<<"mpi_split= "<<mpi_split[dir]<<"mpi_layout= "<<mpi_split[dir]<<std::endl;
while ( tmp> 1) {
if ((mpi_split[dir]*2) <= mpi_layout[dir]){
mpi_split[dir] *=2;
tmp = tmp/2;
}
std::cout << GridLogMessage << "dir= "<<dir <<"tmp= "<<tmp<<"mpi_split= "<<mpi_split[dir]<<"mpi_layout= "<<mpi_layout[dir]<<std::endl;
dir = (dir+1)%Ndir;
}
#endif
int mrhs=1;
for(int i =0;i<Ndir;i++){
mpi_split[i] = mpi_layout[i] / JP.mpi_split[i] ;
mrhs *= JP.mpi_split[i];
}
std::cout << GridLogMessage << "mpi_layout= " << mpi_layout << std::endl;
std::cout << GridLogMessage << "mpi_split= " << mpi_split << std::endl;
std::cout << GridLogMessage << "mrhs= " << mrhs << std::endl;
// assert(JP.Nu==tmp);
/////////////////////////////////////////////
// Split into 1^4 mpi communicators, keeping it explicitly single
/////////////////////////////////////////////
GridCartesian * SGrid = new GridCartesian(GridDefaultLatt(),
GridDefaultSimd(Nd,vComplexF::Nsimd()),
mpi_split,
*UGrid);
GridCartesian * SFGrid = SpaceTimeGrid::makeFiveDimGrid(JP.Ls,SGrid);
GridRedBlackCartesian * SrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(SGrid);
GridRedBlackCartesian * SFrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(JP.Ls,SGrid);
LatticeGaugeFieldF s_Umu(SGrid);
Grid_split (UmuF,s_Umu);
//WilsonFermionR::ImplParams params;
ZMobiusFermionF::ImplParams params;
params.overlapCommsCompute = true;
params.boundary_phases = JP.boundary_phase;
ZMobiusFermionF DdwfF(UmuF,*FGridF,*FrbGridF,*UGridF,*UrbGridF,mass,M5,JP.omega,1.,0.,params);
// SchurDiagTwoOperator<ZMobiusFermionF,FermionField> HermOp(Ddwf);
SchurDiagOneOperator<ZMobiusFermionF,FermionField> HermOp(DdwfF);
ZMobiusFermionF Dsplit(s_Umu,*SFGrid,*SFrbGrid,*SGrid,*SrbGrid,mass,M5,JP.omega,1.,0.,params);
// SchurDiagTwoOperator<ZMobiusFermionF,FermionField> SHermOp(Dsplit);
SchurDiagOneOperator<ZMobiusFermionF,FermionField> SHermOp(Dsplit);
//std::vector<double> Coeffs { 0.,-1.};
// ypj [note] this may not be supported by some compilers
std::vector<double> Coeffs({ 0.,-1.});
Polynomial<FermionField> PolyX(Coeffs);
//Chebyshev<FermionField> Cheb(0.2,5.5,11);
Chebyshev<FermionField> Cheb(JP.low,JP.high,JP.order);
// Cheb.csv(std::cout);
ImplicitlyRestartedBlockLanczos<FermionField> IRBL(HermOp, SHermOp,
FrbGridF,SFrbGrid,mrhs,
Cheb,
JP.Nstop, JP.Ntest,
JP.Nu, JP.Nk, JP.Nm,
JP.resid,
JP.MaxIter,
IRBLdiagonaliseWithEigen);
// IRBLdiagonaliseWithLAPACK);
IRBL.split_test=1;
std::vector<RealD> eval(JP.Nm);
std::vector<FermionField> src(JP.Nu,FrbGridF);
if (0)
{
// in case RNG is too slow
std::cout << GridLogMessage << "Using RNG5"<<std::endl;
FermionField src_tmp(FGrid);
for ( int i=0; i<JP.Nu; ++i ){
// gaussian(RNG5,src_tmp);
ComplexD rnd;
RealD re;
fillScalar(re,RNG5._gaussian[0],RNG5._generators[0]);
std::cout << i <<" / "<< JP.Nm <<" re "<< re << std::endl;
// printf("%d / %d re %e\n",i,FGrid->_processor,re);
src_tmp=re;
pickCheckerboard(Odd,src[i],src_tmp);
}
RNG5.Report();
} else {
std::cout << GridLogMessage << "Using RNG5rb"<<std::endl;
for ( int i=0; i<JP.Nu; ++i )
gaussian(RNG5rb,src[i]);
RNG5rb.Report();
}
std::vector<FermionField> evec(JP.Nm,FrbGridF);
for(int i=0;i<1;++i){
std::cout << GridLogMessage << i <<" / "<< JP.Nm <<" grid pointer "<< evec[i].Grid() << std::endl;
};
int Nconv;
IRBL.calc(eval,evec,src,Nconv,JP.Impl);
Grid_finalize();
}

View File

@ -0,0 +1,401 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/Test_dwf_block_lanczos.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/util/Init.h>
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
using namespace std;
using namespace Grid;
//using namespace Grid::QCD;
//typedef typename GparityDomainWallFermionR::FermionField FermionField;
typedef typename ZMobiusFermionR::FermionField FermionField;
RealD AllZero(RealD x){ return 0.;}
class CmdJobParams
{
public:
std::string gaugefile;
int Ls;
double mass;
double M5;
double mob_b;
std::vector<ComplexD> omega;
std::vector<Complex> boundary_phase;
std::vector<int> mpi_split;
LanczosType Impl;
int Nu;
int Nk;
int Np;
int Nm;
int Nstop;
int Ntest;
int MaxIter;
double resid;
double low;
double high;
int order;
CmdJobParams()
: gaugefile("Hot"),
Ls(8), mass(0.01), M5(1.8), mob_b(1.5),
Impl(LanczosType::irbl),mpi_split(4,1),
Nu(4), Nk(200), Np(200), Nstop(100), Ntest(1), MaxIter(10), resid(1.0e-8),
low(0.2), high(5.5), order(11)
{Nm=Nk+Np;};
void Parse(char **argv, int argc);
};
void CmdJobParams::Parse(char **argv,int argc)
{
std::string arg;
std::vector<int> vi;
double re,im;
int expect, idx;
std::string vstr;
std::ifstream pfile;
if( GridCmdOptionExists(argv,argv+argc,"--gconf") ){
gaugefile = GridCmdOptionPayload(argv,argv+argc,"--gconf");
}
if( GridCmdOptionExists(argv,argv+argc,"--phase") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--phase");
pfile.open(arg);
assert(pfile);
expect = 0;
while( pfile >> vstr ) {
if ( vstr.compare("boundary_phase") == 0 ) {
pfile >> vstr;
GridCmdOptionInt(vstr,idx);
assert(expect==idx);
pfile >> vstr;
GridCmdOptionFloat(vstr,re);
pfile >> vstr;
GridCmdOptionFloat(vstr,im);
boundary_phase.push_back({re,im});
expect++;
}
}
pfile.close();
} else {
for (int i=0; i<4; ++i) boundary_phase.push_back({1.,0.});
}
if( GridCmdOptionExists(argv,argv+argc,"--omega") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--omega");
pfile.open(arg);
assert(pfile);
Ls = 0;
while( pfile >> vstr ) {
if ( vstr.compare("omega") == 0 ) {
pfile >> vstr;
GridCmdOptionInt(vstr,idx);
assert(Ls==idx);
pfile >> vstr;
GridCmdOptionFloat(vstr,re);
pfile >> vstr;
GridCmdOptionFloat(vstr,im);
omega.push_back({re,im});
Ls++;
}
}
pfile.close();
} else {
if( GridCmdOptionExists(argv,argv+argc,"--Ls") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--Ls");
GridCmdOptionInt(arg,Ls);
}
}
if( GridCmdOptionExists(argv,argv+argc,"--mass") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--mass");
GridCmdOptionFloat(arg,mass);
}
if( GridCmdOptionExists(argv,argv+argc,"--M5") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--M5");
GridCmdOptionFloat(arg,M5);
}
if( GridCmdOptionExists(argv,argv+argc,"--mob_b") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--mob_b");
GridCmdOptionFloat(arg,mob_b);
}
if( GridCmdOptionExists(argv,argv+argc,"--irbl") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--irbl");
GridCmdOptionIntVector(arg,vi);
Nu = vi[0];
Nk = vi[1];
Np = vi[2];
Nstop = vi[3];
MaxIter = vi[4];
// ypj[fixme] mode overriding message is needed.
Impl = LanczosType::irbl;
Nm = Nk+Np;
}
// block Lanczos with explicit extension of its dimensions
if( GridCmdOptionExists(argv,argv+argc,"--rbl") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--rbl");
GridCmdOptionIntVector(arg,vi);
Nu = vi[0];
Nk = vi[1];
Np = vi[2]; // vector space is enlarged by adding Np vectors
Nstop = vi[3];
MaxIter = vi[4];
// ypj[fixme] mode overriding message is needed.
Impl = LanczosType::rbl;
Nm = Nk+Np*MaxIter;
}
#if 1
// block Lanczos with explicit extension of its dimensions
if( GridCmdOptionExists(argv,argv+argc,"--split") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--split");
GridCmdOptionIntVector(arg,vi);
for(int i=0;i<mpi_split.size();i++)
mpi_split[i] = vi[i];
}
#endif
if( GridCmdOptionExists(argv,argv+argc,"--check_int") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--check_int");
GridCmdOptionInt(arg,Ntest);
}
if( GridCmdOptionExists(argv,argv+argc,"--resid") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--resid");
GridCmdOptionFloat(arg,resid);
}
if( GridCmdOptionExists(argv,argv+argc,"--cheby_l") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--cheby_l");
GridCmdOptionFloat(arg,low);
}
if( GridCmdOptionExists(argv,argv+argc,"--cheby_u") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--cheby_u");
GridCmdOptionFloat(arg,high);
}
if( GridCmdOptionExists(argv,argv+argc,"--cheby_n") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--cheby_n");
GridCmdOptionInt(arg,order);
}
if ( CartesianCommunicator::RankWorld() == 0 ) {
std::streamsize ss = std::cout.precision();
std::cout << GridLogMessage <<" Gauge Configuration "<< gaugefile << '\n';
std::cout.precision(15);
for ( int i=0; i<4; ++i ) std::cout << GridLogMessage <<" boundary_phase["<< i << "] = " << boundary_phase[i] << '\n';
std::cout.precision(ss);
std::cout << GridLogMessage <<" Ls "<< Ls << '\n';
std::cout << GridLogMessage <<" mass "<< mass << '\n';
std::cout << GridLogMessage <<" M5 "<< M5 << '\n';
std::cout << GridLogMessage <<" mob_b "<< mob_b << '\n';
std::cout.precision(15);
for ( int i=0; i<Ls; ++i ) std::cout << GridLogMessage <<" omega["<< i << "] = " << omega[i] << '\n';
std::cout.precision(ss);
std::cout << GridLogMessage <<" Nu "<< Nu << '\n';
std::cout << GridLogMessage <<" Nk "<< Nk << '\n';
std::cout << GridLogMessage <<" Np "<< Np << '\n';
std::cout << GridLogMessage <<" Nm "<< Nm << '\n';
std::cout << GridLogMessage <<" Nstop "<< Nstop << '\n';
std::cout << GridLogMessage <<" Ntest "<< Ntest << '\n';
std::cout << GridLogMessage <<" MaxIter "<< MaxIter << '\n';
std::cout << GridLogMessage <<" resid "<< resid << '\n';
std::cout << GridLogMessage <<" Cheby Poly "<< low << "," << high << "," << order << std::endl;
}
}
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
CmdJobParams JP;
JP.Parse(argv,argc);
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(JP.Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(JP.Ls,UGrid);
// printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n",UGrid,UrbGrid,FGrid,FrbGrid);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
// ypj [note] why seed RNG5 again? bug? In this case, run with a default seed().
GridParallelRNG RNG5rb(FrbGrid); RNG5rb.SeedFixedIntegers(seeds5);
LatticeGaugeField Umu(UGrid);
std::vector<LatticeColourMatrix> U(4,UGrid);
if ( JP.gaugefile.compare("Hot") == 0 ) {
SU3::HotConfiguration(RNG4, Umu);
} else {
FieldMetaData header;
NerscIO::readConfiguration(Umu,header,JP.gaugefile);
// ypj [fixme] additional checks for the loaded configuration?
}
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
}
RealD mass = JP.mass;
RealD M5 = JP.M5;
// ypj [fixme] flexible support for a various Fermions
// RealD mob_b = JP.mob_b; // Gparity
// std::vector<ComplexD> omega; // ZMobius
// GparityMobiusFermionD ::ImplParams params;
// std::vector<int> twists({1,1,1,0});
// params.twists = twists;
// GparityMobiusFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.,params);
// SchurDiagTwoOperator<GparityMobiusFermionR,FermionField> HermOp(Ddwf);
// int mrhs = JP.Nu;
int Ndir=4;
auto mpi_layout = GridDefaultMpi();
std::vector<int> mpi_split (Ndir,1);
#if 0
int tmp=mrhs, dir=0;
std::cout << GridLogMessage << "dir= "<<dir <<"tmp= "<<tmp<<"mpi_split= "<<mpi_split[dir]<<"mpi_layout= "<<mpi_split[dir]<<std::endl;
while ( tmp> 1) {
if ((mpi_split[dir]*2) <= mpi_layout[dir]){
mpi_split[dir] *=2;
tmp = tmp/2;
}
std::cout << GridLogMessage << "dir= "<<dir <<"tmp= "<<tmp<<"mpi_split= "<<mpi_split[dir]<<"mpi_layout= "<<mpi_layout[dir]<<std::endl;
dir = (dir+1)%Ndir;
}
#endif
int mrhs=1;
for(int i =0;i<Ndir;i++){
mpi_split[i] = mpi_layout[i] / JP.mpi_split[i] ;
mrhs *= JP.mpi_split[i];
}
std::cout << GridLogMessage << "mpi_layout= " << mpi_layout << std::endl;
std::cout << GridLogMessage << "mpi_split= " << mpi_split << std::endl;
std::cout << GridLogMessage << "mrhs= " << mrhs << std::endl;
// assert(JP.Nu==tmp);
/////////////////////////////////////////////
// Split into 1^4 mpi communicators
/////////////////////////////////////////////
GridCartesian * SGrid = new GridCartesian(GridDefaultLatt(),
GridDefaultSimd(Nd,vComplex::Nsimd()),
mpi_split,
*UGrid);
GridCartesian * SFGrid = SpaceTimeGrid::makeFiveDimGrid(JP.Ls,SGrid);
GridRedBlackCartesian * SrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(SGrid);
GridRedBlackCartesian * SFrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(JP.Ls,SGrid);
LatticeGaugeField s_Umu(SGrid);
Grid_split (Umu,s_Umu);
//WilsonFermionR::ImplParams params;
ZMobiusFermionR::ImplParams params;
params.overlapCommsCompute = true;
params.boundary_phases = JP.boundary_phase;
ZMobiusFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,JP.omega,1.,0.,params);
// SchurDiagTwoOperator<ZMobiusFermionR,FermionField> HermOp(Ddwf);
SchurDiagOneOperator<ZMobiusFermionR,FermionField> HermOp(Ddwf);
ZMobiusFermionR Dsplit(s_Umu,*SFGrid,*SFrbGrid,*SGrid,*SrbGrid,mass,M5,JP.omega,1.,0.,params);
// SchurDiagTwoOperator<ZMobiusFermionR,FermionField> SHermOp(Dsplit);
SchurDiagOneOperator<ZMobiusFermionR,FermionField> SHermOp(Dsplit);
//std::vector<double> Coeffs { 0.,-1.};
// ypj [note] this may not be supported by some compilers
std::vector<double> Coeffs({ 0.,-1.});
Polynomial<FermionField> PolyX(Coeffs);
//Chebyshev<FermionField> Cheb(0.2,5.5,11);
Chebyshev<FermionField> Cheb(JP.low,JP.high,JP.order);
// Cheb.csv(std::cout);
ImplicitlyRestartedBlockLanczos<FermionField> IRBL(HermOp, SHermOp,
FrbGrid,SFrbGrid,mrhs,
Cheb,
JP.Nstop, JP.Ntest,
JP.Nu, JP.Nk, JP.Nm,
JP.resid,
JP.MaxIter,
IRBLdiagonaliseWithEigen);
// IRBLdiagonaliseWithLAPACK);
IRBL.split_test=0;
std::vector<RealD> eval(JP.Nm);
std::vector<FermionField> src(JP.Nu,FrbGrid);
if (0)
{
// in case RNG is too slow
std::cout << GridLogMessage << "Using RNG5"<<std::endl;
FermionField src_tmp(FGrid);
for ( int i=0; i<JP.Nu; ++i ){
// gaussian(RNG5,src_tmp);
ComplexD rnd;
RealD re;
fillScalar(re,RNG5._gaussian[0],RNG5._generators[0]);
std::cout << i <<" / "<< JP.Nm <<" re "<< re << std::endl;
// printf("%d / %d re %e\n",i,FGrid->_processor,re);
src_tmp=re;
pickCheckerboard(Odd,src[i],src_tmp);
}
RNG5.Report();
} else {
std::cout << GridLogMessage << "Using RNG5rb"<<std::endl;
for ( int i=0; i<JP.Nu; ++i )
gaussian(RNG5rb,src[i]);
RNG5rb.Report();
}
std::vector<FermionField> evec(JP.Nm,FrbGrid);
for(int i=0;i<1;++i){
std::cout << GridLogMessage << i <<" / "<< JP.Nm <<" grid pointer "<< evec[i].Grid() << std::endl;
};
int Nconv;
IRBL.calc(eval,evec,src,Nconv,JP.Impl);
Grid_finalize();
}

View File

@ -0,0 +1,408 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/Test_dwf_block_lanczos.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/util/Init.h>
#include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
using namespace std;
using namespace Grid;
//using namespace Grid::QCD;
//typedef typename GparityDomainWallFermionR::FermionField FermionField;
typedef typename ZMobiusFermionF::FermionField FermionField;
RealD AllZero(RealD x){ return 0.;}
class CmdJobParams
{
public:
std::string gaugefile;
int Ls;
double mass;
double M5;
double mob_b;
std::vector<ComplexD> omega;
std::vector<Complex> boundary_phase;
std::vector<int> mpi_split;
LanczosType Impl;
int Nu;
int Nk;
int Np;
int Nm;
int Nstop;
int Ntest;
int MaxIter;
double resid;
double low;
double high;
int order;
CmdJobParams()
: gaugefile("Hot"),
Ls(8), mass(0.01), M5(1.8), mob_b(1.5),
Impl(LanczosType::irbl),mpi_split(4,1),
Nu(4), Nk(200), Np(200), Nstop(100), Ntest(1), MaxIter(10), resid(1.0e-8),
low(0.2), high(5.5), order(11)
{Nm=Nk+Np;};
void Parse(char **argv, int argc);
};
void CmdJobParams::Parse(char **argv,int argc)
{
std::string arg;
std::vector<int> vi;
double re,im;
int expect, idx;
std::string vstr;
std::ifstream pfile;
if( GridCmdOptionExists(argv,argv+argc,"--gconf") ){
gaugefile = GridCmdOptionPayload(argv,argv+argc,"--gconf");
}
if( GridCmdOptionExists(argv,argv+argc,"--phase") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--phase");
pfile.open(arg);
assert(pfile);
expect = 0;
while( pfile >> vstr ) {
if ( vstr.compare("boundary_phase") == 0 ) {
pfile >> vstr;
GridCmdOptionInt(vstr,idx);
assert(expect==idx);
pfile >> vstr;
GridCmdOptionFloat(vstr,re);
pfile >> vstr;
GridCmdOptionFloat(vstr,im);
boundary_phase.push_back({re,im});
expect++;
}
}
pfile.close();
} else {
for (int i=0; i<4; ++i) boundary_phase.push_back({1.,0.});
}
if( GridCmdOptionExists(argv,argv+argc,"--omega") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--omega");
pfile.open(arg);
assert(pfile);
Ls = 0;
while( pfile >> vstr ) {
if ( vstr.compare("omega") == 0 ) {
pfile >> vstr;
GridCmdOptionInt(vstr,idx);
assert(Ls==idx);
pfile >> vstr;
GridCmdOptionFloat(vstr,re);
pfile >> vstr;
GridCmdOptionFloat(vstr,im);
omega.push_back({re,im});
Ls++;
}
}
pfile.close();
} else {
if( GridCmdOptionExists(argv,argv+argc,"--Ls") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--Ls");
GridCmdOptionInt(arg,Ls);
}
}
if( GridCmdOptionExists(argv,argv+argc,"--mass") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--mass");
GridCmdOptionFloat(arg,mass);
}
if( GridCmdOptionExists(argv,argv+argc,"--M5") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--M5");
GridCmdOptionFloat(arg,M5);
}
if( GridCmdOptionExists(argv,argv+argc,"--mob_b") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--mob_b");
GridCmdOptionFloat(arg,mob_b);
}
if( GridCmdOptionExists(argv,argv+argc,"--irbl") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--irbl");
GridCmdOptionIntVector(arg,vi);
Nu = vi[0];
Nk = vi[1];
Np = vi[2];
Nstop = vi[3];
MaxIter = vi[4];
// ypj[fixme] mode overriding message is needed.
Impl = LanczosType::irbl;
Nm = Nk+Np;
}
// block Lanczos with explicit extension of its dimensions
if( GridCmdOptionExists(argv,argv+argc,"--rbl") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--rbl");
GridCmdOptionIntVector(arg,vi);
Nu = vi[0];
Nk = vi[1];
Np = vi[2]; // vector space is enlarged by adding Np vectors
Nstop = vi[3];
MaxIter = vi[4];
// ypj[fixme] mode overriding message is needed.
Impl = LanczosType::rbl;
Nm = Nk+Np*MaxIter;
}
#if 1
// block Lanczos with explicit extension of its dimensions
if( GridCmdOptionExists(argv,argv+argc,"--split") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--split");
GridCmdOptionIntVector(arg,vi);
for(int i=0;i<mpi_split.size();i++)
mpi_split[i] = vi[i];
}
#endif
if( GridCmdOptionExists(argv,argv+argc,"--check_int") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--check_int");
GridCmdOptionInt(arg,Ntest);
}
if( GridCmdOptionExists(argv,argv+argc,"--resid") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--resid");
GridCmdOptionFloat(arg,resid);
}
if( GridCmdOptionExists(argv,argv+argc,"--cheby_l") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--cheby_l");
GridCmdOptionFloat(arg,low);
}
if( GridCmdOptionExists(argv,argv+argc,"--cheby_u") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--cheby_u");
GridCmdOptionFloat(arg,high);
}
if( GridCmdOptionExists(argv,argv+argc,"--cheby_n") ){
arg = GridCmdOptionPayload(argv,argv+argc,"--cheby_n");
GridCmdOptionInt(arg,order);
}
if ( CartesianCommunicator::RankWorld() == 0 ) {
std::streamsize ss = std::cout.precision();
std::cout << GridLogMessage <<" Gauge Configuration "<< gaugefile << '\n';
std::cout.precision(15);
for ( int i=0; i<4; ++i ) std::cout << GridLogMessage <<" boundary_phase["<< i << "] = " << boundary_phase[i] << '\n';
std::cout.precision(ss);
std::cout << GridLogMessage <<" Ls "<< Ls << '\n';
std::cout << GridLogMessage <<" mass "<< mass << '\n';
std::cout << GridLogMessage <<" M5 "<< M5 << '\n';
std::cout << GridLogMessage <<" mob_b "<< mob_b << '\n';
std::cout.precision(15);
for ( int i=0; i<Ls; ++i ) std::cout << GridLogMessage <<" omega["<< i << "] = " << omega[i] << '\n';
std::cout.precision(ss);
std::cout << GridLogMessage <<" Nu "<< Nu << '\n';
std::cout << GridLogMessage <<" Nk "<< Nk << '\n';
std::cout << GridLogMessage <<" Np "<< Np << '\n';
std::cout << GridLogMessage <<" Nm "<< Nm << '\n';
std::cout << GridLogMessage <<" Nstop "<< Nstop << '\n';
std::cout << GridLogMessage <<" Ntest "<< Ntest << '\n';
std::cout << GridLogMessage <<" MaxIter "<< MaxIter << '\n';
std::cout << GridLogMessage <<" resid "<< resid << '\n';
std::cout << GridLogMessage <<" Cheby Poly "<< low << "," << high << "," << order << std::endl;
}
}
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
CmdJobParams JP;
JP.Parse(argv,argc);
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(JP.Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(JP.Ls,UGrid);
// printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n",UGrid,UrbGrid,FGrid,FrbGrid);
GridCartesian * UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
GridCartesian * FGridF = SpaceTimeGrid::makeFiveDimGrid(JP.Ls,UGridF);
GridRedBlackCartesian * FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(JP.Ls,UGridF);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
GridParallelRNG RNG5(FGridF); RNG5.SeedFixedIntegers(seeds5);
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
// ypj [note] why seed RNG5 again? bug? In this case, run with a default seed().
GridParallelRNG RNG5rb(FrbGridF); RNG5rb.SeedFixedIntegers(seeds5);
LatticeGaugeField Umu(UGrid);
std::vector<LatticeColourMatrix> U(4,UGrid);
LatticeGaugeFieldF UmuF(UGridF);
std::vector<LatticeColourMatrix> UF(4,UGridF);
if ( JP.gaugefile.compare("Hot") == 0 ) {
SU3::HotConfiguration(RNG4, Umu);
} else {
FieldMetaData header;
NerscIO::readConfiguration(Umu,header,JP.gaugefile);
// ypj [fixme] additional checks for the loaded configuration?
}
precisionChange (UmuF,Umu);
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
}
RealD mass = JP.mass;
RealD M5 = JP.M5;
// ypj [fixme] flexible support for a various Fermions
// RealD mob_b = JP.mob_b; // Gparity
// std::vector<ComplexD> omega; // ZMobius
// GparityMobiusFermionD ::ImplParams params;
// std::vector<int> twists({1,1,1,0});
// params.twists = twists;
// GparityMobiusFermionR Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.,params);
// SchurDiagTwoOperator<GparityMobiusFermionR,FermionField> HermOp(Ddwf);
// int mrhs = JP.Nu;
int Ndir=4;
auto mpi_layout = GridDefaultMpi();
std::vector<int> mpi_split (Ndir,1);
#if 0
int tmp=mrhs, dir=0;
std::cout << GridLogMessage << "dir= "<<dir <<"tmp= "<<tmp<<"mpi_split= "<<mpi_split[dir]<<"mpi_layout= "<<mpi_split[dir]<<std::endl;
while ( tmp> 1) {
if ((mpi_split[dir]*2) <= mpi_layout[dir]){
mpi_split[dir] *=2;
tmp = tmp/2;
}
std::cout << GridLogMessage << "dir= "<<dir <<"tmp= "<<tmp<<"mpi_split= "<<mpi_split[dir]<<"mpi_layout= "<<mpi_layout[dir]<<std::endl;
dir = (dir+1)%Ndir;
}
#endif
int mrhs=1;
for(int i =0;i<Ndir;i++){
mpi_split[i] = mpi_layout[i] / JP.mpi_split[i] ;
mrhs *= JP.mpi_split[i];
}
std::cout << GridLogMessage << "mpi_layout= " << mpi_layout << std::endl;
std::cout << GridLogMessage << "mpi_split= " << mpi_split << std::endl;
std::cout << GridLogMessage << "mrhs= " << mrhs << std::endl;
// assert(JP.Nu==tmp);
/////////////////////////////////////////////
// Split into 1^4 mpi communicators, keeping it explicitly single
/////////////////////////////////////////////
GridCartesian * SGrid = new GridCartesian(GridDefaultLatt(),
GridDefaultSimd(Nd,vComplexF::Nsimd()),
mpi_split,
*UGrid);
GridCartesian * SFGrid = SpaceTimeGrid::makeFiveDimGrid(JP.Ls,SGrid);
GridRedBlackCartesian * SrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(SGrid);
GridRedBlackCartesian * SFrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(JP.Ls,SGrid);
LatticeGaugeFieldF s_Umu(SGrid);
Grid_split (UmuF,s_Umu);
//WilsonFermionR::ImplParams params;
ZMobiusFermionF::ImplParams params;
params.overlapCommsCompute = true;
params.boundary_phases = JP.boundary_phase;
ZMobiusFermionF DdwfF(UmuF,*FGridF,*FrbGridF,*UGridF,*UrbGridF,mass,M5,JP.omega,1.,0.,params);
// SchurDiagTwoOperator<ZMobiusFermionF,FermionField> HermOp(Ddwf);
SchurDiagOneOperator<ZMobiusFermionF,FermionField> HermOp(DdwfF);
ZMobiusFermionF Dsplit(s_Umu,*SFGrid,*SFrbGrid,*SGrid,*SrbGrid,mass,M5,JP.omega,1.,0.,params);
// SchurDiagTwoOperator<ZMobiusFermionF,FermionField> SHermOp(Dsplit);
SchurDiagOneOperator<ZMobiusFermionF,FermionField> SHermOp(Dsplit);
//std::vector<double> Coeffs { 0.,-1.};
// ypj [note] this may not be supported by some compilers
std::vector<double> Coeffs({ 0.,-1.});
Polynomial<FermionField> PolyX(Coeffs);
//Chebyshev<FermionField> Cheb(0.2,5.5,11);
Chebyshev<FermionField> Cheb(JP.low,JP.high,JP.order);
// Cheb.csv(std::cout);
ImplicitlyRestartedBlockLanczos<FermionField> IRBL(HermOp, SHermOp,
FrbGridF,SFrbGrid,mrhs,
Cheb,
JP.Nstop, JP.Ntest,
JP.Nu, JP.Nk, JP.Nm,
JP.resid,
JP.MaxIter,
IRBLdiagonaliseWithEigen);
// IRBLdiagonaliseWithLAPACK);
IRBL.split_test=1;
std::vector<RealD> eval(JP.Nm);
std::vector<FermionField> src(JP.Nu,FrbGridF);
if (0)
{
// in case RNG is too slow
std::cout << GridLogMessage << "Using RNG5"<<std::endl;
FermionField src_tmp(FGrid);
for ( int i=0; i<JP.Nu; ++i ){
// gaussian(RNG5,src_tmp);
ComplexD rnd;
RealD re;
fillScalar(re,RNG5._gaussian[0],RNG5._generators[0]);
std::cout << i <<" / "<< JP.Nm <<" re "<< re << std::endl;
// printf("%d / %d re %e\n",i,FGrid->_processor,re);
src_tmp=re;
pickCheckerboard(Odd,src[i],src_tmp);
}
RNG5.Report();
} else {
std::cout << GridLogMessage << "Using RNG5rb"<<std::endl;
for ( int i=0; i<JP.Nu; ++i )
gaussian(RNG5rb,src[i]);
RNG5rb.Report();
}
std::vector<FermionField> evec(JP.Nm,FrbGridF);
for(int i=0;i<1;++i){
std::cout << GridLogMessage << i <<" / "<< JP.Nm <<" grid pointer "<< evec[i].Grid() << std::endl;
};
int Nconv;
IRBL.calc(eval,evec,src,Nconv,JP.Impl);
Grid_finalize();
}

View File

@ -35,26 +35,45 @@ template<typename Action>
struct Setup{};
template<>
struct Setup<GparityMobiusFermionD>{
static GparityMobiusFermionD* getAction(LatticeGaugeField &Umu,
struct Setup<GparityMobiusFermionF>{
static GparityMobiusFermionF* getAction(LatticeGaugeFieldF &Umu,
GridCartesian* FGrid, GridRedBlackCartesian* FrbGrid, GridCartesian* UGrid, GridRedBlackCartesian* UrbGrid){
RealD mass=0.01;
RealD mass=0.00054;
RealD M5=1.8;
RealD mob_b=1.5;
GparityMobiusFermionD ::ImplParams params;
std::vector<int> twists({1,1,1,0});
params.twists = twists;
return new GparityMobiusFermionD(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.,params);
return new GparityMobiusFermionF(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.,params);
}
};
template<>
struct Setup<DomainWallFermionF>{
static DomainWallFermionF* getAction(LatticeGaugeFieldF &Umu,
struct Setup<DomainWallFermionD>{
static DomainWallFermionD* getAction(LatticeGaugeField &Umu,
GridCartesian* FGrid, GridRedBlackCartesian* FrbGrid, GridCartesian* UGrid, GridRedBlackCartesian* UrbGrid){
RealD mass=0.01;
RealD mass=0.00054;
RealD M5=1.8;
return new DomainWallFermionD(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
return new DomainWallFermionF(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
}
};
template<>
struct Setup<MobiusFermionF>{
static MobiusFermionF* getAction(LatticeGaugeFieldF &Umu,
GridCartesian* FGrid, GridRedBlackCartesian* FrbGrid, GridCartesian* UGrid, GridRedBlackCartesian* UrbGrid){
RealD mass=0.00054;
RealD M5=1.8;
RealD mob_b=1.5;
std::vector<Complex> boundary = {1,1,1,-1};
MobiusFermionF::ImplParams Params(boundary);
std::cout << GridLogMessage << "mass "<<mass<<std::endl;
std::cout << GridLogMessage << "M5 "<<M5<<std::endl;
std::cout << GridLogMessage << "mob_b "<<mob_b<<std::endl;
return new MobiusFermionF(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.,Params);
}
};
@ -63,48 +82,70 @@ struct Setup<DomainWallFermionD>{
template<typename Action>
void run(){
typedef typename Action::FermionField FermionField;
const int Ls=8;
const int Ls=12;
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n",UGrid,UrbGrid,FGrid,FrbGrid);
// printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n",UGrid,UrbGrid,FGrid,FrbGrid);
GridCartesian* UGridF = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd, vComplexF::Nsimd()), GridDefaultMpi());
GridRedBlackCartesian* UrbGridF = SpaceTimeGrid::makeFourDimRedBlackGrid(UGridF);
GridCartesian* FGridF = SpaceTimeGrid::makeFiveDimGrid(Ls, UGridF);
GridRedBlackCartesian* FrbGridF = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGridF);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
GridParallelRNG RNG5rb(FrbGrid); RNG5.SeedFixedIntegers(seeds5);
GridParallelRNG RNG5(FGridF); RNG5.SeedFixedIntegers(seeds5);
GridParallelRNG RNG4(UGridF); RNG4.SeedFixedIntegers(seeds4);
GridParallelRNG RNG5rb(FrbGridF); RNG5.SeedFixedIntegers(seeds5);
LatticeGaugeField Umu(UGrid);
SU<Nc>::HotConfiguration(RNG4, Umu);
// SU<Nc>::HotConfiguration(RNG4, Umu);
FieldMetaData header;
std::string file("./config");
Action *action = Setup<Action>::getAction(Umu,FGrid,FrbGrid,UGrid,UrbGrid);
// int precision32 = 0;
// int tworow = 0;
// NerscIO::writeConfiguration(Umu,file,tworow,precision32);
NerscIO::readConfiguration(Umu,header,file);
LatticeGaugeFieldF UmuF(UGridF);
precisionChange(UmuF, Umu);
Action *action = Setup<Action>::getAction(UmuF,FGridF,FrbGridF,UGridF,UrbGridF);
//MdagMLinearOperator<Action,FermionField> HermOp(Ddwf);
SchurDiagTwoOperator<Action,FermionField> HermOp(*action);
// SchurDiagTwoOperator<Action,FermionField> HermOp(*action);
SchurDiagOneOperator<Action,FermionField> HermOp(*action);
const int Nstop = 30;
const int Nk = 40;
const int Nstop = 150;
const int Nk = 160;
const int Np = 40;
const int Nm = Nk+Np;
const int MaxIt= 10000;
RealD resid = 1.0e-8;
RealD resid = 1.0e-6;
std::cout << GridLogMessage << "Nstop "<<Nstop<<std::endl;
std::cout << GridLogMessage << "Nk "<<Nk<<std::endl;
std::cout << GridLogMessage << "Np "<<Np<<std::endl;
std::cout << GridLogMessage << "resid "<<resid<<std::endl;
std::vector<double> Coeffs { 0.,-1.};
Polynomial<FermionField> PolyX(Coeffs);
Chebyshev<FermionField> Cheby(0.2,5.,11);
Chebyshev<FermionField> Cheby(0.0000006,5.5,4001);
std::cout << GridLogMessage << "Cheby(0.0000006,5.5,4001) "<<std::endl;
FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
PlainHermOp<FermionField> Op (HermOp);
PlainHermOp<FermionField> Op (HermOp);
ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby,Op,Nstop,Nk,Nm,resid,MaxIt);
std::vector<RealD> eval(Nm);
FermionField src(FrbGrid);
FermionField src(FrbGridF);
gaussian(RNG5rb,src);
std::vector<FermionField> evec(Nm,FrbGrid);
std::vector<FermionField> evec(Nm,FrbGridF);
for(int i=0;i<1;i++){
std::cout << GridLogMessage <<i<<" / "<< Nm<< " grid pointer "<<evec[i].Grid()<<std::endl;
};
@ -119,7 +160,7 @@ int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
std::string action = "GparityMobius";
std::string action = "Mobius";
for(int i=1;i<argc;i++){
if(std::string(argv[i]) == "-action"){
action = argv[i+1];
@ -127,9 +168,11 @@ int main (int argc, char ** argv)
}
if(action == "GparityMobius"){
run<GparityMobiusFermionD>();
run<GparityMobiusFermionF>();
}else if(action == "DWF"){
run<DomainWallFermionD>();
run<DomainWallFermionF>();
}else if(action == "Mobius"){
run<MobiusFermionF>();
}else{
std::cout << "Unknown action" << std::endl;
exit(1);

View File

@ -0,0 +1,122 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/Test_dwf_cg_prec.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
//using namespace std;
using namespace Grid;
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
const int Ls=12;
std::cout << GridLogMessage << "::::: NB: to enable a quick bit reproducibility check use the --checksums flag. " << std::endl;
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
GridCartesian * UGrid_f = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid_f = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid_f);
GridCartesian * FGrid_f = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid_f);
GridRedBlackCartesian * FrbGrid_f = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid_f);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
LatticeFermionD src(FGrid); random(RNG5,src);
LatticeFermionD result(FGrid); result=Zero();
LatticeGaugeFieldD Umu(UGrid);
LatticeGaugeFieldF Umu_f(UGrid_f);
SU<Nc>::HotConfiguration(RNG4,Umu);
precisionChange(Umu_f,Umu);
RealD mass=0.1;
RealD M5=1.8;
DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
DomainWallFermionF Ddwf_f(Umu_f,*FGrid_f,*FrbGrid_f,*UGrid_f,*UrbGrid_f,mass,M5);
LatticeFermionD src_o(FrbGrid);
LatticeFermionD result_o(FrbGrid);
LatticeFermionD result_o_2(FrbGrid);
pickCheckerboard(Odd,src_o,src);
result_o.Checkerboard() = Odd;
result_o = Zero();
result_o_2.Checkerboard() = Odd;
result_o_2 = Zero();
SchurDiagMooeeOperator<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
SchurDiagMooeeOperator<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);
std::cout << GridLogMessage << "::::::::::::: Starting mixed CG" << std::endl;
MixedPrecisionConjugateGradient<LatticeFermionD,LatticeFermionF> mCG(1.0e-8, 10000, 50, FrbGrid_f, HermOpEO_f, HermOpEO);
double t1,t2,flops;
double MdagMsiteflops = 1452; // Mobius (real coeffs)
// CG overhead: 8 inner product, 4+8 axpy_norm, 4+4 linear comb (2 of)
double CGsiteflops = (8+4+8+4+4)*Nc*Ns ;
std:: cout << " MdagM site flops = "<< 4*MdagMsiteflops<<std::endl;
std:: cout << " CG site flops = "<< CGsiteflops <<std::endl;
result_o = Zero();
t1=usecond();
mCG(src_o,result_o);
t2=usecond();
int iters = mCG.TotalInnerIterations; //Number of inner CG iterations
flops = MdagMsiteflops*4*FrbGrid->gSites()*iters;
flops+= CGsiteflops*FrbGrid->gSites()*iters;
std::cout << " SinglePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
std::cout << " SinglePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
std::cout << GridLogMessage << "::::::::::::: Starting regular CG" << std::endl;
ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
result_o_2 = Zero();
t1=usecond();
CG(HermOpEO,src_o,result_o_2);
t2=usecond();
iters = CG.IterationsToComplete;
flops = MdagMsiteflops*4*FrbGrid->gSites()*iters;
flops+= CGsiteflops*FrbGrid->gSites()*iters;
std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
std::cout << " DoublePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
LatticeFermionD diff_o(FrbGrid);
RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2);
std::cout << GridLogMessage << "::::::::::::: Diff between mixed and regular CG: " << diff << std::endl;
MemoryManager::Print();
Grid_finalize();
}

View File

@ -0,0 +1,143 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/solver/Test_dwf_relupcg_prec.cc
Copyright (C) 2015
Author: Christopher Kelly <ckelly@bnl.gov>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
double relup_delta = 0.2;
for(int i=1;i<argc-1;i++){
std::string sarg = argv[i];
if(sarg == "--relup_delta"){
std::stringstream ss; ss << argv[i+1]; ss >> relup_delta;
std::cout << GridLogMessage << "Set reliable update Delta to " << relup_delta << std::endl;
}
}
const int Ls=12;
{
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
GridCartesian * UGrid_f = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid_f = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid_f);
GridCartesian * FGrid_f = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid_f);
GridRedBlackCartesian * FrbGrid_f = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid_f);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
LatticeFermionD src(FGrid); random(RNG5,src);
LatticeFermionD result(FGrid); result=Zero();
LatticeGaugeFieldD Umu(UGrid);
LatticeGaugeFieldF Umu_f(UGrid_f);
SU<Nc>::HotConfiguration(RNG4,Umu);
precisionChange(Umu_f,Umu);
RealD mass=0.1;
RealD M5=1.8;
DomainWallFermionD Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
DomainWallFermionF Ddwf_f(Umu_f,*FGrid_f,*FrbGrid_f,*UGrid_f,*UrbGrid_f,mass,M5);
LatticeFermionD src_o(FrbGrid);
LatticeFermionD result_o(FrbGrid);
LatticeFermionD result_o_2(FrbGrid);
pickCheckerboard(Odd,src_o,src);
result_o.Checkerboard() = Odd;
result_o = Zero();
result_o_2.Checkerboard() = Odd;
result_o_2 = Zero();
SchurDiagMooeeOperator<DomainWallFermionD,LatticeFermionD> HermOpEO(Ddwf);
SchurDiagMooeeOperator<DomainWallFermionF,LatticeFermionF> HermOpEO_f(Ddwf_f);
std::cout << GridLogMessage << "::::::::::::: Starting mixed CG" << std::endl;
ConjugateGradientReliableUpdate<LatticeFermionD,LatticeFermionF> mCG(1e-8, 10000, relup_delta, FrbGrid_f, HermOpEO_f, HermOpEO);
double t1,t2,flops;
double MdagMsiteflops = 1452; // Mobius (real coeffs)
// CG overhead: 8 inner product, 4+8 axpy_norm, 4+4 linear comb (2 of)
double CGsiteflops = (8+4+8+4+4)*Nc*Ns ;
std:: cout << " MdagM site flops = "<< 4*MdagMsiteflops<<std::endl;
std:: cout << " CG site flops = "<< CGsiteflops <<std::endl;
int iters, iters_cleanup, relups, tot_iters;
for(int i=0;i<10;i++){
result_o = Zero();
t1=usecond();
mCG(src_o,result_o);
t2=usecond();
iters = mCG.IterationsToComplete; //Number of single prec CG iterations
iters_cleanup = mCG.IterationsToCleanup;
relups = mCG.ReliableUpdatesPerformed;
tot_iters = iters + iters_cleanup + relups; //relup cost MdagM application in double
flops = MdagMsiteflops*4*FrbGrid->gSites()*tot_iters;
flops+= CGsiteflops*FrbGrid->gSites()*tot_iters;
std::cout << " SinglePrecision single prec iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
std::cout << " SinglePrecision double prec cleanup iterations/sec "<< iters_cleanup/(t2-t1)*1000.*1000.<<std::endl;
std::cout << " SinglePrecision reliable updates/sec "<< relups/(t2-t1)*1000.*1000.<<std::endl;
std::cout << " SinglePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
}
std::cout << GridLogMessage << "::::::::::::: Starting regular CG" << std::endl;
ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
for(int i=0;i<1;i++){
result_o_2 = Zero();
t1=usecond();
CG(HermOpEO,src_o,result_o_2);
t2=usecond();
iters = CG.IterationsToComplete;
flops = MdagMsiteflops*4*FrbGrid->gSites()*iters;
flops+= CGsiteflops*FrbGrid->gSites()*iters;
std::cout << " DoublePrecision iterations/sec "<< iters/(t2-t1)*1000.*1000.<<std::endl;
std::cout << " DoublePrecision GF/s "<< flops/(t2-t1)/1000.<<std::endl;
}
// MemoryManager::Print();
LatticeFermionD diff_o(FrbGrid);
RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2);
std::cout << GridLogMessage << "::::::::::::: Diff between mixed and regular CG: " << diff << std::endl;
}
MemoryManager::Print();
Grid_finalize();
}