1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-12 20:27:06 +01:00

Compare commits

...

109 Commits

Author SHA1 Message Date
9203126aa5 Scripts 2025-06-11 15:30:16 +02:00
f90ba4712a Update for Jupiter 2025-06-11 15:24:34 +02:00
3737a24096 Updated python output 2025-06-03 14:09:29 -04:00
d418f78352 Making running on Aurora more debuggable 2025-05-23 20:58:16 +00:00
25163998a0 Makes SYCL compiler happy 2025-05-23 20:57:11 +00:00
dc546aaa4b Updated config options for BNL cluster 2025-05-13 18:44:47 -04:00
5364d580c9 Output chirality, eigenvector density files and python source lego plot 2025-05-13 18:44:47 -04:00
2a9a6347e3 Do not require Grid format RNGs and also to the 5Li reporting 2025-05-13 18:44:47 -04:00
cfdb56f314 Run measurements at t=0 too 2025-05-13 18:44:46 -04:00
b517e88db3 Update README 2025-05-13 16:49:21 -04:00
bb317aba8d Lattice = for sycl 2025-05-13 12:50:58 +00:00
644cc6647e JSON update 2025-05-13 12:50:58 +00:00
72397ce23b SYCL interface change 2025-05-13 12:50:58 +00:00
d60a80c098 Fixes and visualisation 2025-04-29 18:04:23 -04:00
bb8b6d9d73 Fix 2025-04-29 18:04:04 -04:00
677b4cc5b0 Make all tests compile 2025-04-24 20:33:26 -04:00
be565ffab6 update mac config command 2025-04-24 14:50:06 -04:00
df6120e5f6 CPU compile oops fix 2025-04-24 14:50:06 -04:00
21de6f7da8 Merge pull request #477 from lehner/feature/wilson-clover-5d
Feature/wilson clover 5d
2025-04-24 14:44:48 -04:00
dbe39f9ce0 Merge pull request #471 from edbennett/fix-wflow
Shave off rough edges in Wilson flow test
2025-04-24 14:40:31 -04:00
ab3de50d5e Merge pull request #473 from UCL-ARC/gauge_action_deriv
WilsonGagueAction deriv
2025-04-24 14:39:10 -04:00
c545bd2139 Merge pull request #465 from edbennett/allow-nonsu3-compilation
guard against trying to compile SU3-specific code when Nc ≠ 3
2025-04-24 14:35:51 -04:00
6a1c64fbdd Merge pull request #470 from paboyle/specflow
Spectral flow, DWF/Mobius kernel measurement
2025-04-24 14:34:33 -04:00
b75809ed61 Update README 2025-04-24 14:27:22 -04:00
ecaf228e5c Update README 2025-04-24 14:25:32 -04:00
6d015ae8fc Visualisation tools 2025-04-24 13:47:34 -04:00
233150d93f Bug fix for no accelerator aware MPI, thanks Shuhei for finding it. 2025-04-24 11:40:46 -04:00
7af8c77a52 Normalise 2025-04-24 11:37:39 -04:00
a957e7bfa1 Adding DWF evec Chirality measurement 2025-04-22 22:17:51 +00:00
cee4c8ce8c Merge branch 'develop' of https://github.com/paboyle/Grid into specflow 2025-04-18 19:55:36 +00:00
96bf814d8c Add checkerboarding to 5D compact clover 2025-04-10 23:05:39 +02:00
7ddc422788 CompactWilsonClover5D 2025-04-10 23:05:29 +02:00
e652fc2825 Shared Memory test reenabled on every Grid object creation.
Const improvements in Accelerator.h
2025-04-07 11:51:40 -04:00
a49fa3f8d0 ROCM 6.3.1 appears to work 2025-04-07 11:50:59 -04:00
cd452a2f91 Slurm update 2025-04-04 18:40:20 -04:00
4f89f603ae Changes to add back shared memory test on GPU 2025-04-04 18:40:15 -04:00
11dc2c5e1d PVdagM initialise 2025-04-04 18:35:06 -04:00
6fec3c15ca Cleaner printing 2025-04-04 18:35:06 -04:00
938c47480f Updated compile on frontier.
Unsatisfactory hacsk
2025-04-04 18:35:06 -04:00
3811d19298 Fence 2025-04-04 18:35:06 -04:00
83a3ab6b6f Barrier -- not sure 100% this was needed 2025-04-04 18:35:05 -04:00
d66a9af6a3 No compile fix 2025-04-04 18:35:05 -04:00
adc90d3a86 NVLINK GET/PUT on cuda aware mpi 2025-04-04 18:35:05 -04:00
ebbd015c5c Deprecate shared memory copy as direction matters on nvidia GPU 2025-04-04 18:35:05 -04:00
4ab73b36b2 Deprecate shared memory copy as direction matters on GPU 2025-04-04 18:35:05 -04:00
130e07a422 Non hermitian support 2025-04-04 18:35:05 -04:00
8f47bb367e Shifted non herm 2025-04-04 18:35:05 -04:00
0c3cb60135 Script update 2025-04-04 18:35:05 -04:00
9eae8fca5d Size outut 2025-04-04 18:35:05 -04:00
882a217074 Example of Useful prerequisite installs with spack 2025-03-26 11:28:53 -04:00
e465fce201 Merge remote-tracking branch 'upstream/develop' into gauge_action_deriv 2025-03-24 10:12:42 +00:00
d41542c64b reverted sp2n test wilsonfundfermiongauge to original 2025-03-24 08:29:15 +00:00
199818bd6c Merge pull request #475 from lehner/feature-aurora
Sync with GPT on Aurora
2025-03-13 08:55:55 -04:00
fe66c7ca30 verbosity 2025-03-13 12:49:36 +00:00
e9177e4af3 Blas compatibility 2025-03-13 08:48:23 +00:00
d15a6c5933 Merge branch 'develop' of https://github.com/paboyle/Grid into feature-aurora 2025-03-13 07:29:55 +00:00
25ab9325e7 Use hostVector but remove construct resize 2025-03-11 15:02:32 +00:00
19f9378b98 Should work on Aurora nowb 2025-03-11 13:50:43 +00:00
785bc7a14f Adding staple zeroing fix 2025-03-10 12:29:04 +00:00
1a1fe85428 Merge remote-tracking branch 'upstream' into gauge_action_deriv 2025-03-10 08:37:36 +00:00
0000d2e558 Merge branch 'develop' into gauge_action_deriv 2025-03-10 08:35:57 +00:00
9ffd1ed4ce Merged 2025-03-08 15:30:08 +00:00
3d014864e2 Makinig LLVM happy 2025-03-06 14:19:25 -05:00
1d22841811 Working on aurora, GPT issue turned up is fixed 2025-03-06 03:20:18 +00:00
a1cdda833f Update WorkArounds.txt 2025-03-05 14:04:23 -05:00
ad6db92690 Update WorkArounds.txt 2025-03-05 14:00:26 -05:00
e8ff9d8e50 Update WorkArounds.txt 2025-03-05 14:00:04 -05:00
795769c636 Update WorkArounds.txt 2025-03-05 13:50:41 -05:00
267a39d943 Update WorkArounds.txt 2025-03-05 13:49:43 -05:00
3624bd3d22 Update WorkArounds.txt 2025-03-05 13:45:09 -05:00
bc12dbbb38 Update WorkArounds.txt 2025-03-05 12:48:56 -05:00
eb8a008a8f Create WorkArounds.txt 2025-03-05 12:41:59 -05:00
c4d9aa1a21 Config command that makes GPT happier 2025-02-27 20:12:49 +00:00
6ae809ed40 Print not liked on GPT compile 2025-02-27 20:12:49 +00:00
311e2aab3f Update Accelerator.h 2025-02-26 11:42:52 -05:00
438dfbdb83 Only throw if there is a pending list entry in CommsComplete 2025-02-25 16:57:27 +00:00
b2ce760cf4 Verbose issue with GPT 2025-02-25 16:55:23 +00:00
b1ba209696 Latest upstream with np-su3 patch and modified Sp_WilsonFunfFermionGauge test to be small (#22)
Co-authored-by: Mashy Green <mashy@me.com>

merging no-su3 patch
2025-02-24 11:38:42 +00:00
cb3e529b1e Merge branch 'paboyle:develop' into develop 2025-02-24 11:29:09 +00:00
717f647418 added the WilsonFlow patch from upstream PR #471 2025-02-24 08:41:31 +00:00
98e7418187 Merge remote-tracking branch 'upstream/develop' into gauge_action_deriv 2025-02-24 08:33:05 +00:00
fe05bf48b1 Improvements to WilsonGaugeAction deriv function (#16)
* patched version + modifications to deriv -> staple in qcd/gauge

* Cleaning up and aligning variable naming between action deriv versions

* Removing the regresion test files that were also in this branch for a clean PR

* Reverting whitespace changes

* Fixing after revering too much!

---------

Co-authored-by: Mashy Green <mashy@me.com>
2025-02-17 18:52:04 +00:00
d2dd8f54e2 Fixing after revering too much! 2025-02-17 17:32:27 +00:00
7726ee4b16 Reverting whitespace changes 2025-02-17 17:16:28 +00:00
ba9bbe0221 Bounce MPI through host 2025-02-12 19:34:59 +00:00
4c3dd82d84 CSHIFT with bounce throuhgh Host memory on MPI packets 2025-02-12 19:09:53 +00:00
44e911b5b7 Comment change 2025-02-12 17:37:55 +00:00
a7a16df9d0 GET not put has kinder barrier sequence for NVLINK type access as when
GET is done, I can use it without barrier. Moves a barrier to a nicer
place, overlapped with DtoH DMA
2025-02-12 14:59:28 +00:00
382e0abefd Was issueing a double fence -- the gather also fences 2025-02-12 14:57:28 +00:00
6fdefe5b90 Barrier sequencing if doing "GET" not "PUT" is different.
This is somewhat better timing for Barriers
2025-02-12 14:55:20 +00:00
4788dd8e2e More states in packet progression for GPU non aware MPI 2025-02-12 14:53:57 +00:00
1cc5f221f3 GET not put ordering is better as I know when I've got all MY data 2025-02-12 14:53:05 +00:00
93251bfba0 GET not put for better ordering in the downstream dependent kernels -- I
know when I'm done, so we can move a barrier / handshake between ranks
intranode to a point off critical path
2025-02-12 14:50:21 +00:00
18b79508b8 New line better for pretty print 2025-02-12 14:49:48 +00:00
4de5ed1613 Remove vector view. The std::vector will not inform Memory manager of
deletion and so a stale entry could be left. It is not and should not be
used.
2025-02-12 14:48:46 +00:00
0baaddbe98 Pipeline mode commit on Aurora. 5+ TF/s on 16^3x32 per tile at 384
nodes.
More concurrency/fine grained scheduling is possible.
2025-02-04 19:27:26 +00:00
355ec76257 Merge pull request #18 from UCL-ARC/bugfix/nvtx
Bugfix/nvtx
2025-02-03 11:05:42 +00:00
84cab5e6e7 no comms and log cleanup 2025-02-01 16:37:21 +01:00
4f17c8d081 Merge branch 'paboyle:develop' into bugfix/nvtx 2025-01-29 13:10:12 +00:00
aaab753982 Reverting to older version of nvtx for Tursa support 2025-01-29 12:57:38 +00:00
570b72a47b Bugfix. Sorry! 2025-01-21 15:37:39 -05:00
a5798a89ed Merge branch 'develop' into specflow 2025-01-21 12:13:24 -05:00
f7e2f9a401 Checking in spectral flow and DWF/Mobius kernel eigenvalue measurement 2025-01-16 20:47:33 +00:00
2848a9b558 DWF Kernel lanczos working(?) 2025-01-16 01:29:56 +00:00
d4868991af Fixed wrong lib for NVTX in configure.ac and updated to nvtx3 2025-01-10 14:53:19 +00:00
e99d42404e Removing the regresion test files that were also in this branch for a clean PR 2024-12-16 16:31:22 +00:00
3ba019c747 Cleaning up and aligning variable naming between action deriv versions 2024-12-03 15:23:00 +00:00
47429218bb patched version + modifications to deriv -> staple in qcd/gauge 2024-11-27 16:29:22 +00:00
8d305df0db guard against trying to compile SU3-specific code when Nc ≠ 3 2024-05-24 14:00:56 +01:00
110 changed files with 4611 additions and 592 deletions

View File

@ -191,7 +191,7 @@ public:
Lattice<sobj> pgbuf(&pencil_g);
autoView(pgbuf_v , pgbuf, CpuWrite);
std::cout << "CPU view" << std::endl;
//std::cout << "CPU view" << std::endl;
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
@ -215,7 +215,7 @@ public:
else if ( sign == forward ) div = 1.0;
else assert(0);
std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
//std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
FFTW_plan p;
{
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
@ -229,7 +229,7 @@ public:
}
// Barrel shift and collect global pencil
std::cout << GridLogPerformance<<"Making pencil" << std::endl;
//std::cout << GridLogPerformance<<"Making pencil" << std::endl;
Coordinate lcoor(Nd), gcoor(Nd);
result = source;
int pc = processor_coor[dim];
@ -251,7 +251,7 @@ public:
}
}
std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
//std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
// Loop over orthog coords
int NN=pencil_g.lSites();
GridStopWatch timer;
@ -274,7 +274,7 @@ public:
usec += timer.useconds();
flops+= flops_call*NN;
std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
//std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
// writing out result
{
autoView(pgbuf_v,pgbuf,CpuRead);
@ -291,7 +291,7 @@ public:
}
result = result*div;
std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
//std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
// destroying plan
FFTW<scalar>::fftw_destroy_plan(p);
#endif

View File

@ -277,6 +277,38 @@ public:
assert(0);
}
};
template<class Matrix,class Field>
class ShiftedNonHermitianLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
RealD shift;
public:
ShiftedNonHermitianLinearOperator(Matrix &Mat,RealD shft): _Mat(Mat),shift(shft){};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out);
out = out + shift*in;
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
_Mat.MdirAll(in,out);
};
void Op (const Field &in, Field &out){
_Mat.M(in,out);
out = out + shift * in;
}
void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out);
out = out + shift * in;
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
assert(0);
}
void HermOp(const Field &in, Field &out){
assert(0);
}
};
//////////////////////////////////////////////////////////
// Even Odd Schur decomp operators; there are several

View File

@ -269,7 +269,9 @@ public:
RealD xscale = 2.0/(hi-lo);
RealD mscale = -(hi+lo)/(hi-lo);
Linop.HermOp(T0,y);
grid->Barrier();
axpby(T1,xscale,mscale,y,in);
grid->Barrier();
// sum = .5 c[0] T0 + c[1] T1
// out = ()*T0 + Coeffs[1]*T1;

View File

@ -208,8 +208,8 @@ public:
assert(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount);
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
assert(OpB!=GridBLAS_OP_T);
//assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
//assert(OpB!=GridBLAS_OP_T);
int lda = m; // m x k column major
int ldb = k; // k x n column major
@ -367,28 +367,67 @@ public:
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
else
eCmn = alpha * eAmk * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
else
eCmn = alpha * eAmk.adjoint() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
else
eCmn = alpha * eAmk.transpose() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
else
eCmn = alpha * eAmk * eBkn.adjoint() ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
});
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
else
eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
} );
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
else
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
} );
} else {
assert(0);
@ -414,8 +453,8 @@ public:
RealD t2=usecond();
int32_t batchCount = Amk.size();
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
assert(OpB!=GridBLAS_OP_T);
//assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
//assert(OpB!=GridBLAS_OP_T);
int lda = m; // m x k column major
int ldb = k; // k x n column major
@ -514,28 +553,70 @@ public:
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
else
eCmn = alpha * eAmk * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
else
eCmn = alpha * eAmk.adjoint() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
else
eCmn = alpha * eAmk.transpose() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
else
eCmn = alpha * eAmk * eBkn.adjoint() ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
else
eCmn = alpha * eAmk * eBkn.transpose() ;
});
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
else
eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
} );
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
else
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
} );
} else {
assert(0);
@ -661,29 +742,41 @@ public:
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
else
eCmn = alpha * eAmk * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
else
eCmn = alpha * eAmk.transpose() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
else
eCmn = alpha * eAmk * eBkn.transpose() ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
} );
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
else
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
});
} else {
assert(0);
}
@ -809,28 +902,40 @@ public:
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
else
eCmn = alpha * eAmk * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
else
eCmn = alpha * eAmk.transpose() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
else
eCmn = alpha * eAmk * eBkn.transpose() ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
else
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
});
} else {
assert(0);

View File

@ -245,9 +245,10 @@ until convergence
_HermOp(src_n,tmp);
// std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
// std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
// RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
RealD vnum = real(innerProduct(tmp,tmp)); // HermOp^2.
RealD vden = norm2(src_n);
RealD na = vnum/vden;
RealD na = std::sqrt(vnum/vden);
if (fabs(evalMaxApprox/na - 1.0) < 0.0001)
i=_MAX_ITER_IRL_MEVAPP_;
evalMaxApprox = na;
@ -255,6 +256,7 @@ until convergence
src_n = tmp;
}
}
std::cout << GridLogIRL << " Final evalMaxApprox " << evalMaxApprox << std::endl;
std::vector<RealD> lme(Nm);
std::vector<RealD> lme2(Nm);

View File

@ -97,7 +97,7 @@ public:
RealD scale;
ConjugateGradient<FineField> CG(1.0e-2,100,false);
ConjugateGradient<FineField> CG(1.0e-3,400,false);
FineField noise(FineGrid);
FineField Mn(FineGrid);
@ -110,7 +110,7 @@ public:
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
for(int i=0;i<1;i++){
for(int i=0;i<4;i++){
CG(hermop,noise,subspace[b]);
@ -146,7 +146,7 @@ public:
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;
for(int i=0;i<3;i++){
for(int i=0;i<2;i++){
// void operator() (const Field &src, Field &psi){
#if 1
std::cout << GridLogMessage << " inverting on noise "<<std::endl;

View File

@ -441,8 +441,20 @@ public:
std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl;
}
#else
//////////////////////////////////////////////////////////////////////
// Galerkin projection of matrix
//////////////////////////////////////////////////////////////////////
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & Subspace)
{
CoarsenOperator(linop,Subspace,Subspace);
}
//////////////////////////////////////////////////////////////////////
// Petrov - Galerkin projection of matrix
//////////////////////////////////////////////////////////////////////
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & U,
Aggregation<Fobj,CComplex,nbasis> & V)
{
std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
GridBase *grid = FineGrid();
@ -458,11 +470,9 @@ public:
// Orthogonalise the subblocks over the basis
/////////////////////////////////////////////////////////////
CoarseScalar InnerProd(CoarseGrid());
blockOrthogonalise(InnerProd,Subspace.subspace);
blockOrthogonalise(InnerProd,V.subspace);
blockOrthogonalise(InnerProd,U.subspace);
// for(int s=0;s<Subspace.subspace.size();s++){
// std::cout << " subspace norm "<<norm2(Subspace.subspace[s])<<std::endl;
// }
const int npoint = geom.npoint;
Coordinate clatt = CoarseGrid()->GlobalDimensions();
@ -542,7 +552,7 @@ public:
std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
tphaseBZ-=usecond();
phaV = phaF[p]*Subspace.subspace[i];
phaV = phaF[p]*V.subspace[i];
tphaseBZ+=usecond();
/////////////////////////////////////////////////////////////////////
@ -555,7 +565,7 @@ public:
// std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl;
tproj-=usecond();
blockProject(coarseInner,MphaV,Subspace.subspace);
blockProject(coarseInner,MphaV,U.subspace);
coarseInner = conjugate(pha[p]) * coarseInner;
ComputeProj[p] = coarseInner;

View File

@ -69,7 +69,7 @@ public:
}
// FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
void construct(pointer __p, const _Tp& __val) { assert(0);};
void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p) { };
void destroy(pointer __p) { };
};
@ -175,10 +175,11 @@ template<typename _Tp> inline bool operator!=(const devAllocator<_Tp>&, const d
// Template typedefs
////////////////////////////////////////////////////////////////////////////////
template<class T> using hostVector = std::vector<T,alignedAllocator<T> >; // Needs autoview
template<class T> using Vector = std::vector<T,uvmAllocator<T> >; //
template<class T> using Vector = std::vector<T,uvmAllocator<T> >; // Really want to deprecate
template<class T> using uvmVector = std::vector<T,uvmAllocator<T> >; // auto migrating page
template<class T> using deviceVector = std::vector<T,devAllocator<T> >; // device vector
/*
template<class T> class vecView
{
protected:
@ -214,6 +215,7 @@ template<class T> vecView<T> VectorView(Vector<T> &vec,ViewMode _mode)
#define autoVecView(v_v,v,mode) \
auto v_v = VectorView(v,mode); \
ViewCloser<decltype(v_v)> _autoView##v_v(v_v);
*/
NAMESPACE_END(Grid);

View File

@ -9,6 +9,7 @@ static char print_buffer [ MAXLINE ];
#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer << std::endl;
#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer << std::endl;
//#define dprintf(...)
//#define mprintf(...)
////////////////////////////////////////////////////////////
// For caching copies of data on device
@ -109,7 +110,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
///////////////////////////////////////////////////////////
assert(AccCache.state!=Empty);
dprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
dprintf("MemoryManager: Discard(%lx) %lx",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
assert(AccCache.accLock==0);
assert(AccCache.cpuLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
@ -119,7 +120,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
DeviceBytes -=AccCache.bytes;
LRUremove(AccCache);
AccCache.AccPtr=(uint64_t) NULL;
dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
}
uint64_t CpuPtr = AccCache.CpuPtr;
EntryErase(CpuPtr);
@ -139,7 +140,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
///////////////////////////////////////////////////////////////////////////
assert(AccCache.state!=Empty);
mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld\n",
mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld",
(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
(uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock);
if (AccCache.accLock!=0) return;
@ -153,7 +154,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
AccCache.AccPtr=(uint64_t)NULL;
AccCache.state=CpuDirty; // CPU primary now
DeviceBytes -=AccCache.bytes;
dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld ",(uint64_t)AccCache.AccPtr,DeviceBytes);
}
// uint64_t CpuPtr = AccCache.CpuPtr;
DeviceEvictions++;
@ -167,7 +168,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
assert(AccCache.AccPtr!=(uint64_t)NULL);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
DeviceToHostBytes+=AccCache.bytes;
DeviceToHostXfer++;
AccCache.state=Consistent;
@ -182,7 +183,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes;
}
mprintf("MemoryManager: acceleratorCopyToDevice Clone size %ld AccPtr %lx <- CpuPtr %lx\n",
mprintf("MemoryManager: acceleratorCopyToDevice Clone size %ld AccPtr %lx <- CpuPtr %lx",
(uint64_t)AccCache.bytes,
(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
@ -210,7 +211,7 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
{
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr);
dprintf("AcceleratorViewClose %lx",(uint64_t)Ptr);
AcceleratorViewClose((uint64_t)Ptr);
} else if( (mode==CpuRead)||(mode==CpuWrite)){
CpuViewClose((uint64_t)Ptr);
@ -222,7 +223,7 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
{
uint64_t CpuPtr = (uint64_t)_CpuPtr;
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr);
dprintf("AcceleratorViewOpen %lx",(uint64_t)CpuPtr);
return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
} else if( (mode==CpuRead)||(mode==CpuWrite)){
return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
@ -233,6 +234,9 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
}
void MemoryManager::EvictVictims(uint64_t bytes)
{
if(bytes>=DeviceMaxBytes) {
printf("EvictVictims bytes %ld DeviceMaxBytes %ld\n",bytes,DeviceMaxBytes);
}
assert(bytes<DeviceMaxBytes);
while(bytes+DeviceLRUBytes > DeviceMaxBytes){
if ( DeviceLRUBytes > 0){
@ -265,7 +269,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
assert(AccCache.cpuLock==0); // Programming error
if(AccCache.state!=Empty) {
dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld\n",
dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld",
(uint64_t)AccCache.CpuPtr,
(uint64_t)CpuPtr,
(uint64_t)AccCache.bytes,
@ -305,7 +309,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
AccCache.state = Consistent; // Empty + AccRead => Consistent
}
AccCache.accLock= 1;
dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock);
dprintf("Copied Empty entry into device accLock= %d",AccCache.accLock);
} else if(AccCache.state==CpuDirty ){
if(mode==AcceleratorWriteDiscard) {
CpuDiscard(AccCache);
@ -318,21 +322,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
}
AccCache.accLock++;
dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock);
dprintf("CpuDirty entry into device ++accLock= %d",AccCache.accLock);
} else if(AccCache.state==Consistent) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
else
AccCache.state = Consistent; // Consistent + AccRead => Consistent
AccCache.accLock++;
dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock);
dprintf("Consistent entry into device ++accLock= %d",AccCache.accLock);
} else if(AccCache.state==AccDirty) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
else
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
AccCache.accLock++;
dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock);
dprintf("AccDirty entry ++accLock= %d",AccCache.accLock);
} else {
assert(0);
}
@ -341,7 +345,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
// If view is opened on device must remove from LRU
if(AccCache.LRU_valid==1){
// must possibly remove from LRU as now locked on GPU
dprintf("AccCache entry removed from LRU \n");
dprintf("AccCache entry removed from LRU ");
LRUremove(AccCache);
}
@ -364,10 +368,10 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
AccCache.accLock--;
// Move to LRU queue if not locked and close on device
if(AccCache.accLock==0) {
dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
LRUinsert(AccCache);
} else {
dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
dprintf("AccleratorViewClose %lx AccLock decremented to %ld",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
}
}
void MemoryManager::CpuViewClose(uint64_t CpuPtr)

View File

@ -33,6 +33,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
///////////////////////////////////
#include <Grid/communicator/SharedMemory.h>
#define NVLINK_GET
NAMESPACE_BEGIN(Grid);
extern bool Stencil_force_mpi ;
@ -147,7 +149,8 @@ public:
sizeof(obj),d*100+p);
}
CommsComplete(list);
if (!list.empty()) // avoid triggering assert in comms == none
CommsComplete(list);
for(int p=1;p<_processors[d];p++){
accum = accum + column[p];
}
@ -192,6 +195,11 @@ public:
void *recv,
int recv_from_rank,int do_recv,
int xbytes,int rbytes,int dir);
// Could do a PollHtoD and have a CommsMerge dependence
void StencilSendToRecvFromPollDtoH (std::vector<CommsRequest_t> &list);
void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,int do_xmit,

View File

@ -30,6 +30,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid);
Grid_MPI_Comm CartesianCommunicator::communicator_world;
////////////////////////////////////////////
@ -259,32 +260,39 @@ CartesianCommunicator::~CartesianCommunicator()
}
#ifdef USE_GRID_REDUCTION
void CartesianCommunicator::GlobalSum(float &f){
FlightRecorder::StepLog("GlobalSumP2P");
CartesianCommunicator::GlobalSumP2P(f);
}
void CartesianCommunicator::GlobalSum(double &d)
{
FlightRecorder::StepLog("GlobalSumP2P");
CartesianCommunicator::GlobalSumP2P(d);
}
#else
void CartesianCommunicator::GlobalSum(float &f){
FlightRecorder::StepLog("AllReduce");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(double &d)
{
FlightRecorder::StepLog("AllReduce");
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
#endif
void CartesianCommunicator::GlobalSum(uint32_t &u){
FlightRecorder::StepLog("AllReduce");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(uint64_t &u){
FlightRecorder::StepLog("AllReduce");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
FlightRecorder::StepLog("AllReduceVector");
int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0);
}
@ -362,8 +370,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
int bytes)
{
std::vector<MpiCommsRequest_t> reqs(0);
unsigned long xcrc = crc32(0L, Z_NULL, 0);
unsigned long rcrc = crc32(0L, Z_NULL, 0);
int myrank = _processor;
int ierr;
@ -379,9 +385,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
communicator,MPI_STATUS_IGNORE);
assert(ierr==0);
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
// printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
}
// Basic Halo comms primitive
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
@ -399,6 +402,8 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
#ifdef ACCELERATOR_AWARE_MPI
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,int dox,
@ -440,8 +445,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
list.push_back(rrq);
off_node_bytes+=rbytes;
}
#ifdef NVLINK_GET
else {
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
}
#endif
}
// This is a NVLINK PUT
if (dox) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+_processor*32;
@ -450,9 +462,11 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
list.push_back(xrq);
off_node_bytes+=xbytes;
} else {
#ifndef NVLINK_GET
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
#endif
}
}
return off_node_bytes;
@ -461,7 +475,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{
int nreq=list.size();
/*finishes Get/Put*/
acceleratorCopySynchronise();
if (nreq==0) return;
@ -561,53 +575,105 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
if (dox) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
#undef DEVICE_TO_HOST_CONCURRENT // pipeline
#ifdef DEVICE_TO_HOST_CONCURRENT
tag= dir+_processor*32;
host_xmit = this->HostBufferMalloc(xbytes);
acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
CommsRequest_t srq;
srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
// ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
// assert(ierr==0);
// off_node_bytes+=xbytes;
CommsRequest_t srq;
srq.PacketType = InterNodeXmit;
srq.bytes = xbytes;
// srq.req = xrq;
srq.host_buf = host_xmit;
srq.device_buf = xmit;
srq.tag = tag;
srq.dest = dest;
srq.commdir = commdir;
list.push_back(srq);
#else
tag= dir+_processor*32;
host_xmit = this->HostBufferMalloc(xbytes);
const int chunks=1;
for(int n=0;n<chunks;n++){
void * host_xmitc = (void *)( (uint64_t) host_xmit + n*xbytes/chunks);
void * xmitc = (void *)( (uint64_t) xmit + n*xbytes/chunks);
acceleratorCopyFromDeviceAsynch(xmitc, host_xmitc,xbytes/chunks); // Make this Asynch
}
acceleratorCopySynchronise(); // Complete all pending copy transfers
ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0);
off_node_bytes+=xbytes;
CommsRequest_t srq;
srq.PacketType = InterNodeXmit;
srq.bytes = xbytes;
srq.req = xrq;
srq.host_buf = host_xmit;
srq.device_buf = xmit;
list.push_back(srq);
#endif
}
}
return off_node_bytes;
}
/*
* In the interest of better pipelining, poll for completion on each DtoH and
* start MPI_ISend in the meantime
*/
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list)
{
int pending = 0;
do {
pending = 0;
for(int idx = 0; idx<list.size();idx++){
if ( list[idx].PacketType==InterNodeRecv ) {
int flag = 0;
MPI_Status status;
int ierr = MPI_Test(&list[idx].req,&flag,&status);
assert(ierr==0);
if ( flag ) {
// std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl;
acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes);
list[idx].PacketType=InterNodeReceiveHtoD;
} else {
pending ++;
}
}
}
// std::cout << " PollIrecv "<<pending<<" pending requests"<<std::endl;
} while ( pending );
}
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list)
{
int pending = 0;
do {
pending = 0;
for(int idx = 0; idx<list.size();idx++){
if ( list[idx].PacketType==InterNodeXmit ) {
if ( acceleratorEventIsComplete(list[idx].ev) ) {
void *host_xmit = list[idx].host_buf;
uint32_t xbytes = list[idx].bytes;
int dest = list[idx].dest;
int tag = list[idx].tag;
int commdir = list[idx].commdir;
///////////////////
// Send packet
///////////////////
// std::cout << " DtoH is complete for index "<<idx<<" calling MPI_Isend "<<std::endl;
MPI_Request xrq;
int ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0);
list[idx].req = xrq; // Update the MPI request in the list
list[idx].PacketType=InterNodeXmitISend;
} else {
// not done, so return to polling loop
pending++;
}
}
}
} while (pending);
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
@ -644,69 +710,89 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
* - complete all copies
* - post MPI send asynch
*/
#ifdef NVLINK_GET
if ( dor ) {
// static int printed;
// if((printed<8) && this->IsBoss() ) {
// printf("dir %d doX %d doR %d Face size %ld %ld\n",dir,dox,dor,xbytes,rbytes);
// printed++;
// }
if ( ! ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) ) {
// Intranode
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
assert(shm!=NULL);
CommsRequest_t srq;
srq.ev = acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
srq.PacketType = IntraNodeRecv;
srq.bytes = xbytes;
// srq.req = xrq;
srq.host_buf = NULL;
srq.device_buf = xmit;
srq.tag = -1;
srq.dest = dest;
srq.commdir = dir;
list.push_back(srq);
}
}
#else
if (dox) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
#ifdef DEVICE_TO_HOST_CONCURRENT
tag= dir+_processor*32;
// Find the send in the prepared list
int list_idx=-1;
for(int idx = 0; idx<list.size();idx++){
if ( (list[idx].device_buf==xmit)
&&(list[idx].PacketType==InterNodeXmit)
&&(list[idx].bytes==xbytes) ) {
list_idx = idx;
host_xmit = list[idx].host_buf;
}
}
assert(list_idx != -1); // found it
ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0);
list[list_idx].req = xrq; // Update the MPI request in the list
off_node_bytes+=xbytes;
#endif
} else {
if ( !( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) ) {
// Intranode
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
CommsRequest_t srq;
srq.ev = acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
srq.PacketType = IntraNodeXmit;
srq.bytes = xbytes;
// srq.req = xrq;
srq.host_buf = NULL;
srq.device_buf = xmit;
srq.tag = -1;
srq.dest = dest;
srq.commdir = dir;
list.push_back(srq);
}
}
#endif
return off_node_bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{
int nreq=list.size();
acceleratorCopySynchronise(); // Complete all pending copy transfers D2D
if (nreq==0) return;
std::vector<MPI_Status> status(nreq);
std::vector<MPI_Request> MpiRequests(nreq);
std::vector<MPI_Status> status;
std::vector<MPI_Request> MpiRequests;
for(int r=0;r<list.size();r++){
// Must check each Send buf is clear to reuse
if ( list[r].PacketType == InterNodeXmitISend ) MpiRequests.push_back(list[r].req);
// if ( list[r].PacketType == InterNodeRecv ) MpiRequests.push_back(list[r].req); // Already "Test" passed
}
for(int r=0;r<nreq;r++){
MpiRequests[r] = list[r].req;
int nreq=MpiRequests.size();
if (nreq>0) {
status.resize(MpiRequests.size());
int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
assert(ierr==0);
}
int ierr = MPI_Waitall(nreq,&MpiRequests[0],&status[0]);
assert(ierr==0);
for(int r=0;r<nreq;r++){
if ( list[r].PacketType==InterNodeRecv ) {
acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
}
}
// for(int r=0;r<nreq;r++){
// if ( list[r].PacketType==InterNodeRecv ) {
// acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
// }
// }
acceleratorCopySynchronise(); // Complete all pending copy transfers
list.resize(0); // Delete the list
this->HostBufferFreeAll(); // Clean up the buffer allocs
this->StencilBarrier();
#ifndef NVLINK_GET
this->StencilBarrier(); // if PUT must check our nbrs have filled our receive buffers.
#endif
}
#endif
////////////////////////////////////////////
@ -715,6 +801,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
void CartesianCommunicator::StencilBarrier(void)
{
FlightRecorder::StepLog("NodeBarrier");
MPI_Barrier (ShmComm);
}
//void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
@ -722,11 +809,13 @@ void CartesianCommunicator::StencilBarrier(void)
//}
void CartesianCommunicator::Barrier(void)
{
FlightRecorder::StepLog("GridBarrier");
int ierr = MPI_Barrier(communicator);
assert(ierr==0);
}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
{
FlightRecorder::StepLog("Broadcast");
int ierr=MPI_Bcast(data,
bytes,
MPI_BYTE,
@ -745,6 +834,7 @@ void CartesianCommunicator::BarrierWorld(void){
}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
{
FlightRecorder::StepLog("BroadcastWorld");
int ierr= MPI_Bcast(data,
bytes,
MPI_BYTE,
@ -767,6 +857,7 @@ void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,
}
void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes)
{
FlightRecorder::StepLog("AllToAll");
// MPI is a pain and uses "int" arguments
// 64*64*64*128*16 == 500Million elements of data.
// When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.

View File

@ -91,7 +91,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
{
assert(0);
}
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(0);}
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(list.size()==0);}
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
@ -132,6 +132,8 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
{
return 2.0*bytes;
}
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,int dox,
@ -139,7 +141,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
int recv_from_rank,int dor,
int xbytes,int rbytes, int dir)
{
return xbytes+rbytes;
return 0.0;
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,

View File

@ -50,12 +50,30 @@ typedef MPI_Request MpiCommsRequest_t;
#ifdef ACCELERATOR_AWARE_MPI
typedef MPI_Request CommsRequest_t;
#else
enum PacketType_t { InterNodeXmit, InterNodeRecv, IntraNodeXmit, IntraNodeRecv };
/*
* Enable state transitions as each packet flows.
*/
enum PacketType_t {
FaceGather,
InterNodeXmit,
InterNodeRecv,
IntraNodeXmit,
IntraNodeRecv,
InterNodeXmitISend,
InterNodeReceiveHtoD
};
/*
*Package arguments needed for various actions along packet flow
*/
typedef struct {
PacketType_t PacketType;
void *host_buf;
void *device_buf;
int dest;
int tag;
int commdir;
unsigned long bytes;
acceleratorEvent_t ev;
MpiCommsRequest_t req;
} CommsRequest_t;
#endif
@ -119,7 +137,7 @@ public:
///////////////////////////////////////////////////
static void SharedMemoryAllocate(uint64_t bytes, int flags);
static void SharedMemoryFree(void);
static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
// static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
static void SharedMemoryZero(void *dest,size_t bytes);
};

View File

@ -542,12 +542,12 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
// Each MPI rank should allocate our own buffer
///////////////////////////////////////////////////////////////////////////////////////////////////////////
#ifndef ACCELERATOR_AWARE_MPI
printf("Host buffer allocate for GPU non-aware MPI\n");
// printf("Host buffer allocate for GPU non-aware MPI\n");
#if 0
HostCommBuf= acceleratorAllocHost(bytes);
#else
HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
#ifdef HAVE_NUMAIF_H
#if 0
#warning "Moving host buffers to specific NUMA domain"
int numa;
char *numa_name=(char *)getenv("MPI_BUF_NUMA");
@ -916,14 +916,14 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
bzero(dest,bytes);
#endif
}
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
{
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
acceleratorCopyToDevice(src,dest,bytes);
#else
bcopy(src,dest,bytes);
#endif
}
//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
//{
//#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
// acceleratorCopyToDevice(src,dest,bytes);
//#else
// bcopy(src,dest,bytes);
//#endif
//}
////////////////////////////////////////////////////////
// Global shared functionality finished
// Now move to per communicator functionality
@ -959,6 +959,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
// std::cerr << " SetCommunicator rank "<<r<<" comm "<<ShmCommBufs[r] <<std::endl;
}
ShmBufferFreeAll();
@ -989,7 +990,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
}
#endif
//SharedMemoryTest();
// SharedMemoryTest();
}
//////////////////////////////////////////////////////////////////
// On node barrier
@ -1011,19 +1012,18 @@ void SharedMemory::SharedMemoryTest(void)
check[0]=GlobalSharedMemory::WorldNode;
check[1]=r;
check[2]=magic;
GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t));
acceleratorCopyToDevice(check,ShmCommBufs[r],3*sizeof(uint64_t));
}
}
ShmBarrier();
for(uint64_t r=0;r<ShmSize;r++){
ShmBarrier();
GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t));
ShmBarrier();
acceleratorCopyFromDevice(ShmCommBufs[r],check,3*sizeof(uint64_t));
assert(check[0]==GlobalSharedMemory::WorldNode);
assert(check[1]==r);
assert(check[2]==magic);
ShmBarrier();
}
ShmBarrier();
std::cout << GridLogDebug << " SharedMemoryTest has passed "<<std::endl;
}
void *SharedMemory::ShmBuffer(int rank)

View File

@ -122,10 +122,10 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
{
acceleratorMemSet(dest,0,bytes);
}
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
{
acceleratorCopyToDevice(src,dest,bytes);
}
//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
//{
// acceleratorCopyToDevice(src,dest,bytes);
//}
////////////////////////////////////////////////////////
// Global shared functionality finished
// Now move to per communicator functionality

View File

@ -68,7 +68,7 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
if(Cshift_verbose) std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
return ret;
}
#if 1
template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
{
int sshift[2];
@ -125,7 +125,11 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
#ifndef ACCELERATOR_AWARE_MPI
static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
#endif
int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
RealD tcopy=0.0;
@ -156,16 +160,29 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
// int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond();
grid->Barrier();
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank,
(void *)&recv_buf[0],
recv_from_rank,
bytes);
#else
// bouncy bouncy
acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
grid->SendToRecvFrom((void *)&hsend_buf[0],
xmit_to_rank,
(void *)&hrecv_buf[0],
recv_from_rank,
bytes);
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
#endif
xbytes+=bytes;
grid->Barrier();
tcomms+=usecond();
@ -226,12 +243,16 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
static std::vector<deviceVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
scalar_object * recv_buf_extract_mpi;
scalar_object * send_buf_extract_mpi;
for(int s=0;s<Nsimd;s++){
send_buf_extract[s].resize(buffer_size);
recv_buf_extract[s].resize(buffer_size);
}
#ifndef ACCELERATOR_AWARE_MPI
hostVector<scalar_object> hsend_buf; hsend_buf.resize(buffer_size);
hostVector<scalar_object> hrecv_buf; hrecv_buf.resize(buffer_size);
#endif
int bytes = buffer_size*sizeof(scalar_object);
ExtractPointerArray<scalar_object> pointers(Nsimd); //
@ -283,11 +304,22 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
recv_buf_extract_mpi = &recv_buf_extract[i][0];
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
xmit_to_rank,
(void *)recv_buf_extract_mpi,
recv_from_rank,
bytes);
#else
// bouncy bouncy
acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes);
grid->SendToRecvFrom((void *)&hsend_buf[0],
xmit_to_rank,
(void *)&hrecv_buf[0],
recv_from_rank,
bytes);
acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
#endif
xbytes+=bytes;
grid->Barrier();
@ -311,234 +343,6 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
}
}
#else
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
GridBase *grid=rhs.Grid();
Lattice<vobj> temp(rhs.Grid());
int fd = rhs.Grid()->_fdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension];
int pd = rhs.Grid()->_processors[dimension];
int simd_layout = rhs.Grid()->_simd_layout[dimension];
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
assert(simd_layout==1);
assert(comm_dim==1);
assert(shift>=0);
assert(shift<fd);
RealD tcopy=0.0;
RealD tgather=0.0;
RealD tscatter=0.0;
RealD tcomms=0.0;
uint64_t xbytes=0;
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
vobj *send_buf;
vobj *recv_buf;
{
grid->ShmBufferFreeAll();
size_t bytes = buffer_size*sizeof(vobj);
send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
}
int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
for(int x=0;x<rd;x++){
int sx = (x+sshift)%rd;
int comm_proc = ((x+sshift)/rd)%pd;
if (comm_proc==0) {
tcopy-=usecond();
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
tcopy+=usecond();
} else {
int words = buffer_size;
if (cbmask != 0x3) words=words>>1;
int bytes = words * sizeof(vobj);
tgather-=usecond();
Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
tgather+=usecond();
// int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond();
// grid->Barrier();
acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank,
(void *)&recv_buf[0],
recv_from_rank,
bytes);
xbytes+=bytes;
acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
// grid->Barrier();
tcomms+=usecond();
tscatter-=usecond();
Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
tscatter+=usecond();
}
}
if(Cshift_verbose){
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
}
}
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
GridBase *grid=rhs.Grid();
const int Nsimd = grid->Nsimd();
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
int fd = grid->_fdimensions[dimension];
int rd = grid->_rdimensions[dimension];
int ld = grid->_ldimensions[dimension];
int pd = grid->_processors[dimension];
int simd_layout = grid->_simd_layout[dimension];
int comm_dim = grid->_processors[dimension] >1 ;
//std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
assert(comm_dim==1);
assert(simd_layout==2);
assert(shift>=0);
assert(shift<fd);
RealD tcopy=0.0;
RealD tgather=0.0;
RealD tscatter=0.0;
RealD tcomms=0.0;
uint64_t xbytes=0;
int permute_type=grid->PermuteType(dimension);
///////////////////////////////////////////////
// Simd direction uses an extract/merge pair
///////////////////////////////////////////////
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
// int words = sizeof(vobj)/sizeof(vector_type);
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
scalar_object * recv_buf_extract_mpi;
scalar_object * send_buf_extract_mpi;
{
size_t bytes = sizeof(scalar_object)*buffer_size;
grid->ShmBufferFreeAll();
send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
}
for(int s=0;s<Nsimd;s++){
send_buf_extract[s].resize(buffer_size);
recv_buf_extract[s].resize(buffer_size);
}
int bytes = buffer_size*sizeof(scalar_object);
ExtractPointerArray<scalar_object> pointers(Nsimd); //
ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
///////////////////////////////////////////
// Work out what to send where
///////////////////////////////////////////
int cb = (cbmask==0x2)? Odd : Even;
int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
// loop over outer coord planes orthog to dim
for(int x=0;x<rd;x++){
// FIXME call local permute copy if none are offnode.
for(int i=0;i<Nsimd;i++){
pointers[i] = &send_buf_extract[i][0];
}
tgather-=usecond();
int sx = (x+sshift)%rd;
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
tgather+=usecond();
for(int i=0;i<Nsimd;i++){
int inner_bit = (Nsimd>>(permute_type+1));
int ic= (i&inner_bit)? 1:0;
int my_coor = rd*ic + x;
int nbr_coor = my_coor+sshift;
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer
int nbr_ox = (nbr_coor%rd); // outer coord of peer
int nbr_lane = (i&(~inner_bit));
int recv_from_rank;
int xmit_to_rank;
if (nbr_ic) nbr_lane|=inner_bit;
assert (sx == nbr_ox);
if(nbr_proc){
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond();
// grid->Barrier();
acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
xmit_to_rank,
(void *)recv_buf_extract_mpi,
recv_from_rank,
bytes);
acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
xbytes+=bytes;
// grid->Barrier();
tcomms+=usecond();
rpointers[i] = &recv_buf_extract[i][0];
} else {
rpointers[i] = &send_buf_extract[nbr_lane][0];
}
}
tscatter-=usecond();
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
tscatter+=usecond();
}
if(Cshift_verbose){
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl;
}
}
#endif
NAMESPACE_END(Grid);

View File

@ -236,7 +236,7 @@ public:
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
vobj vtmp;
vtmp = r;
#if 0
#if 1
deviceVector<vobj> vvtmp(1);
acceleratorPut(vvtmp[0],vtmp);
vobj *vvtmp_p = & vvtmp[0];

View File

@ -55,7 +55,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
//copy offsets to device
acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
acceleratorCopyToDeviceAsynch(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
@ -88,7 +88,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
exit(EXIT_FAILURE);
}
acceleratorCopyFromDeviceAsync(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
acceleratorCopyFromDeviceAsynch(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
//sync after copy
accelerator_barrier();

View File

@ -466,6 +466,12 @@ public:
static deviceVector<vobj> recv_buf;
send_buf.resize(buffer_size*2*depth);
recv_buf.resize(buffer_size*2*depth);
#ifndef ACCELERATOR_AWARE_MPI
static hostVector<vobj> hsend_buf;
static hostVector<vobj> hrecv_buf;
hsend_buf.resize(buffer_size*2*depth);
hrecv_buf.resize(buffer_size*2*depth);
#endif
std::vector<MpiCommsRequest_t> fwd_req;
std::vector<MpiCommsRequest_t> bwd_req;
@ -495,9 +501,16 @@ public:
t_gather+=usecond()-t;
t=usecond();
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFromBegin(fwd_req,
(void *)&send_buf[d*buffer_size], xmit_to_rank,
(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
#else
acceleratorCopyFromDevice(&send_buf[d*buffer_size],&hsend_buf[d*buffer_size],bytes);
grid->SendToRecvFromBegin(fwd_req,
(void *)&hsend_buf[d*buffer_size], xmit_to_rank,
(void *)&hrecv_buf[d*buffer_size], recv_from_rank, bytes, tag);
#endif
t_comms+=usecond()-t;
}
for ( int d=0;d < depth ; d ++ ) {
@ -508,9 +521,16 @@ public:
t_gather+= usecond() - t;
t=usecond();
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFromBegin(bwd_req,
(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
#else
acceleratorCopyFromDevice(&send_buf[(d+depth)*buffer_size],&hsend_buf[(d+depth)*buffer_size],bytes);
grid->SendToRecvFromBegin(bwd_req,
(void *)&hsend_buf[(d+depth)*buffer_size], recv_from_rank,
(void *)&hrecv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
#endif
t_comms+=usecond()-t;
}
@ -533,8 +553,13 @@ public:
t=usecond();
grid->CommsComplete(fwd_req);
#ifndef ACCELERATOR_AWARE_MPI
for ( int d=0;d < depth ; d ++ ) {
acceleratorCopyToDevice(&hrecv_buf[d*buffer_size],&recv_buf[d*buffer_size],bytes);
}
#endif
t_comms+= usecond() - t;
t=usecond();
for ( int d=0;d < depth ; d ++ ) {
ScatterSlice(recv_buf,to,nld-depth+d,dimension,plane*buffer_size); plane++;
@ -543,6 +568,11 @@ public:
t=usecond();
grid->CommsComplete(bwd_req);
#ifndef ACCELERATOR_AWARE_MPI
for ( int d=0;d < depth ; d ++ ) {
acceleratorCopyToDevice(&hrecv_buf[(d+depth)*buffer_size],&recv_buf[(d+depth)*buffer_size],bytes);
}
#endif
t_comms+= usecond() - t;
t=usecond();

View File

@ -0,0 +1,196 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion5D.h
Copyright (C) 2020 - 2025
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
Author: Nils Meyer <nils.meyer@ur.de>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/qcd/action/fermion/WilsonFermion5D.h>
#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
#include <Grid/qcd/action/fermion/CloverHelpers.h>
NAMESPACE_BEGIN(Grid);
// see Grid/qcd/action/fermion/CompactWilsonCloverFermion.h for description
template<class Impl, class CloverHelpers>
class CompactWilsonCloverFermion5D : public WilsonFermion5D<Impl>,
public WilsonCloverHelpers<Impl>,
public CompactWilsonCloverHelpers<Impl> {
/////////////////////////////////////////////
// Sizes
/////////////////////////////////////////////
public:
INHERIT_COMPACT_CLOVER_SIZES(Impl);
/////////////////////////////////////////////
// Type definitions
/////////////////////////////////////////////
public:
INHERIT_IMPL_TYPES(Impl);
INHERIT_CLOVER_TYPES(Impl);
INHERIT_COMPACT_CLOVER_TYPES(Impl);
typedef WilsonFermion5D<Impl> WilsonBase;
typedef WilsonCloverHelpers<Impl> Helpers;
typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
/////////////////////////////////////////////
// Constructors
/////////////////////////////////////////////
public:
CompactWilsonCloverFermion5D(GaugeField& _Umu,
GridCartesian &FiveDimGrid,
GridRedBlackCartesian &FiveDimRedBlackGrid,
GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid,
const RealD _mass,
const RealD _csw_r = 0.0,
const RealD _csw_t = 0.0,
const RealD _cF = 1.0,
const ImplParams& impl_p = ImplParams());
/////////////////////////////////////////////
// Member functions (implementing interface)
/////////////////////////////////////////////
public:
virtual void Instantiatable() {};
int ConstEE() override { return 0; };
int isTrivialEE() override { return 0; };
void Dhop(const FermionField& in, FermionField& out, int dag) override;
void DhopOE(const FermionField& in, FermionField& out, int dag) override;
void DhopEO(const FermionField& in, FermionField& out, int dag) override;
void DhopDir(const FermionField& in, FermionField& out, int dir, int disp) override;
void DhopDirAll(const FermionField& in, std::vector<FermionField>& out) /* override */;
void M(const FermionField& in, FermionField& out) override;
void Mdag(const FermionField& in, FermionField& out) override;
void Meooe(const FermionField& in, FermionField& out) override;
void MeooeDag(const FermionField& in, FermionField& out) override;
void Mooee(const FermionField& in, FermionField& out) override;
void MooeeDag(const FermionField& in, FermionField& out) override;
void MooeeInv(const FermionField& in, FermionField& out) override;
void MooeeInvDag(const FermionField& in, FermionField& out) override;
void Mdir(const FermionField& in, FermionField& out, int dir, int disp) override;
void MdirAll(const FermionField& in, std::vector<FermionField>& out) override;
void MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) override;
void MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
void MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
/////////////////////////////////////////////
// Member functions (internals)
/////////////////////////////////////////////
void MooeeInternal(const FermionField& in,
FermionField& out,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle);
/////////////////////////////////////////////
// Helpers
/////////////////////////////////////////////
void ImportGauge(const GaugeField& _Umu) override;
/////////////////////////////////////////////
// Helpers
/////////////////////////////////////////////
private:
template<class Field>
const MaskField* getCorrectMaskField(const Field &in) const {
if(in.Grid()->_isCheckerBoarded) {
if(in.Checkerboard() == Odd) {
return &this->BoundaryMaskOdd;
} else {
return &this->BoundaryMaskEven;
}
} else {
return &this->BoundaryMask;
}
}
template<class Field>
void ApplyBoundaryMask(Field& f) {
const MaskField* m = getCorrectMaskField(f); assert(m != nullptr);
assert(m != nullptr);
CompactHelpers::ApplyBoundaryMask(f, *m);
}
/////////////////////////////////////////////
// Member Data
/////////////////////////////////////////////
public:
RealD csw_r;
RealD csw_t;
RealD cF;
int n_rhs;
bool fixedBoundaries;
CloverDiagonalField Diagonal, DiagonalEven, DiagonalOdd;
CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
CloverTriangleField Triangle, TriangleEven, TriangleOdd;
CloverTriangleField TriangleInv, TriangleInvEven, TriangleInvOdd;
FermionField Tmp;
MaskField BoundaryMask, BoundaryMaskEven, BoundaryMaskOdd;
};
NAMESPACE_END(Grid);

View File

@ -55,6 +55,7 @@ NAMESPACE_CHECK(Wilson);
NAMESPACE_CHECK(WilsonTM);
#include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h> // 4d compact wilson clover fermions
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h> // 5d compact wilson clover fermions
NAMESPACE_CHECK(WilsonClover);
#include <Grid/qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types
NAMESPACE_CHECK(Wilson5D);
@ -164,12 +165,17 @@ typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiS
// Compact Clover fermions
template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>;
template <typename WImpl> using CompactWilsonClover5D = CompactWilsonCloverFermion5D<WImpl, CompactCloverHelpers<WImpl>>;
template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>;
typedef CompactWilsonClover<WilsonImplD2> CompactWilsonCloverFermionD2;
typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF;
typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD;
typedef CompactWilsonClover5D<WilsonImplD2> CompactWilsonCloverFermion5DD2;
typedef CompactWilsonClover5D<WilsonImplF> CompactWilsonCloverFermion5DF;
typedef CompactWilsonClover5D<WilsonImplD> CompactWilsonCloverFermion5DD;
typedef CompactWilsonExpClover<WilsonImplD2> CompactWilsonExpCloverFermionD2;
typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF;
typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD;

View File

@ -484,6 +484,11 @@ public:
this->face_table_computed=1;
assert(this->u_comm_offset==this->_unified_buffer_size);
accelerator_barrier();
#ifdef NVLINK_GET
this->_grid->StencilBarrier(); // He can now get mu local gather, I can get his
// Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
// Or issue barrier AFTER the DMA is running
#endif
}
};

View File

@ -91,13 +91,13 @@ public:
virtual void Mdag (const FermionField &in, FermionField &out){assert(0);};
// half checkerboard operations; leave unimplemented as abstract for now
virtual void Meooe (const FermionField &in, FermionField &out){assert(0);};
virtual void Mooee (const FermionField &in, FermionField &out){assert(0);};
virtual void MooeeInv (const FermionField &in, FermionField &out){assert(0);};
virtual void Meooe (const FermionField &in, FermionField &out);
virtual void Mooee (const FermionField &in, FermionField &out);
virtual void MooeeInv (const FermionField &in, FermionField &out);
virtual void MeooeDag (const FermionField &in, FermionField &out){assert(0);};
virtual void MooeeDag (const FermionField &in, FermionField &out){assert(0);};
virtual void MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
virtual void MeooeDag (const FermionField &in, FermionField &out);
virtual void MooeeDag (const FermionField &in, FermionField &out);
virtual void MooeeInvDag (const FermionField &in, FermionField &out);
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac

View File

@ -0,0 +1,376 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion5DImplementation.h
Copyright (C) 2017 - 2025
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/qcd/spin/Dirac.h>
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h>
NAMESPACE_BEGIN(Grid);
template<class Impl, class CloverHelpers>
CompactWilsonCloverFermion5D<Impl, CloverHelpers>::CompactWilsonCloverFermion5D(GaugeField& _Umu,
GridCartesian &FiveDimGrid,
GridRedBlackCartesian &FiveDimRedBlackGrid,
GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid,
const RealD _mass,
const RealD _csw_r,
const RealD _csw_t,
const RealD _cF,
const ImplParams& impl_p)
: WilsonBase(_Umu, FiveDimGrid, FiveDimRedBlackGrid, FourDimGrid, FourDimRedBlackGrid, _mass, impl_p)
, csw_r(_csw_r)
, csw_t(_csw_t)
, cF(_cF)
, fixedBoundaries(impl_p.boundary_phases[Nd-1] == 0.0)
, Diagonal(&FourDimGrid), Triangle(&FourDimGrid)
, DiagonalEven(&FourDimRedBlackGrid), TriangleEven(&FourDimRedBlackGrid)
, DiagonalOdd(&FourDimRedBlackGrid), TriangleOdd(&FourDimRedBlackGrid)
, DiagonalInv(&FourDimGrid), TriangleInv(&FourDimGrid)
, DiagonalInvEven(&FourDimRedBlackGrid), TriangleInvEven(&FourDimRedBlackGrid)
, DiagonalInvOdd(&FourDimRedBlackGrid), TriangleInvOdd(&FourDimRedBlackGrid)
, Tmp(&FiveDimGrid)
, BoundaryMask(&FiveDimGrid)
, BoundaryMaskEven(&FiveDimRedBlackGrid), BoundaryMaskOdd(&FiveDimRedBlackGrid)
{
assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3);
csw_r *= 0.5;
csw_t *= 0.5;
//if (clover_anisotropy.isAnisotropic)
// csw_r /= clover_anisotropy.xi_0;
ImportGauge(_Umu);
if (fixedBoundaries) {
this->BoundaryMaskEven.Checkerboard() = Even;
this->BoundaryMaskOdd.Checkerboard() = Odd;
CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
}
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Dhop(const FermionField& in, FermionField& out, int dag) {
WilsonBase::Dhop(in, out, dag);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopOE(const FermionField& in, FermionField& out, int dag) {
WilsonBase::DhopOE(in, out, dag);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopEO(const FermionField& in, FermionField& out, int dag) {
WilsonBase::DhopEO(in, out, dag);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
WilsonBase::DhopDir(in, out, dir, disp);
if(this->fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
WilsonBase::DhopDirAll(in, out);
if(this->fixedBoundaries) {
for(auto& o : out) ApplyBoundaryMask(o);
}
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::M(const FermionField& in, FermionField& out) {
out.Checkerboard() = in.Checkerboard();
WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
Mooee(in, Tmp);
axpy(out, 1.0, out, Tmp);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Mdag(const FermionField& in, FermionField& out) {
out.Checkerboard() = in.Checkerboard();
WilsonBase::Dhop(in, out, DaggerYes); // call base to save applying bc
MooeeDag(in, Tmp);
axpy(out, 1.0, out, Tmp);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Meooe(const FermionField& in, FermionField& out) {
WilsonBase::Meooe(in, out);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MeooeDag(const FermionField& in, FermionField& out) {
WilsonBase::MeooeDag(in, out);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Mooee(const FermionField& in, FermionField& out) {
if(in.Grid()->_isCheckerBoarded) {
if(in.Checkerboard() == Odd) {
MooeeInternal(in, out, DiagonalOdd, TriangleOdd);
} else {
MooeeInternal(in, out, DiagonalEven, TriangleEven);
}
} else {
MooeeInternal(in, out, Diagonal, Triangle);
}
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeDag(const FermionField& in, FermionField& out) {
Mooee(in, out); // blocks are hermitian
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInv(const FermionField& in, FermionField& out) {
if(in.Grid()->_isCheckerBoarded) {
if(in.Checkerboard() == Odd) {
MooeeInternal(in, out, DiagonalInvOdd, TriangleInvOdd);
} else {
MooeeInternal(in, out, DiagonalInvEven, TriangleInvEven);
}
} else {
MooeeInternal(in, out, DiagonalInv, TriangleInv);
}
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInvDag(const FermionField& in, FermionField& out) {
MooeeInv(in, out); // blocks are hermitian
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Mdir(const FermionField& in, FermionField& out, int dir, int disp) {
DhopDir(in, out, dir, disp);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MdirAll(const FermionField& in, std::vector<FermionField>& out) {
DhopDirAll(in, out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
assert(!fixedBoundaries); // TODO check for changes required for open bc
// NOTE: code copied from original clover term
conformable(X.Grid(), Y.Grid());
conformable(X.Grid(), force.Grid());
GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
GaugeField clover_force(force.Grid());
PropagatorField Lambda(force.Grid());
// Guido: Here we are hitting some performance issues:
// need to extract the components of the DoubledGaugeField
// for each call
// Possible solution
// Create a vector object to store them? (cons: wasting space)
std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
Impl::extractLinkField(U, this->Umu);
force = Zero();
// Derivative of the Wilson hopping term
this->DhopDeriv(force, X, Y, dag);
///////////////////////////////////////////////////////////
// Clover term derivative
///////////////////////////////////////////////////////////
Impl::outerProductImpl(Lambda, X, Y);
//std::cout << "Lambda:" << Lambda << std::endl;
Gamma::Algebra sigma[] = {
Gamma::Algebra::SigmaXY,
Gamma::Algebra::SigmaXZ,
Gamma::Algebra::SigmaXT,
Gamma::Algebra::MinusSigmaXY,
Gamma::Algebra::SigmaYZ,
Gamma::Algebra::SigmaYT,
Gamma::Algebra::MinusSigmaXZ,
Gamma::Algebra::MinusSigmaYZ,
Gamma::Algebra::SigmaZT,
Gamma::Algebra::MinusSigmaXT,
Gamma::Algebra::MinusSigmaYT,
Gamma::Algebra::MinusSigmaZT};
/*
sigma_{\mu \nu}=
| 0 sigma[0] sigma[1] sigma[2] |
| sigma[3] 0 sigma[4] sigma[5] |
| sigma[6] sigma[7] 0 sigma[8] |
| sigma[9] sigma[10] sigma[11] 0 |
*/
int count = 0;
clover_force = Zero();
for (int mu = 0; mu < 4; mu++)
{
force_mu = Zero();
for (int nu = 0; nu < 4; nu++)
{
if (mu == nu)
continue;
RealD factor;
if (nu == 4 || mu == 4)
{
factor = 2.0 * csw_t;
}
else
{
factor = 2.0 * csw_r;
}
PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok
force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu); // checked
count++;
}
pokeLorentz(clover_force, U[mu] * force_mu, mu);
}
//clover_force *= csw;
force += clover_force;
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
assert(0);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
assert(0);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInternal(const FermionField& in,
FermionField& out,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle) {
assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
out.Checkerboard() = in.Checkerboard();
conformable(in, out);
CompactHelpers::MooeeKernel(diagonal.oSites(), this->Ls, in, out, diagonal, triangle);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::ImportGauge(const GaugeField& _Umu) {
// NOTE: parts copied from original implementation
// Import gauge into base class
double t0 = usecond();
WilsonBase::ImportGauge(_Umu); // NOTE: called here and in wilson constructor -> performed twice, but can't avoid that
// Initialize temporary variables
double t1 = usecond();
conformable(_Umu.Grid(), this->GaugeGrid());
GridBase* grid = _Umu.Grid();
typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
CloverField TmpOriginal(grid);
CloverField TmpInverse(grid);
// Compute the field strength terms mu>nu
double t2 = usecond();
WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
// Compute the Clover Operator acting on Colour and Spin
// multiply here by the clover coefficients for the anisotropy
double t3 = usecond();
TmpOriginal = Helpers::fillCloverYZ(Bx) * csw_r;
TmpOriginal += Helpers::fillCloverXZ(By) * csw_r;
TmpOriginal += Helpers::fillCloverXY(Bz) * csw_r;
TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
// Instantiate the clover term
// - In case of the standard clover the mass term is added
// - In case of the exponential clover the clover term is exponentiated
double t4 = usecond();
CloverHelpers::InstantiateClover(TmpOriginal, TmpInverse, csw_t, 4.0 + this->M5 /*this->diag_mass*/);
// Convert the data layout of the clover term
double t5 = usecond();
CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
// Modify the clover term at the temporal boundaries in case of open boundary conditions
double t6 = usecond();
if(fixedBoundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, 4.0 + this->M5 /*this->diag_mass*/);
// Invert the Clover term
// In case of the exponential clover with (anti-)periodic boundary conditions exp(-Clover) saved
// in TmpInverse can be used. In all other cases the clover term has to be explictly inverted.
// TODO: For now this inversion is explictly done on the CPU
double t7 = usecond();
CloverHelpers::InvertClover(TmpInverse, Diagonal, Triangle, DiagonalInv, TriangleInv, fixedBoundaries);
// Fill the remaining clover fields
double t8 = usecond();
pickCheckerboard(Even, DiagonalEven, Diagonal);
pickCheckerboard(Even, TriangleEven, Triangle);
pickCheckerboard(Odd, DiagonalOdd, Diagonal);
pickCheckerboard(Odd, TriangleOdd, Triangle);
pickCheckerboard(Even, DiagonalInvEven, DiagonalInv);
pickCheckerboard(Even, TriangleInvEven, TriangleInv);
pickCheckerboard(Odd, DiagonalInvOdd, DiagonalInv);
pickCheckerboard(Odd, TriangleInvOdd, TriangleInv);
// Report timings
double t9 = usecond();
std::cout << GridLogDebug << "CompactWilsonCloverFermion5D::ImportGauge timings:" << std::endl;
std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
std::cout << GridLogDebug << "allocations = " << (t2 - t1) / 1e6 << std::endl;
std::cout << GridLogDebug << "field strength = " << (t3 - t2) / 1e6 << std::endl;
std::cout << GridLogDebug << "fill clover = " << (t4 - t3) / 1e6 << std::endl;
std::cout << GridLogDebug << "instantiate clover = " << (t5 - t4) / 1e6 << std::endl;
std::cout << GridLogDebug << "convert layout = " << (t6 - t5) / 1e6 << std::endl;
std::cout << GridLogDebug << "modify boundaries = " << (t7 - t6) / 1e6 << std::endl;
std::cout << GridLogDebug << "invert clover = " << (t8 - t7) / 1e6 << std::endl;
std::cout << GridLogDebug << "pick cbs = " << (t9 - t8) / 1e6 << std::endl;
std::cout << GridLogDebug << "total = " << (t9 - t0) / 1e6 << std::endl;
}
NAMESPACE_END(Grid);

View File

@ -14,6 +14,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
Author: Andrew Lawson <andrew.lawson1991@gmail.com>
Author: Vera Guelpers <V.M.Guelpers@soton.ac.uk>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -484,6 +485,54 @@ void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag
Dhop(in,out,dag); // -0.5 is included
axpy(out,4.0-M5,in,out);
}
template <class Impl>
void WilsonFermion5D<Impl>::Meooe(const FermionField &in, FermionField &out)
{
if (in.Checkerboard() == Odd) {
DhopEO(in, out, DaggerNo);
} else {
DhopOE(in, out, DaggerNo);
}
}
template <class Impl>
void WilsonFermion5D<Impl>::MeooeDag(const FermionField &in, FermionField &out)
{
if (in.Checkerboard() == Odd) {
DhopEO(in, out, DaggerYes);
} else {
DhopOE(in, out, DaggerYes);
}
}
template <class Impl>
void WilsonFermion5D<Impl>::Mooee(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard();
typename FermionField::scalar_type scal(4.0 + M5);
out = scal * in;
}
template <class Impl>
void WilsonFermion5D<Impl>::MooeeDag(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard();
Mooee(in, out);
}
template<class Impl>
void WilsonFermion5D<Impl>::MooeeInv(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard();
out = (1.0/(4.0 + M5))*in;
}
template<class Impl>
void WilsonFermion5D<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard();
MooeeInv(in,out);
}
template<class Impl>
void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in, RealD mass,std::vector<double> twist)

View File

@ -63,7 +63,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
} else { \
chi = coalescedRead(buf[SE->_offset],lane); \
} \
acceleratorSynchronise(); \
acceleratorSynchronise(); \
Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \
Recon(result, Uchi);
@ -517,7 +517,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;}
#endif
} else if( exterior ) {
// dependent on result of merge
// // dependent on result of merge
acceleratorFenceComputeStream();
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL_EXT(GenericDhopSiteExt); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteExt); return;}

View File

@ -0,0 +1,45 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/ qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation5D.cc.master
Copyright (C) 2017 - 2025
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
Author: Mattia Bruno <mattia.bruno@cern.ch>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/qcd/spin/Dirac.h>
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h>
#include <Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermion5DImplementation.h>
#include <Grid/qcd/action/fermion/CloverHelpers.h>
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class CompactWilsonCloverFermion5D<IMPLEMENTATION, CompactCloverHelpers<IMPLEMENTATION>>;
template class CompactWilsonCloverFermion5D<IMPLEMENTATION, CompactExpCloverHelpers<IMPLEMENTATION>>;
NAMESPACE_END(Grid);

View File

@ -0,0 +1 @@
../CompactWilsonCloverFermion5DInstantiation.cc.master

View File

@ -0,0 +1 @@
../CompactWilsonCloverFermion5DInstantiation.cc.master

View File

@ -62,7 +62,7 @@ do
done
done
CC_LIST="CompactWilsonCloverFermionInstantiation"
CC_LIST="CompactWilsonCloverFermionInstantiation CompactWilsonCloverFermion5DInstantiation"
for impl in $COMPACT_WILSON_IMPL_LIST
do

View File

@ -76,27 +76,27 @@ public:
return action;
};
virtual void deriv(const GaugeField &Umu,GaugeField & dSdU) {
virtual void deriv(const GaugeField &U, GaugeField &dSdU) {
//extend Ta to include Lorentz indexes
RealD factor_p = c_plaq/RealD(Nc)*0.5;
RealD factor_r = c_rect/RealD(Nc)*0.5;
GridBase *grid = Umu.Grid();
GridBase *grid = U.Grid();
std::vector<GaugeLinkField> U (Nd,grid);
std::vector<GaugeLinkField> Umu (Nd,grid);
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
Umu[mu] = PeekIndex<LorentzIndex>(U,mu);
}
std::vector<GaugeLinkField> RectStaple(Nd,grid), Staple(Nd,grid);
WilsonLoops<Gimpl>::StapleAndRectStapleAll(Staple, RectStaple, U, workspace);
WilsonLoops<Gimpl>::StapleAndRectStapleAll(Staple, RectStaple, Umu, workspace);
GaugeLinkField dSdU_mu(grid);
GaugeLinkField staple(grid);
for (int mu=0; mu < Nd; mu++){
dSdU_mu = Ta(U[mu]*Staple[mu])*factor_p;
dSdU_mu = dSdU_mu + Ta(U[mu]*RectStaple[mu])*factor_r;
dSdU_mu = Ta(Umu[mu]*Staple[mu])*factor_p;
dSdU_mu = dSdU_mu + Ta(Umu[mu]*RectStaple[mu])*factor_r;
PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
}

View File

@ -73,20 +73,23 @@ public:
// extend Ta to include Lorentz indexes
RealD factor = 0.5 * beta / RealD(Nc);
GridBase *grid = U.Grid();
GaugeLinkField Umu(U.Grid());
GaugeLinkField dSdU_mu(U.Grid());
GaugeLinkField dSdU_mu(grid);
std::vector<GaugeLinkField> Umu(Nd, grid);
for (int mu = 0; mu < Nd; mu++) {
Umu[mu] = PeekIndex<LorentzIndex>(U, mu);
}
Umu = PeekIndex<LorentzIndex>(U, mu);
for (int mu = 0; mu < Nd; mu++) {
// Staple in direction mu
WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
dSdU_mu = Ta(Umu * dSdU_mu) * factor;
WilsonLoops<Gimpl>::Staple(dSdU_mu, Umu, mu);
dSdU_mu = Ta(Umu[mu] * dSdU_mu) * factor;
PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
}
}
private:
RealD beta;
};

View File

@ -111,8 +111,8 @@ public:
};
void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {
std::string config, rng;
this->build_filenames(traj, Params, config, rng);
std::string config, rng, smr;
this->build_filenames(traj, Params, config, smr, rng);
this->check_filename(rng);
this->check_filename(config);

View File

@ -75,7 +75,7 @@ public:
GridParallelRNG &pRNG) {
if ((traj % Params.saveInterval) == 0) {
std::string config, rng, smr;
this->build_filenames(traj, Params, config, rng);
this->build_filenames(traj, Params, config, smr, rng);
GridBase *grid = SmartConfig.get_U(false).Grid();
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
@ -102,7 +102,7 @@ public:
if ( Params.saveSmeared ) {
IldgWriter _IldgWriter(grid->IsBoss());
_IldgWriter.open(smr);
_IldgWriter.writeConfiguration<GaugeStats>(SmartConfig.get_U(true), traj, config, config);
_IldgWriter.writeConfiguration<GaugeStats>(SmartConfig.get_U(true), traj, smr, smr);
_IldgWriter.close();
std::cout << GridLogMessage << "Written ILDG Configuration on " << smr
@ -118,8 +118,8 @@ public:
void CheckpointRestore(int traj, GaugeField &U, GridSerialRNG &sRNG,
GridParallelRNG &pRNG) {
std::string config, rng;
this->build_filenames(traj, Params, config, rng);
std::string config, rng, smr;
this->build_filenames(traj, Params, config, smr, rng);
this->check_filename(rng);
this->check_filename(config);

View File

@ -107,8 +107,8 @@ class ScidacHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG,
GridParallelRNG &pRNG) {
std::string config, rng;
this->build_filenames(traj, Params, config, rng);
std::string config, rng, smr;
this->build_filenames(traj, Params, config, smr, rng);
this->check_filename(rng);
this->check_filename(config);

View File

@ -62,15 +62,15 @@ accelerator_inline int stencilIndex(int mu, int nu) {
/*! @brief structure holding the link treatment */
struct SmearingParameters{
SmearingParameters(){}
struct HISQSmearingParameters{
HISQSmearingParameters(){}
Real c_1; // 1 link
Real c_naik; // Naik term
Real c_3; // 3 link
Real c_5; // 5 link
Real c_7; // 7 link
Real c_lp; // 5 link Lepage
SmearingParameters(Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp)
HISQSmearingParameters(Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp)
: c_1(c1),
c_naik(cnaik),
c_3(c3),
@ -86,7 +86,7 @@ class Smear_HISQ : public Gimpl {
private:
GridCartesian* const _grid;
SmearingParameters _linkTreatment;
HISQSmearingParameters _linkTreatment;
public:
@ -117,7 +117,7 @@ public:
// IN--u_thin
void smear(GF& u_smr, GF& u_naik, GF& u_thin) const {
SmearingParameters lt = this->_linkTreatment;
HISQSmearingParameters lt = this->_linkTreatment;
auto grid = this->_grid;
// Create a padded cell of extra padding depth=1 and fill the padding.

View File

@ -252,6 +252,11 @@ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const{
out = in;
RealD taus = 0.;
// Perform initial t=0 measurements
for(auto const &meas : this->functions)
meas.second(0,taus,out);
for (unsigned int step = 1; step <= Nstep; step++) { //step indicates the number of smearing steps applied at the time of measurement
auto start = std::chrono::high_resolution_clock::now();
evolve_step(out, taus);
@ -336,6 +341,11 @@ void WilsonFlowAdaptive<Gimpl>::smear(GaugeField& out, const GaugeField& in) con
RealD taus = 0.;
RealD eps = init_epsilon;
unsigned int step = 0;
// Perform initial t=0 measurements
for(auto const &meas : this->functions)
meas.second(step,taus,out);
do{
int step_success = evolve_step_adaptive(out, taus, eps);
step += step_success; //step will not be incremented if the integration step fails

View File

@ -292,19 +292,21 @@ public:
//////////////////////////////////////////////////
// the sum over all nu-oriented staples for nu != mu on each site
//////////////////////////////////////////////////
static void Staple(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
static void Staple(GaugeMat &staple, const GaugeLorentz &U, int mu) {
GridBase *grid = Umu.Grid();
std::vector<GaugeMat> U(Nd, grid);
std::vector<GaugeMat> Umu(Nd, U.Grid());
for (int d = 0; d < Nd; d++) {
U[d] = PeekIndex<LorentzIndex>(Umu, d);
Umu[d] = PeekIndex<LorentzIndex>(U, d);
}
Staple(staple, U, mu);
Staple(staple, Umu, mu);
}
static void Staple(GaugeMat &staple, const std::vector<GaugeMat> &U, int mu) {
staple = Zero();
static void Staple(GaugeMat &staple, const std::vector<GaugeMat> &Umu, int mu) {
autoView(staple_v, staple, AcceleratorWrite);
accelerator_for(i, staple.Grid()->oSites(), Simd::Nsimd(), {
staple_v[i] = Zero();
});
for (int nu = 0; nu < Nd; nu++) {
@ -318,12 +320,12 @@ public:
// |
// __|
//
staple += Gimpl::ShiftStaple(
Gimpl::CovShiftForward(
U[nu], nu,
Umu[nu], nu,
Gimpl::CovShiftBackward(
U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))),
Umu[mu], mu, Gimpl::CovShiftIdentityBackward(Umu[nu], nu))),
mu);
// __
@ -333,8 +335,8 @@ public:
//
staple += Gimpl::ShiftStaple(
Gimpl::CovShiftBackward(U[nu], nu,
Gimpl::CovShiftBackward(U[mu], mu, U[nu])), mu);
Gimpl::CovShiftBackward(Umu[nu], nu,
Gimpl::CovShiftBackward(Umu[mu], mu, Umu[nu])), mu);
}
}
}

View File

@ -363,12 +363,16 @@ public:
////////////////////////////////////////////////////////////////////////
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
{
// std::cout << "Communicate Begin "<<std::endl;
// _grid->Barrier();
FlightRecorder::StepLog("Communicate begin");
// All GPU kernel tasks must complete
// accelerator_barrier(); // All kernels should ALREADY be complete
// _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer
// But the HaloGather had a barrier too.
for(int i=0;i<Packets.size();i++){
// std::cout << "Communicate prepare "<<i<<std::endl;
// _grid->Barrier();
_grid->StencilSendToRecvFromPrepare(MpiReqs,
Packets[i].send_buf,
Packets[i].to_rank,Packets[i].do_send,
@ -376,8 +380,15 @@ public:
Packets[i].from_rank,Packets[i].do_recv,
Packets[i].xbytes,Packets[i].rbytes,i);
}
// std::cout << "Communicate PollDtoH "<<std::endl;
// _grid->Barrier();
_grid->StencilSendToRecvFromPollDtoH (MpiReqs); /* Starts MPI*/
// std::cout << "Communicate CopySynch "<<std::endl;
// _grid->Barrier();
acceleratorCopySynchronise();
// Starts intranode
for(int i=0;i<Packets.size();i++){
// std::cout << "Communicate Begin "<<i<<std::endl;
_grid->StencilSendToRecvFromBegin(MpiReqs,
Packets[i].send_buf,
Packets[i].to_rank,Packets[i].do_send,
@ -385,6 +396,7 @@ public:
Packets[i].from_rank,Packets[i].do_recv,
Packets[i].xbytes,Packets[i].rbytes,i);
}
FlightRecorder::StepLog("Communicate begin has finished");
// Get comms started then run checksums
// Having this PRIOR to the dslash seems to make Sunspot work... (!)
for(int i=0;i<Packets.size();i++){
@ -395,7 +407,14 @@ public:
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
{
// std::cout << "Communicate Complete "<<std::endl;
// _grid->Barrier();
FlightRecorder::StepLog("Start communicate complete");
// std::cout << "Communicate Complete PollIRecv "<<std::endl;
// _grid->Barrier();
_grid->StencilSendToRecvFromPollIRecv(MpiReqs);
// std::cout << "Communicate Complete Complete "<<std::endl;
// _grid->Barrier();
_grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
if ( this->partialDirichlet ) DslashLogPartial();
else if ( this->fullDirichlet ) DslashLogDirichlet();
@ -428,6 +447,7 @@ public:
Communicate();
CommsMergeSHM(compress);
CommsMerge(compress);
accelerator_barrier();
}
template<class compressor> int HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
@ -483,6 +503,9 @@ public:
void HaloGather(const Lattice<vobj> &source,compressor &compress)
{
// accelerator_barrier();
//////////////////////////////////
// I will overwrite my send buffers
//////////////////////////////////
_grid->StencilBarrier();// Synch shared memory on a single nodes
assert(source.Grid()==_grid);
@ -496,7 +519,11 @@ public:
HaloGatherDir(source,compress,point,face_idx);
}
accelerator_barrier(); // All my local gathers are complete
// _grid->StencilBarrier();// Synch shared memory on a single nodes
#ifdef NVLINK_GET
_grid->StencilBarrier(); // He can now get mu local gather, I can get his
// Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
// Or issue barrier AFTER the DMA is running
#endif
face_table_computed=1;
assert(u_comm_offset==_unified_buffer_size);
}
@ -535,6 +562,7 @@ public:
coalescedWrite(to[j] ,coalescedRead(from [j]));
});
acceleratorFenceComputeStream();
// Also fenced in WilsonKernels
}
}
@ -663,7 +691,7 @@ public:
}
}
}
std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
// std::cout << "BuildSurfaceList size is "<<surface_list_size<<std::endl;
surface_list.resize(surface_list_size);
std::vector<int> surface_list_host(surface_list_size);
int32_t ss=0;
@ -683,6 +711,7 @@ public:
}
}
acceleratorCopyToDevice(&surface_list_host[0],&surface_list[0],surface_list_size*sizeof(int));
// std::cout << GridLogMessage<<"BuildSurfaceList size is "<<surface_list_size<<std::endl;
}
/// Introduce a block structure and switch off comms on boundaries
void DirichletBlock(const Coordinate &dirichlet_block)
@ -774,8 +803,8 @@ public:
this->_entries_host_p = &_entries[0];
this->_entries_p = &_entries_device[0];
std::cout << GridLogMessage << " Stencil object allocated for "<<std::dec<<this->_osites
<<" sites table "<<std::hex<<this->_entries_p<< " GridPtr "<<_grid<<std::dec<<std::endl;
// std::cout << GridLogMessage << " Stencil object allocated for "<<std::dec<<this->_osites
// <<" sites table "<<std::hex<<this->_entries_p<< " GridPtr "<<_grid<<std::dec<<std::endl;
for(int ii=0;ii<npoints;ii++){

View File

@ -242,19 +242,33 @@ inline void *acceleratorAllocDevice(size_t bytes)
return ptr;
};
typedef int acceleratorEvent_t;
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
acceleratorCopyToDevice(from,to,bytes);
return 0;
}
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
acceleratorCopyFromDevice(from,to,bytes);
return 0;
}
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
{
cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
return 0;
}
inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
inline void acceleratorEventWait(acceleratorEvent_t ev)
{
//auto discard=cudaStreamSynchronize(ev);
}
inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}
inline int acceleratorIsCommunicable(void *ptr)
@ -323,7 +337,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
cgh.parallel_for( \
sycl::nd_range<3>(global,local), \
[=] (sycl::nd_item<3> item) /*mutable*/ \
[[intel::reqd_sub_group_size(16)]] \
[[sycl::reqd_sub_group_size(16)]] \
{ \
auto iter1 = item.get_global_id(0); \
auto iter2 = item.get_global_id(1); \
@ -343,11 +357,28 @@ inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait(); }
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes);}
inline void acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); }
inline void acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); }
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
///////
// Asynch event interface
///////
typedef sycl::event acceleratorEvent_t;
inline void acceleratorEventWait(acceleratorEvent_t ev)
{
ev.wait();
}
inline int acceleratorEventIsComplete(acceleratorEvent_t ev)
{
return (ev.get_info<sycl::info::event::command_execution_status>() == sycl::info::event_command_status::complete);
}
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes);}
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}
inline int acceleratorIsCommunicable(void *ptr)
@ -358,8 +389,10 @@ inline int acceleratorIsCommunicable(void *ptr)
else return 0;
#endif
return 1;
}
#endif
//////////////////////////////////////////////
@ -459,7 +492,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
inline void *acceleratorAllocHost(size_t bytes)
{
void *ptr=NULL;
auto err = hipMallocHost((void **)&ptr,bytes);
auto err = hipHostMalloc((void **)&ptr,bytes);
if( err != hipSuccess ) {
ptr = (void *) NULL;
fprintf(stderr," hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
@ -492,23 +525,35 @@ inline void *acceleratorAllocDevice(size_t bytes)
inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);};
inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
typedef int acceleratorEvent_t;
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
{
auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
return 0;
}
inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
acceleratorCopyToDevice(from,to,bytes);
return 0;
}
inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
acceleratorCopyFromDevice(from,to,bytes);
return 0;
}
inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); };
inline void acceleratorEventWait(acceleratorEvent_t ev)
{
// auto discard=hipStreamSynchronize(ev);
}
inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}
#endif
inline void acceleratorPin(void *ptr,unsigned long bytes)
@ -545,6 +590,8 @@ inline void acceleratorPin(void *ptr,unsigned long bytes)
#undef GRID_SIMT
typedef int acceleratorEvent_t;
inline void acceleratorMem(void)
{
/*
@ -565,8 +612,13 @@ inline void acceleratorMem(void)
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); }
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); }
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { acceleratorCopyToDevice(from,to,bytes); return 0; }
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes) { acceleratorCopyFromDevice(from,to,bytes); return 0; }
inline void acceleratorEventWait(acceleratorEvent_t ev){}
inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev); return 1;}
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); return 0;}
inline void acceleratorCopySynchronise(void) {};
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
@ -655,9 +707,9 @@ inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
acceleratorCopySynchronise();
}
template<class T> void acceleratorPut(T& dev,T&host)
template<class T> void acceleratorPut(T& dev,const T&host)
{
acceleratorCopyToDevice(&host,&dev,sizeof(T));
acceleratorCopyToDevice((void *)&host,&dev,sizeof(T));
}
template<class T> T acceleratorGet(T& dev)
{

View File

@ -73,9 +73,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define thread_critical DO_PRAGMA(omp critical)
#ifdef GRID_OMP
inline void thread_bcopy(void *from, void *to,size_t bytes)
inline void thread_bcopy(const void *from, void *to,size_t bytes)
{
uint64_t *ufrom = (uint64_t *)from;
const uint64_t *ufrom = (const uint64_t *)from;
uint64_t *uto = (uint64_t *)to;
assert(bytes%8==0);
uint64_t words=bytes/8;
@ -84,7 +84,7 @@ inline void thread_bcopy(void *from, void *to,size_t bytes)
});
}
#else
inline void thread_bcopy(void *from, void *to,size_t bytes)
inline void thread_bcopy(const void *from, void *to,size_t bytes)
{
bcopy(from,to,bytes);
}

View File

@ -509,7 +509,14 @@ void Grid_init(int *argc,char ***argv)
Grid_default_latt,
Grid_default_mpi);
if( GridCmdOptionExists(*argv,*argv+*argc,"--flightrecorder") ){
std::cout << GridLogMessage <<" Enabling flight recorder " <<std::endl;
FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord);
FlightRecorder::PrintEntireLog = 1;
FlightRecorder::ChecksumComms = 1;
FlightRecorder::ChecksumCommsSend=1;
}
if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
@ -631,12 +638,11 @@ void Grid_debug_handler_init(void)
sa.sa_flags = SA_SIGINFO;
// sigaction(SIGSEGV,&sa,NULL);
sigaction(SIGTRAP,&sa,NULL);
sigaction(SIGBUS,&sa,NULL);
// sigaction(SIGBUS,&sa,NULL);
// sigaction(SIGUSR2,&sa,NULL);
feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
sigaction(SIGFPE,&sa,NULL);
// feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
// sigaction(SIGFPE,&sa,NULL);
sigaction(SIGKILL,&sa,NULL);
sigaction(SIGILL,&sa,NULL);
@ -651,3 +657,4 @@ void Grid_debug_handler_init(void)
}
NAMESPACE_END(Grid);

View File

@ -50,7 +50,7 @@ namespace Grid{
int64_t index64;
IndexFromCoorReversed(coor,index64,dims);
if ( index64>=2*1024*1024*1024LL ){
std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
// std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
}
assert(index64<2*1024*1024*1024LL);
index = (int) index64;

View File

@ -66,6 +66,7 @@ namespace Grid{
};
}
template <class T> void writeFile(T& in, std::string const fname){
#ifdef HAVE_LIME
// Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
@ -73,7 +74,7 @@ template <class T> void writeFile(T& in, std::string const fname){
Grid::emptyUserRecord record;
Grid::ScidacWriter WR(in.Grid()->IsBoss());
WR.open(fname);
WR.writeScidacFieldRecord(in,record,0);
WR.writeScidacFieldRecord(in,record,0); // Lexico
WR.close();
#endif
// What is the appropriate way to throw error?
@ -107,8 +108,18 @@ int main(int argc, char **argv) {
for (int conf = CPar.StartConfiguration; conf <= CPar.EndConfiguration; conf+= CPar.Skip){
#if 0
CPNersc.CheckpointRestore(conf, Umu, sRNG, pRNG);
#else
// Don't require Grid format RNGs
FieldMetaData header;
std::string file, filesmr;
file = CPar.conf_path + "/" + CPar.conf_prefix + "." + std::to_string(conf);
filesmr = CPar.conf_path + "/" + CPar.conf_smr_prefix + "." + std::to_string(conf);
NerscIO::readConfiguration(Umu,header,file);
#endif
std::cout << std::setprecision(15);
std::cout << GridLogMessage << "Initial plaquette: "<< WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu) << std::endl;
@ -116,6 +127,7 @@ int main(int argc, char **argv) {
std::string file_post = CPar.conf_prefix + "." + std::to_string(conf);
WilsonFlow<PeriodicGimplR> WF(WFPar.step_size,WFPar.steps,WFPar.meas_interval);
WF.addMeasurement(WFPar.meas_interval_density, [&file_pre,&file_post,&conf](int step, RealD t, const typename PeriodicGimplR::GaugeField &U){
typedef typename PeriodicGimplR::GaugeLinkField GaugeMat;
@ -165,33 +177,48 @@ int main(int argc, char **argv) {
//double coeff = 2.0 / (1.0 * Nd * (Nd - 1)) / 3.0;
//Plq = coeff * Plq;
int tau = std::round(t);
std::string efile = file_pre + "E_dnsty_" + std::to_string(tau) + "_" + file_post;
writeFile(R,efile);
std::string tfile = file_pre + "Top_dnsty_" + std::to_string(tau) + "_" + file_post;
writeFile(qfield,tfile);
RealD WFlow_TC5Li = WilsonLoops<PeriodicGimplR>::TopologicalCharge5Li(U);
int tau = std::round(t);
std::string efile = file_pre + "E_dnsty_" + std::to_string(tau) + "_" + file_post;
// writeFile(R,efile);
std::string tfile = file_pre + "Top_dnsty_" + std::to_string(tau) + "_" + file_post;
// writeFile(qfield,tfile);
std::string ufile = file_pre + "U_" + std::to_string(tau) + "_" + file_post;
{
// PeriodicGimplR::GaugeField Ucopy = U;
// NerscIO::writeConfiguration(Ucopy,ufile);
}
RealD E = real(sum(R))/ RealD(U.Grid()->gSites());
RealD T = real( sum(qfield) );
Coordinate scoor; for (int mu=0; mu < Nd; mu++) scoor[mu] = 0;
RealD E0 = real(peekSite(R,scoor));
RealD T0 = real(peekSite(qfield,scoor));
std::cout << GridLogMessage << "[WilsonFlow] Saved energy density (clover) & topo. charge density: " << conf << " " << step << " " << tau << " "
<< "(E_avg,T_sum) " << E << " " << T << " (E, T at origin) " << E0 << " " << T0 << std::endl;
<< "(E_avg,T_sum) " << E << " " << T << " (E, T at origin) " << E0 << " " << T0 << " Q5Li "<< WFlow_TC5Li << std::endl;
});
int t=WFPar.maxTau;
WF.smear(Uflow, Umu);
// NerscIO::writeConfiguration(Uflow,filesmr);
RealD WFlow_plaq = WilsonLoops<PeriodicGimplR>::avgPlaquette(Uflow);
RealD WFlow_TC = WilsonLoops<PeriodicGimplR>::TopologicalCharge(Uflow);
RealD WFlow_TC5Li = WilsonLoops<PeriodicGimplR>::TopologicalCharge5Li(Uflow);
RealD WFlow_T0 = WF.energyDensityPlaquette(t,Uflow); // t
RealD WFlow_EC = WF.energyDensityCloverleaf(t,Uflow);
std::cout << GridLogMessage << "Plaquette "<< conf << " " << WFlow_plaq << std::endl;
std::cout << GridLogMessage << "T0 "<< conf << " " << WFlow_T0 << std::endl;
std::cout << GridLogMessage << "TC0 "<< conf << " " << WFlow_EC << std::endl;
std::cout << GridLogMessage << "TopologicalCharge "<< conf << " " << WFlow_TC << std::endl;
std::cout << GridLogMessage << "Plaquette "<< conf << " " << WFlow_plaq << std::endl;
std::cout << GridLogMessage << "T0 "<< conf << " " << WFlow_T0 << std::endl;
std::cout << GridLogMessage << "TC0 "<< conf << " " << WFlow_EC << std::endl;
std::cout << GridLogMessage << "TopologicalCharge "<< conf << " " << WFlow_TC << std::endl;
std::cout << GridLogMessage << "TopologicalCharge5Li "<< conf << " " << WFlow_TC5Li<< std::endl;
std::cout<< GridLogMessage << " Admissibility check:\n";
const double sp_adm = 0.067; // admissible threshold

View File

@ -25,13 +25,20 @@ directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#if Nc == 3
#include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
#include <Grid/qcd/smearing/JacobianAction.h>
#endif
using namespace Grid;
int main(int argc, char **argv)
{
#if Nc != 3
#warning FTHMC2p1f will not work for Nc != 3
std::cout << "This program will currently only work for Nc == 3." << std::endl;
#else
std::cout << std::setprecision(12);
Grid_init(&argc, &argv);
@ -220,7 +227,6 @@ int main(int argc, char **argv)
TheHMC.Run(SmearingPolicy); // for smearing
Grid_finalize();
#endif
} // main

View File

@ -24,14 +24,22 @@ See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#if Nc == 3
#include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
#include <Grid/qcd/smearing/JacobianAction.h>
#endif
using namespace Grid;
int main(int argc, char **argv)
{
#if Nc != 3
#warning FTHMC2p1f_3GeV will not work for Nc != 3
std::cout << "This program will currently only work for Nc == 3." << std::endl;
#else
std::cout << std::setprecision(12);
Grid_init(&argc, &argv);
@ -220,6 +228,7 @@ int main(int argc, char **argv)
TheHMC.Run(SmearingPolicy); // for smearing
Grid_finalize();
#endif
} // main

View File

@ -25,13 +25,20 @@ directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#if Nc == 3
#include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
#include <Grid/qcd/smearing/JacobianAction.h>
#endif
using namespace Grid;
int main(int argc, char **argv)
{
#if Nc != 3
#warning HMC2p1f_3GeV will not work for Nc != 3
std::cout << "This program will currently only work for Nc == 3." << std::endl;
#else
std::cout << std::setprecision(12);
Grid_init(&argc, &argv);
@ -220,6 +227,7 @@ int main(int argc, char **argv)
TheHMC.Run(SmearingPolicy); // for smearing
Grid_finalize();
#endif
} // main

View File

@ -52,7 +52,7 @@ int main (int argc, char ** argv)
int threads = GridThread::GetThreads();
int Ls=8;
int Ls=16;
for(int i=0;i<argc;i++) {
if(std::string(argv[i]) == "-Ls"){
std::stringstream ss(argv[i+1]); ss >> Ls;

View File

@ -492,17 +492,18 @@ public:
}
FGrid->Barrier();
double t1=usecond();
uint64_t ncall = 500;
FGrid->Broadcast(0,&ncall,sizeof(ncall));
uint64_t no = 50;
uint64_t ni = 100;
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
time_statistics timestat;
std::vector<double> t_time(ncall);
for(uint64_t i=0;i<ncall;i++){
std::vector<double> t_time(no);
for(uint64_t i=0;i<no;i++){
t0=usecond();
Dw.DhopEO(src_o,r_e,DaggerNo);
for(uint64_t j=0;j<ni;j++){
Dw.DhopEO(src_o,r_e,DaggerNo);
}
t1=usecond();
t_time[i] = t1-t0;
}
@ -520,11 +521,11 @@ public:
double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time);
mf_hi = flops/timestat.min;
mf_lo = flops/timestat.max;
mf_hi = flops/timestat.min*ni;
mf_lo = flops/timestat.max*ni;
mf_err= flops/timestat.min * timestat.err/timestat.mean;
mflops = flops/timestat.mean;
mflops = flops/timestat.mean*ni;
mflops_all.push_back(mflops);
if ( mflops_best == 0 ) mflops_best = mflops;
if ( mflops_worst== 0 ) mflops_worst= mflops;
@ -535,6 +536,7 @@ public:
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call "<< timestat.mean/ni<<std::endl;
}
@ -654,17 +656,19 @@ public:
}
FGrid->Barrier();
double t1=usecond();
uint64_t ncall = 500;
FGrid->Broadcast(0,&ncall,sizeof(ncall));
uint64_t no = 50;
uint64_t ni = 100;
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
time_statistics timestat;
std::vector<double> t_time(ncall);
for(uint64_t i=0;i<ncall;i++){
std::vector<double> t_time(no);
for(uint64_t i=0;i<no;i++){
t0=usecond();
Ds.DhopEO(src_o,r_e,DaggerNo);
for(uint64_t j=0;j<ni;j++){
Ds.DhopEO(src_o,r_e,DaggerNo);
}
t1=usecond();
t_time[i] = t1-t0;
}
@ -675,11 +679,11 @@ public:
double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time);
mf_hi = flops/timestat.min;
mf_lo = flops/timestat.max;
mf_hi = flops/timestat.min*ni;
mf_lo = flops/timestat.max*ni;
mf_err= flops/timestat.min * timestat.err/timestat.mean;
mflops = flops/timestat.mean;
mflops = flops/timestat.mean*ni;
mflops_all.push_back(mflops);
if ( mflops_best == 0 ) mflops_best = mflops;
if ( mflops_worst== 0 ) mflops_worst= mflops;
@ -689,6 +693,7 @@ public:
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call "<< timestat.mean/ni<<std::endl;
}
@ -792,19 +797,18 @@ public:
Dc.M(src,r);
}
FGrid->Barrier();
double t1=usecond();
uint64_t ncall = 500;
FGrid->Broadcast(0,&ncall,sizeof(ncall));
uint64_t ni = 100;
uint64_t no = 50;
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
time_statistics timestat;
std::vector<double> t_time(ncall);
for(uint64_t i=0;i<ncall;i++){
t0=usecond();
Dc.M(src,r);
t1=usecond();
std::vector<double> t_time(no);
for(uint64_t i=0;i<no;i++){
double t0=usecond();
for(uint64_t j=0;j<ni;j++){
Dc.M(src,r);
}
double t1=usecond();
t_time[i] = t1-t0;
}
FGrid->Barrier();
@ -814,20 +818,21 @@ public:
double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time);
mf_hi = flops/timestat.min;
mf_lo = flops/timestat.max;
mf_hi = flops/timestat.min*ni;
mf_lo = flops/timestat.max*ni;
mf_err= flops/timestat.min * timestat.err/timestat.mean;
mflops = flops/timestat.mean;
mflops = flops/timestat.mean*ni;
mflops_all.push_back(mflops);
if ( mflops_best == 0 ) mflops_best = mflops;
if ( mflops_worst== 0 ) mflops_worst= mflops;
if ( mflops>mflops_best ) mflops_best = mflops;
if ( mflops<mflops_worst) mflops_worst= mflops;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<" "<<timestat.mean<<" us"<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per rank "<< mflops/NP<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per node "<< mflops/NN<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov us per call "<< timestat.mean/ni<<std::endl;
}
@ -872,7 +877,7 @@ int main (int argc, char ** argv)
int do_dslash=1;
int sel=4;
std::vector<int> L_list({8,12,16,24});
std::vector<int> L_list({8,12,16,24,32});
int selm1=sel-1;
std::vector<double> clover;

View File

@ -151,7 +151,7 @@ AC_ARG_ENABLE([tracing],
case ${ac_TRACING} in
nvtx)
AC_DEFINE([GRID_TRACING_NVTX],[1],[use NVTX])
LIBS="${LIBS} -lnvToolsExt64_1"
LIBS="${LIBS} -lnvToolsExt"
;;
roctx)
AC_DEFINE([GRID_TRACING_ROCTX],[1],[use ROCTX])

View File

@ -93,10 +93,13 @@ int main(int argc, char ** argv)
Real coeff = (width*width) / Real(4*Iterations);
chi=kronecker;
// chi = (1-p^2/2N)^N kronecker
for(int n = 0; n < Iterations; ++n) {
Laplacian.M(chi,psi);
chi = chi - coeff*psi;
RealD n2 = norm2(chi);
chi = chi * (1.0/std::sqrt(n2));
}
std::cout << " Wuppertal smeared operator is chi = \n" << chi <<std::endl;

View File

@ -32,15 +32,9 @@ export MPICH_OFI_NIC_POLICY=GPU
# Local vol 16.16.16.32
#
VOL 128.64.128.96
MPI 4.4.4.3
NPROC 192
mpiexec -np 192 -ppn 12 -envall ./gpu_tile.sh ./Benchmark_dwf_fp32 --mpi 4.4.4.3 --grid 128.64.128.96 --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap
LX=32
LX=16
LY=16
LZ=32
LZ=16
LT=32
NX=2

View File

@ -19,7 +19,7 @@ export ONEAPI_DEVICE_FILTER=gpu,level_zero
export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:3
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:4
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1

View File

@ -1,18 +1,19 @@
#Ahead of time compile for PVC
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib"
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/"
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc"
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/ -fPIC"
#JIT compile
#export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl "
#export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions "
../../configure \
../configure \
--enable-simd=GPU \
--enable-reduction=grid \
--enable-gen-simd-width=64 \
--enable-comms=mpi-auto \
--enable-debug \
--prefix $HOME/gpt-install \
--disable-gparity \
--disable-fermion-reps \
--with-lime=$CLIME \

View File

@ -0,0 +1,22 @@
CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
../../configure --enable-comms=mpi-auto \
--with-lime=$CLIME \
--enable-unified=no \
--enable-shm=nvlink \
--enable-tracing=none \
--enable-accelerator=hip \
--enable-gen-simd-width=64 \
--disable-gparity \
--disable-fermion-reps \
--enable-simd=GPU \
--with-gmp=$OLCF_GMP_ROOT \
--with-fftw=$FFTW_DIR/.. \
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
--disable-fermion-reps \
CXX=hipcc MPICXX=mpicxx \
CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas"

View File

@ -0,0 +1,16 @@
echo spack
. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
#module load cce/15.0.1
module load rocm/6.3.1
module load cray-fftw
module load craype-accel-amd-gfx90a
export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
#Ugly hacks to get down level software working on current system
#export LD_LIBRARY_PATH=/opt/cray/libfabric/1.20.1/lib64/:$LD_LIBRARY_PATH
#export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
#ln -s /opt/rocm-6.0.0/lib/libamdhip64.so.6 .

View File

@ -30,14 +30,10 @@ source ${root}/sourceme.sh
export OMP_NUM_THREADS=7
export MPICH_GPU_SUPPORT_ENABLED=1
export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
for vol in 32.32.32.64
#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
#64.64.32.96
for vol in 64.64.32.64
do
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol > log.shm0.ov.$vol
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.ov.$vol
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 0 --grid $vol > log.shm0.seq.$vol
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.seq.$vol
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol -Ls 16
done

View File

@ -3,20 +3,19 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
--with-lime=$CLIME \
--enable-unified=no \
--enable-shm=nvlink \
--enable-tracing=timer \
--enable-tracing=none \
--enable-accelerator=hip \
--enable-gen-simd-width=64 \
--disable-gparity \
--disable-fermion-reps \
--enable-simd=GPU \
--enable-accelerator-cshift \
--with-gmp=$OLCF_GMP_ROOT \
--with-fftw=$FFTW_DIR/.. \
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
--disable-fermion-reps \
CXX=hipcc MPICXX=mpicxx \
CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas"
CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas"

View File

@ -1,12 +1,25 @@
echo spack
. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
spack load c-lime
module load emacs
module load PrgEnv-gnu
module load rocm/6.0.0
module load cray-mpich
module load gmp
module load cce/15.0.1
module load rocm/5.3.0
module load cray-fftw
module load craype-accel-amd-gfx90a
#Ugly hacks to get down level software working on current system
export LD_LIBRARY_PATH=/opt/cray/libfabric/1.20.1/lib64/:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
ln -s /opt/rocm-6.0.0/lib/libamdhip64.so.6 .
#echo spack load c-lime
#spack load c-lime
#module load emacs
##module load PrgEnv-gnu
##module load cray-mpich
##module load cray-fftw
##module load craype-accel-amd-gfx90a
##export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
#Hack for lib
#export LD_LIBRARY_PATH=`pwd`:$LD_LIBRARY_PATH
##export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH

View File

@ -0,0 +1,273 @@
RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
SLURM detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA GH200 120GB
AcceleratorCudaInit[0]: totalGlobalMem: 102005473280
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 1
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 0 device 0 bus id: 0009:01:00.0
AcceleratorCudaInit: ================================================
SharedMemoryMpi: World communicator of size 4
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x4002c0000000 - 40033fffffff for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=3737a24096282ea179607fc879814710860a0de6: (HEAD -> develop, origin/develop, origin/HEAD) clean
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : This rank is running on host jpbo-119-30.jupiter.internal
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 81604378624 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent host allocations: SMALL 8 LARGE 2 HUGE 0
Grid : Message : MemoryManager::Init() cache pool for recent device allocations: SMALL 16 LARGE 8 Huge 0
Grid : Message : MemoryManager::Init() cache pool for recent shared allocations: SMALL 16 LARGE 8 Huge 0
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 0.303000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 0.309000 s : Testing with full communication
Grid : Message : 0.312000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 0.313000 s : Grid Layout
Grid : Message : 0.313000 s : Global lattice size : 32 32 64 64
Grid : Message : 0.319000 s : OpenMP threads : 4
Grid : Message : 0.320000 s : MPI tasks : 1 1 2 2
Grid : Message : 0.129590 s : Initialising 4d RNG
Grid : Message : 0.764790 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 0.764920 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 0.942440 s : Initialising 5d RNG
Grid : Message : 1.149388 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 1.149404 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
local rank 1 device 0 bus id: 0019:01:00.0
local rank 2 device 0 bus id: 0029:01:00.0
local rank 3 device 0 bus id: 0039:01:00.0
Grid : Message : 43.893114 s : Drawing gauge field
Grid : Message : 54.574150 s : Random gauge initialised
Grid : Message : 54.574170 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
Grid : Message : 54.574172 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
Grid : Message : 54.580032 s : Setting up Cshift based reference
Grid : Message : 60.407451 s : *****************************************************************
Grid : Message : 60.407469 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 60.407470 s : *****************************************************************
Grid : Message : 60.407471 s : *****************************************************************
Grid : Message : 60.407472 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 60.407473 s : * Vectorising space-time by 8
Grid : Message : 60.407475 s : * VComplex size is 64 B
Grid : Message : 60.407477 s : * Using Overlapped Comms/Compute
Grid : Message : 60.407479 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 60.407480 s : *****************************************************************
Grid : Message : 61.102178 s : Called warmup
Grid : Message : 62.177160 s : Called Dw 300 times in 1074958 us
Grid : Message : 62.177198 s : mflop/s = 24721998.6
Grid : Message : 62.177201 s : mflop/s per rank = 6180499.64
Grid : Message : 62.177204 s : mflop/s per node = 24721998.6
Grid : Message : 62.182696 s : norm diff 5.8108784e-14 Line 306
Grid : Message : 71.328862 s : ----------------------------------------------------------------
Grid : Message : 71.328884 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 71.328885 s : ----------------------------------------------------------------
Grid : Message : 71.328886 s : Called DwDag
Grid : Message : 71.328887 s : norm dag result 4.12810493
Grid : Message : 71.329493 s : norm dag ref 4.12810493
Grid : Message : 71.331967 s : norm dag diff 3.40632318e-14 Line 377
Grid : Message : 71.394727 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 71.803650 s : src_e0.500003185
Grid : Message : 71.819727 s : src_o0.499996882
Grid : Message : 71.821991 s : *********************************************************
Grid : Message : 71.821993 s : * Benchmarking DomainWallFermion::DhopEO
Grid : Message : 71.821995 s : * Vectorising space-time by 8
Grid : Message : 71.821998 s : * Using Overlapped Comms/Compute
Grid : Message : 71.822002 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 71.822003 s : *********************************************************
Grid : Message : 72.377054 s : Deo mflop/s = 24065467
Grid : Message : 72.377071 s : Deo mflop/s per rank 6016366.75
Grid : Message : 72.377074 s : Deo mflop/s per node 24065467
Grid : Message : 72.624877 s : r_e2.06377678
Grid : Message : 72.625198 s : r_o2.06381058
Grid : Message : 72.625507 s : res4.12758736
Grid : Message : 73.759140 s : norm diff 0
Grid : Message : 73.868204 s : norm diff even 0
Grid : Message : 73.907201 s : norm diff odd 0
Grid : Message : 74.414580 s : ++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 74.414582 s : Testing without internode communication
Grid : Message : 74.414584 s : ++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 74.414586 s : Grid Layout
Grid : Message : 74.414586 s : Global lattice size : 32 32 64 64
Grid : Message : 74.414594 s : OpenMP threads : 4
Grid : Message : 74.414595 s : MPI tasks : 1 1 2 2
Grid : Message : 74.679364 s : Initialising 4d RNG
Grid : Message : 74.742332 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 74.742343 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 74.759525 s : Initialising 5d RNG
Grid : Message : 75.812412 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 75.812429 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 119.252016 s : Drawing gauge field
Grid : Message : 129.919846 s : Random gauge initialised
Grid : Message : 129.919863 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
Grid : Message : 129.919865 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
Grid : Message : 129.923611 s : Setting up Cshift based reference
Grid : Message : 135.522878 s : *****************************************************************
Grid : Message : 135.522897 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 135.522899 s : *****************************************************************
Grid : Message : 135.522899 s : *****************************************************************
Grid : Message : 135.522900 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 135.522901 s : * Vectorising space-time by 8
Grid : Message : 135.522903 s : * VComplex size is 64 B
Grid : Message : 135.522905 s : * Using Overlapped Comms/Compute
Grid : Message : 135.522907 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 135.522908 s : *****************************************************************
Grid : Message : 136.151202 s : Called warmup
Grid : Message : 137.224721 s : Called Dw 300 times in 1073490 us
Grid : Message : 137.224748 s : mflop/s = 24755806
Grid : Message : 137.224751 s : mflop/s per rank = 6188951.49
Grid : Message : 137.224753 s : mflop/s per node = 24755806
Grid : Message : 137.235239 s : norm diff 5.8108784e-14 Line 306
Grid : Message : 146.451686 s : ----------------------------------------------------------------
Grid : Message : 146.451708 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 146.451710 s : ----------------------------------------------------------------
Grid : Message : 146.451712 s : Called DwDag
Grid : Message : 146.451714 s : norm dag result 4.12810493
Grid : Message : 146.452323 s : norm dag ref 4.12810493
Grid : Message : 146.454799 s : norm dag diff 3.40632318e-14 Line 377
Grid : Message : 146.498557 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 146.940894 s : src_e0.500003185
Grid : Message : 146.953676 s : src_o0.499996882
Grid : Message : 146.955927 s : *********************************************************
Grid : Message : 146.955929 s : * Benchmarking DomainWallFermion::DhopEO
Grid : Message : 146.955932 s : * Vectorising space-time by 8
Grid : Message : 146.955936 s : * Using Overlapped Comms/Compute
Grid : Message : 146.955938 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 146.955941 s : *********************************************************
Grid : Message : 147.511975 s : Deo mflop/s = 24036256.5
Grid : Message : 147.511989 s : Deo mflop/s per rank 6009064.13
Grid : Message : 147.511991 s : Deo mflop/s per node 24036256.5
Grid : Message : 147.522100 s : r_e2.06377678
Grid : Message : 147.522433 s : r_o2.06381058
Grid : Message : 147.522745 s : res4.12758736
Grid : Message : 148.229848 s : norm diff 0
Grid : Message : 149.233474 s : norm diff even 0
Grid : Message : 149.235815 s : norm diff odd 0
Grid : Message : 149.960985 s : ++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 149.960990 s : Testing without intranode communication
Grid : Message : 149.960991 s : ++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 149.960995 s : Grid Layout
Grid : Message : 149.960995 s : Global lattice size : 32 32 64 64
Grid : Message : 149.961003 s : OpenMP threads : 4
Grid : Message : 149.961004 s : MPI tasks : 1 1 2 2
Grid : Message : 150.155810 s : Initialising 4d RNG
Grid : Message : 150.800200 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 150.800340 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 150.973420 s : Initialising 5d RNG
Grid : Message : 151.131117 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 151.131136 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 193.933765 s : Drawing gauge field
Grid : Message : 204.611551 s : Random gauge initialised
Grid : Message : 204.611574 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
Grid : Message : 204.611576 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
Grid : Message : 204.615265 s : Setting up Cshift based reference
Grid : Message : 210.117788 s : *****************************************************************
Grid : Message : 210.117807 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 210.117809 s : *****************************************************************
Grid : Message : 210.117810 s : *****************************************************************
Grid : Message : 210.117812 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 210.117813 s : * Vectorising space-time by 8
Grid : Message : 210.117814 s : * VComplex size is 64 B
Grid : Message : 210.117817 s : * Using Overlapped Comms/Compute
Grid : Message : 210.117818 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 210.117819 s : *****************************************************************
Grid : Message : 210.714641 s : Called warmup
Grid : Message : 211.892227 s : Called Dw 300 times in 1177557 us
Grid : Message : 211.892252 s : mflop/s = 22568003.2
Grid : Message : 211.892255 s : mflop/s per rank = 5642000.8
Grid : Message : 211.892257 s : mflop/s per node = 22568003.2
Grid : Message : 211.896037 s : norm diff 5.8108784e-14 Line 306
Grid : Message : 220.751375 s : ----------------------------------------------------------------
Grid : Message : 220.751406 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 220.751409 s : ----------------------------------------------------------------
Grid : Message : 220.751411 s : Called DwDag
Grid : Message : 220.751412 s : norm dag result 4.12810493
Grid : Message : 220.753307 s : norm dag ref 4.12810493
Grid : Message : 220.755796 s : norm dag diff 3.40632318e-14 Line 377
Grid : Message : 220.813226 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 221.697800 s : src_e0.500003185
Grid : Message : 221.890920 s : src_o0.499996882
Grid : Message : 221.913430 s : *********************************************************
Grid : Message : 221.913450 s : * Benchmarking DomainWallFermion::DhopEO
Grid : Message : 221.913480 s : * Vectorising space-time by 8
Grid : Message : 221.913500 s : * Using Overlapped Comms/Compute
Grid : Message : 221.913530 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 221.913550 s : *********************************************************
Grid : Message : 221.645213 s : Deo mflop/s = 24114032
Grid : Message : 221.645228 s : Deo mflop/s per rank 6028508.01
Grid : Message : 221.645231 s : Deo mflop/s per node 24114032
Grid : Message : 221.656021 s : r_e2.06377678
Grid : Message : 221.656389 s : r_o2.06381058
Grid : Message : 221.656698 s : res4.12758736
Grid : Message : 222.110075 s : norm diff 0
Grid : Message : 222.857692 s : norm diff even 0
Grid : Message : 222.875763 s : norm diff odd 0
Grid : Message : 223.598127 s : *******************************************
Grid : Message : 223.598145 s : ******* Grid Finalize ******
Grid : Message : 223.598146 s : *******************************************

View File

@ -0,0 +1,286 @@
RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
SLURM detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA GH200 120GB
AcceleratorCudaInit[0]: totalGlobalMem: 102005473280
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 1
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 0 device 0 bus id: 0009:01:00.0
AcceleratorCudaInit: ================================================
SharedMemoryMpi: World communicator of size 16
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x4002a0000000 - 40031fffffff for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=3737a24096282ea179607fc879814710860a0de6: (HEAD -> develop, origin/develop, origin/HEAD) clean
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : This rank is running on host jpbo-012-11.jupiter.internal
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 81604378624 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent host allocations: SMALL 8 LARGE 2 HUGE 0
Grid : Message : MemoryManager::Init() cache pool for recent device allocations: SMALL 16 LARGE 8 Huge 0
Grid : Message : MemoryManager::Init() cache pool for recent shared allocations: SMALL 16 LARGE 8 Huge 0
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 0.834000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 0.838000 s : Testing with full communication
Grid : Message : 0.839000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 0.840000 s : Grid Layout
Grid : Message : 0.840000 s : Global lattice size : 64 64 64 64
Grid : Message : 0.846000 s : OpenMP threads : 4
Grid : Message : 0.846000 s : MPI tasks : 2 2 2 2
Grid : Message : 0.165970 s : Initialising 4d RNG
Grid : Message : 0.787270 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 0.787340 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 0.960410 s : Initialising 5d RNG
Grid : Message : 1.142344 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 1.142352 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
local rank 2 device 0 bus id: 0029:01:00.0
local rank 3 device 0 bus id: 0039:01:00.0
local rank 1 device 0 bus id: 0019:01:00.0
Grid : Message : 44.657270 s : Drawing gauge field
Grid : Message : 55.247733 s : Random gauge initialised
Grid : Message : 55.247745 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
Grid : Message : 55.247747 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
Grid : Message : 55.253053 s : Setting up Cshift based reference
Grid : Message : 62.191747 s : *****************************************************************
Grid : Message : 62.191767 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 62.191768 s : *****************************************************************
Grid : Message : 62.191769 s : *****************************************************************
Grid : Message : 62.191769 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 62.191769 s : * Vectorising space-time by 8
Grid : Message : 62.191770 s : * VComplex size is 64 B
Grid : Message : 62.191771 s : * Using Overlapped Comms/Compute
Grid : Message : 62.191771 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 62.191772 s : *****************************************************************
Grid : Message : 62.857568 s : Called warmup
Grid : Message : 65.581790 s : Called Dw 300 times in 2200540 us
Grid : Message : 65.582120 s : mflop/s = 48306525
Grid : Message : 65.582140 s : mflop/s per rank = 3019157.81
Grid : Message : 65.582150 s : mflop/s per node = 12076631.3
Grid : Message : 65.637550 s : norm diff 5.80156793e-14 Line 306
Grid : Message : 75.122153 s : ----------------------------------------------------------------
Grid : Message : 75.122166 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 75.122167 s : ----------------------------------------------------------------
Grid : Message : 75.122167 s : Called DwDag
Grid : Message : 75.122167 s : norm dag result 4.12801829
Grid : Message : 75.123295 s : norm dag ref 4.12801829
Grid : Message : 75.125890 s : norm dag diff 3.42093991e-14 Line 377
Grid : Message : 75.188462 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 75.605683 s : src_e0.500004005
Grid : Message : 75.617824 s : src_o0.499996067
Grid : Message : 75.620089 s : *********************************************************
Grid : Message : 75.620091 s : * Benchmarking DomainWallFermion::DhopEO
Grid : Message : 75.620093 s : * Vectorising space-time by 8
Grid : Message : 75.620094 s : * Using Overlapped Comms/Compute
Grid : Message : 75.620095 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 75.620096 s : *********************************************************
Grid : Message : 76.732272 s : Deo mflop/s = 48068252.4
Grid : Message : 76.732283 s : Deo mflop/s per rank 3004265.77
Grid : Message : 76.732285 s : Deo mflop/s per node 12017063.1
Grid : Message : 76.749317 s : r_e2.06443136
Grid : Message : 76.749652 s : r_o2.06378451
Grid : Message : 76.749955 s : res4.12821587
Grid : Message : 77.198827 s : norm diff 0
Grid : Message : 77.981760 s : norm diff even 0
Grid : Message : 78.455900 s : norm diff odd 0
Grid : Message : 78.539333 s : ++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 78.539337 s : Testing without internode communication
Grid : Message : 78.539338 s : ++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 78.539339 s : Grid Layout
Grid : Message : 78.539339 s : Global lattice size : 64 64 64 64
Grid : Message : 78.539347 s : OpenMP threads : 4
Grid : Message : 78.539348 s : MPI tasks : 2 2 2 2
Grid : Message : 78.798501 s : Initialising 4d RNG
Grid : Message : 78.862916 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 78.862925 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 78.879916 s : Initialising 5d RNG
Grid : Message : 79.941271 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 79.941280 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 124.586264 s : Drawing gauge field
Grid : Message : 135.338090 s : Random gauge initialised
Grid : Message : 135.338102 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
Grid : Message : 135.338103 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
Grid : Message : 135.341266 s : Setting up Cshift based reference
Grid : Message : 142.604280 s : *****************************************************************
Grid : Message : 142.604450 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 142.604460 s : *****************************************************************
Grid : Message : 142.604470 s : *****************************************************************
Grid : Message : 142.604480 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 142.604480 s : * Vectorising space-time by 8
Grid : Message : 142.604500 s : * VComplex size is 64 B
Grid : Message : 142.604510 s : * Using Overlapped Comms/Compute
Grid : Message : 142.604510 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 142.604520 s : *****************************************************************
Grid : Message : 142.686034 s : Called warmup
Grid : Message : 144.868543 s : Called Dw 300 times in 2182483 us
Grid : Message : 144.868559 s : mflop/s = 48706194.1
Grid : Message : 144.868561 s : mflop/s per rank = 3044137.13
Grid : Message : 144.868562 s : mflop/s per node = 12176548.5
Grid : Message : 144.887595 s : norm diff 5.80156793e-14 Line 306
Grid : Message : 153.622978 s : ----------------------------------------------------------------
Grid : Message : 153.622994 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 153.622995 s : ----------------------------------------------------------------
Grid : Message : 153.622995 s : Called DwDag
Grid : Message : 153.622996 s : norm dag result 4.12801829
Grid : Message : 153.623604 s : norm dag ref 4.12801829
Grid : Message : 153.626098 s : norm dag diff 3.42093991e-14 Line 377
Grid : Message : 153.691426 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 154.148319 s : src_e0.500004005
Grid : Message : 154.151454 s : src_o0.499996067
Grid : Message : 154.153722 s : *********************************************************
Grid : Message : 154.153724 s : * Benchmarking DomainWallFermion::DhopEO
Grid : Message : 154.153725 s : * Vectorising space-time by 8
Grid : Message : 154.153726 s : * Using Overlapped Comms/Compute
Grid : Message : 154.153727 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 154.153728 s : *********************************************************
Grid : Message : 155.200671 s : Deo mflop/s = 51121022.4
Grid : Message : 155.200682 s : Deo mflop/s per rank 3195063.9
Grid : Message : 155.200684 s : Deo mflop/s per node 12780255.6
Grid : Message : 155.217204 s : r_e2.06443136
Grid : Message : 155.217550 s : r_o2.06378451
Grid : Message : 155.217869 s : res4.12821587
Grid : Message : 155.673744 s : norm diff 0
Grid : Message : 156.463329 s : norm diff even 0
Grid : Message : 156.878866 s : norm diff odd 0
Grid : Message : 157.620761 s : ++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 157.620764 s : Testing without intranode communication
Grid : Message : 157.620765 s : ++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 157.620766 s : Grid Layout
Grid : Message : 157.620766 s : Global lattice size : 64 64 64 64
Grid : Message : 157.620773 s : OpenMP threads : 4
Grid : Message : 157.620774 s : MPI tasks : 2 2 2 2
Grid : Message : 157.671479 s : Initialising 4d RNG
Grid : Message : 157.738691 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 157.738698 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 157.755651 s : Initialising 5d RNG
Grid : Message : 158.848676 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 158.848685 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 202.465158 s : Drawing gauge field
Grid : Message : 213.214546 s : Random gauge initialised
Grid : Message : 213.214561 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
Grid : Message : 213.214563 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
Grid : Message : 213.217711 s : Setting up Cshift based reference
Grid : Message : 219.662772 s : *****************************************************************
Grid : Message : 219.662786 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 219.662787 s : *****************************************************************
Grid : Message : 219.662788 s : *****************************************************************
Grid : Message : 219.662788 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 219.662789 s : * Vectorising space-time by 8
Grid : Message : 219.662790 s : * VComplex size is 64 B
Grid : Message : 219.662791 s : * Using Overlapped Comms/Compute
Grid : Message : 219.662791 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 219.662791 s : *****************************************************************
Grid : Message : 220.425592 s : Called warmup
Grid : Message : 222.536249 s : Called Dw 300 times in 2110597 us
Grid : Message : 222.536267 s : mflop/s = 50365105.5
Grid : Message : 222.536269 s : mflop/s per rank = 3147819.09
Grid : Message : 222.536270 s : mflop/s per node = 12591276.4
Grid : Message : 222.541053 s : norm diff 5.80156793e-14 Line 306
Grid : Message : 232.135901 s : ----------------------------------------------------------------
Grid : Message : 232.135915 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 232.135916 s : ----------------------------------------------------------------
Grid : Message : 232.135917 s : Called DwDag
Grid : Message : 232.135918 s : norm dag result 4.12801829
Grid : Message : 232.151938 s : norm dag ref 4.12801829
Grid : Message : 232.154451 s : norm dag diff 3.42093991e-14 Line 377
Grid : Message : 232.216117 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 232.630529 s : src_e0.500004005
Grid : Message : 232.643197 s : src_o0.499996067
Grid : Message : 232.645527 s : *********************************************************
Grid : Message : 232.645529 s : * Benchmarking DomainWallFermion::DhopEO
Grid : Message : 232.645532 s : * Vectorising space-time by 8
Grid : Message : 232.645533 s : * Using Overlapped Comms/Compute
Grid : Message : 232.645534 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 232.645535 s : *********************************************************
Grid : Message : 233.774184 s : Deo mflop/s = 47432091.9
Grid : Message : 233.774194 s : Deo mflop/s per rank 2964505.74
Grid : Message : 233.774196 s : Deo mflop/s per node 11858023
Grid : Message : 233.791552 s : r_e2.06443136
Grid : Message : 233.791899 s : r_o2.06378451
Grid : Message : 233.792204 s : res4.12821587
Grid : Message : 234.230783 s : norm diff 0
Grid : Message : 235.162780 s : norm diff even 0
Grid : Message : 235.291950 s : norm diff odd 0
Grid : Message : 235.765411 s : *******************************************
Grid : Message : 235.765424 s : ******* Grid Finalize ******
Grid : Message : 235.765425 s : *******************************************

View File

@ -0,0 +1,57 @@
#!/bin/sh
#SBATCH --account=jureap14
#SBATCH --nodes=1
#SBATCH --ntasks=4
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=64
#SBATCH --time=2:00:00
#SBATCH --partition=booster
#SBATCH --gres=gpu:4
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
OPT="--comms-overlap"
source ../sourceme.sh
cat << EOF > bind_gpu
#!/bin/bash
export GPU_MAP=(0 1 2 3)
export NUMA_MAP=(0 1 2 3)
export NIC_MAP=(0 1 2 3)
export GPU=\$SLURM_LOCALID
export NUMA=\$SLURM_LOCALID
export NIC=\$SLURM_LOCALID
export CUDA_VISIBLE_DEVICES=\$GPU
export UCX_NET_DEVICES=mlx5_\${NIC}:1
echo RANK \$SLURM_LOCALID using NUMA \$NUMA GPU \$GPU NIC \$UCX_NET_DEVICES
exec numactl -m \$NUMA -N \$NUMA \$*
EOF
chmod +x ./bind_gpu
srun --cpu-bind=no -N 1 -n $SLURM_NTASKS \
./bind_gpu ./Benchmark_dwf_fp32 \
$OPT \
--mpi 1.1.2.2 \
--accelerator-threads 8 \
--grid 32.32.64.64 \
--shm 2048 > dwf.1node.perf
srun --cpu-bind=no -N 1 -n $SLURM_NTASKS \
./bind_gpu ./Benchmark_comms_host_device \
--mpi 1.1.2.2 \
--accelerator-threads 8 \
--grid 32.32.64.64 \
--shm 2048 > comms.1node.perf

View File

@ -0,0 +1,57 @@
#!/bin/sh
#SBATCH --account=jureap14
#SBATCH --nodes=4
#SBATCH --ntasks=16
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=64
#SBATCH --time=2:00:00
#SBATCH --partition=booster
#SBATCH --gres=gpu:4
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
OPT="--comms-overlap"
source ../sourceme.sh
cat << EOF > bind_gpu
#!/bin/bash
export GPU_MAP=(0 1 2 3)
export NUMA_MAP=(0 1 2 3)
export NIC_MAP=(0 1 2 3)
export GPU=\$SLURM_LOCALID
export NUMA=\$SLURM_LOCALID
export NIC=\$SLURM_LOCALID
export CUDA_VISIBLE_DEVICES=\$GPU
export UCX_NET_DEVICES=mlx5_\${NIC}:1
echo RANK \$SLURM_LOCALID using NUMA \$NUMA GPU \$GPU NIC \$UCX_NET_DEVICES
exec numactl -m \$NUMA -N \$NUMA \$*
EOF
chmod +x ./bind_gpu
srun --cpu-bind=no -N 4 -n $SLURM_NTASKS \
./bind_gpu ./Benchmark_dwf_fp32 \
$OPT \
--mpi 2.2.2.2 \
--accelerator-threads 8 \
--grid 64.64.64.64 \
--shm 2048 > dwf.4node.perf
srun --cpu-bind=no -N 4 -n $SLURM_NTASKS \
./bind_gpu ./Benchmark_comms_host_device \
--mpi 2.2.2.2 \
--accelerator-threads 8 \
--grid 32.32.64.64 \
--shm 2048 > comms.4node.perf

View File

@ -0,0 +1,16 @@
export CXX=nvcc
export OPENMPI=/p/software/default/stages/2025/software/OpenMPI/5.0.5-NVHPC-24.9-CUDA-12/
export LDFLAGS="-cudart shared -L${OPENMPI}/lib"
export CXXFLAGS="-ccbin clang++ -gencode arch=compute_90,code=sm_90 -std=c++17 -cudart shared -lcublas -lmpi -I${OPENMPI}/include"
../../configure \
--enable-comms=mpi \
--enable-simd=GPU \
--enable-gen-simd-width=64 \
--enable-shm=nvlink \
--enable-accelerator=cuda \
--with-lime=$CLIME \
--disable-gparity \
--disable-fermion-reps \
--disable-unified

View File

@ -0,0 +1,9 @@
CLIME=$HOME/install/
module load Clang
module load CUDA
module load FFTW
module load OpenSSL
module load MPFR
module load NVHPC
module load UCX
module load OpenMPI

206
systems/WorkArounds.txt Normal file
View File

@ -0,0 +1,206 @@
The purpose of this file is to collate all non-obvious known magic shell variables
and compiler flags required for either correctness or performance on various systems.
A repository of work-arounds.
Contents:
1. Interconnect + MPI
2. Compilation
3. Profiling
************************
* 1. INTERCONNECT + MPI
************************
--------------------------------------------------------------------
MPI2-IO correctness: force OpenMPI to use the MPICH romio implementation for parallel I/O
--------------------------------------------------------------------
export OMPI_MCA_io=romio321
--------------------------------------
ROMIO fail with > 2GB per node read (32 bit issue)
--------------------------------------
Use later MPICH
https://github.com/paboyle/Grid/issues/381
https://github.com/pmodels/mpich/commit/3a479ab0
--------------------------------------------------------------------
Slingshot: Frontier and Perlmutter libfabric slow down
and physical memory fragmentation
--------------------------------------------------------------------
export FI_MR_CACHE_MONITOR=disabled
or
export FI_MR_CACHE_MONITOR=kdreg2
--------------------------------------------------------------------
Perlmutter
--------------------------------------------------------------------
export MPICH_RDMA_ENABLED_CUDA=1
export MPICH_GPU_IPC_ENABLED=1
export MPICH_GPU_EAGER_REGISTER_HOST_MEM=0
export MPICH_GPU_NO_ASYNC_MEMCPY=0
--------------------------------------------------------------------
Frontier/LumiG
--------------------------------------------------------------------
Hiding ROCR_VISIBLE_DEVICES triggers SDMA engines to be used for GPU-GPU
cat << EOF > select_gpu
#!/bin/bash
export MPICH_GPU_SUPPORT_ENABLED=1
export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
export GPU_MAP=(0 1 2 3 7 6 5 4)
export NUMA_MAP=(3 3 1 1 2 2 0 0)
export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]}
export HIP_VISIBLE_DEVICES=\$GPU
unset ROCR_VISIBLE_DEVICES
echo RANK \$SLURM_LOCALID using GPU \$GPU
exec numactl -m \$NUMA -N \$NUMA \$*
EOF
chmod +x ./select_gpu
srun ./select_gpu BINARY
--------------------------------------------------------------------
Mellanox performance with A100 GPU (Tursa, Booster, Leonardo)
--------------------------------------------------------------------
export OMPI_MCA_btl=^uct,openib
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
--------------------------------------------------------------------
Mellanox + A100 correctness (Tursa, Booster, Leonardo)
--------------------------------------------------------------------
export UCX_MEMTYPE_CACHE=n
--------------------------------------------------------------------
MPICH/Aurora/PVC correctness and performance
--------------------------------------------------------------------
https://github.com/pmodels/mpich/issues/7302
--enable-cuda-aware-mpi=no
--enable-unified=no
Grid's internal D-H-H-D pipeline mode, avoid device memory in MPI
Do not use SVM
Ideally use MPICH with fix to issue 7302:
https://github.com/pmodels/mpich/pull/7312
Ideally:
MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE=generic
Alternatives:
export MPIR_CVAR_NOLOCAL=1
export MPIR_CVAR_CH4_IPC_GPU_P2P_THRESHOLD=1000000000
--------------------------------------------------------------------
MPICH/Aurora/PVC correctness and performance
--------------------------------------------------------------------
Broken:
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
This gives good peformance without requiring
--enable-cuda-aware-mpi=no
But is an open issue reported by James Osborn
https://github.com/pmodels/mpich/issues/7139
Possibly resolved but unclear if in the installed software yet.
************************
* 2. COMPILATION
************************
--------------------------------------------------------------------
G++ compiler breakage / graveyard
--------------------------------------------------------------------
9.3.0, 10.3.1,
https://github.com/paboyle/Grid/issues/290
https://github.com/paboyle/Grid/issues/264
Working (-) Broken (X):
4.9.0 -
4.9.1 -
5.1.0 X
5.2.0 X
5.3.0 X
5.4.0 X
6.1.0 X
6.2.0 X
6.3.0 -
7.1.0 -
8.0.0 (HEAD) -
https://github.com/paboyle/Grid/issues/100
--------------------------------------------------------------------
AMD GPU nodes :
--------------------------------------------------------------------
multiple ROCM versions broken; use 5.3.0
manifests itself as wrong results in fp32
https://github.com/paboyle/Grid/issues/464
--------------------------------------------------------------------
Aurora/PVC
--------------------------------------------------------------------
SYCL ahead of time compilation (fixes rare runtime JIT errors and faster runtime, PB)
SYCL slow link and relocatable code issues (Christoph Lehner)
Opt large register file required for good performance in fp64
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc"
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -fPIC"
--------------------------------------------------------------------
Aurora/PVC useful extra options
--------------------------------------------------------------------
Host only sanitizer:
-Xarch_host -fsanitize=leak
-Xarch_host -fsanitize=address
Deterministic MPI reduction:
export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
************************
* 3. Visual profile tools
************************
--------------------------------------------------------------------
Frontier/rocprof
--------------------------------------------------------------------
--------------------------------------------------------------------
Aurora/unitrace
--------------------------------------------------------------------
--------------------------------------------------------------------
Tursa/nsight-sys
--------------------------------------------------------------------

View File

@ -1,2 +1,14 @@
CXXFLAGS=-I/opt/local/include LDFLAGS=-L/opt/local/lib/ CXX=c++-13 MPICXX=mpicxx ../../configure --enable-simd=GEN --enable-comms=mpi-auto --enable-unified=yes --prefix $HOME/QCD/GridInstall --with-lime=/Users/peterboyle/QCD/SciDAC/install/ --with-openssl=$BREW --disable-fermion-reps --disable-gparity --disable-debug
CXX=mpicxx ../../configure \
--enable-simd=GEN \
--enable-comms=mpi-auto \
--enable-Sp=yes \
--enable-unified=yes \
--prefix /Users/peterboyle/QCD/vtk/Grid/install \
--with-lime=$CLIME \
--with-openssl=$OPENSSL \
--with-gmp=$GMP \
--with-mpfr=$MPFR \
--disable-debug

View File

@ -0,0 +1,32 @@
#!/bin/bash
#SBATCH --partition lqcd
#SBATCH --time=00:50:00
#SBATCH -A lqcdtest
#SBATCH -q lqcd
#SBATCH --exclusive
#SBATCH --nodes=1
#SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=64
#SBATCH --qos lqcd
source sourceme.sh
export PLACES=(1:16:4 1:32:2 0:64:1);
export THR=(16 32 64)
for t in 2
do
export OMP_NUM_THREADS=${THR[$t]}
export OMP_PLACES=${PLACES[$t]}
export thr=${THR[$t]}
#for vol in 24.24.24.24 32.32.32.32 48.48.48.96
for vol in 48.48.48.96
do
srun -N1 -n1 ./benchmarks/Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid $vol --dslash-asm --shm 8192 > $vol.1node.thr$thr
done
#srun -N1 -n1 ./benchmarks/Benchmark_usqcd --mpi 1.1.1.1 --grid $vol > usqcd.1node.thr$thr
done

View File

@ -0,0 +1,36 @@
#!/bin/bash
#SBATCH --partition lqcd
#SBATCH --time=00:50:00
#SBATCH -A lqcdtest
#SBATCH -q lqcd
#SBATCH --exclusive
#SBATCH --nodes=2
#SBATCH -w genoahost001,genoahost003,genoahost050,genoahost054
#SBATCH --ntasks=2
#SBATCH --cpus-per-task=64
#SBATCH --qos lqcd
source sourceme.sh
export PLACES=(1:16:4 1:32:2 0:64:1);
export THR=(16 32 64)
nodes=2
mpi=1.1.1.2
for t in 2
do
export OMP_NUM_THREADS=${THR[$t]}
export OMP_PLACES=${PLACES[$t]}
export thr=${THR[$t]}
#srun -N$nodes -n$nodes ./benchmarks/Benchmark_usqcd --mpi $mpi --grid 32.32.32.32 > usqcd.n$nodes.thr$thr
for vol in 64.64.64.128
do
srun -N$nodes -n$nodes ./benchmarks/Benchmark_dwf_fp32 --mpi $mpi --grid $vol --dslash-asm --comms-overlap --shm 8192 > $vol.n$nodes.overlap.thr$thr
done
done

View File

@ -0,0 +1,29 @@
spack load c-lime
spack load fftw
spack load hdf5+cxx
export FFTW=`spack find --paths fftw | grep ^fftw | awk '{print $2}' `
export HDF5=`spack find --paths hdf5+cxx | grep ^hdf5 | awk '{print $2}' `
export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
../../configure \
--enable-comms=mpi-auto \
--enable-unified=yes \
--enable-shm=shmopen \
--enable-shm-fast-path=shmopen \
--enable-accelerator=none \
--enable-simd=AVX512 \
--with-lime=$CLIME \
--with-hdf5=$HDF5 \
--with-fftw=$FFTW \
--disable-fermion-reps \
--disable-gparity \
CXX=clang++ \
MPICXX=mpicxx \
LIBS=-llime \
LDFLAGS=-L$CLIME/lib/ \
CXXFLAGS="-std=c++17 -fPIE"

View File

@ -0,0 +1,5 @@
source $HOME/spack/share/spack/setup-env.sh
spack load llvm@17.0.4
export LD_LIBRARY_PATH=/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/llvm-17.0.4-laufdrcip63ivkadmtgoepwmj3dtztdu/lib:$LD_LIBRARY_PATH
module load openmpi/4.1.8
spack load c-lime

View File

@ -0,0 +1,17 @@
../../src/Grid/configure \
--prefix /home/pab/NPR/install \
--enable-comms=mpi-auto \
--enable-simd=AVX2 \
--enable-shm=none \
--enable-debug \
--with-lime=$CLIME \
--with-hdf5=$HDF5 \
--with-fftw=$FFTW \
--with-gmp=$GMP \
--with-mpfr=$MPFR \
--disable-gparity \
--disable-fermion-reps \
CXX=clang++ \
MPICXX=mpicxx \
CXXFLAGS="-std=c++17 "

View File

@ -0,0 +1,28 @@
source $HOME/spack/share/spack/setup-env.sh
spack load llvm@12
spack load autoconf%clang@12.0.1
spack load automake%clang@12.0.1
spack load c-lime%clang@12.0.1
spack load fftw%clang@12.0.1
spack load gmp%clang@12.0.1
spack load mpfr%clang@12.0.1
spack load openmpi%clang@12.0.1
spack load openssl%clang@12.0.1
spack load hdf5+cxx%clang@12.0.1
spack load cmake%clang@12.0.1
export FFTW=`spack find --paths fftw%clang@12.0.1 | grep ^fftw | awk '{print $2}' `
export HDF5=`spack find --paths hdf5+cxx%clang@12.0.1 | grep ^hdf5 | awk '{print $2}' `
export CLIME=`spack find --paths c-lime%clang@12.0.1 | grep ^c-lime | awk '{print $2}' `
export MPFR=`spack find --paths mpfr%clang@12.0.1 | grep ^mpfr | awk '{print $2}' `
export LLVM=`spack find --paths llvm@12 | grep ^llvm | awk '{print $2}' `
export OPENSSL=`spack find --paths openssl%clang@12.0.1 | grep openssl | awk '{print $2}' `
export GMP=`spack find --paths gmp%clang@12.0.1 | grep ^gmp | awk '{print $2}' `
export TCLAP=`spack find --paths tclap%clang@12.0.1 | grep ^tclap | awk '{print $2}' `
export LD_LIBRARY_PATH=${TCLAP}/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$MPFR/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$GMP/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$FFTW/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$LLVM/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$LLVM/lib/x86_64-unknown-linux-gnu/:$LD_LIBRARY_PATH
ulimit -s 81920

View File

@ -0,0 +1,19 @@
cd
git clone https://github.com/spack/spack.git
source $HOME/spack/share/spack/setup-env.sh
spack install llvm@12
spack install autoconf%clang@12.0.1
spack install automake%clang@12.0.1
spack install c-lime%clang@12.0.1
spack install fftw%clang@12.0.1
spack install gmp%clang@12.0.1
spack install mpfr%clang@12.0.1
spack install openmpi%clang@12.0.1
spack install openssl%clang@12.0.1
spack install hdf5+cxx%clang@12.0.1
spack install cmake%clang@12.0.1
spack install tclap%clang@12.0.1
spack install emacs%clang@12.0.1

View File

@ -62,7 +62,7 @@ int VerifyOnDevice(const FermionField &res, FermionField &ref)
if (((random()&0xF)==0)&&injection) {
uint64_t sF = random()%(NN);
int lane=0;
printf("Error injection site %ld on rank %d\n",sF,res.Grid()->ThisRank());
printf("Error injection site %ld on rank %d\n",(long)sF,res.Grid()->ThisRank());
auto vv = acceleratorGet(res_v[sF]);
double *dd = (double *)&vv;
*dd=M_PI;

View File

@ -195,8 +195,8 @@ int main (int argc, char ** argv)
int Nk=nrhs;
int Nm=Nk*3;
int Nk=36;
int Nm=144;
// int Nk=36;
// int Nm=144;
int Nstop=Nk;
int Nconv_test_interval=1;

View File

@ -47,20 +47,20 @@ public:
void OpDir (const Field &in, Field &out,int dir,int disp) { assert(0); }
void OpDirAll (const Field &in, std::vector<Field> &out){ assert(0); };
void Op (const Field &in, Field &out){
std::cout << "Op: PVdag M "<<std::endl;
// std::cout << "Op: PVdag M "<<std::endl;
Field tmp(in.Grid());
_Mat.M(in,tmp);
_PV.Mdag(tmp,out);
}
void AdjOp (const Field &in, Field &out){
std::cout << "AdjOp: Mdag PV "<<std::endl;
// std::cout << "AdjOp: Mdag PV "<<std::endl;
Field tmp(in.Grid());
_PV.M(in,tmp);
_Mat.Mdag(tmp,out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
void HermOp(const Field &in, Field &out){
std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
// std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
Field tmp(in.Grid());
// _Mat.M(in,tmp);
// _PV.Mdag(tmp,out);
@ -83,14 +83,14 @@ public:
void OpDir (const Field &in, Field &out,int dir,int disp) { assert(0); }
void OpDirAll (const Field &in, std::vector<Field> &out){ assert(0); };
void Op (const Field &in, Field &out){
std::cout << "Op: PVdag M "<<std::endl;
// std::cout << "Op: PVdag M "<<std::endl;
Field tmp(in.Grid());
_Mat.M(in,tmp);
_PV.Mdag(tmp,out);
out = out + shift * in;
}
void AdjOp (const Field &in, Field &out){
std::cout << "AdjOp: Mdag PV "<<std::endl;
// std::cout << "AdjOp: Mdag PV "<<std::endl;
Field tmp(in.Grid());
_PV.M(tmp,out);
_Mat.Mdag(in,tmp);
@ -98,7 +98,7 @@ public:
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
void HermOp(const Field &in, Field &out){
std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
// std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
Field tmp(in.Grid());
Op(in,tmp);
AdjOp(tmp,out);

View File

@ -54,6 +54,7 @@ const RealD M5 = 1.8;
int main(int argc, char** argv)
{
#ifdef ENABLE_GPARITY
Grid_init(&argc, &argv);
int threads = GridThread::GetThreads();
@ -106,6 +107,6 @@ int main(int argc, char** argv)
Meofa.refresh(Umu,sRNG, RNG5);
printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
}
#endif
return 0;
}

View File

@ -56,6 +56,7 @@ const RealD M5 = 1.8;
int main(int argc, char** argv)
{
#ifdef ENABLE_GPARITY
Grid_init(&argc, &argv);
int threads = GridThread::GetThreads();
@ -106,6 +107,6 @@ int main(int argc, char** argv)
Meofa.refresh(Umu, sRNG, RNG5);
printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
}
#endif
return 0;
}

View File

@ -33,6 +33,7 @@ using namespace std;
using namespace Grid;
// This is to optimize the SIMD
/*
template<class vobj> void gpermute(vobj & inout,int perm){
vobj tmp=inout;
if (perm & 0x1 ) { permute(inout,tmp,0); tmp=inout;}
@ -40,7 +41,7 @@ template<class vobj> void gpermute(vobj & inout,int perm){
if (perm & 0x4 ) { permute(inout,tmp,2); tmp=inout;}
if (perm & 0x8 ) { permute(inout,tmp,3); tmp=inout;}
}
*/
int main (int argc, char ** argv)
{

View File

@ -153,7 +153,7 @@ public:
t=usecond();
{
autoView( gStaple_v , gStaple, AcceleratorWrite);
auto gStencil_v = gStencil.View();
auto gStencil_v = gStencil.View(AcceleratorRead);
autoView( Ug_mu_v , Ug_mu, AcceleratorRead);
autoView( Ug_nu_v , Ug_nu, AcceleratorRead);
@ -389,7 +389,7 @@ public:
GeneralLocalStencil gStencil(ggrid,shifts);
{
autoView( gStaple_v , gStaple, AcceleratorWrite);
auto gStencil_v = gStencil.View();
auto gStencil_v = gStencil.View(AcceleratorRead);
typedef LatticeView<typename GaugeMat::vector_object> GaugeViewType;
size_t vsize = Nd*sizeof(GaugeViewType);

View File

@ -83,6 +83,7 @@ std::vector<RealD> jack_stats(const std::vector<RealD>& data)
int main(int argc, char **argv)
{
#ifdef ENABLE_GPARITY
Grid_init(&argc, &argv);
// Initialize spacetime grid
@ -206,4 +207,5 @@ int main(int argc, char **argv)
std::cout << std::endl << "EOFA: rw = " << eofa_result[0] << " +/- " << eofa_result[1] << std::endl;
Grid_finalize();
#endif
}

View File

@ -85,6 +85,7 @@ std::vector<RealD> jack_stats(const std::vector<RealD>& data)
int main(int argc, char **argv)
{
#ifdef ENABLE_GPARITY
Grid_init(&argc, &argv);
// Initialize spacetime grid
@ -215,4 +216,5 @@ int main(int argc, char **argv)
std::cout << std::endl << "EOFA: rw = " << eofa_result[0] << " +/- " << eofa_result[1] << std::endl;
Grid_finalize();
#endif
}

View File

@ -35,6 +35,7 @@ using namespace Grid;
int main (int argc, char ** argv)
{
#ifdef ENABLE_GPARITY
Grid_init(&argc,&argv);
Coordinate latt_size = GridDefaultLatt();
@ -244,4 +245,5 @@ int main (int argc, char ** argv)
std::cout<< GridLogMessage << "Done" <<std::endl;
Grid_finalize();
#endif
}

View File

@ -38,6 +38,7 @@ typedef typename FermionAction::FermionField FermionField;
int main (int argc, char** argv)
{
#ifdef ENABLE_GPARITY
Grid_init(&argc, &argv);
Coordinate latt_size = GridDefaultLatt();
@ -173,4 +174,5 @@ int main (int argc, char** argv)
std::cout << GridLogMessage << "Done" << std::endl;
Grid_finalize();
#endif
}

View File

@ -35,6 +35,7 @@ using namespace Grid;
int main (int argc, char ** argv)
{
#ifdef ENABLE_GPARITY
Grid_init(&argc,&argv);
Coordinate latt_size = GridDefaultLatt();
@ -204,4 +205,5 @@ int main (int argc, char ** argv)
assert( fabs(real(Sprime-S-dSpred)) < 1.0 ) ;
std::cout<< GridLogMessage << "Done" <<std::endl;
Grid_finalize();
#endif
}

View File

@ -32,6 +32,7 @@ using namespace std;
using namespace Grid;
//Here we test the G-parity action and force between the 1f (doubled-lattice) and 2f approaches
#ifdef ENABLE_GPARITY
void copyConjGauge(LatticeGaugeFieldD &Umu_1f, const LatticeGaugeFieldD &Umu_2f, const int nu){
@ -444,3 +445,7 @@ int main (int argc, char ** argv)
assert(0);
}
}
#else
int main (int argc, char ** argv){};
#endif

View File

@ -32,6 +32,7 @@ using namespace Grid;
int main (int argc, char ** argv)
{
#ifdef ENABLE_GPARITY
Grid_init(&argc,&argv);
Coordinate latt_size = GridDefaultLatt();
@ -155,4 +156,5 @@ int main (int argc, char ** argv)
std::cout<< GridLogMessage << "Done" <<std::endl;
Grid_finalize();
#endif
}

View File

@ -30,9 +30,10 @@ See the full license in the file "LICENSE" in the top level distribution directo
#include <Grid/Grid.h>
#ifdef ENABLE_GPARITY
using namespace std;
using namespace Grid;
;
typedef GparityWilsonImplD FermionImplPolicyD;
typedef GparityMobiusEOFAFermionD FermionActionD;
@ -231,3 +232,7 @@ int main (int argc, char** argv)
std::cout << GridLogMessage << "Done" << std::endl;
Grid_finalize();
}
#else
int main(int argc,char ** argv) { return 0;};
#endif

View File

@ -31,14 +31,14 @@ See the full license in the file "LICENSE" in the top level distribution directo
using namespace std;
using namespace Grid;
;
typedef GparityWilsonImplR FermionImplPolicy;
typedef GparityMobiusEOFAFermionD FermionAction;
typedef typename FermionAction::FermionField FermionField;
int main (int argc, char** argv)
{
#ifdef ENABLE_GPARITY
Grid_init(&argc, &argv);
Coordinate latt_size = GridDefaultLatt();
@ -171,4 +171,5 @@ int main (int argc, char** argv)
std::cout << GridLogMessage << "Done" << std::endl;
Grid_finalize();
#endif
}

View File

@ -30,7 +30,7 @@
using namespace Grid;
#ifdef ENABLE_GPARITY
template<typename FermionField2f, typename FermionField1f>
void copy2fTo1fFermionField(FermionField1f &out, const FermionField2f &in, int gpdir){
@ -255,3 +255,6 @@ int main(int argc, char **argv) {
} // main
#else
int main(int argc, char **argv){};
#endif

View File

@ -30,6 +30,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
int main(int argc, char **argv) {
#ifdef ENABLE_GPARITY
using namespace Grid;
;
@ -139,7 +140,7 @@ int main(int argc, char **argv) {
Grid_finalize();
#endif
} // main

View File

@ -55,13 +55,13 @@ namespace Grid{
};
struct SmearingParameters: Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(SmearingParameters,
struct HmcSmearingParameters: Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(HmcSmearingParameters,
double, rho,
Integer, Nsmear)
template <class ReaderClass >
SmearingParameters(Reader<ReaderClass>& Reader){
HmcSmearingParameters(Reader<ReaderClass>& Reader){
read(Reader, "StoutSmearing", *this);
}
@ -213,7 +213,7 @@ int main(int argc, char **argv) {
// Reset performance counters
if (ApplySmearing){
SmearingParameters SmPar(Reader);
HmcSmearingParameters SmPar(Reader);
//double rho = 0.1; // smearing parameter
//int Nsmear = 3; // number of smearing levels
Smear_Stout<HMCWrapper::ImplPolicy> Stout(SmPar.rho);

View File

@ -0,0 +1,14 @@
<?xml version="1.0"?>
<grid>
<LanczosParameters>
<mass>0.00107</mass>
<M5>1.8</M5>
<Ls>48</Ls>
<Nstop>10</Nstop>
<Nk>15</Nk>
<Np>85</Np>
<ChebyLow>0.003</ChebyLow>
<ChebyHigh>60</ChebyHigh>
<ChebyOrder>201</ChebyOrder>
</LanczosParameters>
</grid>

View File

@ -35,6 +35,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
#include <Grid/algorithms/iterative/LocalCoherenceLanczos.h>
#ifdef ENABLE_GPARITY
using namespace std;
using namespace Grid;
@ -378,7 +380,8 @@ void runTest(const Options &opt){
//Note: because we rely upon physical properties we must use a "real" gauge configuration
int main (int argc, char ** argv) {
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
GridLogIRL.TimingMode(1);
@ -482,4 +485,8 @@ int main (int argc, char ** argv) {
Grid_finalize();
}
#else
int main(int argc, char **argv){};
#endif

View File

@ -0,0 +1,428 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/Test_dwf_G5R5.cc
Copyright (C) 2015
Author: Chulwoo Jung <chulwoo@bnl.gov>
From Duo and Bob's Chirality study
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
//typedef WilsonFermionD FermionOp;
typedef DomainWallFermionD FermionOp;
typedef typename DomainWallFermionD::FermionField FermionField;
template <class T> void writeFile(T& in, std::string const fname){
#ifdef HAVE_LIME
// Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
Grid::emptyUserRecord record;
Grid::ScidacWriter WR(in.Grid()->IsBoss());
WR.open(fname);
WR.writeScidacFieldRecord(in,record,0);
WR.close();
#endif
// What is the appropriate way to throw error?
}
RealD AllZero(RealD x) { return 0.; }
namespace Grid {
struct LanczosParameters: Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
RealD, mass ,
RealD, M5 ,
Integer, Ls,
Integer, Nstop,
Integer, Nk,
Integer, Np,
RealD, ChebyLow,
RealD, ChebyHigh,
Integer, ChebyOrder)
// Integer, StartTrajectory,
// Integer, Trajectories, /* @brief Number of sweeps in this run */
// bool, MetropolisTest,
// Integer, NoMetropolisUntil,
// std::string, StartingType,
// Integer, SW,
// RealD, Kappa,
// IntegratorParameters, MD)
LanczosParameters() {
////////////////////////////// Default values
mass = 0;
// MetropolisTest = true;
// NoMetropolisUntil = 10;
// StartTrajectory = 0;
// SW = 2;
// Trajectories = 10;
// StartingType = "HotStart";
/////////////////////////////////
}
template <class ReaderClass >
LanczosParameters(Reader<ReaderClass> & TheReader){
initialize(TheReader);
}
template < class ReaderClass >
void initialize(Reader<ReaderClass> &TheReader){
// std::cout << GridLogMessage << "Reading HMC\n";
read(TheReader, "HMC", *this);
}
void print_parameters() const {
// std::cout << GridLogMessage << "[HMC parameters] Trajectories : " << Trajectories << "\n";
// std::cout << GridLogMessage << "[HMC parameters] Start trajectory : " << StartTrajectory << "\n";
// std::cout << GridLogMessage << "[HMC parameters] Metropolis test (on/off): " << std::boolalpha << MetropolisTest << "\n";
// std::cout << GridLogMessage << "[HMC parameters] Thermalization trajs : " << NoMetropolisUntil << "\n";
// std::cout << GridLogMessage << "[HMC parameters] Starting type : " << StartingType << "\n";
// MD.print_parameters();
}
};
}
int main(int argc, char** argv) {
Grid_init(&argc, &argv);
LanczosParameters LanParams;
#if 1
{
XmlReader HMCrd("LanParams.xml");
read(HMCrd,"LanczosParameters",LanParams);
}
#else
{
LanParams.mass = mass;
}
#endif
std::cout << GridLogMessage<< LanParams <<std::endl;
{
XmlWriter HMCwr("LanParams.xml.out");
write(HMCwr,"LanczosParameters",LanParams);
}
int Ls=16;
RealD M5=1.8;
RealD mass = 0.01;
mass=LanParams.mass;
Ls=LanParams.Ls;
M5=LanParams.M5;
GridCartesian* UGrid = SpaceTimeGrid::makeFourDimGrid(
GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()),
GridDefaultMpi());
GridRedBlackCartesian* UrbGrid =
SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
// GridCartesian* FGrid = UGrid;
// GridRedBlackCartesian* FrbGrid = UrbGrid;
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid);
// printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n", UGrid, UrbGrid, FGrid, FrbGrid);
std::vector<int> seeds4({1, 2, 3, 4});
std::vector<int> seeds5({5, 6, 7, 8});
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
GridParallelRNG RNG5rb(FrbGrid); RNG5.SeedFixedIntegers(seeds5);
LatticeGaugeField Umu(UGrid);
FieldMetaData header;
std::string file("./config");
int precision32 = 0;
int tworow = 0;
NerscIO::readConfiguration(Umu,header,file);
/*
std::vector<LatticeColourMatrix> U(4, UGrid);
for (int mu = 0; mu < Nd; mu++) {
U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
}
*/
int Nk = 20;
int Nstop = Nk;
int Np = 80;
Nstop=LanParams.Nstop;
Nk=LanParams.Nk;
Np=LanParams.Np;
int Nm = Nk + Np;
int MaxIt = 10000;
RealD resid = 1.0e-5;
//while ( mass > - 5.0){
FermionOp Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
MdagMLinearOperator<FermionOp,FermionField> HermOp(Ddwf); /// <-----
// Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
Gamma5R5HermitianLinearOperator<FermionOp, LatticeFermion> G5R5Herm(Ddwf);
// Gamma5R5HermitianLinearOperator
std::vector<double> Coeffs{0, 1.};
Polynomial<FermionField> PolyX(Coeffs);
Chebyshev<FermionField> Cheby(LanParams.ChebyLow,LanParams.ChebyHigh,LanParams.ChebyOrder);
FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
PlainHermOp<FermionField> Op (HermOp);
PlainHermOp<FermionField> Op2 (G5R5Herm);
ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op, Nstop, Nk, Nm, resid, MaxIt);
std::vector<RealD> eval(Nm);
FermionField src(FGrid);
gaussian(RNG5, src);
std::vector<FermionField> evec(Nm, FGrid);
for (int i = 0; i < 1; i++) {
std::cout << i << " / " << Nm << " grid pointer " << evec[i].Grid()
<< std::endl;
};
int Nconv;
IRL.calc(eval, evec, src, Nconv);
std::cout << mass <<" : " << eval << std::endl;
std::cout << " #evecs " << evec.size() << std::endl;
std::cout << " Nconv " << Nconv << std::endl;
std::cout << " Nm " << Nm << std::endl;
if ( Nconv > evec.size() ) Nconv = evec.size();
#if 0
Gamma g5(Gamma::Algebra::Gamma5) ;
ComplexD dot;
FermionField tmp(FGrid);
// RealD eMe,eMMe;
for (int i = 0; i < Nstop ; i++) {
// tmp = g5*evec[i];
dot = innerProduct(evec[i],evec[i]);
// G5R5(tmp,evec[i]);
G5R5Herm.HermOpAndNorm(evec[i],tmp,eMe,eMMe);
std::cout <<"Norm "<<M5<<" "<< mass << " : " << i << " " << real(dot) << " " << imag(dot) << " "<< eMe << " " <<eMMe<< std::endl ;
for (int j = 0; j < Nstop ; j++) {
dot = innerProduct(tmp,evec[j]);
std::cout <<"G5R5 "<<M5<<" "<< mass << " : " << i << " " <<j<<" " << real(dot) << " " << imag(dot) << std::endl ;
}
}
// src = evec[0]+evec[1]+evec[2];
// mass += -0.1;
#endif
//**********************************************************************
//orthogonalization
//calculat the matrix
cout << "Start orthogonalization " << endl;
cout << "calculate the matrix element" << endl;
vector<LatticeFermion> G5R5Mevec(Nconv, FGrid);
vector<LatticeFermion> finalevec(Nconv, FGrid);
vector<RealD> eMe(Nconv), eMMe(Nconv);
for(int i = 0; i < Nconv; i++){
cout << "calculate the matrix element["<<i<<"]" << endl;
G5R5Herm.HermOpAndNorm(evec[i], G5R5Mevec[i], eMe[i], eMMe[i]);
}
cout << "Re<evec, G5R5M(evec)>: " << endl;
cout << eMe << endl;
cout << "<G5R5M(evec), G5R5M(evec)>" << endl;
cout << eMMe << endl;
vector<vector<ComplexD>> VevecG5R5Mevec(Nconv);
Eigen::MatrixXcd evecG5R5Mevec = Eigen::MatrixXcd::Zero(Nconv, Nconv);
for(int i = 0; i < Nconv; i++){
VevecG5R5Mevec[i].resize(Nconv);
for(int j = 0; j < Nconv; j++){
VevecG5R5Mevec[i][j] = innerProduct(evec[i], G5R5Mevec[j]);
evecG5R5Mevec(i, j) = VevecG5R5Mevec[i][j];
}
}
//calculate eigenvector
cout << "Eigen solver" << endl;
Eigen::SelfAdjointEigenSolver<Eigen::MatrixXcd> eigensolver(evecG5R5Mevec);
vector<RealD> eigeneval(Nconv);
vector<vector<ComplexD>> eigenevec(Nconv);
for(int i = 0; i < Nconv; i++){
eigeneval[i] = eigensolver.eigenvalues()[i];
eigenevec[i].resize(Nconv);
for(int j = 0; j < Nconv; j++){
eigenevec[i][j] = eigensolver.eigenvectors()(i, j);
}
}
//rotation
cout << "Do rotation" << endl;
for(int i = 0; i < Nconv; i++){
finalevec[i] = finalevec[i] - finalevec[i];
for(int j = 0; j < Nconv; j++){
finalevec[i] = eigenevec[j][i]*evec[j] + finalevec[i];
}
}
//normalize again;
for(int i = 0; i < Nconv; i++){
RealD tmp_RealD = norm2(finalevec[i]);
tmp_RealD = 1./pow(tmp_RealD, 0.5);
finalevec[i] = finalevec[i]*tmp_RealD;
}
//check
for(int i = 0; i < Nconv; i++){
G5R5Herm.HermOpAndNorm(finalevec[i], G5R5Mevec[i], eMe[i], eMMe[i]);
}
//**********************************************************************
//sort the eigenvectors
vector<LatticeFermion> finalevec_copy(Nconv, FGrid);
for(int i = 0; i < Nconv; i++){
finalevec_copy[i] = finalevec[i];
}
vector<RealD> eMe_copy(eMe);
for(int i = 0; i < Nconv; i++){
eMe[i] = fabs(eMe[i]);
eMe_copy[i] = eMe[i];
}
sort(eMe_copy.begin(), eMe_copy.end());
for(int i = 0; i < Nconv; i++){
for(int j = 0; j < Nconv; j++){
if(eMe[j] == eMe_copy[i]){
finalevec[i] = finalevec_copy[j];
}
}
}
for(int i = 0; i < Nconv; i++){
G5R5Herm.HermOpAndNorm(finalevec[i], G5R5Mevec[i], eMe[i], eMMe[i]);
}
cout << "Re<evec, G5R5M(evec)>: " << endl;
cout << eMe << endl;
cout << "<G5R5M(evec), G5R5M(evec)>" << endl;
cout << eMMe << endl;
// vector<LatticeFermion> finalevec(Nconv, FGrid);
// temporary, until doing rotation
// for(int i = 0; i < Nconv; i++)
// finalevec[i]=evec[i];
//**********************************************************************
//calculate chirality matrix
vector<LatticeFermion> G5evec(Nconv, FGrid);
vector<vector<ComplexD>> chiral_matrix(Nconv);
vector<vector<RealD>> chiral_matrix_real(Nconv);
for(int i = 0; i < Nconv; i++){
// G5evec[i] = G5evec[i] - G5evec[i];
G5evec[i] = Zero();
for(int j = 0; j < Ls/2; j++){
axpby_ssp(G5evec[i], 1., finalevec[i], 0., G5evec[i], j, j);
}
for(int j = Ls/2; j < Ls; j++){
axpby_ssp(G5evec[i], -1., finalevec[i], 0., G5evec[i], j, j);
}
}
for(int i = 0; i < Nconv; i++){
Ddwf.M(finalevec[i], G5R5Mevec[i]);
for(int j = 0; j < Nconv; j++){
std::cout << "<"<<j<<"|Ddwf|"<<i<<"> = "<<innerProduct(finalevec[j],G5R5Mevec[i])<<std::endl;
}
}
for(int i = 0; i < Nconv; i++){
RealD t1,t2;
G5R5Herm.HermOpAndNorm(finalevec[i], G5R5Mevec[i], t1, t2);
for(int j = 0; j < Nconv; j++){
std::cout << "<"<<j<<"|G5R5 M|"<<i<<"> = "<<innerProduct(finalevec[j],G5R5Mevec[i])<<std::endl;
}
}
for(int i = 0; i < Nconv; i++){
chiral_matrix_real[i].resize(Nconv);
chiral_matrix[i].resize(Nconv);
std::string evfile("./evec_density");
evfile = evfile+"_"+std::to_string(i);
auto evdensity = localInnerProduct(finalevec[i],finalevec[i] );
writeFile(evdensity,evfile);
for(int j = 0; j < Nconv; j++){
chiral_matrix[i][j] = innerProduct(finalevec[i], G5evec[j]);
std::cout <<" chiral_matrix_real signed "<<i<<" "<<j<<" "<< chiral_matrix_real[i][j] << std::endl;
chiral_matrix_real[i][j] = abs(chiral_matrix[i][j]);
std::cout <<" chiral_matrix_real "<<i<<" "<<j<<" "<< chiral_matrix_real[i][j] << std::endl;
if ( chiral_matrix_real[i][j] > 0.8 ) {
auto g5density = localInnerProduct(finalevec[i], G5evec[j]);
std::string chfile("./chiral_density_");
chfile = chfile +std::to_string(i)+"_"+std::to_string(j);
writeFile(g5density,chfile);
}
}
}
for(int i = 0; i < Nconv; i++){
if(chiral_matrix[i][i].real() < 0.){
chiral_matrix_real[i][i] = -1. * chiral_matrix_real[i][i];
}
}
FILE *fp = fopen("lego-plot.py","w"); assert(fp!=NULL);
#define PYTHON_LINE(A) fprintf(fp,A"\n");
PYTHON_LINE("import matplotlib.pyplot as plt");
PYTHON_LINE("import numpy as np");
PYTHON_LINE("");
PYTHON_LINE("fig = plt.figure()");
PYTHON_LINE("ax = fig.add_subplot(projection='3d')");
PYTHON_LINE("");
PYTHON_LINE("x, y = np.random.rand(2, 100) * 4");
fprintf(fp,"hist, xedges, yedges = np.histogram2d(x, y, bins=%d, range=[[0, %d], [0, %d]])\n",Nconv,Nconv-1,Nconv-1);
PYTHON_LINE("");
PYTHON_LINE("# Construct arrays for the anchor positions of the 16 bars");
PYTHON_LINE("xpos, ypos = np.meshgrid(xedges[:-1] + 0.25, yedges[:-1] + 0.25, indexing=\"ij\")");
PYTHON_LINE("xpos = xpos.ravel()");
PYTHON_LINE("ypos = ypos.ravel()");
PYTHON_LINE("zpos = 0");
PYTHON_LINE("");
PYTHON_LINE("# Construct arrays with the dimensions for the 16 bars.");
PYTHON_LINE("dx = dy = 0.5 * np.ones_like(zpos)");
PYTHON_LINE("dz = np.array([");
for(int i = 0; i < Nconv; i++){
fprintf(fp,"\t[ ");
for(int j = 0; j < Nconv; j++){
fprintf(fp,"%lf ",chiral_matrix_real[i][j]);
if(j<Nconv-1) fprintf(fp,",");
else fprintf(fp," ");
}
fprintf(fp,"]");
if(i<Nconv-1) fprintf(fp,",\n");
else fprintf(fp,"\n");
}
PYTHON_LINE("\t])");
PYTHON_LINE("dz = dz.ravel()");
PYTHON_LINE("ax.bar3d(xpos, ypos, zpos, dx, dy, dz, zsort='average')");
PYTHON_LINE("plt.show()");
fclose(fp);
Grid_finalize();
}

View File

@ -29,11 +29,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
using namespace std;
using namespace Grid;
;
template<typename Action>
struct Setup{};
#ifdef ENABLE_GPARITY
template<>
struct Setup<GparityMobiusFermionF>{
static GparityMobiusFermionF* getAction(LatticeGaugeFieldF &Umu,
@ -47,16 +47,24 @@ struct Setup<GparityMobiusFermionF>{
return new GparityMobiusFermionF(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.,params);
}
};
#endif
template<>
struct Setup<DomainWallFermionF>{
static DomainWallFermionF* getAction(LatticeGaugeFieldF &Umu,
GridCartesian* FGrid, GridRedBlackCartesian* FrbGrid, GridCartesian* UGrid, GridRedBlackCartesian* UrbGrid){
RealD mass=0.00054;
RealD M5=1.8;
return new DomainWallFermionF(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
}
};
template<>
struct Setup<DomainWallFermionD>{
static DomainWallFermionD* getAction(LatticeGaugeField &Umu,
GridCartesian* FGrid, GridRedBlackCartesian* FrbGrid, GridCartesian* UGrid, GridRedBlackCartesian* UrbGrid){
RealD mass=0.00054;
RealD M5=1.8;
return new DomainWallFermionF(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
return new DomainWallFermionD(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
}
};
@ -168,7 +176,9 @@ int main (int argc, char ** argv)
}
if(action == "GparityMobius"){
#ifdef ENABLE_GPARITY
run<GparityMobiusFermionF>();
#endif
}else if(action == "DWF"){
run<DomainWallFermionF>();
}else if(action == "Mobius"){

View File

@ -555,6 +555,7 @@ int main (int argc, char ** argv) {
double c = (args.mobius_scale - bmc)/2.; // c = 1/2 [ (b+c) - (b-c) ]
if(is_gparity){
#ifdef ENABLE_GPARITY
GparityWilsonImplD::ImplParams Params = setupGparityParams(args.GparityDirs);
readConfiguration<ConjugateGimplD>(Umu, config, args.is_cps_cfg); //Read the gauge field
@ -564,7 +565,10 @@ int main (int argc, char ** argv) {
}else if(action_s == "Mobius"){
GparityMobiusFermionD action(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, args.mass, args.M5, b, c, Params);
run(action, config, args);
}
}
#else
assert(0);
#endif
}else{
WilsonImplD::ImplParams Params = setupParams();
readConfiguration<PeriodicGimplD>(Umu, config, args.is_cps_cfg); //Read the gauge field

View File

@ -0,0 +1,278 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/Test_dwf_lanczos.cc
Copyright (C) 2015
Author: Chulwoo Jung <chulwoo@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
;
typedef WilsonFermionD FermionOp;
typedef typename WilsonFermionD::FermionField FermionField;
RealD AllZero(RealD x) { return 0.; }
namespace Grid {
#if 0
template<typename Field>
class RationalHermOp : public LinearFunction<Field> {
public:
using LinearFunction<Field>::operator();
// OperatorFunction<Field> & _poly;
LinearOperatorBase<Field> &_Linop;
RealD _massDen, _massNum;
FunctionHermOp(LinearOperatorBase<Field>& linop, RealD massDen,RealD massNum)
: _Linop(linop) ,_massDen(massDen),_massNum(massNum) {};
void operator()(const Field& in, Field& out) {
// _poly(_Linop,in,out);
}
};
#endif
template<class Matrix,class Field>
class InvG5LinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
RealD _num;
RealD _Tol;
Integer _MaxIt;
Gamma g5;
public:
InvG5LinearOperator(Matrix &Mat,RealD num): _Mat(Mat),_num(num), _Tol(1e-12),_MaxIt(10000), g5(Gamma::Algebra::Gamma5) {};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
assert(0);
_Mat.Mdiag(in,out);
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
assert(0);
_Mat.Mdir(in,out,dir,disp);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
assert(0);
_Mat.MdirAll(in,out);
};
void Op (const Field &in, Field &out){
assert(0);
_Mat.M(in,out);
}
void AdjOp (const Field &in, Field &out){
assert(0);
_Mat.Mdag(in,out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
HermOp(in,out);
ComplexD dot = innerProduct(in,out);
n1=real(dot);
n2=norm2(out);
}
void HermOp(const Field &in, Field &out){
Field tmp(in.Grid());
MdagMLinearOperator<Matrix,Field> denom(_Mat);
ConjugateGradient<Field> CG(_Tol,_MaxIt);
_Mat.M(in,tmp);
tmp += _num*in;
_Mat.Mdag(tmp,out);
CG(denom,out,tmp);
out = g5*tmp;
}
};
struct LanczosParameters: Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
RealD, mass ,
RealD, resid,
RealD, ChebyLow,
RealD, ChebyHigh,
Integer, ChebyOrder)
// Integer, StartTrajectory,
// Integer, Trajectories, /* @brief Number of sweeps in this run */
// bool, MetropolisTest,
// Integer, NoMetropolisUntil,
// std::string, StartingType,
// Integer, SW,
// RealD, Kappa,
// IntegratorParameters, MD)
LanczosParameters() {
////////////////////////////// Default values
mass = 0;
// MetropolisTest = true;
// NoMetropolisUntil = 10;
// StartTrajectory = 0;
// SW = 2;
// Trajectories = 10;
// StartingType = "HotStart";
/////////////////////////////////
}
template <class ReaderClass >
LanczosParameters(Reader<ReaderClass> & TheReader){
initialize(TheReader);
}
template < class ReaderClass >
void initialize(Reader<ReaderClass> &TheReader){
// std::cout << GridLogMessage << "Reading HMC\n";
read(TheReader, "HMC", *this);
}
void print_parameters() const {
// std::cout << GridLogMessage << "[HMC parameters] Trajectories : " << Trajectories << "\n";
// std::cout << GridLogMessage << "[HMC parameters] Start trajectory : " << StartTrajectory << "\n";
// std::cout << GridLogMessage << "[HMC parameters] Metropolis test (on/off): " << std::boolalpha << MetropolisTest << "\n";
// std::cout << GridLogMessage << "[HMC parameters] Thermalization trajs : " << NoMetropolisUntil << "\n";
// std::cout << GridLogMessage << "[HMC parameters] Starting type : " << StartingType << "\n";
// MD.print_parameters();
}
};
}
int main(int argc, char** argv) {
Grid_init(&argc, &argv);
GridCartesian* UGrid = SpaceTimeGrid::makeFourDimGrid(
GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()),
GridDefaultMpi());
GridRedBlackCartesian* UrbGrid =
SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian* FGrid = UGrid;
GridRedBlackCartesian* FrbGrid = UrbGrid;
// printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n", UGrid, UrbGrid, FGrid, FrbGrid);
std::vector<int> seeds4({1, 2, 3, 4});
std::vector<int> seeds5({5, 6, 7, 8});
GridParallelRNG RNG5(FGrid);
RNG5.SeedFixedIntegers(seeds5);
GridParallelRNG RNG4(UGrid);
RNG4.SeedFixedIntegers(seeds4);
GridParallelRNG RNG5rb(FrbGrid);
RNG5.SeedFixedIntegers(seeds5);
LatticeGaugeField Umu(UGrid);
// SU<Nc>::HotConfiguration(RNG4, Umu);
FieldMetaData header;
std::string file("./config");
int precision32 = 0;
int tworow = 0;
// NerscIO::writeConfiguration(Umu,file,tworow,precision32);
NerscIO::readConfiguration(Umu,header,file);
/*
std::vector<LatticeColourMatrix> U(4, UGrid);
for (int mu = 0; mu < Nd; mu++) {
U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
}
*/
int Nstop = 5;
int Nk = 10;
int Np = 90;
int Nm = Nk + Np;
int MaxIt = 10000;
RealD resid = 1.0e-5;
RealD mass = -1.0;
LanczosParameters LanParams;
#if 1
{
XmlReader HMCrd("LanParams.xml");
read(HMCrd,"LanczosParameters",LanParams);
}
#else
{
LanParams.mass = mass;
}
#endif
std::cout << GridLogMessage<< LanParams <<std::endl;
{
XmlWriter HMCwr("LanParams.xml.out");
write(HMCwr,"LanczosParameters",LanParams);
}
mass=LanParams.mass;
resid=LanParams.resid;
while ( mass > - 5.0){
FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,2.+mass);
InvG5LinearOperator<FermionOp,LatticeFermion> HermOp(WilsonOperator,-2.); /// <-----
//SchurDiagTwoOperator<FermionOp,FermionField> HermOp(WilsonOperator);
// Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
std::vector<double> Coeffs{0, 0, 1.};
Polynomial<FermionField> PolyX(Coeffs);
Chebyshev<FermionField> Cheby(LanParams.ChebyLow,LanParams.ChebyHigh,LanParams.ChebyOrder);
FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
// InvHermOp<FermionField> Op(WilsonOperator,HermOp);
PlainHermOp<FermionField> Op (HermOp);
// PlainHermOp<FermionField> Op2 (HermOp2);
ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op, Nstop, Nk, Nm, resid, MaxIt);
std::vector<RealD> eval(Nm);
FermionField src(FGrid);
gaussian(RNG5, src);
std::vector<FermionField> evec(Nm, FGrid);
for (int i = 0; i < 1; i++) {
std::cout << i << " / " << Nm << " grid pointer " << evec[i].Grid()
<< std::endl;
};
int Nconv;
IRL.calc(eval, evec, src, Nconv);
std::cout << mass <<" : " << eval << std::endl;
Gamma g5(Gamma::Algebra::Gamma5) ;
ComplexD dot;
FermionField tmp(FGrid);
for (int i = 0; i < Nstop ; i++) {
tmp = g5*evec[i];
dot = innerProduct(tmp,evec[i]);
std::cout << mass << " : " << eval[i] << " " << real(dot) << " " << imag(dot) << std::endl ;
}
src = evec[0]+evec[1]+evec[2];
mass += -0.1;
}
Grid_finalize();
}

View File

@ -0,0 +1,211 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/Test_dwf_lanczos.cc
Copyright (C) 2015
Author: Chulwoo Jung <chulwoo@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
;
typedef WilsonFermionD FermionOp;
typedef typename WilsonFermionD::FermionField FermionField;
RealD AllZero(RealD x) { return 0.; }
namespace Grid {
struct LanczosParameters: Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
RealD, mass ,
RealD, ChebyLow,
RealD, ChebyHigh,
Integer, ChebyOrder)
// Integer, StartTrajectory,
// Integer, Trajectories, /* @brief Number of sweeps in this run */
// bool, MetropolisTest,
// Integer, NoMetropolisUntil,
// std::string, StartingType,
// Integer, SW,
// RealD, Kappa,
// IntegratorParameters, MD)
LanczosParameters() {
////////////////////////////// Default values
mass = 0;
// MetropolisTest = true;
// NoMetropolisUntil = 10;
// StartTrajectory = 0;
// SW = 2;
// Trajectories = 10;
// StartingType = "HotStart";
/////////////////////////////////
}
template <class ReaderClass >
LanczosParameters(Reader<ReaderClass> & TheReader){
initialize(TheReader);
}
template < class ReaderClass >
void initialize(Reader<ReaderClass> &TheReader){
// std::cout << GridLogMessage << "Reading HMC\n";
read(TheReader, "HMC", *this);
}
void print_parameters() const {
// std::cout << GridLogMessage << "[HMC parameters] Trajectories : " << Trajectories << "\n";
// std::cout << GridLogMessage << "[HMC parameters] Start trajectory : " << StartTrajectory << "\n";
// std::cout << GridLogMessage << "[HMC parameters] Metropolis test (on/off): " << std::boolalpha << MetropolisTest << "\n";
// std::cout << GridLogMessage << "[HMC parameters] Thermalization trajs : " << NoMetropolisUntil << "\n";
// std::cout << GridLogMessage << "[HMC parameters] Starting type : " << StartingType << "\n";
// MD.print_parameters();
}
};
}
int main(int argc, char** argv) {
Grid_init(&argc, &argv);
GridCartesian* UGrid = SpaceTimeGrid::makeFourDimGrid(
GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()),
GridDefaultMpi());
GridRedBlackCartesian* UrbGrid =
SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian* FGrid = UGrid;
GridRedBlackCartesian* FrbGrid = UrbGrid;
// printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n", UGrid, UrbGrid, FGrid, FrbGrid);
std::vector<int> seeds4({1, 2, 3, 4});
std::vector<int> seeds5({5, 6, 7, 8});
GridParallelRNG RNG5(FGrid);
RNG5.SeedFixedIntegers(seeds5);
GridParallelRNG RNG4(UGrid);
RNG4.SeedFixedIntegers(seeds4);
GridParallelRNG RNG5rb(FrbGrid);
RNG5.SeedFixedIntegers(seeds5);
LatticeGaugeField Umu(UGrid);
// SU<Nc>::HotConfiguration(RNG4, Umu);
FieldMetaData header;
std::string file("./config");
int precision32 = 0;
int tworow = 0;
// NerscIO::writeConfiguration(Umu,file,tworow,precision32);
NerscIO::readConfiguration(Umu,header,file);
/*
std::vector<LatticeColourMatrix> U(4, UGrid);
for (int mu = 0; mu < Nd; mu++) {
U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
}
*/
int Nstop = 10;
int Nk = 20;
int Np = 80;
int Nm = Nk + Np;
int MaxIt = 10000;
RealD resid = 1.0e-5;
RealD mass = -1.0;
LanczosParameters LanParams;
#if 1
{
XmlReader HMCrd("LanParams.xml");
read(HMCrd,"LanczosParameters",LanParams);
}
#else
{
LanParams.mass = mass;
}
#endif
std::cout << GridLogMessage<< LanParams <<std::endl;
{
XmlWriter HMCwr("LanParams.xml.out");
write(HMCwr,"LanczosParameters",LanParams);
}
mass=LanParams.mass;
while ( mass > - 5.0){
FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass);
MdagMLinearOperator<FermionOp,FermionField> HermOp(WilsonOperator); /// <-----
//SchurDiagTwoOperator<FermionOp,FermionField> HermOp(WilsonOperator);
Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
std::vector<double> Coeffs{0, 1.};
Polynomial<FermionField> PolyX(Coeffs);
// Chebyshev<FermionField> Cheby(0.5, 60., 31);
// RealD, ChebyLow,
// RealD, ChebyHigh,
// Integer, ChebyOrder)
Chebyshev<FermionField> Cheby(LanParams.ChebyLow,LanParams.ChebyHigh,LanParams.ChebyOrder);
FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
PlainHermOp<FermionField> Op (HermOp);
PlainHermOp<FermionField> Op2 (HermOp2);
ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op2, Nstop, Nk, Nm, resid, MaxIt);
std::vector<RealD> eval(Nm);
FermionField src(FGrid);
gaussian(RNG5, src);
std::vector<FermionField> evec(Nm, FGrid);
for (int i = 0; i < 1; i++) {
std::cout << i << " / " << Nm << " grid pointer " << evec[i].Grid()
<< std::endl;
};
int Nconv;
IRL.calc(eval, evec, src, Nconv);
std::cout << mass <<" : " << eval << std::endl;
Gamma g5(Gamma::Algebra::Gamma5) ;
ComplexD dot;
FermionField tmp(FGrid);
for (int i = 0; i < Nstop ; i++) {
tmp = g5*evec[i];
dot = innerProduct(tmp,evec[i]);
std::cout << mass << " : " << eval[i] << " " << real(dot) << " " << imag(dot) << std::endl ;
}
src = evec[0]+evec[1]+evec[2];
mass += -0.1;
}
Grid_finalize();
}

Some files were not shown because too many files have changed in this diff Show More