1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-10-16 06:04:43 +01:00

Compare commits

..

130 Commits

Author SHA1 Message Date
Peter Boyle
c7b74db317 Default dimensions fixed 2025-10-09 14:57:22 -04:00
Peter Boyle
0ce201efbe IcosahedralVerted() checks 2025-10-09 13:35:16 -04:00
Peter Boyle
6d8a3d8bb2 Config 2025-10-09 13:30:16 -04:00
Peter Boyle
7dfd207ebb Need to protect pole operatoins to only take place on IcosahedralVertices mesh 2025-10-08 15:18:31 -04:00
Peter Boyle
3a65a096f2 Nd verbose 2025-10-07 18:49:00 -04:00
Peter Boyle
85b2bd4c93 Beginnings of S2xR 2025-10-07 16:11:06 -04:00
Peter Boyle
35e10a1159 Changes for Nd=3 2025-10-03 12:17:13 -04:00
d418f78352 Making running on Aurora more debuggable 2025-05-23 20:58:16 +00:00
25163998a0 Makes SYCL compiler happy 2025-05-23 20:57:11 +00:00
Peter Boyle
dc546aaa4b Updated config options for BNL cluster 2025-05-13 18:44:47 -04:00
Peter Boyle
5364d580c9 Output chirality, eigenvector density files and python source lego plot 2025-05-13 18:44:47 -04:00
Peter Boyle
2a9a6347e3 Do not require Grid format RNGs and also to the 5Li reporting 2025-05-13 18:44:47 -04:00
Peter Boyle
cfdb56f314 Run measurements at t=0 too 2025-05-13 18:44:46 -04:00
Peter Boyle
b517e88db3 Update README 2025-05-13 16:49:21 -04:00
bb317aba8d Lattice = for sycl 2025-05-13 12:50:58 +00:00
644cc6647e JSON update 2025-05-13 12:50:58 +00:00
72397ce23b SYCL interface change 2025-05-13 12:50:58 +00:00
Peter Boyle
d60a80c098 Fixes and visualisation 2025-04-29 18:04:23 -04:00
Peter Boyle
bb8b6d9d73 Fix 2025-04-29 18:04:04 -04:00
Peter Boyle
677b4cc5b0 Make all tests compile 2025-04-24 20:33:26 -04:00
Peter Boyle
be565ffab6 update mac config command 2025-04-24 14:50:06 -04:00
Peter Boyle
df6120e5f6 CPU compile oops fix 2025-04-24 14:50:06 -04:00
Peter Boyle
21de6f7da8 Merge pull request #477 from lehner/feature/wilson-clover-5d
Feature/wilson clover 5d
2025-04-24 14:44:48 -04:00
Peter Boyle
dbe39f9ce0 Merge pull request #471 from edbennett/fix-wflow
Shave off rough edges in Wilson flow test
2025-04-24 14:40:31 -04:00
Peter Boyle
ab3de50d5e Merge pull request #473 from UCL-ARC/gauge_action_deriv
WilsonGagueAction deriv
2025-04-24 14:39:10 -04:00
Peter Boyle
c545bd2139 Merge pull request #465 from edbennett/allow-nonsu3-compilation
guard against trying to compile SU3-specific code when Nc ≠ 3
2025-04-24 14:35:51 -04:00
Peter Boyle
6a1c64fbdd Merge pull request #470 from paboyle/specflow
Spectral flow, DWF/Mobius kernel measurement
2025-04-24 14:34:33 -04:00
Peter Boyle
b75809ed61 Update README 2025-04-24 14:27:22 -04:00
Peter Boyle
ecaf228e5c Update README 2025-04-24 14:25:32 -04:00
Peter Boyle
6d015ae8fc Visualisation tools 2025-04-24 13:47:34 -04:00
Peter Boyle
233150d93f Bug fix for no accelerator aware MPI, thanks Shuhei for finding it. 2025-04-24 11:40:46 -04:00
Peter Boyle
7af8c77a52 Normalise 2025-04-24 11:37:39 -04:00
Chulwoo Jung
a957e7bfa1 Adding DWF evec Chirality measurement 2025-04-22 22:17:51 +00:00
Chulwoo Jung
cee4c8ce8c Merge branch 'develop' of https://github.com/paboyle/Grid into specflow 2025-04-18 19:55:36 +00:00
Christoph Lehner
96bf814d8c Add checkerboarding to 5D compact clover 2025-04-10 23:05:39 +02:00
Christoph Lehner
7ddc422788 CompactWilsonClover5D 2025-04-10 23:05:29 +02:00
Peter Boyle
e652fc2825 Shared Memory test reenabled on every Grid object creation.
Const improvements in Accelerator.h
2025-04-07 11:51:40 -04:00
Peter Boyle
a49fa3f8d0 ROCM 6.3.1 appears to work 2025-04-07 11:50:59 -04:00
Peter Boyle
cd452a2f91 Slurm update 2025-04-04 18:40:20 -04:00
Peter Boyle
4f89f603ae Changes to add back shared memory test on GPU 2025-04-04 18:40:15 -04:00
Peter Boyle
11dc2c5e1d PVdagM initialise 2025-04-04 18:35:06 -04:00
Peter Boyle
6fec3c15ca Cleaner printing 2025-04-04 18:35:06 -04:00
Peter Boyle
938c47480f Updated compile on frontier.
Unsatisfactory hacsk
2025-04-04 18:35:06 -04:00
Peter Boyle
3811d19298 Fence 2025-04-04 18:35:06 -04:00
Peter Boyle
83a3ab6b6f Barrier -- not sure 100% this was needed 2025-04-04 18:35:05 -04:00
Peter Boyle
d66a9af6a3 No compile fix 2025-04-04 18:35:05 -04:00
Peter Boyle
adc90d3a86 NVLINK GET/PUT on cuda aware mpi 2025-04-04 18:35:05 -04:00
Peter Boyle
ebbd015c5c Deprecate shared memory copy as direction matters on nvidia GPU 2025-04-04 18:35:05 -04:00
Peter Boyle
4ab73b36b2 Deprecate shared memory copy as direction matters on GPU 2025-04-04 18:35:05 -04:00
Peter Boyle
130e07a422 Non hermitian support 2025-04-04 18:35:05 -04:00
Peter Boyle
8f47bb367e Shifted non herm 2025-04-04 18:35:05 -04:00
Peter Boyle
0c3cb60135 Script update 2025-04-04 18:35:05 -04:00
Peter Boyle
9eae8fca5d Size outut 2025-04-04 18:35:05 -04:00
Peter Boyle
882a217074 Example of Useful prerequisite installs with spack 2025-03-26 11:28:53 -04:00
Mashy Green
e465fce201 Merge remote-tracking branch 'upstream/develop' into gauge_action_deriv 2025-03-24 10:12:42 +00:00
Mashy Green
d41542c64b reverted sp2n test wilsonfundfermiongauge to original 2025-03-24 08:29:15 +00:00
Peter Boyle
199818bd6c Merge pull request #475 from lehner/feature-aurora
Sync with GPT on Aurora
2025-03-13 08:55:55 -04:00
Christoph Lehner
fe66c7ca30 verbosity 2025-03-13 12:49:36 +00:00
Christoph Lehner
e9177e4af3 Blas compatibility 2025-03-13 08:48:23 +00:00
Christoph Lehner
d15a6c5933 Merge branch 'develop' of https://github.com/paboyle/Grid into feature-aurora 2025-03-13 07:29:55 +00:00
25ab9325e7 Use hostVector but remove construct resize 2025-03-11 15:02:32 +00:00
19f9378b98 Should work on Aurora nowb 2025-03-11 13:50:43 +00:00
Mashy Green
785bc7a14f Adding staple zeroing fix 2025-03-10 12:29:04 +00:00
Mashy Green
1a1fe85428 Merge remote-tracking branch 'upstream' into gauge_action_deriv 2025-03-10 08:37:36 +00:00
Mashy Green
0000d2e558 Merge branch 'develop' into gauge_action_deriv 2025-03-10 08:35:57 +00:00
Christoph Lehner
9ffd1ed4ce Merged 2025-03-08 15:30:08 +00:00
Peter Boyle
3d014864e2 Makinig LLVM happy 2025-03-06 14:19:25 -05:00
1d22841811 Working on aurora, GPT issue turned up is fixed 2025-03-06 03:20:18 +00:00
Peter Boyle
a1cdda833f Update WorkArounds.txt 2025-03-05 14:04:23 -05:00
Peter Boyle
ad6db92690 Update WorkArounds.txt 2025-03-05 14:00:26 -05:00
Peter Boyle
e8ff9d8e50 Update WorkArounds.txt 2025-03-05 14:00:04 -05:00
Peter Boyle
795769c636 Update WorkArounds.txt 2025-03-05 13:50:41 -05:00
Peter Boyle
267a39d943 Update WorkArounds.txt 2025-03-05 13:49:43 -05:00
Peter Boyle
3624bd3d22 Update WorkArounds.txt 2025-03-05 13:45:09 -05:00
Peter Boyle
bc12dbbb38 Update WorkArounds.txt 2025-03-05 12:48:56 -05:00
Peter Boyle
eb8a008a8f Create WorkArounds.txt 2025-03-05 12:41:59 -05:00
c4d9aa1a21 Config command that makes GPT happier 2025-02-27 20:12:49 +00:00
6ae809ed40 Print not liked on GPT compile 2025-02-27 20:12:49 +00:00
Peter Boyle
311e2aab3f Update Accelerator.h 2025-02-26 11:42:52 -05:00
438dfbdb83 Only throw if there is a pending list entry in CommsComplete 2025-02-25 16:57:27 +00:00
b2ce760cf4 Verbose issue with GPT 2025-02-25 16:55:23 +00:00
Muhammad Asif
b1ba209696 Latest upstream with np-su3 patch and modified Sp_WilsonFunfFermionGauge test to be small (#22)
Co-authored-by: Mashy Green <mashy@me.com>

merging no-su3 patch
2025-02-24 11:38:42 +00:00
Muhammad Asif
cb3e529b1e Merge branch 'paboyle:develop' into develop 2025-02-24 11:29:09 +00:00
Mashy Green
717f647418 added the WilsonFlow patch from upstream PR #471 2025-02-24 08:41:31 +00:00
Mashy Green
98e7418187 Merge remote-tracking branch 'upstream/develop' into gauge_action_deriv 2025-02-24 08:33:05 +00:00
Mashy Green
fe05bf48b1 Improvements to WilsonGaugeAction deriv function (#16)
* patched version + modifications to deriv -> staple in qcd/gauge

* Cleaning up and aligning variable naming between action deriv versions

* Removing the regresion test files that were also in this branch for a clean PR

* Reverting whitespace changes

* Fixing after revering too much!

---------

Co-authored-by: Mashy Green <mashy@me.com>
2025-02-17 18:52:04 +00:00
Mashy Green
d2dd8f54e2 Fixing after revering too much! 2025-02-17 17:32:27 +00:00
Mashy Green
7726ee4b16 Reverting whitespace changes 2025-02-17 17:16:28 +00:00
ba9bbe0221 Bounce MPI through host 2025-02-12 19:34:59 +00:00
4c3dd82d84 CSHIFT with bounce throuhgh Host memory on MPI packets 2025-02-12 19:09:53 +00:00
44e911b5b7 Comment change 2025-02-12 17:37:55 +00:00
a7a16df9d0 GET not put has kinder barrier sequence for NVLINK type access as when
GET is done, I can use it without barrier. Moves a barrier to a nicer
place, overlapped with DtoH DMA
2025-02-12 14:59:28 +00:00
382e0abefd Was issueing a double fence -- the gather also fences 2025-02-12 14:57:28 +00:00
6fdefe5b90 Barrier sequencing if doing "GET" not "PUT" is different.
This is somewhat better timing for Barriers
2025-02-12 14:55:20 +00:00
4788dd8e2e More states in packet progression for GPU non aware MPI 2025-02-12 14:53:57 +00:00
1cc5f221f3 GET not put ordering is better as I know when I've got all MY data 2025-02-12 14:53:05 +00:00
93251bfba0 GET not put for better ordering in the downstream dependent kernels -- I
know when I'm done, so we can move a barrier / handshake between ranks
intranode to a point off critical path
2025-02-12 14:50:21 +00:00
18b79508b8 New line better for pretty print 2025-02-12 14:49:48 +00:00
4de5ed1613 Remove vector view. The std::vector will not inform Memory manager of
deletion and so a stale entry could be left. It is not and should not be
used.
2025-02-12 14:48:46 +00:00
0baaddbe98 Pipeline mode commit on Aurora. 5+ TF/s on 16^3x32 per tile at 384
nodes.
More concurrency/fine grained scheduling is possible.
2025-02-04 19:27:26 +00:00
Ed Bennett
8729c46169 add clover energy density measurement to default WilsonFlow measurements 2025-02-03 14:27:55 +00:00
Ed Bennett
09f81fe7c3 don't force energy density measurement to be every wilson flow iteration 2025-02-03 14:27:45 +00:00
Ed Bennett
1876e5b7c0 correct tests/smearing/WilsonFlow to use non-adaptive flow and use correct interface 2025-02-03 14:27:29 +00:00
Mashy Green
355ec76257 Merge pull request #18 from UCL-ARC/bugfix/nvtx
Bugfix/nvtx
2025-02-03 11:05:42 +00:00
b50fb34e71 Perf on Aurora 2025-02-01 18:39:34 +00:00
de84d730ff Fastest run config on Aurora to date 2025-02-01 18:08:40 +00:00
Peter Boyle
c74d11e3d7 PVdagM MG 2025-02-01 11:04:13 -05:00
Christoph Lehner
84cab5e6e7 no comms and log cleanup 2025-02-01 16:37:21 +01:00
c4fc972fec Merge branch 'feature/deprecate-uvm' into develop 2025-01-31 16:32:36 +00:00
Mashy Green
4f17c8d081 Merge branch 'paboyle:develop' into bugfix/nvtx 2025-01-29 13:10:12 +00:00
Mashy Green
aaab753982 Reverting to older version of nvtx for Tursa support 2025-01-29 12:57:38 +00:00
Chulwoo Jung
570b72a47b Bugfix. Sorry! 2025-01-21 15:37:39 -05:00
Chulwoo Jung
a5798a89ed Merge branch 'develop' into specflow 2025-01-21 12:13:24 -05:00
Peter Boyle
3f3661a86f Heading towards PVdagM multigrid 2025-01-17 14:33:35 +00:00
Chulwoo Jung
f7e2f9a401 Checking in spectral flow and DWF/Mobius kernel eigenvalue measurement 2025-01-16 20:47:33 +00:00
Chulwoo Jung
2848a9b558 DWF Kernel lanczos working(?) 2025-01-16 01:29:56 +00:00
Mashy Green
d4868991af Fixed wrong lib for NVTX in configure.ac and updated to nvtx3 2025-01-10 14:53:19 +00:00
Mashy Green
e99d42404e Removing the regresion test files that were also in this branch for a clean PR 2024-12-16 16:31:22 +00:00
Mashy Green
3ba019c747 Cleaning up and aligning variable naming between action deriv versions 2024-12-03 15:23:00 +00:00
Mashy Green
47429218bb patched version + modifications to deriv -> staple in qcd/gauge 2024-11-27 16:29:22 +00:00
Peter Boyle
5a4f9bf2e3 Force the ROCM version 2024-10-29 18:12:31 -04:00
Peter Boyle
f617468e04 Update Lattice_base.h 2024-10-11 10:39:16 -04:00
Peter Boyle
ee4046fe92 Added a dimension ordered column sum based reduction for scalar.
Removes dependence on MPI_Allreduce and allows for work around on
systems where this is bollox.
2024-09-27 09:26:03 -04:00
Peter Boyle
2a9cfeb9ea New files 2024-09-26 14:23:29 -04:00
Peter Boyle
1147b8ea40 Cheby poly setup 2024-09-26 14:20:32 -04:00
Peter Boyle
3f9119b39d Remove vectors used for the power spectrum table in paper 2024-09-26 14:19:41 -04:00
Peter Boyle
35e8225abd Verbose control 2024-09-26 14:18:35 -04:00
Peter Boyle
bdbfbb7a14 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-09-26 14:05:45 -04:00
Peter Boyle
f7d4be8d96 Calculate bytes correctly 2024-09-26 14:04:44 -04:00
Ed Bennett
8d305df0db guard against trying to compile SU3-specific code when Nc ≠ 3 2024-05-24 14:00:56 +01:00
149 changed files with 8128 additions and 806 deletions

View File

@@ -37,6 +37,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/qcd/QCD.h>
#include <Grid/qcd/spin/Spin.h>
#include <Grid/qcd/gparity/Gparity.h>
#include <Grid/qcd/spin/Pauli.h> // depends on Gparity
#include <Grid/qcd/utils/Utils.h>
#include <Grid/qcd/representations/Representations.h>
NAMESPACE_CHECK(GridQCDCore);

View File

@@ -191,7 +191,7 @@ public:
Lattice<sobj> pgbuf(&pencil_g);
autoView(pgbuf_v , pgbuf, CpuWrite);
std::cout << "CPU view" << std::endl;
//std::cout << "CPU view" << std::endl;
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
@@ -215,7 +215,7 @@ public:
else if ( sign == forward ) div = 1.0;
else assert(0);
std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
//std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
FFTW_plan p;
{
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
@@ -229,7 +229,7 @@ public:
}
// Barrel shift and collect global pencil
std::cout << GridLogPerformance<<"Making pencil" << std::endl;
//std::cout << GridLogPerformance<<"Making pencil" << std::endl;
Coordinate lcoor(Nd), gcoor(Nd);
result = source;
int pc = processor_coor[dim];
@@ -251,7 +251,7 @@ public:
}
}
std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
//std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
// Loop over orthog coords
int NN=pencil_g.lSites();
GridStopWatch timer;
@@ -274,7 +274,7 @@ public:
usec += timer.useconds();
flops+= flops_call*NN;
std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
//std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
// writing out result
{
autoView(pgbuf_v,pgbuf,CpuRead);
@@ -291,7 +291,7 @@ public:
}
result = result*div;
std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
//std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
// destroying plan
FFTW<scalar>::fftw_destroy_plan(p);
#endif

View File

@@ -277,6 +277,38 @@ public:
assert(0);
}
};
template<class Matrix,class Field>
class ShiftedNonHermitianLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
RealD shift;
public:
ShiftedNonHermitianLinearOperator(Matrix &Mat,RealD shft): _Mat(Mat),shift(shft){};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out);
out = out + shift*in;
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
_Mat.MdirAll(in,out);
};
void Op (const Field &in, Field &out){
_Mat.M(in,out);
out = out + shift * in;
}
void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out);
out = out + shift * in;
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
assert(0);
}
void HermOp(const Field &in, Field &out){
assert(0);
}
};
//////////////////////////////////////////////////////////
// Even Odd Schur decomp operators; there are several

View File

@@ -269,7 +269,9 @@ public:
RealD xscale = 2.0/(hi-lo);
RealD mscale = -(hi+lo)/(hi-lo);
Linop.HermOp(T0,y);
grid->Barrier();
axpby(T1,xscale,mscale,y,in);
grid->Barrier();
// sum = .5 c[0] T0 + c[1] T1
// out = ()*T0 + Coeffs[1]*T1;

View File

@@ -208,8 +208,8 @@ public:
assert(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount);
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
assert(OpB!=GridBLAS_OP_T);
//assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
//assert(OpB!=GridBLAS_OP_T);
int lda = m; // m x k column major
int ldb = k; // k x n column major
@@ -367,28 +367,67 @@ public:
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
else
eCmn = alpha * eAmk * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
else
eCmn = alpha * eAmk.adjoint() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
else
eCmn = alpha * eAmk.transpose() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
else
eCmn = alpha * eAmk * eBkn.adjoint() ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
});
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
else
eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
} );
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
else
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
} );
} else {
assert(0);
@@ -414,8 +453,8 @@ public:
RealD t2=usecond();
int32_t batchCount = Amk.size();
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
assert(OpB!=GridBLAS_OP_T);
//assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
//assert(OpB!=GridBLAS_OP_T);
int lda = m; // m x k column major
int ldb = k; // k x n column major
@@ -514,28 +553,70 @@ public:
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
else
eCmn = alpha * eAmk * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
else
eCmn = alpha * eAmk.adjoint() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
else
eCmn = alpha * eAmk.transpose() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
else
eCmn = alpha * eAmk * eBkn.adjoint() ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
else
eCmn = alpha * eAmk * eBkn.transpose() ;
});
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
else
eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
} );
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
else
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
} );
} else {
assert(0);
@@ -661,29 +742,41 @@ public:
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
else
eCmn = alpha * eAmk * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
else
eCmn = alpha * eAmk.transpose() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
else
eCmn = alpha * eAmk * eBkn.transpose() ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
} );
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
else
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
});
} else {
assert(0);
}
@@ -809,28 +902,40 @@ public:
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
else
eCmn = alpha * eAmk * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
else
eCmn = alpha * eAmk.transpose() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
else
eCmn = alpha * eAmk * eBkn.transpose() ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
else
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
});
} else {
assert(0);

View File

@@ -144,11 +144,11 @@ public:
acceleratorCopyDeviceToDevice(&BLAS_Y[offset],&y_v[0],sizeof(scalar_object)*vol);
}
RealD t4 = usecond();
std::cout << "MulMatrix alloc took "<< t1-t0<<" us"<<std::endl;
std::cout << "MulMatrix preamble took "<< t2-t1<<" us"<<std::endl;
std::cout << "MulMatrix blas took "<< t3-t2<<" us"<<std::endl;
std::cout << "MulMatrix copy took "<< t4-t3<<" us"<<std::endl;
std::cout << "MulMatrix total "<< t4-t0<<" us"<<std::endl;
std::cout <<GridLogPerformance << "MulMatrix alloc took "<< t1-t0<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix preamble took "<< t2-t1<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix blas took "<< t3-t2<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix copy took "<< t4-t3<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix total "<< t4-t0<<" us"<<std::endl;
}
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y)
@@ -242,16 +242,16 @@ public:
RealD flops = 8.0*M*N*K;
flops = flops/(t4-t3)/1.e3;
bytes = bytes/(t4-t3)/1.e3;
std::cout << "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
std::cout << "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
std::cout << "InnerProductMatrix cp t2 "<< t2-t1<<" us"<<std::endl;
std::cout << "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
std::cout << "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
std::cout << "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
std::cout << "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
std::cout << "InnerProductMatrix gsum t5 "<< t5-t4<<" us"<<std::endl;
std::cout << "InnerProductMatrix cp t6 "<< t6-t5<<" us"<<std::endl;
std::cout << "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t2 "<< t2-t1<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t5 "<< t5-t4<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t6 "<< t6-t5<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
#else
int nrhs;
GridBase *grid;
@@ -358,17 +358,17 @@ public:
flops = flops/(t4-t3)/1.e3;
bytes = bytes/(t4-t3)/1.e3;
xybytes = 4*xybytes/(t2-t1)/1.e3;
std::cout << "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
std::cout << "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
std::cout << "InnerProductMatrix cp t2 "<< t2-t1<<" us "<<xybytes<<" GB/s"<<std::endl;
std::cout << "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
std::cout << "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
std::cout << "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
std::cout << "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
std::cout << "InnerProductMatrix cp t5 "<< t5-t4<<" us"<<std::endl;
std::cout << "InnerProductMatrix lsum t6l "<< t6l-t5<<" us"<<std::endl;
std::cout << "InnerProductMatrix gsum t6 "<< t6-t6l<<" us"<<std::endl;
std::cout << "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t2 "<< t2-t1<<" us "<<xybytes<<" GB/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t5 "<< t5-t4<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix lsum t6l "<< t6l-t5<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t6 "<< t6-t6l<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
#endif
}
};

View File

@@ -63,7 +63,12 @@ class TwoLevelCGmrhs
GridStopWatch SmoothTimer;
GridStopWatch InsertTimer;
/*
Field rrr;
Field sss;
Field qqq;
Field zzz;
*/
// more most opertor functions
TwoLevelCGmrhs(RealD tol,
Integer maxit,
@@ -74,6 +79,12 @@ class TwoLevelCGmrhs
MaxIterations(maxit),
_FineLinop(FineLinop),
_Smoother(Smoother)
/*
rrr(fine),
sss(fine),
qqq(fine),
zzz(fine)
*/
{
grid = fine;
};
@@ -81,8 +92,8 @@ class TwoLevelCGmrhs
// Vector case
virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
{
SolveSingleSystem(src,x);
// SolvePrecBlockCG(src,x);
// SolveSingleSystem(src,x);
SolvePrecBlockCG(src,x);
}
////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -657,6 +668,8 @@ public:
CoarseField PleftProjMrhs(this->coarsegridmrhs);
CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
// this->rrr=in[0];
#undef SMOOTHER_BLOCK_SOLVE
#if SMOOTHER_BLOCK_SOLVE
this->SmoothTimer.Start();
@@ -669,6 +682,7 @@ public:
this->SmoothTimer.Stop();
}
#endif
// this->sss=Min[0];
for(int rhs=0;rhs<nrhs;rhs++) {
@@ -705,9 +719,11 @@ public:
this->_Projector.blockPromote(tmp,PleftMss_proj);// tmp= Q[in - A Min]
this->PromoteTimer.Stop();
this->FineTimer.Start();
// this->qqq=tmp[0];
for(int rhs=0;rhs<nrhs;rhs++) {
axpy(out[rhs],1.0,Min[rhs],tmp[rhs]); // Min+tmp
}
// this->zzz=out[0];
this->FineTimer.Stop();
}
};

View File

@@ -245,9 +245,10 @@ until convergence
_HermOp(src_n,tmp);
// std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
// std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
// RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
RealD vnum = real(innerProduct(tmp,tmp)); // HermOp^2.
RealD vden = norm2(src_n);
RealD na = vnum/vden;
RealD na = std::sqrt(vnum/vden);
if (fabs(evalMaxApprox/na - 1.0) < 0.0001)
i=_MAX_ITER_IRL_MEVAPP_;
evalMaxApprox = na;
@@ -255,6 +256,7 @@ until convergence
src_n = tmp;
}
}
std::cout << GridLogIRL << " Final evalMaxApprox " << evalMaxApprox << std::endl;
std::vector<RealD> lme(Nm);
std::vector<RealD> lme2(Nm);

View File

@@ -74,7 +74,7 @@ public:
void operator() (const Field &src, Field &psi){
psi=Zero();
// psi=Zero();
RealD cp, ssq,rsq;
ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq;

View File

@@ -30,6 +30,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */
#pragma once
#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
NAMESPACE_BEGIN(Grid);
inline RealD AggregatePowerLaw(RealD x)
@@ -95,7 +97,7 @@ public:
RealD scale;
ConjugateGradient<FineField> CG(1.0e-2,100,false);
ConjugateGradient<FineField> CG(1.0e-3,400,false);
FineField noise(FineGrid);
FineField Mn(FineGrid);
@@ -108,7 +110,7 @@ public:
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
for(int i=0;i<1;i++){
for(int i=0;i<4;i++){
CG(hermop,noise,subspace[b]);
@@ -124,6 +126,53 @@ public:
}
}
virtual void CreateSubspaceGCR(GridParallelRNG &RNG,LinearOperatorBase<FineField> &DiracOp,int nn=nbasis)
{
RealD scale;
TrivialPrecon<FineField> simple_fine;
PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,12,12);
FineField noise(FineGrid);
FineField src(FineGrid);
FineField guess(FineGrid);
FineField Mn(FineGrid);
for(int b=0;b<nn;b++){
subspace[b] = Zero();
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;
for(int i=0;i<2;i++){
// void operator() (const Field &src, Field &psi){
#if 1
std::cout << GridLogMessage << " inverting on noise "<<std::endl;
src = noise;
guess=Zero();
GCR(src,guess);
subspace[b] = guess;
#else
std::cout << GridLogMessage << " inverting on zero "<<std::endl;
src=Zero();
guess = noise;
GCR(src,guess);
subspace[b] = guess;
#endif
noise = subspace[b];
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
}
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|Op|f> "<<innerProduct(noise,Mn)<<std::endl;
subspace[b] = noise;
}
}
////////////////////////////////////////////////////////////////////////////////////////////////
// World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
// and this is the best I found
@@ -160,14 +209,21 @@ public:
int b =0;
{
ComplexD ip;
// Filter
Chebyshev<FineField> Cheb(lo,hi,orderfilter);
Cheb(hermop,noise,Mn);
// normalise
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
hermop.Op(Mn,tmp);
ip= innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
hermop.AdjOp(Mn,tmp);
ip = innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
b++;
}
@@ -213,8 +269,18 @@ public:
Mn=*Tnp;
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
ComplexD ip;
hermop.Op(Mn,tmp);
ip= innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
hermop.AdjOp(Mn,tmp);
ip = innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
b++;
}
@@ -228,6 +294,70 @@ public:
}
assert(b==nn);
}
virtual void CreateSubspacePolyCheby(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn,
double hi,
double lo1,
int orderfilter,
double lo2,
int orderstep)
{
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
// New normalised noise
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
std::cout << GridLogMessage<<" CreateSubspacePolyCheby "<<std::endl;
// Initial matrix element
hermop.Op(noise,Mn);
std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
int b =0;
{
// Filter
std::cout << GridLogMessage << "Cheby "<<lo1<<","<<hi<<" "<<orderstep<<std::endl;
Chebyshev<FineField> Cheb(lo1,hi,orderfilter);
Cheb(hermop,noise,Mn);
// normalise
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|n> "<<norm2(Mn)<<std::endl;
}
// Generate a full sequence of Chebyshevs
for(int n=1;n<nn;n++){
std::cout << GridLogMessage << "Cheby "<<lo2<<","<<hi<<" "<<orderstep<<std::endl;
Chebyshev<FineField> Cheb(lo2,hi,orderstep);
Cheb(hermop,subspace[n-1],Mn);
for(int m=0;m<n;m++){
ComplexD c = innerProduct(subspace[m],Mn);
Mn = Mn - c*subspace[m];
}
// normalise
scale = std::pow(norm2(Mn),-0.5);
Mn=Mn*scale;
subspace[n]=Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<n<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
std::cout<<GridLogMessage << "filt ["<<n<<"] <n|n> "<<norm2(Mn)<<std::endl;
}
}
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn,
double hi,

View File

@@ -441,8 +441,20 @@ public:
std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl;
}
#else
//////////////////////////////////////////////////////////////////////
// Galerkin projection of matrix
//////////////////////////////////////////////////////////////////////
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & Subspace)
{
CoarsenOperator(linop,Subspace,Subspace);
}
//////////////////////////////////////////////////////////////////////
// Petrov - Galerkin projection of matrix
//////////////////////////////////////////////////////////////////////
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & U,
Aggregation<Fobj,CComplex,nbasis> & V)
{
std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
GridBase *grid = FineGrid();
@@ -458,11 +470,9 @@ public:
// Orthogonalise the subblocks over the basis
/////////////////////////////////////////////////////////////
CoarseScalar InnerProd(CoarseGrid());
blockOrthogonalise(InnerProd,Subspace.subspace);
blockOrthogonalise(InnerProd,V.subspace);
blockOrthogonalise(InnerProd,U.subspace);
// for(int s=0;s<Subspace.subspace.size();s++){
// std::cout << " subspace norm "<<norm2(Subspace.subspace[s])<<std::endl;
// }
const int npoint = geom.npoint;
Coordinate clatt = CoarseGrid()->GlobalDimensions();
@@ -542,7 +552,7 @@ public:
std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
tphaseBZ-=usecond();
phaV = phaF[p]*Subspace.subspace[i];
phaV = phaF[p]*V.subspace[i];
tphaseBZ+=usecond();
/////////////////////////////////////////////////////////////////////
@@ -555,7 +565,7 @@ public:
// std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl;
tproj-=usecond();
blockProject(coarseInner,MphaV,Subspace.subspace);
blockProject(coarseInner,MphaV,U.subspace);
coarseInner = conjugate(pha[p]) * coarseInner;
ComputeProj[p] = coarseInner;

View File

@@ -69,7 +69,7 @@ public:
}
// FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
void construct(pointer __p, const _Tp& __val) { assert(0);};
void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p) { };
void destroy(pointer __p) { };
};
@@ -175,10 +175,11 @@ template<typename _Tp> inline bool operator!=(const devAllocator<_Tp>&, const d
// Template typedefs
////////////////////////////////////////////////////////////////////////////////
template<class T> using hostVector = std::vector<T,alignedAllocator<T> >; // Needs autoview
template<class T> using Vector = std::vector<T,uvmAllocator<T> >; //
template<class T> using Vector = std::vector<T,uvmAllocator<T> >; // Really want to deprecate
template<class T> using uvmVector = std::vector<T,uvmAllocator<T> >; // auto migrating page
template<class T> using deviceVector = std::vector<T,devAllocator<T> >; // device vector
/*
template<class T> class vecView
{
protected:
@@ -214,6 +215,7 @@ template<class T> vecView<T> VectorView(Vector<T> &vec,ViewMode _mode)
#define autoVecView(v_v,v,mode) \
auto v_v = VectorView(v,mode); \
ViewCloser<decltype(v_v)> _autoView##v_v(v_v);
*/
NAMESPACE_END(Grid);

View File

@@ -9,6 +9,7 @@ static char print_buffer [ MAXLINE ];
#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer << std::endl;
#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer << std::endl;
//#define dprintf(...)
//#define mprintf(...)
////////////////////////////////////////////////////////////
// For caching copies of data on device
@@ -109,7 +110,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
///////////////////////////////////////////////////////////
assert(AccCache.state!=Empty);
dprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
dprintf("MemoryManager: Discard(%lx) %lx",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
assert(AccCache.accLock==0);
assert(AccCache.cpuLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
@@ -119,7 +120,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
DeviceBytes -=AccCache.bytes;
LRUremove(AccCache);
AccCache.AccPtr=(uint64_t) NULL;
dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
}
uint64_t CpuPtr = AccCache.CpuPtr;
EntryErase(CpuPtr);
@@ -139,7 +140,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
///////////////////////////////////////////////////////////////////////////
assert(AccCache.state!=Empty);
mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld\n",
mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld",
(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
(uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock);
if (AccCache.accLock!=0) return;
@@ -153,7 +154,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
AccCache.AccPtr=(uint64_t)NULL;
AccCache.state=CpuDirty; // CPU primary now
DeviceBytes -=AccCache.bytes;
dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld ",(uint64_t)AccCache.AccPtr,DeviceBytes);
}
// uint64_t CpuPtr = AccCache.CpuPtr;
DeviceEvictions++;
@@ -167,7 +168,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
assert(AccCache.AccPtr!=(uint64_t)NULL);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
DeviceToHostBytes+=AccCache.bytes;
DeviceToHostXfer++;
AccCache.state=Consistent;
@@ -182,7 +183,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes;
}
mprintf("MemoryManager: acceleratorCopyToDevice Clone size %ld AccPtr %lx <- CpuPtr %lx\n",
mprintf("MemoryManager: acceleratorCopyToDevice Clone size %ld AccPtr %lx <- CpuPtr %lx",
(uint64_t)AccCache.bytes,
(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
@@ -210,7 +211,7 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
{
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr);
dprintf("AcceleratorViewClose %lx",(uint64_t)Ptr);
AcceleratorViewClose((uint64_t)Ptr);
} else if( (mode==CpuRead)||(mode==CpuWrite)){
CpuViewClose((uint64_t)Ptr);
@@ -222,7 +223,7 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
{
uint64_t CpuPtr = (uint64_t)_CpuPtr;
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr);
dprintf("AcceleratorViewOpen %lx",(uint64_t)CpuPtr);
return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
} else if( (mode==CpuRead)||(mode==CpuWrite)){
return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
@@ -233,6 +234,9 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
}
void MemoryManager::EvictVictims(uint64_t bytes)
{
if(bytes>=DeviceMaxBytes) {
printf("EvictVictims bytes %ld DeviceMaxBytes %ld\n",bytes,DeviceMaxBytes);
}
assert(bytes<DeviceMaxBytes);
while(bytes+DeviceLRUBytes > DeviceMaxBytes){
if ( DeviceLRUBytes > 0){
@@ -265,7 +269,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
assert(AccCache.cpuLock==0); // Programming error
if(AccCache.state!=Empty) {
dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld\n",
dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld",
(uint64_t)AccCache.CpuPtr,
(uint64_t)CpuPtr,
(uint64_t)AccCache.bytes,
@@ -305,7 +309,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
AccCache.state = Consistent; // Empty + AccRead => Consistent
}
AccCache.accLock= 1;
dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock);
dprintf("Copied Empty entry into device accLock= %d",AccCache.accLock);
} else if(AccCache.state==CpuDirty ){
if(mode==AcceleratorWriteDiscard) {
CpuDiscard(AccCache);
@@ -318,21 +322,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
}
AccCache.accLock++;
dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock);
dprintf("CpuDirty entry into device ++accLock= %d",AccCache.accLock);
} else if(AccCache.state==Consistent) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
else
AccCache.state = Consistent; // Consistent + AccRead => Consistent
AccCache.accLock++;
dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock);
dprintf("Consistent entry into device ++accLock= %d",AccCache.accLock);
} else if(AccCache.state==AccDirty) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
else
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
AccCache.accLock++;
dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock);
dprintf("AccDirty entry ++accLock= %d",AccCache.accLock);
} else {
assert(0);
}
@@ -341,7 +345,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
// If view is opened on device must remove from LRU
if(AccCache.LRU_valid==1){
// must possibly remove from LRU as now locked on GPU
dprintf("AccCache entry removed from LRU \n");
dprintf("AccCache entry removed from LRU ");
LRUremove(AccCache);
}
@@ -364,10 +368,10 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
AccCache.accLock--;
// Move to LRU queue if not locked and close on device
if(AccCache.accLock==0) {
dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
LRUinsert(AccCache);
} else {
dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
dprintf("AccleratorViewClose %lx AccLock decremented to %ld",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
}
}
void MemoryManager::CpuViewClose(uint64_t CpuPtr)

View File

@@ -31,5 +31,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/cartesian/Cartesian_base.h>
#include <Grid/cartesian/Cartesian_full.h>
#include <Grid/cartesian/Cartesian_red_black.h>
#include <Grid/cartesian/CartesianCrossIcosahedron.h>
#endif

View File

@@ -0,0 +1,199 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/cartesian/CartesianCrossIcosahedron.h
Copyright (C) 2025
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////////////
// Grid Support.
/////////////////////////////////////////////////////////////////////////////////////////
enum IcosahedralMeshType {
IcosahedralVertices,
IcosahedralEdges
} ;
enum NorthSouth {
North = 1,
South = 0
};
const int num_icosahedron_tiles = 10;
class GridCartesianCrossIcosahedron: public GridCartesian {
public:
IcosahedralMeshType meshType;
IcosahedralMeshType MeshType(void) { return meshType; };
/////////////////////////////////////////////////////////////////////////
// Constructor takes a parent grid and possibly subdivides communicator.
/////////////////////////////////////////////////////////////////////////
/*
GridCartesian(const Coordinate &dimensions,
const Coordinate &simd_layout,
const Coordinate &processor_grid,
const GridCartesian &parent) : GridBase(processor_grid,parent,dummy)
{
assert(0); // No subdivision
}
GridCartesian(const Coordinate &dimensions,
const Coordinate &simd_layout,
const Coordinate &processor_grid,
const GridCartesian &parent,int &split_rank) : GridBase(processor_grid,parent,split_rank)
{
assert(0); // No subdivision
}
*/
/////////////////////////////////////////////////////////////////////////
// Construct from comm world
/////////////////////////////////////////////////////////////////////////
GridCartesianCrossIcosahedron(const Coordinate &dimensions,
const Coordinate &simd_layout,
const Coordinate &processor_grid,
IcosahedralMeshType _meshType) : GridCartesian(dimensions,simd_layout,processor_grid)
{
meshType = _meshType;
Coordinate S2dimensions=dimensions;
Coordinate S2simd =simd_layout;
Coordinate S2procs =processor_grid;
assert(simd_layout[0]==1); // Force simd into perpendicular dimensions
assert(simd_layout[1]==1); // to avoid pole storage complexity interacting with SIMD.
assert(dimensions[_ndimension-1]==num_icosahedron_tiles);
assert(processor_grid[_ndimension-1]<=2); // Keeps the patches that need a pole on the same node
// allocate the pole storage if we are seeking vertex domain data
if ( meshType == IcosahedralVertices ) {
InitPoles();
}
}
virtual ~GridCartesianCrossIcosahedron() = default;
////////////////////////////////////////////////
// Use to decide if a given grid is icosahedral
////////////////////////////////////////////////
int hasNorthPole;
int hasSouthPole;
int northPoleOsite;
int southPoleOsite;
int northPoleOsites;
int southPoleOsites;
virtual int isIcosahedral(void) override { return 1;}
virtual int isIcosahedralVertex(void) override { return meshType==IcosahedralVertices;}
virtual int isIcosahedralEdge (void) override { return meshType==IcosahedralEdges;}
virtual int ownsNorthPole(void) const override { return hasNorthPole; };
virtual int NorthPoleOsite(void) const override { return northPoleOsite; };
virtual int NorthPoleOsites(void) const override { return northPoleOsites; };
virtual int ownsSouthPole(void) const override { return hasSouthPole; };
virtual int SouthPoleOsite(void) const override { return southPoleOsite; };
virtual int SouthPoleOsites(void) const override { return southPoleOsites; };
void InitPoles(void)
{
int Ndm1 = _ndimension-1;
///////////////////////
// Add the extra pole storage
///////////////////////
// Vertices = 1x LxLx D1...Dn + 2.D1...Dn
// Start after the LxL and don't include the 10 patch dim
int OrthogSize = 1;
for (int d = 2; d < Ndm1; d++) {
OrthogSize *= _gdimensions[d];
}
_fsites += OrthogSize*2;
_gsites += OrthogSize*2;
// Simd reduced sizes are multiplied up.
// If the leading LxL are simd-ized, the vector objects will contain "redundant" lanes
// which should contain identical north (south) pole data
OrthogSize = 1;
for (int d = 2; d < Ndm1; d++) {
OrthogSize *= _rdimensions[d];
}
// Grow the local volume to hold pole data
// on rank (0,0) in the LxL planes
// since SIMD must be placed in the orthogonal directions
Coordinate pcoor = this->ThisProcessorCoor();
Coordinate pgrid = this->ProcessorGrid();
const int xdim=0;
const int ydim=1;
/*
*
* /\/\/\/\/\
* /\/\/\/\/\/
* \/\/\/\/\/
*
* y
* /
* \x
*
* Labelling patches as 5 6 7 8 9
* 0 1 2 3 4
*
* Will ban distribution of the patch dimension by more than 2.
*
* Hence all 5 patches associated with the pole must have the
* appropriate "corner" of the patch L^2 located on the SAME rank.
*/
if( (pcoor[xdim]==pgrid[xdim]-1) && (pcoor[ydim]==0) && (pcoor[Ndm1]==0) ){
hasSouthPole =1;
southPoleOsite=this->_osites;
southPoleOsites=OrthogSize;
this->_osites += OrthogSize;
} else {
hasSouthPole =0;
southPoleOsites=0;
southPoleOsite=0;
}
if( (pcoor[xdim]==0) && (pcoor[ydim]==pgrid[ydim]-1) && (pcoor[Ndm1]==pgrid[Ndm1]-1) ){
hasNorthPole =1;
northPoleOsite=this->_osites;
northPoleOsites=OrthogSize;
this->_osites += OrthogSize;
} else {
hasNorthPole =0;
northPoleOsites=0;
northPoleOsite=0;
}
std::cout << "Icosahedral vertex field volume " << this->_osites<<std::endl;
std::cout << "Icosahedral south pole offset " << this->southPoleOsite<<std::endl;
std::cout << "Icosahedral north pole offset " << this->northPoleOsite<<std::endl;
std::cout << "Icosahedral south pole size " << this->southPoleOsites<<std::endl;
std::cout << "Icosahedral north pole size " << this->northPoleOsites<<std::endl;
};
};
NAMESPACE_END(Grid);

View File

@@ -86,10 +86,22 @@ public:
public:
// Icosahedral decisions
virtual int isIcosahedral(void) { return 0;}
virtual int isIcosahedralVertex(void) { return 0;}
virtual int isIcosahedralEdge (void) { return 0;}
virtual int ownsNorthPole(void) const { return 0; };
virtual int ownsSouthPole(void) const { return 0; };
virtual int NorthPoleOsite(void) const { return 0; };
virtual int SouthPoleOsite(void) const { return 0; };
virtual int NorthPoleOsites(void) const { std::cout << "base osites" <<std::endl;return 0; };
virtual int SouthPoleOsites(void) const { std::cout << "base osites" <<std::endl;return 0; };
////////////////////////////////////////////////////////////////
// Checkerboarding interface is virtual and overridden by
// GridCartesian / GridRedBlackCartesian
////////////////////////////////////////////////////////////////
virtual int CheckerBoarded(int dim) =0;
virtual int CheckerBoard(const Coordinate &site)=0;
virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
@@ -176,6 +188,8 @@ public:
}
return permute_type;
}
////////////////////////////////////////////////////////////////
// Array sizing queries
////////////////////////////////////////////////////////////////

View File

@@ -33,6 +33,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
///////////////////////////////////
#include <Grid/communicator/SharedMemory.h>
#define NVLINK_GET
NAMESPACE_BEGIN(Grid);
extern bool Stencil_force_mpi ;
@@ -127,7 +129,7 @@ public:
void GlobalSumVector(ComplexD *c,int N);
void GlobalXOR(uint32_t &);
void GlobalXOR(uint64_t &);
template<class obj> void GlobalSumP2P(obj &o)
{
std::vector<obj> column;
@@ -147,7 +149,8 @@ public:
sizeof(obj),d*100+p);
}
CommsComplete(list);
if (!list.empty()) // avoid triggering assert in comms == none
CommsComplete(list);
for(int p=1;p<_processors[d];p++){
accum = accum + column[p];
}
@@ -192,6 +195,11 @@ public:
void *recv,
int recv_from_rank,int do_recv,
int xbytes,int rbytes,int dir);
// Could do a PollHtoD and have a CommsMerge dependence
void StencilSendToRecvFromPollDtoH (std::vector<CommsRequest_t> &list);
void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,int do_xmit,

View File

@@ -30,6 +30,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid);
Grid_MPI_Comm CartesianCommunicator::communicator_world;
////////////////////////////////////////////
@@ -259,32 +260,39 @@ CartesianCommunicator::~CartesianCommunicator()
}
#ifdef USE_GRID_REDUCTION
void CartesianCommunicator::GlobalSum(float &f){
FlightRecorder::StepLog("GlobalSumP2P");
CartesianCommunicator::GlobalSumP2P(f);
}
void CartesianCommunicator::GlobalSum(double &d)
{
FlightRecorder::StepLog("GlobalSumP2P");
CartesianCommunicator::GlobalSumP2P(d);
}
#else
void CartesianCommunicator::GlobalSum(float &f){
FlightRecorder::StepLog("AllReduce");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(double &d)
{
FlightRecorder::StepLog("AllReduce");
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
#endif
void CartesianCommunicator::GlobalSum(uint32_t &u){
FlightRecorder::StepLog("AllReduce");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(uint64_t &u){
FlightRecorder::StepLog("AllReduce");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
FlightRecorder::StepLog("AllReduceVector");
int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0);
}
@@ -362,8 +370,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
int bytes)
{
std::vector<MpiCommsRequest_t> reqs(0);
unsigned long xcrc = crc32(0L, Z_NULL, 0);
unsigned long rcrc = crc32(0L, Z_NULL, 0);
int myrank = _processor;
int ierr;
@@ -379,9 +385,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
communicator,MPI_STATUS_IGNORE);
assert(ierr==0);
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
// printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
}
// Basic Halo comms primitive
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
@@ -399,6 +402,8 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
#ifdef ACCELERATOR_AWARE_MPI
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,int dox,
@@ -440,8 +445,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
list.push_back(rrq);
off_node_bytes+=rbytes;
}
#ifdef NVLINK_GET
else {
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
}
#endif
}
// This is a NVLINK PUT
if (dox) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+_processor*32;
@@ -450,9 +462,11 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
list.push_back(xrq);
off_node_bytes+=xbytes;
} else {
#ifndef NVLINK_GET
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
#endif
}
}
return off_node_bytes;
@@ -461,7 +475,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{
int nreq=list.size();
/*finishes Get/Put*/
acceleratorCopySynchronise();
if (nreq==0) return;
@@ -561,53 +575,105 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
if (dox) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
#undef DEVICE_TO_HOST_CONCURRENT // pipeline
#ifdef DEVICE_TO_HOST_CONCURRENT
tag= dir+_processor*32;
host_xmit = this->HostBufferMalloc(xbytes);
acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
CommsRequest_t srq;
srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
// ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
// assert(ierr==0);
// off_node_bytes+=xbytes;
CommsRequest_t srq;
srq.PacketType = InterNodeXmit;
srq.bytes = xbytes;
// srq.req = xrq;
srq.host_buf = host_xmit;
srq.device_buf = xmit;
srq.tag = tag;
srq.dest = dest;
srq.commdir = commdir;
list.push_back(srq);
#else
tag= dir+_processor*32;
host_xmit = this->HostBufferMalloc(xbytes);
const int chunks=1;
for(int n=0;n<chunks;n++){
void * host_xmitc = (void *)( (uint64_t) host_xmit + n*xbytes/chunks);
void * xmitc = (void *)( (uint64_t) xmit + n*xbytes/chunks);
acceleratorCopyFromDeviceAsynch(xmitc, host_xmitc,xbytes/chunks); // Make this Asynch
}
acceleratorCopySynchronise(); // Complete all pending copy transfers
ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0);
off_node_bytes+=xbytes;
CommsRequest_t srq;
srq.PacketType = InterNodeXmit;
srq.bytes = xbytes;
srq.req = xrq;
srq.host_buf = host_xmit;
srq.device_buf = xmit;
list.push_back(srq);
#endif
}
}
return off_node_bytes;
}
/*
* In the interest of better pipelining, poll for completion on each DtoH and
* start MPI_ISend in the meantime
*/
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list)
{
int pending = 0;
do {
pending = 0;
for(int idx = 0; idx<list.size();idx++){
if ( list[idx].PacketType==InterNodeRecv ) {
int flag = 0;
MPI_Status status;
int ierr = MPI_Test(&list[idx].req,&flag,&status);
assert(ierr==0);
if ( flag ) {
// std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl;
acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes);
list[idx].PacketType=InterNodeReceiveHtoD;
} else {
pending ++;
}
}
}
// std::cout << " PollIrecv "<<pending<<" pending requests"<<std::endl;
} while ( pending );
}
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list)
{
int pending = 0;
do {
pending = 0;
for(int idx = 0; idx<list.size();idx++){
if ( list[idx].PacketType==InterNodeXmit ) {
if ( acceleratorEventIsComplete(list[idx].ev) ) {
void *host_xmit = list[idx].host_buf;
uint32_t xbytes = list[idx].bytes;
int dest = list[idx].dest;
int tag = list[idx].tag;
int commdir = list[idx].commdir;
///////////////////
// Send packet
///////////////////
// std::cout << " DtoH is complete for index "<<idx<<" calling MPI_Isend "<<std::endl;
MPI_Request xrq;
int ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0);
list[idx].req = xrq; // Update the MPI request in the list
list[idx].PacketType=InterNodeXmitISend;
} else {
// not done, so return to polling loop
pending++;
}
}
}
} while (pending);
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
@@ -644,69 +710,89 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
* - complete all copies
* - post MPI send asynch
*/
#ifdef NVLINK_GET
if ( dor ) {
// static int printed;
// if((printed<8) && this->IsBoss() ) {
// printf("dir %d doX %d doR %d Face size %ld %ld\n",dir,dox,dor,xbytes,rbytes);
// printed++;
// }
if ( ! ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) ) {
// Intranode
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
assert(shm!=NULL);
CommsRequest_t srq;
srq.ev = acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
srq.PacketType = IntraNodeRecv;
srq.bytes = xbytes;
// srq.req = xrq;
srq.host_buf = NULL;
srq.device_buf = xmit;
srq.tag = -1;
srq.dest = dest;
srq.commdir = dir;
list.push_back(srq);
}
}
#else
if (dox) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
#ifdef DEVICE_TO_HOST_CONCURRENT
tag= dir+_processor*32;
// Find the send in the prepared list
int list_idx=-1;
for(int idx = 0; idx<list.size();idx++){
if ( (list[idx].device_buf==xmit)
&&(list[idx].PacketType==InterNodeXmit)
&&(list[idx].bytes==xbytes) ) {
list_idx = idx;
host_xmit = list[idx].host_buf;
}
}
assert(list_idx != -1); // found it
ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0);
list[list_idx].req = xrq; // Update the MPI request in the list
off_node_bytes+=xbytes;
#endif
} else {
if ( !( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) ) {
// Intranode
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
CommsRequest_t srq;
srq.ev = acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
srq.PacketType = IntraNodeXmit;
srq.bytes = xbytes;
// srq.req = xrq;
srq.host_buf = NULL;
srq.device_buf = xmit;
srq.tag = -1;
srq.dest = dest;
srq.commdir = dir;
list.push_back(srq);
}
}
#endif
return off_node_bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{
int nreq=list.size();
acceleratorCopySynchronise(); // Complete all pending copy transfers D2D
if (nreq==0) return;
std::vector<MPI_Status> status(nreq);
std::vector<MPI_Request> MpiRequests(nreq);
std::vector<MPI_Status> status;
std::vector<MPI_Request> MpiRequests;
for(int r=0;r<list.size();r++){
// Must check each Send buf is clear to reuse
if ( list[r].PacketType == InterNodeXmitISend ) MpiRequests.push_back(list[r].req);
// if ( list[r].PacketType == InterNodeRecv ) MpiRequests.push_back(list[r].req); // Already "Test" passed
}
for(int r=0;r<nreq;r++){
MpiRequests[r] = list[r].req;
int nreq=MpiRequests.size();
if (nreq>0) {
status.resize(MpiRequests.size());
int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
assert(ierr==0);
}
int ierr = MPI_Waitall(nreq,&MpiRequests[0],&status[0]);
assert(ierr==0);
for(int r=0;r<nreq;r++){
if ( list[r].PacketType==InterNodeRecv ) {
acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
}
}
// for(int r=0;r<nreq;r++){
// if ( list[r].PacketType==InterNodeRecv ) {
// acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
// }
// }
acceleratorCopySynchronise(); // Complete all pending copy transfers
list.resize(0); // Delete the list
this->HostBufferFreeAll(); // Clean up the buffer allocs
this->StencilBarrier();
#ifndef NVLINK_GET
this->StencilBarrier(); // if PUT must check our nbrs have filled our receive buffers.
#endif
}
#endif
////////////////////////////////////////////
@@ -715,6 +801,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
void CartesianCommunicator::StencilBarrier(void)
{
FlightRecorder::StepLog("NodeBarrier");
MPI_Barrier (ShmComm);
}
//void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
@@ -722,11 +809,13 @@ void CartesianCommunicator::StencilBarrier(void)
//}
void CartesianCommunicator::Barrier(void)
{
FlightRecorder::StepLog("GridBarrier");
int ierr = MPI_Barrier(communicator);
assert(ierr==0);
}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
{
FlightRecorder::StepLog("Broadcast");
int ierr=MPI_Bcast(data,
bytes,
MPI_BYTE,
@@ -745,6 +834,7 @@ void CartesianCommunicator::BarrierWorld(void){
}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
{
FlightRecorder::StepLog("BroadcastWorld");
int ierr= MPI_Bcast(data,
bytes,
MPI_BYTE,
@@ -767,6 +857,7 @@ void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,
}
void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes)
{
FlightRecorder::StepLog("AllToAll");
// MPI is a pain and uses "int" arguments
// 64*64*64*128*16 == 500Million elements of data.
// When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.

View File

@@ -91,7 +91,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
{
assert(0);
}
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(0);}
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(list.size()==0);}
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
@@ -132,6 +132,8 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
{
return 2.0*bytes;
}
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,int dox,
@@ -139,7 +141,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
int recv_from_rank,int dor,
int xbytes,int rbytes, int dir)
{
return xbytes+rbytes;
return 0.0;
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,

View File

@@ -50,12 +50,30 @@ typedef MPI_Request MpiCommsRequest_t;
#ifdef ACCELERATOR_AWARE_MPI
typedef MPI_Request CommsRequest_t;
#else
enum PacketType_t { InterNodeXmit, InterNodeRecv, IntraNodeXmit, IntraNodeRecv };
/*
* Enable state transitions as each packet flows.
*/
enum PacketType_t {
FaceGather,
InterNodeXmit,
InterNodeRecv,
IntraNodeXmit,
IntraNodeRecv,
InterNodeXmitISend,
InterNodeReceiveHtoD
};
/*
*Package arguments needed for various actions along packet flow
*/
typedef struct {
PacketType_t PacketType;
void *host_buf;
void *device_buf;
int dest;
int tag;
int commdir;
unsigned long bytes;
acceleratorEvent_t ev;
MpiCommsRequest_t req;
} CommsRequest_t;
#endif
@@ -119,7 +137,7 @@ public:
///////////////////////////////////////////////////
static void SharedMemoryAllocate(uint64_t bytes, int flags);
static void SharedMemoryFree(void);
static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
// static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
static void SharedMemoryZero(void *dest,size_t bytes);
};

View File

@@ -542,12 +542,12 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
// Each MPI rank should allocate our own buffer
///////////////////////////////////////////////////////////////////////////////////////////////////////////
#ifndef ACCELERATOR_AWARE_MPI
printf("Host buffer allocate for GPU non-aware MPI\n");
// printf("Host buffer allocate for GPU non-aware MPI\n");
#if 0
HostCommBuf= acceleratorAllocHost(bytes);
#else
HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
#ifdef HAVE_NUMAIF_H
#if 0
#warning "Moving host buffers to specific NUMA domain"
int numa;
char *numa_name=(char *)getenv("MPI_BUF_NUMA");
@@ -916,14 +916,14 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
bzero(dest,bytes);
#endif
}
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
{
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
acceleratorCopyToDevice(src,dest,bytes);
#else
bcopy(src,dest,bytes);
#endif
}
//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
//{
//#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
// acceleratorCopyToDevice(src,dest,bytes);
//#else
// bcopy(src,dest,bytes);
//#endif
//}
////////////////////////////////////////////////////////
// Global shared functionality finished
// Now move to per communicator functionality
@@ -959,6 +959,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
// std::cerr << " SetCommunicator rank "<<r<<" comm "<<ShmCommBufs[r] <<std::endl;
}
ShmBufferFreeAll();
@@ -989,7 +990,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
}
#endif
//SharedMemoryTest();
// SharedMemoryTest();
}
//////////////////////////////////////////////////////////////////
// On node barrier
@@ -1011,19 +1012,18 @@ void SharedMemory::SharedMemoryTest(void)
check[0]=GlobalSharedMemory::WorldNode;
check[1]=r;
check[2]=magic;
GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t));
acceleratorCopyToDevice(check,ShmCommBufs[r],3*sizeof(uint64_t));
}
}
ShmBarrier();
for(uint64_t r=0;r<ShmSize;r++){
ShmBarrier();
GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t));
ShmBarrier();
acceleratorCopyFromDevice(ShmCommBufs[r],check,3*sizeof(uint64_t));
assert(check[0]==GlobalSharedMemory::WorldNode);
assert(check[1]==r);
assert(check[2]==magic);
ShmBarrier();
}
ShmBarrier();
std::cout << GridLogDebug << " SharedMemoryTest has passed "<<std::endl;
}
void *SharedMemory::ShmBuffer(int rank)

View File

@@ -122,10 +122,10 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
{
acceleratorMemSet(dest,0,bytes);
}
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
{
acceleratorCopyToDevice(src,dest,bytes);
}
//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
//{
// acceleratorCopyToDevice(src,dest,bytes);
//}
////////////////////////////////////////////////////////
// Global shared functionality finished
// Now move to per communicator functionality

View File

@@ -34,6 +34,8 @@ NAMESPACE_BEGIN(Grid);
const int Cshift_verbose=0;
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
{
assert(!rhs.Grid()->isIcosahedral());
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
@@ -68,7 +70,7 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
if(Cshift_verbose) std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
return ret;
}
#if 1
template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
{
int sshift[2];
@@ -125,7 +127,11 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
#ifndef ACCELERATOR_AWARE_MPI
static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
#endif
int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
RealD tcopy=0.0;
@@ -156,16 +162,29 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
// int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond();
grid->Barrier();
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank,
(void *)&recv_buf[0],
recv_from_rank,
bytes);
#else
// bouncy bouncy
acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
grid->SendToRecvFrom((void *)&hsend_buf[0],
xmit_to_rank,
(void *)&hrecv_buf[0],
recv_from_rank,
bytes);
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
#endif
xbytes+=bytes;
grid->Barrier();
tcomms+=usecond();
@@ -226,12 +245,16 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
static std::vector<deviceVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
scalar_object * recv_buf_extract_mpi;
scalar_object * send_buf_extract_mpi;
for(int s=0;s<Nsimd;s++){
send_buf_extract[s].resize(buffer_size);
recv_buf_extract[s].resize(buffer_size);
}
#ifndef ACCELERATOR_AWARE_MPI
hostVector<scalar_object> hsend_buf; hsend_buf.resize(buffer_size);
hostVector<scalar_object> hrecv_buf; hrecv_buf.resize(buffer_size);
#endif
int bytes = buffer_size*sizeof(scalar_object);
ExtractPointerArray<scalar_object> pointers(Nsimd); //
@@ -283,11 +306,22 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
recv_buf_extract_mpi = &recv_buf_extract[i][0];
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
xmit_to_rank,
(void *)recv_buf_extract_mpi,
recv_from_rank,
bytes);
#else
// bouncy bouncy
acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes);
grid->SendToRecvFrom((void *)&hsend_buf[0],
xmit_to_rank,
(void *)&hrecv_buf[0],
recv_from_rank,
bytes);
acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
#endif
xbytes+=bytes;
grid->Barrier();
@@ -311,234 +345,6 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
}
}
#else
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
GridBase *grid=rhs.Grid();
Lattice<vobj> temp(rhs.Grid());
int fd = rhs.Grid()->_fdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension];
int pd = rhs.Grid()->_processors[dimension];
int simd_layout = rhs.Grid()->_simd_layout[dimension];
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
assert(simd_layout==1);
assert(comm_dim==1);
assert(shift>=0);
assert(shift<fd);
RealD tcopy=0.0;
RealD tgather=0.0;
RealD tscatter=0.0;
RealD tcomms=0.0;
uint64_t xbytes=0;
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
vobj *send_buf;
vobj *recv_buf;
{
grid->ShmBufferFreeAll();
size_t bytes = buffer_size*sizeof(vobj);
send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
}
int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
for(int x=0;x<rd;x++){
int sx = (x+sshift)%rd;
int comm_proc = ((x+sshift)/rd)%pd;
if (comm_proc==0) {
tcopy-=usecond();
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
tcopy+=usecond();
} else {
int words = buffer_size;
if (cbmask != 0x3) words=words>>1;
int bytes = words * sizeof(vobj);
tgather-=usecond();
Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
tgather+=usecond();
// int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond();
// grid->Barrier();
acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank,
(void *)&recv_buf[0],
recv_from_rank,
bytes);
xbytes+=bytes;
acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
// grid->Barrier();
tcomms+=usecond();
tscatter-=usecond();
Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
tscatter+=usecond();
}
}
if(Cshift_verbose){
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
}
}
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
GridBase *grid=rhs.Grid();
const int Nsimd = grid->Nsimd();
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
int fd = grid->_fdimensions[dimension];
int rd = grid->_rdimensions[dimension];
int ld = grid->_ldimensions[dimension];
int pd = grid->_processors[dimension];
int simd_layout = grid->_simd_layout[dimension];
int comm_dim = grid->_processors[dimension] >1 ;
//std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
assert(comm_dim==1);
assert(simd_layout==2);
assert(shift>=0);
assert(shift<fd);
RealD tcopy=0.0;
RealD tgather=0.0;
RealD tscatter=0.0;
RealD tcomms=0.0;
uint64_t xbytes=0;
int permute_type=grid->PermuteType(dimension);
///////////////////////////////////////////////
// Simd direction uses an extract/merge pair
///////////////////////////////////////////////
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
// int words = sizeof(vobj)/sizeof(vector_type);
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
scalar_object * recv_buf_extract_mpi;
scalar_object * send_buf_extract_mpi;
{
size_t bytes = sizeof(scalar_object)*buffer_size;
grid->ShmBufferFreeAll();
send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
}
for(int s=0;s<Nsimd;s++){
send_buf_extract[s].resize(buffer_size);
recv_buf_extract[s].resize(buffer_size);
}
int bytes = buffer_size*sizeof(scalar_object);
ExtractPointerArray<scalar_object> pointers(Nsimd); //
ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
///////////////////////////////////////////
// Work out what to send where
///////////////////////////////////////////
int cb = (cbmask==0x2)? Odd : Even;
int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
// loop over outer coord planes orthog to dim
for(int x=0;x<rd;x++){
// FIXME call local permute copy if none are offnode.
for(int i=0;i<Nsimd;i++){
pointers[i] = &send_buf_extract[i][0];
}
tgather-=usecond();
int sx = (x+sshift)%rd;
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
tgather+=usecond();
for(int i=0;i<Nsimd;i++){
int inner_bit = (Nsimd>>(permute_type+1));
int ic= (i&inner_bit)? 1:0;
int my_coor = rd*ic + x;
int nbr_coor = my_coor+sshift;
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer
int nbr_ox = (nbr_coor%rd); // outer coord of peer
int nbr_lane = (i&(~inner_bit));
int recv_from_rank;
int xmit_to_rank;
if (nbr_ic) nbr_lane|=inner_bit;
assert (sx == nbr_ox);
if(nbr_proc){
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond();
// grid->Barrier();
acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
xmit_to_rank,
(void *)recv_buf_extract_mpi,
recv_from_rank,
bytes);
acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
xbytes+=bytes;
// grid->Barrier();
tcomms+=usecond();
rpointers[i] = &recv_buf_extract[i][0];
} else {
rpointers[i] = &send_buf_extract[nbr_lane][0];
}
}
tscatter-=usecond();
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
tscatter+=usecond();
}
if(Cshift_verbose){
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl;
}
}
#endif
NAMESPACE_END(Grid);

View File

@@ -30,6 +30,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid);
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
{
assert(!rhs.Grid()->isIcosahedral());
Lattice<vobj> ret(rhs.Grid());
ret.Checkerboard() = rhs.Grid()->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension);
Cshift_local(ret,rhs,dimension,shift);

View File

@@ -236,7 +236,7 @@ public:
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
vobj vtmp;
vtmp = r;
#if 0
#if 1
deviceVector<vobj> vvtmp(1);
acceleratorPut(vvtmp[0],vtmp);
vobj *vvtmp_p = & vvtmp[0];
@@ -373,14 +373,17 @@ public:
template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
typedef typename vobj::scalar_object sobj;
for(int64_t g=0;g<o.Grid()->_gsites;g++){
uint64_t gsites=1;
uint64_t polesites=0;
for(int d=0;d<o.Grid()->_ndimension;d++) gsites *= o.Grid()->_gdimensions[d];
for(int64_t g=0;g<gsites;g++){
Coordinate gcoor;
o.Grid()->GlobalIndexToGlobalCoor(g,gcoor);
sobj ss;
peekSite(ss,o,gcoor);
stream<<"[";
stream<<"["<< g<<" : ";
for(int d=0;d<gcoor.size();d++){
stream<<gcoor[d];
if(d!=gcoor.size()-1) stream<<",";
@@ -388,6 +391,41 @@ template<class vobj> std::ostream& operator<< (std::ostream& stream, const Latti
stream<<"]\t";
stream<<ss<<std::endl;
}
if ( o.Grid()->isIcosahedral() ) {
uint64_t psites=1;
Coordinate perpdims;
for(int d=2;d<o.Grid()->_ndimension-1;d++){
int pd=o.Grid()->_gdimensions[d];
psites*=pd;
perpdims.push_back(pd);
}
for(uint64_t p=0;p<psites;p++){
sobj ss;
Coordinate orthog;
Lexicographic::CoorFromIndex(orthog,p,perpdims);
peekPole(ss,o,orthog,South);
stream<<"[ SouthPole : ";
for(int d=0;d<orthog.size();d++){
stream<<orthog[d];
if(d!=orthog.size()-1) stream<<",";
}
stream<<"]\t";
stream<<ss<<std::endl;
}
for(uint64_t p=0;p<psites;p++){
sobj ss;
Coordinate orthog;
Lexicographic::CoorFromIndex(orthog,p,perpdims);
peekPole(ss,o,orthog,North);
stream<<"[ NorthPole : ";
for(int d=0;d<orthog.size();d++){
stream<<orthog[d];
if(d!=orthog.size()-1) stream<<",";
}
stream<<"]\t";
stream<<ss<<std::endl;
}
}
return stream;
}

View File

@@ -34,22 +34,86 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
typedef typename iobj::scalar_type scalar_type;
typedef typename iobj::vector_type vector_type;
l=Zero();
GridBase *grid = l.Grid();
int Nsimd = grid->iSites();
autoView(l_v, l, CpuWrite);
thread_for( o, grid->oSites(), {
vector_type vI;
Coordinate gcoor;
ExtractBuffer<scalar_type> mergebuf(Nsimd);
for(int i=0;i<grid->iSites();i++){
grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
mergebuf[i]=(Integer)gcoor[mu];
int cartesian_vol = grid->oSites();
if ( grid->isIcosahedral() ) {
cartesian_vol = cartesian_vol - grid->NorthPoleOsites()-grid->SouthPoleOsites();
}
{
autoView(l_v, l, CpuWrite);
thread_for( o, cartesian_vol, {
vector_type vI;
Coordinate gcoor;
ExtractBuffer<scalar_type> mergebuf(Nsimd);
for(int i=0;i<grid->iSites();i++){
grid->RankIndexToGlobalCoor(grid->ThisRank(),o,i,gcoor);
mergebuf[i]=(Integer)gcoor[mu];
}
merge<vector_type,scalar_type>(vI,mergebuf);
l_v[o]=vI;
});
}
if (grid->isIcosahedralVertex()) {
uint64_t psites=1;
Coordinate perpdims;
typename iobj::scalar_object ss;
for(int d=2;d<grid->_ndimension-1;d++){
int pd=grid->_gdimensions[d];
psites*=pd;
perpdims.push_back(pd);
}
merge<vector_type,scalar_type>(vI,mergebuf);
l_v[o]=vI;
});
for(uint64_t p=0;p<psites;p++){
Coordinate orthog;
Lexicographic::CoorFromIndex(orthog,p,perpdims);
int icoor;
if ( mu>=2 && mu < grid->_ndimension-1) {
icoor = orthog[mu-2];
} else {
icoor = -1;
}
ss=scalar_type(icoor);
pokePole(ss,l,orthog,South);
pokePole(ss,l,orthog,North);
}
}
};
template<class iobj> inline void LatticePole(Lattice<iobj> &l,NorthSouth pole)
{
typedef typename iobj::scalar_object sobj;
typedef typename iobj::scalar_type scalar_type;
typedef typename iobj::vector_type vector_type;
GridBase *grid = l.Grid();
l=Zero();
assert(grid->isIcosahedralVertex());
if (grid->isIcosahedralVertex()) {
uint64_t psites=1;
Coordinate perpdims;
sobj ss;
scalar_type one(1.0);
ss=one;
for(int d=2;d<l.Grid()->_ndimension-1;d++){
int pd=l.Grid()->_gdimensions[d];
psites*=pd;
perpdims.push_back(pd);
}
for(uint64_t p=0;p<psites;p++){
Coordinate orthog;
Lexicographic::CoorFromIndex(orthog,p,perpdims);
pokePole(ss,l,orthog,pole);
}
}
};
NAMESPACE_END(Grid);

View File

@@ -141,7 +141,7 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
ExtractBuffer<sobj> buf(Nsimd);
autoView( l_v , l, CpuWrite);
autoView( l_v , l, CpuRead);
extract(l_v[odx],buf);
s = buf[idx];
@@ -151,6 +151,136 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
return;
};
// zero for south pole, one for north pole
template<class vobj,class sobj>
void peekPole(sobj &s,const Lattice<vobj> &l,const Coordinate &orthog,NorthSouth isNorth)
{
s=Zero();
GridBase *grid=l.Grid();
assert(grid->isIcosahedral());
assert(grid->isIcosahedralVertex());
int Nsimd = grid->Nsimd();
int rank;
int Ndm1 = grid->_ndimension-1;
Coordinate pgrid = grid->ProcessorGrid();
const int xdim=0;
const int ydim=1;
const int pdim=Ndm1;
int64_t pole_osite;
int64_t pole_isite;
Coordinate rdims;
Coordinate idims;
Coordinate ocoor;
Coordinate icoor;
Coordinate pcoor(grid->_ndimension);
for(int d=2;d<Ndm1;d++){
int dd=d-2;
rdims.push_back(grid->_rdimensions[d]);
idims.push_back(grid->_simd_layout[d]);
icoor.push_back((orthog[dd]%grid->_ldimensions[d])/grid->_rdimensions[d]);
ocoor.push_back(orthog[dd]%grid->_rdimensions[d]);
pcoor[d] = orthog[dd]/grid->_ldimensions[d];
}
Lexicographic::IndexFromCoor(ocoor,pole_osite,rdims);
Lexicographic::IndexFromCoor(icoor,pole_isite,idims);
int64_t osite;
if(isNorth == North){
pcoor[xdim] = 0;
pcoor[ydim] = pgrid[ydim]-1;
pcoor[Ndm1] = pgrid[Ndm1]-1;
osite = pole_osite + grid->NorthPoleOsite();
} else {
pcoor[xdim] = pgrid[xdim]-1;
pcoor[ydim] = 0;
pcoor[Ndm1] = 0;
osite = pole_osite + grid->SouthPoleOsite();
}
rank = grid->RankFromProcessorCoor(pcoor);
if ( rank == grid->ThisRank() ) {
ExtractBuffer<sobj> buf(Nsimd);
autoView( l_v , l, CpuWrite);
extract(l_v[osite],buf);
s = buf[pole_isite];
}
grid->Broadcast(rank,s);
return;
};
template<class vobj,class sobj>
void pokePole(const sobj &s,Lattice<vobj> &l,const Coordinate &orthog,NorthSouth isNorth)
{
GridBase *grid=l.Grid();
assert(grid->isIcosahedral());
assert(grid->isIcosahedralVertex());
grid->Broadcast(grid->BossRank(),s);
int Nsimd = grid->Nsimd();
int rank;
int Ndm1 = grid->_ndimension-1;
Coordinate pgrid = grid->ProcessorGrid();
const int xdim=0;
const int ydim=1;
const int pdim=Ndm1;
int64_t pole_osite;
int64_t pole_isite;
Coordinate rdims;
Coordinate idims;
Coordinate ocoor;
Coordinate icoor;
Coordinate pcoor(grid->_ndimension,0);
for(int d=2;d<Ndm1;d++){
int dd = d-2;
rdims.push_back(grid->_rdimensions[d]);
idims.push_back(grid->_simd_layout[d]);
icoor.push_back((orthog[dd]%grid->_ldimensions[d])/grid->_rdimensions[d]);
ocoor.push_back(orthog[dd]%grid->_rdimensions[d]);
pcoor[d] = orthog[dd]/grid->_ldimensions[d];
int o = orthog[dd];
int r = grid->_rdimensions[d];
int omr = o % r;
}
Lexicographic::IndexFromCoor(ocoor,pole_osite,rdims);
Lexicographic::IndexFromCoor(icoor,pole_isite,idims);
int64_t osite;
if(isNorth ==North){
pcoor[xdim] = 0;
pcoor[ydim] = pgrid[ydim]-1;
pcoor[Ndm1] = pgrid[Ndm1]-1;
osite = pole_osite + grid->NorthPoleOsite();
} else {
pcoor[xdim] = pgrid[xdim]-1;
pcoor[ydim] = 0;
pcoor[Ndm1] = 0;
osite = pole_osite + grid->SouthPoleOsite();
}
rank = grid->RankFromProcessorCoor(pcoor);
// extract-modify-merge cycle is easiest way and this is not perf critical
if ( rank == grid->ThisRank() ) {
ExtractBuffer<sobj> buf(Nsimd);
autoView( l_v , l, CpuWrite);
extract(l_v[osite],buf);
buf[pole_isite] = s;
merge(l_v[osite],buf);
}
return;
};
//////////////////////////////////////////////////////////
// Peek a scalar object from the SIMD array
//////////////////////////////////////////////////////////
@@ -179,7 +309,7 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
for(int w=0;w<words;w++){
pt[w] = getlane(vp[w],idx);
}
// std::cout << "peekLocalSite "<<site<<" "<<odx<<","<<idx<<" "<<s<<std::endl;
return;
};
template<class vobj,class sobj>

View File

@@ -55,7 +55,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
//copy offsets to device
acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
acceleratorCopyToDeviceAsynch(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
@@ -88,7 +88,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
exit(EXIT_FAILURE);
}
acceleratorCopyFromDeviceAsync(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
acceleratorCopyFromDeviceAsynch(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
//sync after copy
accelerator_barrier();

View File

@@ -466,6 +466,12 @@ public:
static deviceVector<vobj> recv_buf;
send_buf.resize(buffer_size*2*depth);
recv_buf.resize(buffer_size*2*depth);
#ifndef ACCELERATOR_AWARE_MPI
static hostVector<vobj> hsend_buf;
static hostVector<vobj> hrecv_buf;
hsend_buf.resize(buffer_size*2*depth);
hrecv_buf.resize(buffer_size*2*depth);
#endif
std::vector<MpiCommsRequest_t> fwd_req;
std::vector<MpiCommsRequest_t> bwd_req;
@@ -495,9 +501,16 @@ public:
t_gather+=usecond()-t;
t=usecond();
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFromBegin(fwd_req,
(void *)&send_buf[d*buffer_size], xmit_to_rank,
(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
#else
acceleratorCopyFromDevice(&send_buf[d*buffer_size],&hsend_buf[d*buffer_size],bytes);
grid->SendToRecvFromBegin(fwd_req,
(void *)&hsend_buf[d*buffer_size], xmit_to_rank,
(void *)&hrecv_buf[d*buffer_size], recv_from_rank, bytes, tag);
#endif
t_comms+=usecond()-t;
}
for ( int d=0;d < depth ; d ++ ) {
@@ -508,9 +521,16 @@ public:
t_gather+= usecond() - t;
t=usecond();
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFromBegin(bwd_req,
(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
#else
acceleratorCopyFromDevice(&send_buf[(d+depth)*buffer_size],&hsend_buf[(d+depth)*buffer_size],bytes);
grid->SendToRecvFromBegin(bwd_req,
(void *)&hsend_buf[(d+depth)*buffer_size], recv_from_rank,
(void *)&hrecv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
#endif
t_comms+=usecond()-t;
}
@@ -533,8 +553,13 @@ public:
t=usecond();
grid->CommsComplete(fwd_req);
#ifndef ACCELERATOR_AWARE_MPI
for ( int d=0;d < depth ; d ++ ) {
acceleratorCopyToDevice(&hrecv_buf[d*buffer_size],&recv_buf[d*buffer_size],bytes);
}
#endif
t_comms+= usecond() - t;
t=usecond();
for ( int d=0;d < depth ; d ++ ) {
ScatterSlice(recv_buf,to,nld-depth+d,dimension,plane*buffer_size); plane++;
@@ -543,6 +568,11 @@ public:
t=usecond();
grid->CommsComplete(bwd_req);
#ifndef ACCELERATOR_AWARE_MPI
for ( int d=0;d < depth ; d ++ ) {
acceleratorCopyToDevice(&hrecv_buf[(d+depth)*buffer_size],&recv_buf[(d+depth)*buffer_size],bytes);
}
#endif
t_comms+= usecond() - t;
t=usecond();

View File

@@ -49,7 +49,7 @@ static constexpr int Tm = 7;
static constexpr int Nc=Config_Nc;
static constexpr int Ns=4;
static constexpr int Nd=4;
static constexpr int Nd=Config_Nd;
static constexpr int Nhs=2; // half spinor
static constexpr int Nds=8; // double stored gauge field
static constexpr int Ngp=2; // gparity index range
@@ -75,6 +75,7 @@ static constexpr int InverseYes=1;
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
const int SpinorIndex = 2;
const int PauliIndex = 2; //TensorLevel counts from the bottom!
template<typename T> struct isSpinor {
static constexpr bool value = (SpinorIndex==T::TensorLevel);
};

View File

@@ -0,0 +1,196 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion5D.h
Copyright (C) 2020 - 2025
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
Author: Nils Meyer <nils.meyer@ur.de>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/qcd/action/fermion/WilsonFermion5D.h>
#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
#include <Grid/qcd/action/fermion/CloverHelpers.h>
NAMESPACE_BEGIN(Grid);
// see Grid/qcd/action/fermion/CompactWilsonCloverFermion.h for description
template<class Impl, class CloverHelpers>
class CompactWilsonCloverFermion5D : public WilsonFermion5D<Impl>,
public WilsonCloverHelpers<Impl>,
public CompactWilsonCloverHelpers<Impl> {
/////////////////////////////////////////////
// Sizes
/////////////////////////////////////////////
public:
INHERIT_COMPACT_CLOVER_SIZES(Impl);
/////////////////////////////////////////////
// Type definitions
/////////////////////////////////////////////
public:
INHERIT_IMPL_TYPES(Impl);
INHERIT_CLOVER_TYPES(Impl);
INHERIT_COMPACT_CLOVER_TYPES(Impl);
typedef WilsonFermion5D<Impl> WilsonBase;
typedef WilsonCloverHelpers<Impl> Helpers;
typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
/////////////////////////////////////////////
// Constructors
/////////////////////////////////////////////
public:
CompactWilsonCloverFermion5D(GaugeField& _Umu,
GridCartesian &FiveDimGrid,
GridRedBlackCartesian &FiveDimRedBlackGrid,
GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid,
const RealD _mass,
const RealD _csw_r = 0.0,
const RealD _csw_t = 0.0,
const RealD _cF = 1.0,
const ImplParams& impl_p = ImplParams());
/////////////////////////////////////////////
// Member functions (implementing interface)
/////////////////////////////////////////////
public:
virtual void Instantiatable() {};
int ConstEE() override { return 0; };
int isTrivialEE() override { return 0; };
void Dhop(const FermionField& in, FermionField& out, int dag) override;
void DhopOE(const FermionField& in, FermionField& out, int dag) override;
void DhopEO(const FermionField& in, FermionField& out, int dag) override;
void DhopDir(const FermionField& in, FermionField& out, int dir, int disp) override;
void DhopDirAll(const FermionField& in, std::vector<FermionField>& out) /* override */;
void M(const FermionField& in, FermionField& out) override;
void Mdag(const FermionField& in, FermionField& out) override;
void Meooe(const FermionField& in, FermionField& out) override;
void MeooeDag(const FermionField& in, FermionField& out) override;
void Mooee(const FermionField& in, FermionField& out) override;
void MooeeDag(const FermionField& in, FermionField& out) override;
void MooeeInv(const FermionField& in, FermionField& out) override;
void MooeeInvDag(const FermionField& in, FermionField& out) override;
void Mdir(const FermionField& in, FermionField& out, int dir, int disp) override;
void MdirAll(const FermionField& in, std::vector<FermionField>& out) override;
void MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) override;
void MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
void MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
/////////////////////////////////////////////
// Member functions (internals)
/////////////////////////////////////////////
void MooeeInternal(const FermionField& in,
FermionField& out,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle);
/////////////////////////////////////////////
// Helpers
/////////////////////////////////////////////
void ImportGauge(const GaugeField& _Umu) override;
/////////////////////////////////////////////
// Helpers
/////////////////////////////////////////////
private:
template<class Field>
const MaskField* getCorrectMaskField(const Field &in) const {
if(in.Grid()->_isCheckerBoarded) {
if(in.Checkerboard() == Odd) {
return &this->BoundaryMaskOdd;
} else {
return &this->BoundaryMaskEven;
}
} else {
return &this->BoundaryMask;
}
}
template<class Field>
void ApplyBoundaryMask(Field& f) {
const MaskField* m = getCorrectMaskField(f); assert(m != nullptr);
assert(m != nullptr);
CompactHelpers::ApplyBoundaryMask(f, *m);
}
/////////////////////////////////////////////
// Member Data
/////////////////////////////////////////////
public:
RealD csw_r;
RealD csw_t;
RealD cF;
int n_rhs;
bool fixedBoundaries;
CloverDiagonalField Diagonal, DiagonalEven, DiagonalOdd;
CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
CloverTriangleField Triangle, TriangleEven, TriangleOdd;
CloverTriangleField TriangleInv, TriangleInvEven, TriangleInvOdd;
FermionField Tmp;
MaskField BoundaryMask, BoundaryMaskEven, BoundaryMaskOdd;
};
NAMESPACE_END(Grid);

View File

@@ -123,10 +123,10 @@ public:
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
peekLocalSite(ScalarUmu, Umu_v, lcoor);
for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu);
for (int mu = 0; mu < Nd; mu++) ScalarUds(mu) = ScalarUmu(mu);
peekLocalSite(ScalarUmu, Uadj_v, lcoor);
for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu);
for (int mu = 0; mu < Nd; mu++) ScalarUds(mu + Nd) = ScalarUmu(mu);
pokeLocalSite(ScalarUds, Uds_v, lcoor);
});

View File

@@ -55,6 +55,7 @@ NAMESPACE_CHECK(Wilson);
NAMESPACE_CHECK(WilsonTM);
#include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h> // 4d compact wilson clover fermions
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h> // 5d compact wilson clover fermions
NAMESPACE_CHECK(WilsonClover);
#include <Grid/qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types
NAMESPACE_CHECK(Wilson5D);
@@ -84,6 +85,15 @@ NAMESPACE_CHECK(DomainWall);
#include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h>
#include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h>
NAMESPACE_CHECK(Overlap);
///////////////////////////////////////////////////////////////////////////////
// Two spin wilson fermion based
///////////////////////////////////////////////////////////////////////////////
#include <Grid/qcd/action/fermion/TwoSpinWilsonFermion3plus1D.h>
NAMESPACE_CHECK(TwoSpinWilson);
///////////////////////////////////////////////////////////////////////////////
// G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
///////////////////////////////////////////////////////////////////////////////
@@ -164,12 +174,17 @@ typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiS
// Compact Clover fermions
template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>;
template <typename WImpl> using CompactWilsonClover5D = CompactWilsonCloverFermion5D<WImpl, CompactCloverHelpers<WImpl>>;
template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>;
typedef CompactWilsonClover<WilsonImplD2> CompactWilsonCloverFermionD2;
typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF;
typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD;
typedef CompactWilsonClover5D<WilsonImplD2> CompactWilsonCloverFermion5DD2;
typedef CompactWilsonClover5D<WilsonImplF> CompactWilsonCloverFermion5DF;
typedef CompactWilsonClover5D<WilsonImplD> CompactWilsonCloverFermion5DD;
typedef CompactWilsonExpClover<WilsonImplD2> CompactWilsonExpCloverFermionD2;
typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF;
typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD;

View File

@@ -41,8 +41,9 @@ NAMESPACE_CHECK(Compressor);
NAMESPACE_CHECK(FermionOperatorImpl);
#include <Grid/qcd/action/fermion/FermionOperator.h>
NAMESPACE_CHECK(FermionOperator);
#include <Grid/qcd/action/fermion/WilsonKernels.h> //used by all wilson type fermions
#include <Grid/qcd/action/fermion/WilsonKernels.h> //used by all wilson type fermions
#include <Grid/qcd/action/fermion/StaggeredKernels.h> //used by all wilson type fermions
#include <Grid/qcd/action/fermion/TwoSpinWilsonKernels.h> //used for 3D fermions, pauli in place of Dirac
NAMESPACE_CHECK(Kernels);
#endif

View File

@@ -180,6 +180,12 @@ NAMESPACE_CHECK(ImplGparityWilson);
#include <Grid/qcd/action/fermion/StaggeredImpl.h>
NAMESPACE_CHECK(ImplStaggered);
/////////////////////////////////////////////////////////////////////////////
// Two component spinor Wilson action for 3d / Boston
/////////////////////////////////////////////////////////////////////////////
#include <Grid/qcd/action/fermion/TwoSpinWilsonImpl.h>
NAMESPACE_CHECK(ImplTwoSpinWilson);
/////////////////////////////////////////////////////////////////////////////
// Single flavour one component spinors with colour index. 5d vec
/////////////////////////////////////////////////////////////////////////////

View File

@@ -274,7 +274,7 @@ public:
autoView( Uds_v , Uds, CpuWrite);
autoView( Utmp_v, Utmp, CpuWrite);
thread_foreach(ss,Utmp_v,{
Uds_v[ss](0)(mu+4) = Utmp_v[ss]();
Uds_v[ss](0)(mu+Nd) = Utmp_v[ss]();
});
}
Utmp = Uconj;
@@ -286,7 +286,7 @@ public:
autoView( Uds_v , Uds, CpuWrite);
autoView( Utmp_v, Utmp, CpuWrite);
thread_foreach(ss,Utmp_v,{
Uds_v[ss](1)(mu+4) = Utmp_v[ss]();
Uds_v[ss](1)(mu+Nd) = Utmp_v[ss]();
});
}
}
@@ -320,7 +320,7 @@ public:
}
Uconj = conjugate(*Upoke);
pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu + 4);
pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu + Nd);
}
}

View File

@@ -36,6 +36,8 @@ public:
static const std::vector<int> directions;
static const std::vector<int> displacements;
static const int npoint = 16;
static std::vector<int> MakeDirections(void);
static std::vector<int> MakeDisplacements(void);
};
template <class Impl>

View File

@@ -40,6 +40,8 @@ public:
static const std::vector<int> directions;
static const std::vector<int> displacements;
const int npoint = 16;
static std::vector<int> MakeDirections(void);
static std::vector<int> MakeDisplacements(void);
};
template<class Impl>

View File

@@ -36,6 +36,8 @@ public:
static const std::vector<int> directions;
static const std::vector<int> displacements;
static const int npoint = 8;
static std::vector<int> MakeDirections(void);
static std::vector<int> MakeDisplacements(void);
};
template <class Impl>

View File

@@ -141,9 +141,9 @@ public:
Udag = Udag *phases;
InsertGaugeField(Uds,U,mu);
InsertGaugeField(Uds,Udag,mu+4);
InsertGaugeField(Uds,Udag,mu+Nd);
// PokeIndex<LorentzIndex>(Uds, U, mu);
// PokeIndex<LorentzIndex>(Uds, Udag, mu + 4);
// PokeIndex<LorentzIndex>(Uds, Udag, mu + Nd);
// 3 hop based on thin links. Crazy huh ?
U = PeekIndex<LorentzIndex>(Uthin, mu);
@@ -156,7 +156,7 @@ public:
UUUdag = UUUdag *phases;
InsertGaugeField(UUUds,UUU,mu);
InsertGaugeField(UUUds,UUUdag,mu+4);
InsertGaugeField(UUUds,UUUdag,mu+Nd);
}
}

View File

@@ -0,0 +1,175 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/TwoSpinWilsonFermion3plus1D.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma one
NAMESPACE_BEGIN(Grid);
class TwoSpinWilsonFermion3plus1DStatic {
public:
// S-direction is INNERMOST and takes no part in the parity.
static const std::vector<int> directions;
static const std::vector<int> displacements;
static constexpr int npoint = 6;
static std::vector<int> MakeDirections(void);
static std::vector<int> MakeDisplacements(void);
};
template<class Impl>
class TwoSpinWilsonFermion3plus1D : public TwoSpinWilsonKernels<Impl>, public TwoSpinWilsonFermion3plus1DStatic
{
public:
INHERIT_IMPL_TYPES(Impl);
typedef TwoSpinWilsonKernels<Impl> Kernels;
FermionField _tmp;
FermionField &tmp(void) { return _tmp; }
int Dirichlet;
Coordinate Block;
///////////////////////////////////////////////////////////////
// Implement the abstract base
///////////////////////////////////////////////////////////////
GridBase *GaugeGrid(void) { return _ThreeDimGrid ;}
GridBase *GaugeRedBlackGrid(void) { return _ThreeDimRedBlackGrid ;}
GridBase *FermionGrid(void) { return _FourDimGrid;}
GridBase *FermionRedBlackGrid(void) { return _FourDimRedBlackGrid;}
// full checkerboard operations; leave unimplemented as abstract for now
virtual void M (const FermionField &in, FermionField &out){assert(0);};
virtual void Mdag (const FermionField &in, FermionField &out){assert(0);};
// half checkerboard operations; leave unimplemented as abstract for now
virtual void Meooe (const FermionField &in, FermionField &out);
virtual void Mooee (const FermionField &in, FermionField &out);
virtual void MooeeInv (const FermionField &in, FermionField &out);
virtual void MeooeDag (const FermionField &in, FermionField &out);
virtual void MooeeDag (const FermionField &in, FermionField &out);
virtual void MooeeInvDag (const FermionField &in, FermionField &out);
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
// These can be overridden by fancy 5d chiral action
virtual void DhopDeriv (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
// void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
// Implement hopping term non-hermitian hopping term; half cb or both
// Implement s-diagonal DW
void DW (const FermionField &in, FermionField &out,int dag);
void Dhop (const FermionField &in, FermionField &out,int dag);
void DhopOE(const FermionField &in, FermionField &out,int dag);
void DhopEO(const FermionField &in, FermionField &out,int dag);
void DhopComms (const FermionField &in, FermionField &out);
void DhopCalc (const FermionField &in, FermionField &out,uint64_t *ids);
// add a DhopComm
// -- suboptimal interface will presently trigger multiple comms.
void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
void DhopDirAll(const FermionField &in,std::vector<FermionField> &out);
void DhopDirComms(const FermionField &in);
void DhopDirCalc(const FermionField &in, FermionField &out,int point);
///////////////////////////////////////////////////////////////
// New methods added
///////////////////////////////////////////////////////////////
void DerivInternal(StencilImpl & st,
DoubledGaugeField & U,
GaugeField &mat,
const FermionField &A,
const FermionField &B,
int dag);
void DhopInternal(StencilImpl & st,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out,
int dag);
void DhopInternalOverlappedComms(StencilImpl & st,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out,
int dag);
void DhopInternalSerialComms(StencilImpl & st,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out,
int dag);
// Constructors
TwoSpinWilsonFermion3plus1D(GaugeField &_Umu,
GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid,
GridCartesian &ThreeDimGrid,
GridRedBlackCartesian &ThreeDimRedBlackGrid,
double _M5,const ImplParams &p= ImplParams());
virtual void DirichletBlock(const Coordinate & block)
{
}
// DoubleStore
void ImportGauge(const GaugeField &_Umu);
///////////////////////////////////////////////////////////////
// Data members require to support the functionality
///////////////////////////////////////////////////////////////
public:
// Add these to the support from Wilson
GridBase *_ThreeDimGrid;
GridBase *_ThreeDimRedBlackGrid;
GridBase *_FourDimGrid;
GridBase *_FourDimRedBlackGrid;
double M5;
int Ls;
//Defines the stencils for even and odd
StencilImpl Stencil;
StencilImpl StencilEven;
StencilImpl StencilOdd;
// Copy of the gauge field , with even and odd subsets
DoubledGaugeField Umu;
DoubledGaugeField UmuEven;
DoubledGaugeField UmuOdd;
};
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,222 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/FermionOperatorImpl.h
Copyright (C) 2015
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////
// Single flavour four spinors with colour index
/////////////////////////////////////////////////////////////////////////////
template <class S, class Representation = FundamentalRepresentation,class Options = CoeffReal >
class TwoSpinWilsonImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
public:
static const int Dimension = Representation::Dimension;
static const bool isFundamental = Representation::isFundamental;
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
INHERIT_GIMPL_TYPES(Gimpl);
//Necessary?
constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}
typedef typename Options::_Coeff_t Coeff_t;
template <typename vtype> using iImplSpinor = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
template <typename vtype> using iImplPropagator = iScalar<iMatrix<iMatrix<vtype, Dimension>, Nhs> >;
template <typename vtype> using iImplHalfSpinor = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
template <typename vtype> using iImplHalfCommSpinor = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
typedef iImplSpinor<Simd> SiteSpinor;
typedef iImplPropagator<Simd> SitePropagator;
typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
typedef iImplHalfCommSpinor<Simd> SiteHalfCommSpinor;
typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
typedef Lattice<SiteSpinor> FermionField;
typedef Lattice<SitePropagator> PropagatorField;
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
typedef SimpleCompressor<SiteSpinor> Compressor;
typedef WilsonImplParams ImplParams;
typedef CartesianStencil<SiteSpinor, SiteSpinor, ImplParams> StencilImpl;
typedef const typename StencilImpl::View_type StencilView;
ImplParams Params;
TwoSpinWilsonImpl(const ImplParams &p = ImplParams()) : Params(p){
};
template<class _Spinor>
static accelerator_inline void multLink(_Spinor &phi,
const SiteDoubledGaugeField &U,
const _Spinor &chi,
int mu)
{
auto UU = coalescedRead(U(mu));
mult(&phi(), &UU, &chi());
}
template<class _Spinor>
static accelerator_inline void multLink(_Spinor &phi,
const SiteDoubledGaugeField &U,
const _Spinor &chi,
int mu,
StencilEntry *SE,
StencilView &St)
{
multLink(phi,U,chi,mu);
}
template<class _SpinorField>
inline void multLinkField(_SpinorField & out,
const DoubledGaugeField &Umu,
const _SpinorField & phi,
int mu)
{
const int Nsimd = SiteHalfSpinor::Nsimd();
autoView( out_v, out, AcceleratorWrite);
autoView( phi_v, phi, AcceleratorRead);
autoView( Umu_v, Umu, AcceleratorRead);
typedef decltype(coalescedRead(out_v[0])) calcSpinor;
accelerator_for(sss,out.Grid()->oSites(),Nsimd,{
calcSpinor tmp;
multLink(tmp,Umu_v[sss],phi_v(sss),mu);
coalescedWrite(out_v[sss],tmp);
});
}
template <class ref>
static accelerator_inline void loadLinkElement(Simd &reg, ref &memory)
{
reg = memory;
}
inline void DoubleStore(GridBase *GaugeGrid,
DoubledGaugeField &Uds,
const GaugeField &Umu)
{
typedef typename Simd::scalar_type scalar_type;
conformable(Uds.Grid(), GaugeGrid);
conformable(Umu.Grid(), GaugeGrid);
GaugeLinkField U(GaugeGrid);
GaugeLinkField tmp(GaugeGrid);
Lattice<iScalar<vInteger> > coor(GaugeGrid);
////////////////////////////////////////////////////
// apply any boundary phase or twists
////////////////////////////////////////////////////
for (int mu = 0; mu < Nd; mu++) {
////////// boundary phase /////////////
auto pha = Params.boundary_phases[mu];
scalar_type phase( real(pha),imag(pha) );
int L = GaugeGrid->GlobalDimensions()[mu];
int Lmu = L - 1;
LatticeCoordinate(coor, mu);
U = PeekIndex<LorentzIndex>(Umu, mu);
// apply any twists
RealD theta = Params.twist_n_2pi_L[mu] * 2*M_PI / L;
if ( theta != 0.0) {
scalar_type twphase(::cos(theta),::sin(theta));
U = twphase*U;
std::cout << GridLogMessage << " Twist ["<<mu<<"] "<< Params.twist_n_2pi_L[mu]<< " phase"<<phase <<std::endl;
}
tmp = where(coor == Lmu, phase * U, U);
PokeIndex<LorentzIndex>(Uds, tmp, mu);
U = adj(Cshift(U, mu, -1));
U = where(coor == 0, conjugate(phase) * U, U);
PokeIndex<LorentzIndex>(Uds, U, mu + Nd);
}
}
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
GaugeLinkField link(mat.Grid());
link = TraceIndex<SpinIndex>(outerProduct(Btilde,A));
PokeIndex<LorentzIndex>(mat,link,mu);
}
inline void outerProductImpl(PropagatorField &mat, const FermionField &B, const FermionField &A){
mat = outerProduct(B,A);
}
inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
mat = TraceIndex<SpinIndex>(P);
}
inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds)
{
for (int mu = 0; mu < Nd; mu++)
mat[mu] = PeekIndex<LorentzIndex>(Uds, mu);
}
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu)
{
int Ls=Btilde.Grid()->_fdimensions[0];
autoView( mat_v , mat, AcceleratorWrite);
{
const int Nsimd = SiteSpinor::Nsimd();
autoView( Btilde_v , Btilde, AcceleratorRead);
autoView( Atilde_v , Atilde, AcceleratorRead);
accelerator_for(sss,mat.Grid()->oSites(),Nsimd,{
int sU=sss;
typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
ColorMatrixType sum;
zeroit(sum);
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
for(int spn=0;spn<Ns;spn++){ //sum over spin
auto bb = coalescedRead(Btilde_v[sF]()(spn) ); //color vector
auto aa = coalescedRead(Atilde_v[sF]()(spn) );
auto op = outerProduct(bb,aa);
sum = sum + op;
}
}
coalescedWrite(mat_v[sU](mu)(), sum);
});
}
}
};
typedef TwoSpinWilsonImpl<vComplex, FundamentalRepresentation, CoeffReal > TwoSpinWilsonImplR; // Real.. whichever prec
typedef TwoSpinWilsonImpl<vComplexF, FundamentalRepresentation, CoeffReal > TwoSpinWilsonImplF; // Float
typedef TwoSpinWilsonImpl<vComplexD, FundamentalRepresentation, CoeffReal > TwoSpinWilsonImplD; // Double
typedef TwoSpinWilsonImpl<vComplexD2, FundamentalRepresentation, CoeffReal > TwoSpinWilsonImplD2; // Double
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,84 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.h
Copyright (C) 2015
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Helper routines that implement Wilson stencil for a single site.
// Common to both the WilsonFermion and WilsonFermion5D
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Impl> class TwoSpinWilsonKernels : public FermionOperator<Impl> {
public:
INHERIT_IMPL_TYPES(Impl);
typedef FermionOperator<Impl> Base;
typedef AcceleratorVector<int,STENCIL_MAX> StencilVector;
public:
static void DhopKernel(StencilImpl &st, DoubledGaugeField &U, SiteSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out,
int interior=1,int exterior=1) ;
static void DhopKernel(StencilImpl &st, DoubledGaugeField &U, SiteSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out,
uint64_t *ids);
static void DhopDagKernel(StencilImpl &st, DoubledGaugeField &U, SiteSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out,
int interior=1,int exterior=1) ;
static void DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteSpinor *buf, int Ls,
int Nsite, const FermionField &in, std::vector<FermionField> &out) ;
static void DhopDirKernel(StencilImpl &st, DoubledGaugeField &U,SiteSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma);
private:
static accelerator_inline void DhopDirK(StencilView &st, DoubledGaugeFieldView &U,SiteSpinor * buf,
int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dirdisp, int gamma);
static accelerator_inline void DhopDirXp(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
static accelerator_inline void DhopDirYp(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
static accelerator_inline void DhopDirZp(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
static accelerator_inline void DhopDirXm(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
static accelerator_inline void DhopDirYm(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
static accelerator_inline void DhopDirZm(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
public:
TwoSpinWilsonKernels(const ImplParams &p = ImplParams()) : Base(p){};
};
NAMESPACE_END(Grid);

View File

@@ -484,6 +484,11 @@ public:
this->face_table_computed=1;
assert(this->u_comm_offset==this->_unified_buffer_size);
accelerator_barrier();
#ifdef NVLINK_GET
this->_grid->StencilBarrier(); // He can now get mu local gather, I can get his
// Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
// Or issue barrier AFTER the DMA is running
#endif
}
};

View File

@@ -38,6 +38,8 @@ public:
static int MortonOrder;
static const std::vector<int> directions;
static const std::vector<int> displacements;
static std::vector<int> MakeDirections(void);
static std::vector<int> MakeDisplacements(void);
static const int npoint = 8;
};

View File

@@ -62,6 +62,8 @@ public:
static const std::vector<int> directions;
static const std::vector<int> displacements;
static constexpr int npoint = 8;
static std::vector<int> MakeDirections(void);
static std::vector<int> MakeDisplacements(void);
};
template<class Impl>
@@ -91,13 +93,13 @@ public:
virtual void Mdag (const FermionField &in, FermionField &out){assert(0);};
// half checkerboard operations; leave unimplemented as abstract for now
virtual void Meooe (const FermionField &in, FermionField &out){assert(0);};
virtual void Mooee (const FermionField &in, FermionField &out){assert(0);};
virtual void MooeeInv (const FermionField &in, FermionField &out){assert(0);};
virtual void Meooe (const FermionField &in, FermionField &out);
virtual void Mooee (const FermionField &in, FermionField &out);
virtual void MooeeInv (const FermionField &in, FermionField &out);
virtual void MeooeDag (const FermionField &in, FermionField &out){assert(0);};
virtual void MooeeDag (const FermionField &in, FermionField &out){assert(0);};
virtual void MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
virtual void MeooeDag (const FermionField &in, FermionField &out);
virtual void MooeeDag (const FermionField &in, FermionField &out);
virtual void MooeeInvDag (const FermionField &in, FermionField &out);
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac

View File

@@ -166,7 +166,7 @@ public:
U = adj(Cshift(U, mu, -1));
U = where(coor == 0, conjugate(phase) * U, U);
PokeIndex<LorentzIndex>(Uds, U, mu + 4);
PokeIndex<LorentzIndex>(Uds, U, mu + Nd);
}
}

View File

@@ -56,7 +56,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
Frbgrid,
Ugrid,
Urbgrid,
4.0,p)
Nd*1.0,p)
{
update(_mass,_mu);
@@ -83,7 +83,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
out.Checkerboard() = in.Checkerboard();
//axpibg5x(out,in,a,b); // out = a*in + b*i*G5*in
for (int s=0;s<(int)this->mass.size();s++) {
ComplexD a = 4.0+this->mass[s];
ComplexD a = Nd*1.0+this->mass[s];
ComplexD b(0.0,this->mu[s]);
axpbg5y_ssp(out,a,in,b,in,s,s);
}
@@ -92,7 +92,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
virtual void MooeeDag(const FermionField &in, FermionField &out) {
out.Checkerboard() = in.Checkerboard();
for (int s=0;s<(int)this->mass.size();s++) {
ComplexD a = 4.0+this->mass[s];
ComplexD a = Nd*1.0+this->mass[s];
ComplexD b(0.0,-this->mu[s]);
axpbg5y_ssp(out,a,in,b,in,s,s);
}
@@ -101,7 +101,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
for (int s=0;s<(int)this->mass.size();s++) {
RealD m = this->mass[s];
RealD tm = this->mu[s];
RealD mtil = 4.0+this->mass[s];
RealD mtil = Nd*1.0+this->mass[s];
RealD sq = mtil*mtil+tm*tm;
ComplexD a = mtil/sq;
ComplexD b(0.0, -tm /sq);
@@ -112,7 +112,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
for (int s=0;s<(int)this->mass.size();s++) {
RealD m = this->mass[s];
RealD tm = this->mu[s];
RealD mtil = 4.0+this->mass[s];
RealD mtil = Nd*1.0+this->mass[s];
RealD sq = mtil*mtil+tm*tm;
ComplexD a = mtil/sq;
ComplexD b(0.0,tm /sq);
@@ -126,7 +126,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
this->Dhop(in, out, DaggerNo);
FermionField tmp(out.Grid());
for (int s=0;s<(int)this->mass.size();s++) {
ComplexD a = 4.0+this->mass[s];
ComplexD a = Nd*1.0+this->mass[s];
ComplexD b(0.0,this->mu[s]);
axpbg5y_ssp(tmp,a,in,b,in,s,s);
}

View File

@@ -0,0 +1,376 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion5DImplementation.h
Copyright (C) 2017 - 2025
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/qcd/spin/Dirac.h>
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h>
NAMESPACE_BEGIN(Grid);
template<class Impl, class CloverHelpers>
CompactWilsonCloverFermion5D<Impl, CloverHelpers>::CompactWilsonCloverFermion5D(GaugeField& _Umu,
GridCartesian &FiveDimGrid,
GridRedBlackCartesian &FiveDimRedBlackGrid,
GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid,
const RealD _mass,
const RealD _csw_r,
const RealD _csw_t,
const RealD _cF,
const ImplParams& impl_p)
: WilsonBase(_Umu, FiveDimGrid, FiveDimRedBlackGrid, FourDimGrid, FourDimRedBlackGrid, _mass, impl_p)
, csw_r(_csw_r)
, csw_t(_csw_t)
, cF(_cF)
, fixedBoundaries(impl_p.boundary_phases[Nd-1] == 0.0)
, Diagonal(&FourDimGrid), Triangle(&FourDimGrid)
, DiagonalEven(&FourDimRedBlackGrid), TriangleEven(&FourDimRedBlackGrid)
, DiagonalOdd(&FourDimRedBlackGrid), TriangleOdd(&FourDimRedBlackGrid)
, DiagonalInv(&FourDimGrid), TriangleInv(&FourDimGrid)
, DiagonalInvEven(&FourDimRedBlackGrid), TriangleInvEven(&FourDimRedBlackGrid)
, DiagonalInvOdd(&FourDimRedBlackGrid), TriangleInvOdd(&FourDimRedBlackGrid)
, Tmp(&FiveDimGrid)
, BoundaryMask(&FiveDimGrid)
, BoundaryMaskEven(&FiveDimRedBlackGrid), BoundaryMaskOdd(&FiveDimRedBlackGrid)
{
assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3);
csw_r *= 0.5;
csw_t *= 0.5;
//if (clover_anisotropy.isAnisotropic)
// csw_r /= clover_anisotropy.xi_0;
ImportGauge(_Umu);
if (fixedBoundaries) {
this->BoundaryMaskEven.Checkerboard() = Even;
this->BoundaryMaskOdd.Checkerboard() = Odd;
CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
}
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Dhop(const FermionField& in, FermionField& out, int dag) {
WilsonBase::Dhop(in, out, dag);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopOE(const FermionField& in, FermionField& out, int dag) {
WilsonBase::DhopOE(in, out, dag);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopEO(const FermionField& in, FermionField& out, int dag) {
WilsonBase::DhopEO(in, out, dag);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
WilsonBase::DhopDir(in, out, dir, disp);
if(this->fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
WilsonBase::DhopDirAll(in, out);
if(this->fixedBoundaries) {
for(auto& o : out) ApplyBoundaryMask(o);
}
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::M(const FermionField& in, FermionField& out) {
out.Checkerboard() = in.Checkerboard();
WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
Mooee(in, Tmp);
axpy(out, 1.0, out, Tmp);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Mdag(const FermionField& in, FermionField& out) {
out.Checkerboard() = in.Checkerboard();
WilsonBase::Dhop(in, out, DaggerYes); // call base to save applying bc
MooeeDag(in, Tmp);
axpy(out, 1.0, out, Tmp);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Meooe(const FermionField& in, FermionField& out) {
WilsonBase::Meooe(in, out);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MeooeDag(const FermionField& in, FermionField& out) {
WilsonBase::MeooeDag(in, out);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Mooee(const FermionField& in, FermionField& out) {
if(in.Grid()->_isCheckerBoarded) {
if(in.Checkerboard() == Odd) {
MooeeInternal(in, out, DiagonalOdd, TriangleOdd);
} else {
MooeeInternal(in, out, DiagonalEven, TriangleEven);
}
} else {
MooeeInternal(in, out, Diagonal, Triangle);
}
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeDag(const FermionField& in, FermionField& out) {
Mooee(in, out); // blocks are hermitian
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInv(const FermionField& in, FermionField& out) {
if(in.Grid()->_isCheckerBoarded) {
if(in.Checkerboard() == Odd) {
MooeeInternal(in, out, DiagonalInvOdd, TriangleInvOdd);
} else {
MooeeInternal(in, out, DiagonalInvEven, TriangleInvEven);
}
} else {
MooeeInternal(in, out, DiagonalInv, TriangleInv);
}
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInvDag(const FermionField& in, FermionField& out) {
MooeeInv(in, out); // blocks are hermitian
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Mdir(const FermionField& in, FermionField& out, int dir, int disp) {
DhopDir(in, out, dir, disp);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MdirAll(const FermionField& in, std::vector<FermionField>& out) {
DhopDirAll(in, out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
assert(!fixedBoundaries); // TODO check for changes required for open bc
// NOTE: code copied from original clover term
conformable(X.Grid(), Y.Grid());
conformable(X.Grid(), force.Grid());
GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
GaugeField clover_force(force.Grid());
PropagatorField Lambda(force.Grid());
// Guido: Here we are hitting some performance issues:
// need to extract the components of the DoubledGaugeField
// for each call
// Possible solution
// Create a vector object to store them? (cons: wasting space)
std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
Impl::extractLinkField(U, this->Umu);
force = Zero();
// Derivative of the Wilson hopping term
this->DhopDeriv(force, X, Y, dag);
///////////////////////////////////////////////////////////
// Clover term derivative
///////////////////////////////////////////////////////////
Impl::outerProductImpl(Lambda, X, Y);
//std::cout << "Lambda:" << Lambda << std::endl;
Gamma::Algebra sigma[] = {
Gamma::Algebra::SigmaXY,
Gamma::Algebra::SigmaXZ,
Gamma::Algebra::SigmaXT,
Gamma::Algebra::MinusSigmaXY,
Gamma::Algebra::SigmaYZ,
Gamma::Algebra::SigmaYT,
Gamma::Algebra::MinusSigmaXZ,
Gamma::Algebra::MinusSigmaYZ,
Gamma::Algebra::SigmaZT,
Gamma::Algebra::MinusSigmaXT,
Gamma::Algebra::MinusSigmaYT,
Gamma::Algebra::MinusSigmaZT};
/*
sigma_{\mu \nu}=
| 0 sigma[0] sigma[1] sigma[2] |
| sigma[3] 0 sigma[4] sigma[5] |
| sigma[6] sigma[7] 0 sigma[8] |
| sigma[9] sigma[10] sigma[11] 0 |
*/
int count = 0;
clover_force = Zero();
for (int mu = 0; mu < 4; mu++)
{
force_mu = Zero();
for (int nu = 0; nu < 4; nu++)
{
if (mu == nu)
continue;
RealD factor;
if (nu == 4 || mu == 4)
{
factor = 2.0 * csw_t;
}
else
{
factor = 2.0 * csw_r;
}
PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok
force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu); // checked
count++;
}
pokeLorentz(clover_force, U[mu] * force_mu, mu);
}
//clover_force *= csw;
force += clover_force;
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
assert(0);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
assert(0);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInternal(const FermionField& in,
FermionField& out,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle) {
assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
out.Checkerboard() = in.Checkerboard();
conformable(in, out);
CompactHelpers::MooeeKernel(diagonal.oSites(), this->Ls, in, out, diagonal, triangle);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::ImportGauge(const GaugeField& _Umu) {
// NOTE: parts copied from original implementation
// Import gauge into base class
double t0 = usecond();
WilsonBase::ImportGauge(_Umu); // NOTE: called here and in wilson constructor -> performed twice, but can't avoid that
// Initialize temporary variables
double t1 = usecond();
conformable(_Umu.Grid(), this->GaugeGrid());
GridBase* grid = _Umu.Grid();
typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
CloverField TmpOriginal(grid);
CloverField TmpInverse(grid);
// Compute the field strength terms mu>nu
double t2 = usecond();
WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
// Compute the Clover Operator acting on Colour and Spin
// multiply here by the clover coefficients for the anisotropy
double t3 = usecond();
TmpOriginal = Helpers::fillCloverYZ(Bx) * csw_r;
TmpOriginal += Helpers::fillCloverXZ(By) * csw_r;
TmpOriginal += Helpers::fillCloverXY(Bz) * csw_r;
TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
// Instantiate the clover term
// - In case of the standard clover the mass term is added
// - In case of the exponential clover the clover term is exponentiated
double t4 = usecond();
CloverHelpers::InstantiateClover(TmpOriginal, TmpInverse, csw_t, 4.0 + this->M5 /*this->diag_mass*/);
// Convert the data layout of the clover term
double t5 = usecond();
CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
// Modify the clover term at the temporal boundaries in case of open boundary conditions
double t6 = usecond();
if(fixedBoundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, 4.0 + this->M5 /*this->diag_mass*/);
// Invert the Clover term
// In case of the exponential clover with (anti-)periodic boundary conditions exp(-Clover) saved
// in TmpInverse can be used. In all other cases the clover term has to be explictly inverted.
// TODO: For now this inversion is explictly done on the CPU
double t7 = usecond();
CloverHelpers::InvertClover(TmpInverse, Diagonal, Triangle, DiagonalInv, TriangleInv, fixedBoundaries);
// Fill the remaining clover fields
double t8 = usecond();
pickCheckerboard(Even, DiagonalEven, Diagonal);
pickCheckerboard(Even, TriangleEven, Triangle);
pickCheckerboard(Odd, DiagonalOdd, Diagonal);
pickCheckerboard(Odd, TriangleOdd, Triangle);
pickCheckerboard(Even, DiagonalInvEven, DiagonalInv);
pickCheckerboard(Even, TriangleInvEven, TriangleInv);
pickCheckerboard(Odd, DiagonalInvOdd, DiagonalInv);
pickCheckerboard(Odd, TriangleInvOdd, TriangleInv);
// Report timings
double t9 = usecond();
std::cout << GridLogDebug << "CompactWilsonCloverFermion5D::ImportGauge timings:" << std::endl;
std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
std::cout << GridLogDebug << "allocations = " << (t2 - t1) / 1e6 << std::endl;
std::cout << GridLogDebug << "field strength = " << (t3 - t2) / 1e6 << std::endl;
std::cout << GridLogDebug << "fill clover = " << (t4 - t3) / 1e6 << std::endl;
std::cout << GridLogDebug << "instantiate clover = " << (t5 - t4) / 1e6 << std::endl;
std::cout << GridLogDebug << "convert layout = " << (t6 - t5) / 1e6 << std::endl;
std::cout << GridLogDebug << "modify boundaries = " << (t7 - t6) / 1e6 << std::endl;
std::cout << GridLogDebug << "invert clover = " << (t8 - t7) / 1e6 << std::endl;
std::cout << GridLogDebug << "pick cbs = " << (t9 - t8) / 1e6 << std::endl;
std::cout << GridLogDebug << "total = " << (t9 - t0) / 1e6 << std::endl;
}
NAMESPACE_END(Grid);

View File

@@ -240,7 +240,7 @@ void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, std::ve
this->ceo.resize(Ls);
for(int i=0; i<Ls; ++i){
this->bee[i] = 4.0 - this->M5 + 1.0;
this->bee[i] = Nd*1.0 - this->M5 + 1.0;
this->cee[i] = 1.0;
}

View File

@@ -0,0 +1,486 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/TwoSpinWilsonFermion2plus1D.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/WilsonFermion5D.h>
#include <Grid/perfmon/PerfCount.h>
NAMESPACE_BEGIN(Grid);
// 5d lattice for DWF.
template<class Impl>
TwoSpinWilsonFermion3plus15D<Impl>::TwoSpinWilsonFermion3plus1D(GaugeField &_Umu,
GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid,
GridCartesian &ThreeDimGrid,
GridRedBlackCartesian &ThreeDimRedBlackGrid,
RealD _M5,const ImplParams &p) :
Kernels(p),
_FourDimGrid (&FourDimGrid),
_FourDimRedBlackGrid(&FourDimRedBlackGrid),
_ThreeDimGrid (&ThreeDimGrid),
_ThreeDimRedBlackGrid(&ThreeDimRedBlackGrid),
Stencil (_FourDimGrid,npoint,Even,directions,displacements,p),
StencilEven(_FourDimRedBlackGrid,npoint,Even,directions,displacements,p), // source is Even
StencilOdd (_FourDimRedBlackGrid,npoint,Odd ,directions,displacements,p), // source is Odd
M5(_M5),
Umu(_ThreeDimGrid),
UmuEven(_ThreeDimRedBlackGrid),
UmuOdd (_ThreeDimRedBlackGrid),
_tmp(&FourDimRedBlackGrid),
Dirichlet(0)
{
// some assertions
assert(FourDimGrid._ndimension==Nd+1);
assert(ThreeDimGrid._ndimension==Nd);
assert(ThreeDimRedBlackGrid._ndimension==Nd);
assert(FourDimRedBlackGrid._ndimension==Nd+1);
assert(FourDimRedBlackGrid._checker_dim==1); // Don't checker the s direction
// extent of fifth dim and not spread out
Ls=FourDimGrid._fdimensions[0];
assert(FourDimRedBlackGrid._fdimensions[0]==Ls);
assert(FourDimGrid._processors[0] ==1);
assert(FourDimRedBlackGrid._processors[0] ==1);
// Other dimensions must match the decomposition of the four-D fields
for(int d=0;d<Nd;d++){
assert(FourDimGrid._processors[d+1] ==ThreeDimGrid._processors[d]);
assert(FourDimRedBlackGrid._processors[d+1] ==ThreeDimGrid._processors[d]);
assert(ThreeDimRedBlackGrid._processors[d] ==ThreeDimGrid._processors[d]);
assert(FourDimGrid._fdimensions[d+1] ==ThreeDimGrid._fdimensions[d]);
assert(FourDimRedBlackGrid._fdimensions[d+1]==ThreeDimGrid._fdimensions[d]);
assert(ThreeDimRedBlackGrid._fdimensions[d] ==ThreeDimGrid._fdimensions[d]);
assert(FourDimGrid._simd_layout[d+1] ==ThreeDimGrid._simd_layout[d]);
assert(FourDimRedBlackGrid._simd_layout[d+1]==ThreeDimGrid._simd_layout[d]);
assert(ThreeDimRedBlackGrid._simd_layout[d] ==ThreeDimGrid._simd_layout[d]);
}
if ( p.dirichlet.size() == Nd+1) {
Coordinate block = p.dirichlet;
for(int d=0;d<Nd+1;d++) {
if ( block[d] ){
Dirichlet = 1;
std::cout << GridLogMessage << " WilsonFermion: non-trivial Dirichlet condition "<< block << std::endl;
std::cout << GridLogMessage << " WilsonFermion: partial Dirichlet "<< p.partialDirichlet << std::endl;
Block = block;
}
}
} else {
Coordinate block(Nd+1,0);
Block = block;
}
// Dimension zero of the five-d is the Ls direction
assert(FourDimRedBlackGrid._simd_layout[0]==1);
assert(FourDimGrid._simd_layout[0] ==1);
// Allocate the required comms buffer
ImportGauge(_Umu);
// Build lists of exterior only nodes
int LLs = FourDimGrid._rdimensions[0];
int vol3;
vol3=ThreeDimGrid.oSites();
Stencil.BuildSurfaceList(LLs,vol3);
vol3=ThreeDimRedBlackGrid.oSites();
StencilEven.BuildSurfaceList(LLs,vol3);
StencilOdd.BuildSurfaceList(LLs,vol3);
}
template<class Impl>
void TwoSpinWilsonFermion3plus1D<Impl>::ImportGauge(const GaugeField &_Umu)
{
GaugeField HUmu(_Umu.Grid());
HUmu = _Umu*(-0.5);
Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
pickCheckerboard(Even,UmuEven,Umu);
pickCheckerboard(Odd ,UmuOdd,Umu);
}
template<class Impl>
void TwoSpinWilsonFermion3plus1D<Impl>::DhopDir(const FermionField &in, FermionField &out,int dir5,int disp)
{
int dir = dir5-1; // Maps to the ordering above in "directions" that is passed to stencil
// we drop off the innermost fifth dimension
// assert( (disp==1)||(disp==-1) );
// assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;
int skip = (disp==1) ? 0 : 1;
int dirdisp = dir+skip*Nd;
int gamma = dir+(1-skip)*Nd;
Compressor compressor(DaggerNo);
Stencil.HaloExchange(in,compressor);
uint64_t Nsite = Umu.Grid()->oSites();
Kernels::DhopDirKernel(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out,dirdisp,gamma);
};
template<class Impl>
void TwoSpinWilsonFermion3plus1D<Impl>::DhopDirAll(const FermionField &in, std::vector<FermionField> &out)
{
Compressor compressor(DaggerNo);
Stencil.HaloExchange(in,compressor);
uint64_t Nsite = Umu.Grid()->oSites();
Kernels::DhopDirAll(Stencil,Umu,Stencil.CommBuf(),Ls,Nsite,in,out);
};
template<class Impl>
void TwoSpinWilsonFermion3plus1D<Impl>::DerivInternal(StencilImpl & st,
DoubledGaugeField & U,
GaugeField &mat,
const FermionField &A,
const FermionField &B,
int dag)
{
assert((dag==DaggerNo) ||(dag==DaggerYes));
conformable(st.Grid(),A.Grid());
conformable(st.Grid(),B.Grid());
Compressor compressor(dag);
FermionField Btilde(B.Grid());
FermionField Atilde(B.Grid());
st.HaloExchange(B,compressor);
Atilde=A;
int LLs = B.Grid()->_rdimensions[0];
for (int mu = 0; mu < Nd; mu++) {
////////////////////////////////////////////////////////////////////////
// Flip gamma if dag
////////////////////////////////////////////////////////////////////////
int gamma = mu;
if (!dag) gamma += Nd;
////////////////////////
// Call the single hop
////////////////////////
int Usites = U.Grid()->oSites();
Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, Usites, B, Btilde, mu,gamma);
////////////////////////////
// spin trace outer product
////////////////////////////
Impl::InsertForce5D(mat, Btilde, Atilde, mu);
}
}
template<class Impl>
void TwoSpinWilsonFermion3plus1D<Impl>::DhopDeriv(GaugeField &mat,
const FermionField &A,
const FermionField &B,
int dag)
{
conformable(A.Grid(),FermionGrid());
conformable(A.Grid(),B.Grid());
//conformable(GaugeGrid(),mat.Grid());// this is not general! leaving as a comment
mat.Checkerboard() = A.Checkerboard();
// mat.checkerboard = A.checkerboard;
DerivInternal(Stencil,Umu,mat,A,B,dag);
}
template<class Impl>
void TwoSpinWilsonFermion3plus1D<Impl>::DhopDerivEO(GaugeField &mat,
const FermionField &A,
const FermionField &B,
int dag)
{
conformable(A.Grid(),FermionRedBlackGrid());
conformable(A.Grid(),B.Grid());
assert(B.Checkerboard()==Odd);
assert(A.Checkerboard()==Even);
mat.Checkerboard() = Even;
DerivInternal(StencilOdd,UmuEven,mat,A,B,dag);
}
template<class Impl>
void TwoSpinWilsonFermion3plus1D<Impl>::DhopDerivOE(GaugeField &mat,
const FermionField &A,
const FermionField &B,
int dag)
{
conformable(A.Grid(),FermionRedBlackGrid());
conformable(A.Grid(),B.Grid());
assert(B.Checkerboard()==Even);
assert(A.Checkerboard()==Odd);
mat.Checkerboard() = Odd;
DerivInternal(StencilEven,UmuOdd,mat,A,B,dag);
}
template<class Impl>
void TwoSpinWilsonFermion3plus1D<Impl>::DhopInternal(StencilImpl & st,
DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag)
{
DhopInternalSerialComms(st,U,in,out,dag);
}
template<class Impl>
void TwoSpinWilsonFermion3plus1D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag)
{
GRID_TRACE("DhopInternalOverlappedComms");
Compressor compressor(dag);
int LLs = in.Grid()->_rdimensions[0];
int len = U.Grid()->oSites();
/////////////////////////////
// Start comms // Gather intranode and extra node differentiated??
/////////////////////////////
{
// std::cout << " TwoSpinWilsonFermion3plus1D gather " <<std::endl;
GRID_TRACE("Gather");
st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine
}
// std::cout << " TwoSpinWilsonFermion3plus1D Communicate Begin " <<std::endl;
std::vector<std::vector<CommsRequest_t> > requests;
#if 1
/////////////////////////////
// Overlap with comms
/////////////////////////////
st.CommunicateBegin(requests);
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
#endif
/////////////////////////////
// do the compute interior
/////////////////////////////
if (dag == DaggerYes) {
GRID_TRACE("DhopDagInterior");
Kernels::DhopDagKernel(st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
} else {
GRID_TRACE("DhopInterior");
Kernels::DhopKernel (st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
}
//ifdef GRID_ACCELERATED
#if 0
/////////////////////////////
// Overlap with comms -- on GPU the interior kernel call is nonblocking
/////////////////////////////
st.CommunicateBegin(requests);
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
#endif
/////////////////////////////
// Complete comms
/////////////////////////////
// std::cout << " TwoSpinWilsonFermion3plus1D Comms Complete " <<std::endl;
st.CommunicateComplete(requests);
// traceStop(id);
/////////////////////////////
// do the compute exterior
/////////////////////////////
{
// std::cout << " TwoSpinWilsonFermion3plus1D Comms Merge " <<std::endl;
GRID_TRACE("Merge");
st.CommsMerge(compressor);
}
// std::cout << " TwoSpinWilsonFermion3plus1D Exterior " <<std::endl;
if (dag == DaggerYes) {
GRID_TRACE("DhopDagExterior");
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
} else {
GRID_TRACE("DhopExterior");
Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
}
// std::cout << " TwoSpinWilsonFermion3plus1D Done " <<std::endl;
}
template<class Impl>
void TwoSpinWilsonFermion3plus1D<Impl>::DhopInternalSerialComms(StencilImpl & st,
DoubledGaugeField & U,
const FermionField &in,
FermionField &out,int dag)
{
GRID_TRACE("DhopInternalSerialComms");
Compressor compressor(dag);
int LLs = in.Grid()->_rdimensions[0];
// std::cout << " TwoSpinWilsonFermion3plus1D Halo exch " <<std::endl;
{
GRID_TRACE("HaloExchange");
st.HaloExchangeOpt(in,compressor);
}
// std::cout << " TwoSpinWilsonFermion3plus1D Dhop " <<std::endl;
if (dag == DaggerYes) {
GRID_TRACE("DhopDag");
Kernels::DhopDagKernel(st,U,st.CommBuf(),LLs,U.oSites(),in,out);
} else {
GRID_TRACE("Dhop");
Kernels::DhopKernel(st,U,st.CommBuf(),LLs,U.oSites(),in,out);
}
// std::cout << " TwoSpinWilsonFermion3plus1D Done " <<std::endl;
}
template<class Impl>
void TwoSpinWilsonFermion3plus1D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
{
conformable(in.Grid(),FermionRedBlackGrid()); // verifies half grid
conformable(in.Grid(),out.Grid()); // drops the cb check
assert(in.Checkerboard()==Even);
out.Checkerboard() = Odd;
DhopInternal(StencilEven,UmuOdd,in,out,dag);
}
template<class Impl>
void TwoSpinWilsonFermion3plus1D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
{
conformable(in.Grid(),FermionRedBlackGrid()); // verifies half grid
conformable(in.Grid(),out.Grid()); // drops the cb check
assert(in.Checkerboard()==Odd);
out.Checkerboard() = Even;
DhopInternal(StencilOdd,UmuEven,in,out,dag);
}
template<class Impl>
void TwoSpinWilsonFermion3plus1D<Impl>::DhopComms(const FermionField &in, FermionField &out)
{
int dag =0 ;
conformable(in.Grid(),FermionGrid()); // verifies full grid
conformable(in.Grid(),out.Grid());
out.Checkerboard() = in.Checkerboard();
Compressor compressor(dag);
Stencil.HaloExchangeOpt(in,compressor);
}
template<class Impl>
void TwoSpinWilsonFermion3plus1D<Impl>::DhopCalc(const FermionField &in, FermionField &out,uint64_t *ids)
{
conformable(in.Grid(),FermionGrid()); // verifies full grid
conformable(in.Grid(),out.Grid());
out.Checkerboard() = in.Checkerboard();
int LLs = in.Grid()->_rdimensions[0];
Kernels::DhopKernel(Stencil,Umu,Stencil.CommBuf(),LLs,Umu.oSites(),in,out,ids);
}
template<class Impl>
void TwoSpinWilsonFermion3plus1D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
{
conformable(in.Grid(),FermionGrid()); // verifies full grid
conformable(in.Grid(),out.Grid());
out.Checkerboard() = in.Checkerboard();
DhopInternal(Stencil,Umu,in,out,dag);
}
template<class Impl>
void TwoSpinWilsonFermion3plus1D<Impl>::DW(const FermionField &in, FermionField &out,int dag)
{
out.Checkerboard()=in.Checkerboard();
Dhop(in,out,dag); // -0.5 is included
axpy(out,Nd*1.0-M5,in,out);
}
template <class Impl>
void TwoSpinWilsonFermion3plus1D<Impl>::Meooe(const FermionField &in, FermionField &out)
{
if (in.Checkerboard() == Odd) {
DhopEO(in, out, DaggerNo);
} else {
DhopOE(in, out, DaggerNo);
}
}
template <class Impl>
void TwoSpinWilsonFermion3plus1D<Impl>::MeooeDag(const FermionField &in, FermionField &out)
{
if (in.Checkerboard() == Odd) {
DhopEO(in, out, DaggerYes);
} else {
DhopOE(in, out, DaggerYes);
}
}
template <class Impl>
void TwoSpinWilsonFermion3plus1D<Impl>::Mooee(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard();
typename FermionField::scalar_type scal(Nd*1.0 + M5);
out = scal * in;
}
template <class Impl>
void TwoSpinWilsonFermion3plus1D<Impl>::MooeeDag(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard();
Mooee(in, out);
}
template<class Impl>
void TwoSpinWilsonFermion3plus1D<Impl>::MooeeInv(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard();
out = (1.0/(Nd*1.0 + M5))*in;
}
template<class Impl>
void TwoSpinWilsonFermion3plus1D<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard();
MooeeInv(in,out);
}
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,441 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/TwoSpinWilsonKernels.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/qcd/action/fermion/FermionCore.h>
NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////
// Generic implementation; move to different file?
////////////////////////////////////////////
#define GENERIC_STENCIL_LEG(Dir,spProj,Recon) \
SE = st.GetEntry(ptype, Dir, sF); \
if (SE->_is_local) { \
int perm= SE->_permute; \
auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane); \
spProj(chi,tmp); \
} else { \
chi = coalescedRead(buf[SE->_offset],lane); \
} \
acceleratorSynchronise(); \
Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \
Recon(result, Uchi);
#define GENERIC_STENCIL_LEG_INT(Dir,spProj,Recon) \
SE = st.GetEntry(ptype, Dir, sF); \
if (SE->_is_local) { \
int perm= SE->_permute; \
auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane); \
spProj(chi,tmp); \
Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \
Recon(result, Uchi); \
} \
acceleratorSynchronise();
#define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon) \
SE = st.GetEntry(ptype, Dir, sF); \
if (!SE->_is_local ) { \
auto chi = coalescedRead(buf[SE->_offset],lane); \
Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \
Recon(result, Uchi); \
nmu++; \
} \
acceleratorSynchronise();
#define GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon) \
if (SE->_is_local ) { \
int perm= SE->_permute; \
auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane); \
spProj(chi,tmp); \
} else { \
chi = coalescedRead(buf[SE->_offset],lane); \
} \
acceleratorSynchronise(); \
Impl::multLink(Uchi, U[sU], chi, dir, SE, st); \
Recon(result, Uchi);
#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon) \
if (gamma == Dir) { \
GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon); \
}
////////////////////////////////////////////////////////////////////
// All legs kernels ; comms then compute
////////////////////////////////////////////////////////////////////
template <class Impl> accelerator_inline
void TwoSpinWilsonKernels<Impl>::DhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out)
{
typedef decltype(coalescedRead(in[0])) calcSpinor;
calcSpinor chi;
calcSpinor Uchi;
calcSpinor result;
StencilEntry *SE;
int ptype;
const int Nsimd = SiteSpinor::Nsimd();
const int lane=acceleratorSIMTlane(Nsimd);
GENERIC_STENCIL_LEG(Xp,pauliProjXp,pauliAssign);
GENERIC_STENCIL_LEG(Yp,pauliProjYp,pauliAdd);
GENERIC_STENCIL_LEG(Zp,pauliProjZp,pauliAdd);
GENERIC_STENCIL_LEG(Xm,pauliProjXm,pauliAdd);
GENERIC_STENCIL_LEG(Ym,pauliProjYm,pauliAdd);
GENERIC_STENCIL_LEG(Zm,pauliProjZm,pauliAdd);
coalescedWrite(out[sF],result,lane);
};
template <class Impl> accelerator_inline
void TwoSpinWilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U,
SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out)
{
typedef decltype(coalescedRead(in[0])) calcSpinor;
calcSpinor chi;
// calcSpinor *chi_p;
calcSpinor Uchi;
calcSpinor result;
StencilEntry *SE;
int ptype;
const int Nsimd = SiteSpinor::Nsimd();
const int lane=acceleratorSIMTlane(Nsimd);
GENERIC_STENCIL_LEG(Xm,pauliProjXp,pauliAssign);
GENERIC_STENCIL_LEG(Ym,pauliProjYp,pauliAdd);
GENERIC_STENCIL_LEG(Zm,pauliProjZp,pauliAdd);
GENERIC_STENCIL_LEG(Xp,pauliProjXm,pauliAdd);
GENERIC_STENCIL_LEG(Yp,pauliProjYm,pauliAdd);
GENERIC_STENCIL_LEG(Zp,pauliProjZm,pauliAdd);
coalescedWrite(out[sF], result,lane);
};
////////////////////////////////////////////////////////////////////
// Interior kernels
////////////////////////////////////////////////////////////////////
template <class Impl> accelerator_inline
void TwoSpinWilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,
SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out)
{
typedef decltype(coalescedRead(in[0])) calcSpinor;
calcSpinor chi;
// calcSpinor *chi_p;
calcSpinor Uchi;
calcSpinor result;
StencilEntry *SE;
int ptype;
const int Nsimd = SiteSpinor::Nsimd();
const int lane=acceleratorSIMTlane(Nsimd);
result=Zero();
GENERIC_STENCIL_LEG_INT(Xp,pauliProjXp,pauliAdd);
GENERIC_STENCIL_LEG_INT(Yp,pauliProjYp,pauliAdd);
GENERIC_STENCIL_LEG_INT(Zp,pauliProjZp,pauliAdd);
GENERIC_STENCIL_LEG_INT(Xm,pauliProjXm,pauliAdd);
GENERIC_STENCIL_LEG_INT(Ym,pauliProjYm,pauliAdd);
GENERIC_STENCIL_LEG_INT(Zm,pauliProjZm,pauliAdd);
coalescedWrite(out[sF], result,lane);
};
template <class Impl> accelerator_inline
void TwoSpinWilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,
SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out)
{
typedef decltype(coalescedRead(in[0])) calcSpinor;
const int Nsimd = SiteSpinor::Nsimd();
const int lane=acceleratorSIMTlane(Nsimd);
calcSpinor chi;
// calcSpinor *chi_p;
calcSpinor Uchi;
calcSpinor result;
StencilEntry *SE;
int ptype;
result=Zero();
GENERIC_STENCIL_LEG_INT(Xm,pauliProjXp,pauliAdd);
GENERIC_STENCIL_LEG_INT(Ym,pauliProjYp,pauliAdd);
GENERIC_STENCIL_LEG_INT(Zm,pauliProjZp,pauliAdd);
GENERIC_STENCIL_LEG_INT(Xp,pauliProjXm,pauliAdd);
GENERIC_STENCIL_LEG_INT(Yp,pauliProjYm,pauliAdd);
GENERIC_STENCIL_LEG_INT(Zp,pauliProjZm,pauliAdd);
coalescedWrite(out[sF], result,lane);
};
////////////////////////////////////////////////////////////////////
// Exterior kernels
////////////////////////////////////////////////////////////////////
template <class Impl> accelerator_inline
void TwoSpinWilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,
SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out)
{
typedef decltype(coalescedRead(in[0])) calcSpinor;
// calcSpinor *chi_p;
calcSpinor Uchi;
calcSpinor result;
StencilEntry *SE;
int ptype;
int nmu=0;
const int Nsimd = SiteSpinor::Nsimd();
const int lane=acceleratorSIMTlane(Nsimd);
result=Zero();
GENERIC_STENCIL_LEG_EXT(Xp,pauliProjXp,pauliAdd);
GENERIC_STENCIL_LEG_EXT(Yp,pauliProjYp,pauliAdd);
GENERIC_STENCIL_LEG_EXT(Zp,pauliProjZp,pauliAdd);
GENERIC_STENCIL_LEG_EXT(Xm,pauliProjXm,pauliAdd);
GENERIC_STENCIL_LEG_EXT(Ym,pauliProjYm,pauliAdd);
GENERIC_STENCIL_LEG_EXT(Zm,pauliProjZm,pauliAdd);
if ( nmu ) {
auto out_t = coalescedRead(out[sF],lane);
out_t = out_t + result;
coalescedWrite(out[sF],out_t,lane);
}
};
template <class Impl> accelerator_inline
void TwoSpinWilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,
SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out)
{
typedef decltype(coalescedRead(in[0])) calcSpinor;
// calcSpinor *chi_p;
calcSpinor Uchi;
calcSpinor result;
StencilEntry *SE;
int ptype;
int nmu=0;
const int Nsimd = SiteSpinor::Nsimd();
const int lane=acceleratorSIMTlane(Nsimd);
result=Zero();
GENERIC_STENCIL_LEG_EXT(Xm,pauliProjXp,pauliAdd);
GENERIC_STENCIL_LEG_EXT(Ym,pauliProjYp,pauliAdd);
GENERIC_STENCIL_LEG_EXT(Zm,pauliProjZp,pauliAdd);
GENERIC_STENCIL_LEG_EXT(Xp,pauliProjXm,pauliAdd);
GENERIC_STENCIL_LEG_EXT(Yp,pauliProjYm,pauliAdd);
GENERIC_STENCIL_LEG_EXT(Zp,pauliProjZm,pauliAdd);
if ( nmu ) {
auto out_t = coalescedRead(out[sF],lane);
out_t = out_t + result;
coalescedWrite(out[sF],out_t,lane);
}
};
#define DhopDirMacro(Dir,spProj,spRecon) \
template <class Impl> accelerator_inline \
void TwoSpinWilsonKernels<Impl>::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteSpinor *buf, int sF, \
int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \
{ \
typedef decltype(coalescedRead(in[0])) calcSpinor; \
calcSpinor chi; \
calcSpinor result; \
calcSpinor Uchi; \
StencilEntry *SE; \
int ptype; \
const int Nsimd = SiteSpinor::Nsimd(); \
const int lane=acceleratorSIMTlane(Nsimd); \
\
SE = st.GetEntry(ptype, dir, sF); \
GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,spRecon); \
coalescedWrite(out[sF], result,lane); \
}
DhopDirMacro(Xp,pauliProjXp,pauliAssign);
DhopDirMacro(Yp,pauliProjYp,pauliAssign);
DhopDirMacro(Zp,pauliProjZp,pauliAssign);
DhopDirMacro(Xm,pauliProjXm,pauliAssign);
DhopDirMacro(Ym,pauliProjYm,pauliAssign);
DhopDirMacro(Zm,pauliProjZm,pauliAssign);
template <class Impl> accelerator_inline
void TwoSpinWilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma)
{
typedef decltype(coalescedRead(in[0])) calcSpinor;
calcSpinor chi;
calcSpinor result;
calcSpinor Uchi;
StencilEntry *SE;
int ptype;
const int Nsimd = SiteSpinor::Nsimd();
const int lane=acceleratorSIMTlane(Nsimd);
SE = st.GetEntry(ptype, dir, sF);
GENERIC_DHOPDIR_LEG(Xp,pauliProjXp,pauliAssign);
GENERIC_DHOPDIR_LEG(Yp,pauliProjYp,pauliAssign);
GENERIC_DHOPDIR_LEG(Zp,pauliProjZp,pauliAssign);
GENERIC_DHOPDIR_LEG(Xm,pauliProjXm,pauliAssign);
GENERIC_DHOPDIR_LEG(Ym,pauliProjYm,pauliAssign);
GENERIC_DHOPDIR_LEG(Zm,pauliProjZm,pauliAssign);
coalescedWrite(out[sF], result,lane);
}
template <class Impl>
void TwoSpinWilsonKernels<Impl>::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteSpinor *buf, int Ls,
int Nsite, const FermionField &in, std::vector<FermionField> &out)
{
autoView(U_v ,U,AcceleratorRead);
autoView(in_v ,in,AcceleratorRead);
autoView(st_v ,st,AcceleratorRead);
autoView(out_Xm,out[0],AcceleratorWrite);
autoView(out_Ym,out[1],AcceleratorWrite);
autoView(out_Zm,out[2],AcceleratorWrite);
autoView(out_Xp,out[4],AcceleratorWrite);
autoView(out_Yp,out[5],AcceleratorWrite);
autoView(out_Zp,out[6],AcceleratorWrite);
auto CBp=st.CommBuf();
accelerator_for(sss,Nsite*Ls,Simd::Nsimd(),{
int sU=sss/Ls;
int sF =sss;
DhopDirXm(st_v,U_v,CBp,sF,sU,in_v,out_Xm,0);
DhopDirYm(st_v,U_v,CBp,sF,sU,in_v,out_Ym,1);
DhopDirZm(st_v,U_v,CBp,sF,sU,in_v,out_Zm,2);
DhopDirXp(st_v,U_v,CBp,sF,sU,in_v,out_Xp,3);
DhopDirYp(st_v,U_v,CBp,sF,sU,in_v,out_Yp,4);
DhopDirZp(st_v,U_v,CBp,sF,sU,in_v,out_Zp,5);
});
}
template <class Impl>
void TwoSpinWilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,SiteSpinor *buf, int Ls,
int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma)
{
assert(dirdisp<=5);
assert(dirdisp>=0);
autoView(U_v ,U ,AcceleratorRead);
autoView(in_v ,in ,AcceleratorRead);
autoView(out_v,out,AcceleratorWrite);
autoView(st_v ,st ,AcceleratorRead);
auto CBp=st.CommBuf();
#define LoopBody(Dir) \
case Dir : \
accelerator_for(ss,Nsite,Simd::Nsimd(),{ \
for(int s=0;s<Ls;s++){ \
int sU=ss; \
int sF = s+Ls*sU; \
DhopDir##Dir(st_v,U_v,CBp,sF,sU,in_v,out_v,dirdisp);\
} \
}); \
break;
switch(gamma){
LoopBody(Xp);
LoopBody(Yp);
LoopBody(Zp);
LoopBody(Xm);
LoopBody(Ym);
LoopBody(Zm);
default:
assert(0);
break;
}
#undef LoopBody
}
#define KERNEL_CALLNB(A) \
const uint64_t NN = Nsite*Ls; \
accelerator_forNB( ss, NN, Simd::Nsimd(), { \
int sF = ss; \
int sU = ss/Ls; \
TwoSpinWilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v); \
});
#define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier();
#define KERNEL_CALL_EXT(A) \
const uint64_t sz = st.surface_list.size(); \
auto ptr = &st.surface_list[0]; \
accelerator_forNB( ss, sz, Simd::Nsimd(), { \
int sF = ptr[ss]; \
int sU = sF/Ls; \
TwoSpinWilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v); \
}); \
accelerator_barrier();
template <class Impl>
void TwoSpinWilsonKernels<Impl>::DhopKernel(StencilImpl &st, DoubledGaugeField &U, SiteSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out,
int interior,int exterior)
{
autoView(U_v , U,AcceleratorRead);
autoView(in_v , in,AcceleratorRead);
autoView(out_v,out,AcceleratorWrite);
autoView(st_v , st,AcceleratorRead);
if( interior && exterior ) {
acceleratorFenceComputeStream();
KERNEL_CALL(GenericDhopSite);
return;
} else if( interior ) {
KERNEL_CALLNB(GenericDhopSiteInt);
return;
} else if( exterior ) {
// // dependent on result of merge
acceleratorFenceComputeStream();
KERNEL_CALL_EXT(GenericDhopSiteExt);
return;
}
assert(0 && " Kernel optimisation case not covered ");
}
template <class Impl>
void TwoSpinWilsonKernels<Impl>::DhopDagKernel(StencilImpl &st, DoubledGaugeField &U, SiteSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out,
int interior,int exterior)
{
autoView(U_v ,U,AcceleratorRead);
autoView(in_v ,in,AcceleratorRead);
autoView(out_v,out,AcceleratorWrite);
autoView(st_v ,st,AcceleratorRead);
if( interior && exterior ) {
acceleratorFenceComputeStream();
KERNEL_CALL(GenericDhopSiteDag);
return;
} else if( interior ) {
KERNEL_CALLNB(GenericDhopSiteDagInt); return;
} else if( exterior ) {
// Dependent on result of merge
acceleratorFenceComputeStream();
KERNEL_CALL_EXT(GenericDhopSiteDagExt); return;
}
assert(0 && " Kernel optimisation case not covered ");
}
#undef KERNEL_CALLNB
#undef KERNEL_CALL
NAMESPACE_END(Grid);

View File

@@ -61,7 +61,7 @@ WilsonCloverFermion<Impl, CloverHelpers>::WilsonCloverFermion(GaugeField&
diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
} else {
csw_r = _csw_r * 0.5;
diag_mass = 4.0 + _mass;
diag_mass = Nd*1.0 + _mass;
}
csw_t = _csw_t * 0.5;
@@ -297,9 +297,9 @@ void WilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField &force, const F
{
if (mu == nu)
continue;
RealD factor;
if (nu == 4 || mu == 4)
if (nu == (Nd-1) || mu == (Nd-1)) // This was a bug - surely mu/nu is NEVER 4 but rather (Nd-1)=3 ??
{
factor = 2.0 * csw_t;
}
@@ -307,9 +307,11 @@ void WilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField &force, const F
{
factor = 2.0 * csw_r;
}
PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok
force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu); // checked
if ( mu < Nd && nu < Nd ) { // Allow to restrict range to Nd=3, but preserve orders of SigmaMuNu in table by counting ALL
PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok
force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu); // checked
}
count++;
}

View File

@@ -14,6 +14,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
Author: Andrew Lawson <andrew.lawson1991@gmail.com>
Author: Vera Guelpers <V.M.Guelpers@soton.ac.uk>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -62,10 +63,10 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
Dirichlet(0)
{
// some assertions
assert(FiveDimGrid._ndimension==5);
assert(FourDimGrid._ndimension==4);
assert(FourDimRedBlackGrid._ndimension==4);
assert(FiveDimRedBlackGrid._ndimension==5);
assert(FiveDimGrid._ndimension==Nd+1);
assert(FourDimGrid._ndimension==Nd);
assert(FourDimRedBlackGrid._ndimension==Nd);
assert(FiveDimRedBlackGrid._ndimension==Nd+1);
assert(FiveDimRedBlackGrid._checker_dim==1); // Don't checker the s direction
// extent of fifth dim and not spread out
@@ -75,7 +76,7 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
assert(FiveDimRedBlackGrid._processors[0] ==1);
// Other dimensions must match the decomposition of the four-D fields
for(int d=0;d<4;d++){
for(int d=0;d<Nd;d++){
assert(FiveDimGrid._processors[d+1] ==FourDimGrid._processors[d]);
assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
@@ -92,11 +93,13 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
if ( p.dirichlet.size() == Nd+1) {
Coordinate block = p.dirichlet;
if ( block[0] || block[1] || block[2] || block[3] || block[4] ){
Dirichlet = 1;
std::cout << GridLogMessage << " WilsonFermion: non-trivial Dirichlet condition "<< block << std::endl;
std::cout << GridLogMessage << " WilsonFermion: partial Dirichlet "<< p.partialDirichlet << std::endl;
Block = block;
for(int d=0;d<Nd+1;d++) {
if ( block[d] ){
Dirichlet = 1;
std::cout << GridLogMessage << " WilsonFermion: non-trivial Dirichlet condition "<< block << std::endl;
std::cout << GridLogMessage << " WilsonFermion: partial Dirichlet "<< p.partialDirichlet << std::endl;
Block = block;
}
}
} else {
Coordinate block(Nd+1,0);
@@ -111,7 +114,7 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
assert(FiveDimGrid._simd_layout[0] ==nsimd);
assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);
for(int d=0;d<4;d++){
for(int d=0;d<Nd;d++){
assert(FourDimGrid._simd_layout[d]==1);
assert(FourDimRedBlackGrid._simd_layout[d]==1);
assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
@@ -182,8 +185,8 @@ void WilsonFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,in
// assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t;
int skip = (disp==1) ? 0 : 1;
int dirdisp = dir+skip*4;
int gamma = dir+(1-skip)*4;
int dirdisp = dir+skip*Nd;
int gamma = dir+(1-skip)*Nd;
Compressor compressor(DaggerNo);
Stencil.HaloExchange(in,compressor);
@@ -482,7 +485,55 @@ void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag
{
out.Checkerboard()=in.Checkerboard();
Dhop(in,out,dag); // -0.5 is included
axpy(out,4.0-M5,in,out);
axpy(out,Nd*1.0-M5,in,out);
}
template <class Impl>
void WilsonFermion5D<Impl>::Meooe(const FermionField &in, FermionField &out)
{
if (in.Checkerboard() == Odd) {
DhopEO(in, out, DaggerNo);
} else {
DhopOE(in, out, DaggerNo);
}
}
template <class Impl>
void WilsonFermion5D<Impl>::MeooeDag(const FermionField &in, FermionField &out)
{
if (in.Checkerboard() == Odd) {
DhopEO(in, out, DaggerYes);
} else {
DhopOE(in, out, DaggerYes);
}
}
template <class Impl>
void WilsonFermion5D<Impl>::Mooee(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard();
typename FermionField::scalar_type scal(Nd*1.0 + M5);
out = scal * in;
}
template <class Impl>
void WilsonFermion5D<Impl>::MooeeDag(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard();
Mooee(in, out);
}
template<class Impl>
void WilsonFermion5D<Impl>::MooeeInv(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard();
out = (1.0/(Nd*1.0 + M5))*in;
}
template<class Impl>
void WilsonFermion5D<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard();
MooeeInv(in,out);
}
template<class Impl>
@@ -586,7 +637,7 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
A = one / (abs(W) * sinha * 2.0) * one / (sinhaLs * 2.0);
F = eaLs * (one - Wea + (Wema - one) * mass*mass);
F = F + emaLs * (Wema - one + (one - Wea) * mass*mass);
F = F - abs(W) * sinha * 4.0 * mass;
F = F - abs(W) * sinha * (Nd* 1.0) * mass;
Bpp = (A/F) * (ema2Ls - one) * (one - Wema) * (one - mass*mass * one);
Bmm = (A/F) * (one - ea2Ls) * (one - Wea) * (one - mass*mass * one);

View File

@@ -63,7 +63,7 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
if (anisotropyCoeff.isAnisotropic){
diag_mass = mass + 1.0 + (Nd-1)*(anisotropyCoeff.nu / anisotropyCoeff.xi_0);
} else {
diag_mass = 4.0 + mass;
diag_mass = Nd*1.0 + mass;
}
int vol4;
@@ -354,8 +354,8 @@ void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int
Stencil.HaloExchange(in, compressor);
int skip = (disp == 1) ? 0 : 1;
int dirdisp = dir + skip * 4;
int gamma = dir + (1 - skip) * 4;
int dirdisp = dir + skip * Nd;
int gamma = dir + (1 - skip) * Nd;
DhopDirCalc(in, out, dirdisp, gamma, DaggerNo);
};
@@ -370,8 +370,8 @@ void WilsonFermion<Impl>::DhopDirAll(const FermionField &in, std::vector<Fermion
for(int disp=-1;disp<=1;disp+=2){
int skip = (disp == 1) ? 0 : 1;
int dirdisp = dir + skip * 4;
int gamma = dir + (1 - skip) * 4;
int dirdisp = dir + skip * Nd;
int gamma = dir + (1 - skip) * Nd;
DhopDirCalc(in, out[dirdisp], dirdisp, gamma, DaggerNo);
}

View File

@@ -97,7 +97,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
distance = st._distances[DIR]; \
sl = st._simd_layout[direction]; \
inplace_twist = 0; \
if(SE->_around_the_world && st.parameters.twists[DIR % 4]){ \
if(SE->_around_the_world && st.parameters.twists[DIR % Nd]){ \
if(sl == 1){ \
g = (F+1) % 2; \
}else{ \

View File

@@ -63,7 +63,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
} else { \
chi = coalescedRead(buf[SE->_offset],lane); \
} \
acceleratorSynchronise(); \
acceleratorSynchronise(); \
Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \
Recon(result, Uchi);
@@ -517,7 +517,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;}
#endif
} else if( exterior ) {
// dependent on result of merge
// // dependent on result of merge
acceleratorFenceComputeStream();
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL_EXT(GenericDhopSiteExt); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteExt); return;}

View File

@@ -0,0 +1,45 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/ qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation5D.cc.master
Copyright (C) 2017 - 2025
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
Author: Mattia Bruno <mattia.bruno@cern.ch>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/qcd/spin/Dirac.h>
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h>
#include <Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermion5DImplementation.h>
#include <Grid/qcd/action/fermion/CloverHelpers.h>
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class CompactWilsonCloverFermion5D<IMPLEMENTATION, CompactCloverHelpers<IMPLEMENTATION>>;
template class CompactWilsonCloverFermion5D<IMPLEMENTATION, CompactExpCloverHelpers<IMPLEMENTATION>>;
NAMESPACE_END(Grid);

View File

@@ -32,8 +32,30 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid);
// S-direction is INNERMOST and takes no part in the parity.
const std::vector<int> ImprovedStaggeredFermion5DStatic::directions({1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4});
const std::vector<int> ImprovedStaggeredFermion5DStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, 3, 3, -3, -3, -3, -3});
const std::vector<int> ImprovedStaggeredFermion5DStatic::directions(ImprovedStaggeredFermion5DStatic::MakeDirections());
const std::vector<int> ImprovedStaggeredFermion5DStatic::displacements(ImprovedStaggeredFermion5DStatic::MakeDisplacements());
std::vector<int> ImprovedStaggeredFermion5DStatic::MakeDirections(void)
{
std::vector<int> directions(4*Nd);
for(int d=0;d<Nd;d++){
directions[d+Nd*0] = d+1;
directions[d+Nd*1] = d+1;
directions[d+Nd*2] = d+1;
directions[d+Nd*3] = d+1;
}
return directions;
}
std::vector<int> ImprovedStaggeredFermion5DStatic::MakeDisplacements(void)
{
std::vector<int> displacements(4*Nd);
for(int d=0;d<Nd;d++){
displacements[d+Nd*0] =+1;
displacements[d+Nd*1] =-1;
displacements[d+Nd*2] =+3;
displacements[d+Nd*3] =-3;
}
return displacements;
}
NAMESPACE_END(Grid);

View File

@@ -32,5 +32,26 @@ NAMESPACE_BEGIN(Grid);
const std::vector<int> ImprovedStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3});
const std::vector<int> ImprovedStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, 3, 3, -3, -3, -3, -3});
std::vector<int> ImprovedStaggeredFermionStatic::MakeDirections(void)
{
std::vector<int> directions(4*Nd);
for(int d=0;d<Nd;d++){
directions[d+Nd*0] = d;
directions[d+Nd*1] = d;
directions[d+Nd*2] = d;
directions[d+Nd*3] = d;
}
return directions;
}
std::vector<int> ImprovedStaggeredFermionStatic::MakeDisplacements(void)
{
std::vector<int> displacements(4*Nd);
for(int d=0;d<Nd;d++){
displacements[d+Nd*0] =+1;
displacements[d+Nd*1] =-1;
displacements[d+Nd*2] =+3;
displacements[d+Nd*3] =-3;
}
return displacements;
}
NAMESPACE_END(Grid);

View File

@@ -30,7 +30,27 @@ directory
NAMESPACE_BEGIN(Grid);
const std::vector<int> NaiveStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3});
const std::vector<int> NaiveStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1});
//const std::vector<int> NaiveStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3});
//const std::vector<int> NaiveStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1});
const std::vector<int> NaiveStaggeredFermionStatic::directions(NaiveStaggeredFermionStatic::MakeDirections());
const std::vector<int> NaiveStaggeredFermionStatic::displacements(NaiveStaggeredFermionStatic::MakeDisplacements());
std::vector<int> NaiveStaggeredFermionStatic::MakeDirections(void)
{
std::vector<int> directions(4*Nd);
for(int d=0;d<Nd;d++){
directions[d+Nd*0] = d;
directions[d+Nd*1] = d;
}
return directions;
}
std::vector<int> NaiveStaggeredFermionStatic::MakeDisplacements(void)
{
std::vector<int> displacements(4*Nd);
for(int d=0;d<Nd;d++){
displacements[d+Nd*0] =+1;
displacements[d+Nd*1] =-1;
}
return displacements;
}
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,61 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/TwoSpinWilsonFermion3plus1D.h>
NAMESPACE_BEGIN(Grid);
// S-direction is INNERMOST and takes no part in the parity.
const std::vector<int> TwoSpinWilsonFermion3plus1DStatic::directions (TwoSpinWilsonFermion3plus1DStatic::MakeDirections());
const std::vector<int> TwoSpinWilsonFermion3plus1DStatic::displacements(TwoSpinWilsonFermion3plus1DStatic::MakeDisplacements());
std::vector<int> TwoSpinWilsonFermion3plus1DStatic::MakeDirections (void)
{
std::vector<int> directions(2*Nd);
for(int d=0;d<Nd;d++){
directions[d] = d+1;
directions[d+Nd] = d+1;
}
return directions;
}
std::vector<int> TwoSpinWilsonFermion3plus1DStatic::MakeDisplacements(void)
{
std::vector<int> displacements(2*Nd);
for(int d=0;d<Nd;d++){
displacements[d] = +1;
displacements[d+Nd] = -1;
}
return displacements;
}
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,40 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/TwoSpinWilsonFermion3plus1DImplementation.h>
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class TwoSpinWilsonFermion3plus1D<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@@ -0,0 +1,40 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015, 2020
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/implementation/TwoSpinWilsonKernelsImplementation.h>
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class TwoSpinWilsonKernels<IMPLEMENTATION>;
NAMESPACE_END(Grid);

View File

@@ -34,8 +34,28 @@ directory
NAMESPACE_BEGIN(Grid);
// S-direction is INNERMOST and takes no part in the parity.
const std::vector<int> WilsonFermion5DStatic::directions ({1,2,3,4, 1, 2, 3, 4});
const std::vector<int> WilsonFermion5DStatic::displacements({1,1,1,1,-1,-1,-1,-1});
const std::vector<int> WilsonFermion5DStatic::directions (WilsonFermion5DStatic::MakeDirections());
const std::vector<int> WilsonFermion5DStatic::displacements(WilsonFermion5DStatic::MakeDisplacements());
std::vector<int> WilsonFermion5DStatic::MakeDirections (void)
{
std::vector<int> directions(2*Nd);
for(int d=0;d<Nd;d++){
directions[d] = d+1;
directions[d+Nd] = d+1;
}
return directions;
}
std::vector<int> WilsonFermion5DStatic::MakeDisplacements(void)
{
std::vector<int> displacements(2*Nd);
for(int d=0;d<Nd;d++){
displacements[d] = +1;
displacements[d+Nd] = -1;
}
return displacements;
}
NAMESPACE_END(Grid);

View File

@@ -33,9 +33,27 @@ directory
NAMESPACE_BEGIN(Grid);
const std::vector<int> WilsonFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3});
const std::vector<int> WilsonFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1});
const std::vector<int> WilsonFermionStatic::directions(WilsonFermionStatic::MakeDirections());
const std::vector<int> WilsonFermionStatic::displacements(WilsonFermionStatic::MakeDisplacements());
int WilsonFermionStatic::HandOptDslash;
std::vector<int> WilsonFermionStatic::MakeDirections (void)
{
std::vector<int> directions(2*Nd);
for(int d=0;d<Nd;d++){
directions[d] = d;
directions[d+Nd] = d;
}
return directions;
}
std::vector<int> WilsonFermionStatic::MakeDisplacements(void)
{
std::vector<int> displacements(2*Nd);
for(int d=0;d<Nd;d++){
displacements[d] = +1;
displacements[d+Nd] = -1;
}
return displacements;
}
NAMESPACE_END(Grid);

View File

@@ -0,0 +1 @@
../CompactWilsonCloverFermion5DInstantiation.cc.master

View File

@@ -0,0 +1 @@
../CompactWilsonCloverFermion5DInstantiation.cc.master

View File

@@ -36,11 +36,16 @@ DWF_IMPL_LIST=" \
ZWilsonImplF \
ZWilsonImplD2 "
TWOSPIN_WILSON_IMPL_LIST=" \
TwoSpinWilsonImplF \
TwoSpinWilsonImplD "
GDWF_IMPL_LIST=" \
GparityWilsonImplF \
GparityWilsonImplD "
IMPL_LIST="$STAG_IMPL_LIST $WILSON_IMPL_LIST $DWF_IMPL_LIST $GDWF_IMPL_LIST"
IMPL_LIST="$STAG_IMPL_LIST $WILSON_IMPL_LIST $DWF_IMPL_LIST $GDWF_IMPL_LIST $TWOSPIN_WILSON_IMPL_LIST"
for impl in $IMPL_LIST
do
@@ -62,7 +67,7 @@ do
done
done
CC_LIST="CompactWilsonCloverFermionInstantiation"
CC_LIST="CompactWilsonCloverFermionInstantiation CompactWilsonCloverFermion5DInstantiation"
for impl in $COMPACT_WILSON_IMPL_LIST
do
@@ -110,7 +115,12 @@ do
done
done
CC_LIST=" \
ImprovedStaggeredFermion5DInstantiation \
StaggeredKernelsInstantiation "
CC_LIST="TwoSpinWilsonFermion3plus1DInstantiation.cc.master TwoSpinWilsonKernelsInstantiation.cc.master"
for impl in $TWOSPIN_WILSON_IMPL_LIST
do
for f in $CC_LIST
do
ln -f -s ../$f.cc.master $impl/$f$impl.cc
done
done

View File

@@ -76,27 +76,27 @@ public:
return action;
};
virtual void deriv(const GaugeField &Umu,GaugeField & dSdU) {
virtual void deriv(const GaugeField &U, GaugeField &dSdU) {
//extend Ta to include Lorentz indexes
RealD factor_p = c_plaq/RealD(Nc)*0.5;
RealD factor_r = c_rect/RealD(Nc)*0.5;
GridBase *grid = Umu.Grid();
GridBase *grid = U.Grid();
std::vector<GaugeLinkField> U (Nd,grid);
std::vector<GaugeLinkField> Umu (Nd,grid);
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
Umu[mu] = PeekIndex<LorentzIndex>(U,mu);
}
std::vector<GaugeLinkField> RectStaple(Nd,grid), Staple(Nd,grid);
WilsonLoops<Gimpl>::StapleAndRectStapleAll(Staple, RectStaple, U, workspace);
WilsonLoops<Gimpl>::StapleAndRectStapleAll(Staple, RectStaple, Umu, workspace);
GaugeLinkField dSdU_mu(grid);
GaugeLinkField staple(grid);
for (int mu=0; mu < Nd; mu++){
dSdU_mu = Ta(U[mu]*Staple[mu])*factor_p;
dSdU_mu = dSdU_mu + Ta(U[mu]*RectStaple[mu])*factor_r;
dSdU_mu = Ta(Umu[mu]*Staple[mu])*factor_p;
dSdU_mu = dSdU_mu + Ta(Umu[mu]*RectStaple[mu])*factor_r;
PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
}

View File

@@ -73,20 +73,23 @@ public:
// extend Ta to include Lorentz indexes
RealD factor = 0.5 * beta / RealD(Nc);
GridBase *grid = U.Grid();
GaugeLinkField Umu(U.Grid());
GaugeLinkField dSdU_mu(U.Grid());
GaugeLinkField dSdU_mu(grid);
std::vector<GaugeLinkField> Umu(Nd, grid);
for (int mu = 0; mu < Nd; mu++) {
Umu[mu] = PeekIndex<LorentzIndex>(U, mu);
}
Umu = PeekIndex<LorentzIndex>(U, mu);
for (int mu = 0; mu < Nd; mu++) {
// Staple in direction mu
WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
dSdU_mu = Ta(Umu * dSdU_mu) * factor;
WilsonLoops<Gimpl>::Staple(dSdU_mu, Umu, mu);
dSdU_mu = Ta(Umu[mu] * dSdU_mu) * factor;
PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
}
}
private:
RealD beta;
};

View File

@@ -111,8 +111,8 @@ public:
};
void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {
std::string config, rng;
this->build_filenames(traj, Params, config, rng);
std::string config, rng, smr;
this->build_filenames(traj, Params, config, smr, rng);
this->check_filename(rng);
this->check_filename(config);

View File

@@ -75,7 +75,7 @@ public:
GridParallelRNG &pRNG) {
if ((traj % Params.saveInterval) == 0) {
std::string config, rng, smr;
this->build_filenames(traj, Params, config, rng);
this->build_filenames(traj, Params, config, smr, rng);
GridBase *grid = SmartConfig.get_U(false).Grid();
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
@@ -102,7 +102,7 @@ public:
if ( Params.saveSmeared ) {
IldgWriter _IldgWriter(grid->IsBoss());
_IldgWriter.open(smr);
_IldgWriter.writeConfiguration<GaugeStats>(SmartConfig.get_U(true), traj, config, config);
_IldgWriter.writeConfiguration<GaugeStats>(SmartConfig.get_U(true), traj, smr, smr);
_IldgWriter.close();
std::cout << GridLogMessage << "Written ILDG Configuration on " << smr
@@ -118,8 +118,8 @@ public:
void CheckpointRestore(int traj, GaugeField &U, GridSerialRNG &sRNG,
GridParallelRNG &pRNG) {
std::string config, rng;
this->build_filenames(traj, Params, config, rng);
std::string config, rng, smr;
this->build_filenames(traj, Params, config, smr, rng);
this->check_filename(rng);
this->check_filename(config);

View File

@@ -107,8 +107,8 @@ class ScidacHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG,
GridParallelRNG &pRNG) {
std::string config, rng;
this->build_filenames(traj, Params, config, rng);
std::string config, rng, smr;
this->build_filenames(traj, Params, config, smr, rng);
this->check_filename(rng);
this->check_filename(config);

View File

@@ -62,15 +62,15 @@ accelerator_inline int stencilIndex(int mu, int nu) {
/*! @brief structure holding the link treatment */
struct SmearingParameters{
SmearingParameters(){}
struct HISQSmearingParameters{
HISQSmearingParameters(){}
Real c_1; // 1 link
Real c_naik; // Naik term
Real c_3; // 3 link
Real c_5; // 5 link
Real c_7; // 7 link
Real c_lp; // 5 link Lepage
SmearingParameters(Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp)
HISQSmearingParameters(Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp)
: c_1(c1),
c_naik(cnaik),
c_3(c3),
@@ -86,7 +86,7 @@ class Smear_HISQ : public Gimpl {
private:
GridCartesian* const _grid;
SmearingParameters _linkTreatment;
HISQSmearingParameters _linkTreatment;
public:
@@ -117,7 +117,7 @@ public:
// IN--u_thin
void smear(GF& u_smr, GF& u_naik, GF& u_thin) const {
SmearingParameters lt = this->_linkTreatment;
HISQSmearingParameters lt = this->_linkTreatment;
auto grid = this->_grid;
// Create a padded cell of extra padding depth=1 and fill the padding.

View File

@@ -158,8 +158,8 @@ RealD WilsonFlowBase<Gimpl>::energyDensityCloverleaf(const RealD t, const GaugeF
LatticeComplexD R(U.Grid());
R = Zero();
for(int mu=0;mu<3;mu++){
for(int nu=mu+1;nu<4;nu++){
for(int mu=0;mu<Nd-1;mu++){
for(int nu=mu+1;nu<Nd;nu++){
WilsonLoops<Gimpl>::FieldStrength(F, U, mu, nu);
R = R + trace(F*F);
}
@@ -207,11 +207,14 @@ std::vector<RealD> WilsonFlowBase<Gimpl>::flowMeasureEnergyDensityCloverleaf(con
}
template <class Gimpl>
void WilsonFlowBase<Gimpl>::setDefaultMeasurements(int topq_meas_interval){
addMeasurement(1, [](int step, RealD t, const typename Gimpl::GaugeField &U){
void WilsonFlowBase<Gimpl>::setDefaultMeasurements(int meas_interval){
addMeasurement(meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : " << step << " " << t << " " << energyDensityPlaquette(t,U) << std::endl;
});
addMeasurement(topq_meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
addMeasurement(meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
std::cout << GridLogMessage << "[WilsonFlow] Energy density (cloverleaf) : " << step << " " << t << " " << energyDensityCloverleaf(t,U) << std::endl;
});
addMeasurement(meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
std::cout << GridLogMessage << "[WilsonFlow] Top. charge : " << step << " " << WilsonLoops<Gimpl>::TopologicalCharge(U) << std::endl;
});
}
@@ -249,6 +252,11 @@ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const{
out = in;
RealD taus = 0.;
// Perform initial t=0 measurements
for(auto const &meas : this->functions)
meas.second(0,taus,out);
for (unsigned int step = 1; step <= Nstep; step++) { //step indicates the number of smearing steps applied at the time of measurement
auto start = std::chrono::high_resolution_clock::now();
evolve_step(out, taus);
@@ -333,6 +341,11 @@ void WilsonFlowAdaptive<Gimpl>::smear(GaugeField& out, const GaugeField& in) con
RealD taus = 0.;
RealD eps = init_epsilon;
unsigned int step = 0;
// Perform initial t=0 measurements
for(auto const &meas : this->functions)
meas.second(step,taus,out);
do{
int step_success = evolve_step_adaptive(out, taus, eps);
step += step_success; //step will not be incremented if the integration step fails

220
Grid/qcd/spin/Pauli.h Normal file
View File

@@ -0,0 +1,220 @@
#ifndef GRID_QCD_PAULI_H
#define GRID_QCD_PAULI_H
#include <array>
NAMESPACE_BEGIN(Grid);
//
/*
* Pauli basis
* sx sy sz ident
* (0 1) , (0 -i) , ( 1 0 )
* (1 0) (i 0) ( 0 -1)
*
* These are hermitian.
*
* Also supply wilson "projectors" (1+/-sx), (1+/-sy), (1+/-sz)
*
* spPauliProjXm
* spPauliProjYm etc...
*/
class Pauli {
public:
GRID_SERIALIZABLE_ENUM(Algebra, undef,
SigmaX , 0,
MinusSigmaX , 1,
SigmaY , 2,
MinusSigmaY , 3,
SigmaZ , 4,
MinusSigmaZ , 5,
Identity , 6,
MinusIdentity , 7);
static constexpr unsigned int nPauli = 8;
static const std::array<const char *, nPauli> name;
static const std::array<std::array<Algebra, nPauli>, nPauli> mul;
static const std::array<Algebra, nPauli> adj;
static const std::array<const Pauli, 4> gmu;
static const std::array<const Pauli, 16> gall;
Algebra g;
public:
accelerator Pauli(Algebra initg): g(initg) {}
};
#define CopyImplementation(iTemplate,multPauli,multFlavour) \
template<class vtype> \
accelerator_inline void multPauli(iTemplate<vtype, Nhs> &ret, const iTemplate<vtype, Nhs> &rhs) { \
multFlavour(ret,rhs); \
}
CopyImplementation(iVector,multPauliSigmaX,multFlavourSigmaX);
CopyImplementation(iMatrix,lmultPauliSigmaX,lmultFlavourSigmaX);
CopyImplementation(iMatrix,rmultPauliSigmaX,rmultFlavourSigmaX);
CopyImplementation(iVector,multPauliMinusSigmaX ,multFlavourMinusSigmaX);
CopyImplementation(iMatrix,lmultPauliMinusSigmaX,lmultFlavourMinusSigmaX);
CopyImplementation(iMatrix,rmultPauliMinusSigmaX,rmultFlavourMinusSigmaX);
CopyImplementation(iVector,multPauliSigmaY,multFlavourSigmaY);
CopyImplementation(iMatrix,lmultPauliSigmaY,lmultFlavourSigmaY);
CopyImplementation(iMatrix,rmultPauliSigmaY,rmultFlavourSigmaY);
CopyImplementation(iVector,multPauliMinusSigmaY ,multFlavourMinusSigmaY);
CopyImplementation(iMatrix,lmultPauliMinusSigmaY,lmultFlavourMinusSigmaY);
CopyImplementation(iMatrix,rmultPauliMinusSigmaY,rmultFlavourMinusSigmaY);
CopyImplementation(iVector,multPauliSigmaZ,multFlavourSigmaZ);
CopyImplementation(iMatrix,lmultPauliSigmaZ,lmultFlavourSigmaZ);
CopyImplementation(iMatrix,rmultPauliSigmaZ,rmultFlavourSigmaZ);
CopyImplementation(iVector,multPauliMinusSigmaZ ,multFlavourMinusSigmaZ);
CopyImplementation(iMatrix,lmultPauliMinusSigmaZ,lmultFlavourMinusSigmaZ);
CopyImplementation(iMatrix,rmultPauliMinusSigmaZ,rmultFlavourMinusSigmaZ);
CopyImplementation(iVector,multPauliIdentity,multFlavourIdentity);
CopyImplementation(iMatrix,lmultPauliIdentity,lmultFlavourIdentity);
CopyImplementation(iMatrix,rmultPauliIdentity,rmultFlavourIdentity);
CopyImplementation(iVector,multPauliMinusIdentity ,multFlavourMinusIdentity);
CopyImplementation(iMatrix,lmultPauliMinusIdentity,lmultFlavourMinusIdentity);
CopyImplementation(iMatrix,rmultPauliMinusIdentity,rmultFlavourMinusIdentity);
/*
* sx sy sz ident
* (0 1) , (0 -i) , ( 1 0 )
* (1 0) (i 0) ( 0 -1)
*/
template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliProjXp (iVector<vtype,Nhs> &hspin,const iVector<vtype,Nhs> &fspin)
{
hspin(0)=fspin(0)+fspin(1);
hspin(1)=fspin(1)+fspin(0);
}
template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliProjXm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Nhs> &fspin)
{
hspin(0)=fspin(0)-fspin(1);
hspin(1)=fspin(1)-fspin(0);
}
template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliProjYp (iVector<vtype,Nhs> &hspin,const iVector<vtype,Nhs> &fspin)
{
hspin(0)=fspin(0)-timesI(fspin(1));
hspin(1)=fspin(1)+timesI(fspin(0));
}
template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliProjYm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Nhs> &fspin)
{
hspin(0)=fspin(0)+timesI(fspin(1));
hspin(1)=fspin(1)-timesI(fspin(0));
}
template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliProjZp (iVector<vtype,Nhs> &hspin,const iVector<vtype,Nhs> &fspin)
{
hspin(0)=fspin(0)+fspin(0);
hspin(1)=Zero();
}
template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliProjZm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Nhs> &fspin)
{
hspin(0)=Zero();
hspin(1)=fspin(1)+fspin(1);
}
template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliAssign(iVector<vtype,Nhs> &fspin,const iVector<vtype,Nhs> &hspin)
{
fspin = hspin;
}
template<class vtype,IfSpinor<iVector<vtype,Nhs> > = 0> accelerator_inline void pauliAdd (iVector<vtype,Nhs> &fspin,const iVector<vtype,Nhs> &hspin)
{
fspin = fspin + hspin;
}
template<class vtype>
accelerator_inline auto operator*(const Pauli &G, const iVector<vtype, Nhs> &arg)
->typename std::enable_if<matchGridTensorIndex<iVector<vtype, Nhs>, PauliIndex>::value, iVector<vtype, Nhs>>::type
{
iVector<vtype, Nhs> ret;
switch (G.g)
{
case Pauli::Algebra::SigmaX:
multPauliSigmaX(ret, arg); break;
case Pauli::Algebra::MinusSigmaX:
multPauliMinusSigmaX(ret, arg); break;
case Pauli::Algebra::SigmaY:
multPauliSigmaY(ret, arg); break;
case Pauli::Algebra::MinusSigmaY:
multPauliMinusSigmaY(ret, arg); break;
case Pauli::Algebra::SigmaZ:
multPauliSigmaZ(ret, arg); break;
case Pauli::Algebra::MinusSigmaZ:
multPauliMinusSigmaZ(ret, arg); break;
case Pauli::Algebra::Identity:
multPauliIdentity(ret, arg); break;
case Pauli::Algebra::MinusIdentity:
multPauliMinusIdentity(ret, arg); break;
default: assert(0);
}
return ret;
}
template<class vtype>
accelerator_inline auto operator*(const Pauli &G, const iMatrix<vtype, Nhs> &arg)
->typename std::enable_if<matchGridTensorIndex<iMatrix<vtype, Nhs>, PauliIndex>::value, iMatrix<vtype, Nhs>>::type
{
iMatrix<vtype, Nhs> ret;
switch (G.g)
{
case Pauli::Algebra::SigmaX:
lmultPauliSigmaX(ret, arg); break;
case Pauli::Algebra::MinusSigmaX:
lmultPauliMinusSigmaX(ret, arg); break;
case Pauli::Algebra::SigmaY:
lmultPauliSigmaY(ret, arg); break;
case Pauli::Algebra::MinusSigmaY:
lmultPauliMinusSigmaY(ret, arg); break;
case Pauli::Algebra::SigmaZ:
lmultPauliSigmaZ(ret, arg); break;
case Pauli::Algebra::MinusSigmaZ:
lmultPauliMinusSigmaZ(ret, arg); break;
case Pauli::Algebra::Identity:
lmultPauliIdentity(ret, arg); break;
case Pauli::Algebra::MinusIdentity:
lmultPauliMinusIdentity(ret, arg); break;
default: assert(0);
}
return ret;
}
template<class vtype>
accelerator_inline auto operator*(const iMatrix<vtype, Nhs> &arg, const Pauli &G)
->typename std::enable_if<matchGridTensorIndex<iMatrix<vtype, Nhs>, PauliIndex>::value, iMatrix<vtype, Nhs>>::type
{
iMatrix<vtype, Nhs> ret;
switch (G.g)
{
case Pauli::Algebra::SigmaX:
rmultPauliSigmaX(ret, arg); break;
case Pauli::Algebra::MinusSigmaX:
rmultPauliMinusSigmaX(ret, arg); break;
case Pauli::Algebra::SigmaY:
rmultPauliSigmaY(ret, arg); break;
case Pauli::Algebra::MinusSigmaY:
rmultPauliMinusSigmaY(ret, arg); break;
case Pauli::Algebra::SigmaZ:
rmultPauliSigmaZ(ret, arg); break;
case Pauli::Algebra::MinusSigmaZ:
rmultPauliMinusSigmaZ(ret, arg); break;
case Pauli::Algebra::Identity:
rmultPauliIdentity(ret, arg); break;
case Pauli::Algebra::MinusIdentity:
rmultPauliMinusIdentity(ret, arg); break;
default: assert(0);
}
return ret;
}
NAMESPACE_END(Grid);
#endif // GRID_QCD_GAMMA_H

View File

@@ -179,20 +179,17 @@ public:
//////////////////////////////////////////////////
// average over all x,y,z the temporal loop
//////////////////////////////////////////////////
static ComplexD avgPolyakovLoop(const GaugeField &Umu) { //assume Nd=4
static ComplexD avgPolyakovLoop(const GaugeField &Umu) {
GaugeMat Ut(Umu.Grid()), P(Umu.Grid());
ComplexD out;
int T = Umu.Grid()->GlobalDimensions()[3];
int X = Umu.Grid()->GlobalDimensions()[0];
int Y = Umu.Grid()->GlobalDimensions()[1];
int Z = Umu.Grid()->GlobalDimensions()[2];
Ut = peekLorentz(Umu,3); //Select temporal direction
uint64_t vol = Umu.Grid()->gSites();
int T = Umu.Grid()->GlobalDimensions()[Nd-1];
Ut = peekLorentz(Umu,Nd-1); //Select temporal direction
P = Ut;
for (int t=1;t<T;t++){
P = Gimpl::CovShiftForward(Ut,3,P);
P = Gimpl::CovShiftForward(Ut,Nd-1,P);
}
RealD norm = 1.0/(Nc*X*Y*Z*T);
RealD norm = 1.0/(Nc*vol);
out = sum(trace(P))*norm;
return out;
}
@@ -215,7 +212,7 @@ public:
double vol = Umu.Grid()->gSites();
return p.real() / vol / (4.0 * Nc ) ;
return p.real() / vol / (Nd * Nc ) ;
};
//////////////////////////////////////////////////
@@ -292,19 +289,21 @@ public:
//////////////////////////////////////////////////
// the sum over all nu-oriented staples for nu != mu on each site
//////////////////////////////////////////////////
static void Staple(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
static void Staple(GaugeMat &staple, const GaugeLorentz &U, int mu) {
GridBase *grid = Umu.Grid();
std::vector<GaugeMat> U(Nd, grid);
std::vector<GaugeMat> Umu(Nd, U.Grid());
for (int d = 0; d < Nd; d++) {
U[d] = PeekIndex<LorentzIndex>(Umu, d);
Umu[d] = PeekIndex<LorentzIndex>(U, d);
}
Staple(staple, U, mu);
Staple(staple, Umu, mu);
}
static void Staple(GaugeMat &staple, const std::vector<GaugeMat> &U, int mu) {
staple = Zero();
static void Staple(GaugeMat &staple, const std::vector<GaugeMat> &Umu, int mu) {
autoView(staple_v, staple, AcceleratorWrite);
accelerator_for(i, staple.Grid()->oSites(), Simd::Nsimd(), {
staple_v[i] = Zero();
});
for (int nu = 0; nu < Nd; nu++) {
@@ -318,12 +317,12 @@ public:
// |
// __|
//
staple += Gimpl::ShiftStaple(
Gimpl::CovShiftForward(
U[nu], nu,
Umu[nu], nu,
Gimpl::CovShiftBackward(
U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))),
Umu[mu], mu, Gimpl::CovShiftIdentityBackward(Umu[nu], nu))),
mu);
// __
@@ -333,8 +332,8 @@ public:
//
staple += Gimpl::ShiftStaple(
Gimpl::CovShiftBackward(U[nu], nu,
Gimpl::CovShiftBackward(U[mu], mu, U[nu])), mu);
Gimpl::CovShiftBackward(Umu[nu], nu,
Gimpl::CovShiftBackward(Umu[mu], mu, Umu[nu])), mu);
}
}
}
@@ -738,6 +737,7 @@ public:
//cf https://arxiv.org/pdf/hep-lat/9701012.pdf Eq 6
//output is the charge by timeslice: sum over timeslices to obtain the total
static std::vector<Real> TimesliceTopologicalChargeMxN(const GaugeLorentz &U, int M, int N){
// Audit: 4D epsilon is hard coded
assert(Nd == 4);
std::vector<std::vector<GaugeMat*> > F(Nd,std::vector<GaugeMat*>(Nd,nullptr));
//Note F_numu = - F_munu
@@ -827,6 +827,25 @@ public:
return out;
}
//Compute the 5Li topological charge density
static std::vector<Real> TopologicalChargeDensity5Li(const GaugeLorentz &U){
static const int exts[5][2] = { {1,1}, {2,2}, {1,2}, {1,3}, {3,3} };
std::vector<std::vector<Real> > loops = TimesliceTopologicalCharge5LiContributions(U);
double c5=1./20.;
double c4=1./5.-2.*c5;
double c3=(-64.+640.*c5)/45.;
double c2=(1-64.*c5)/9.;
double c1=(19.-55.*c5)/9.;
int Lt = loops[0].size();
std::vector<Real> out(Lt,0.);
for(int t=0;t<Lt;t++)
out[t] += c1*loops[0][t] + c2*loops[1][t] + c3*loops[2][t] + c4*loops[3][t] + c5*loops[4][t];
return out;
}
static Real TopologicalCharge5Li(const GaugeLorentz &U){
std::vector<Real> Qt = TimesliceTopologicalCharge5Li(U);
Real Q = 0.;
@@ -1453,7 +1472,7 @@ public:
//////////////////////////////////////////////////
static Real sumWilsonLoop(const GaugeLorentz &Umu,
const int R1, const int R2) {
std::vector<GaugeMat> U(4, Umu.Grid());
std::vector<GaugeMat> U(Nd, Umu.Grid());
for (int mu = 0; mu < Umu.Grid()->_ndimension; mu++) {
U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
@@ -1472,7 +1491,7 @@ public:
//////////////////////////////////////////////////
static Real sumTimelikeWilsonLoop(const GaugeLorentz &Umu,
const int R1, const int R2) {
std::vector<GaugeMat> U(4, Umu.Grid());
std::vector<GaugeMat> U(Nd, Umu.Grid());
for (int mu = 0; mu < Umu.Grid()->_ndimension; mu++) {
U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
@@ -1490,8 +1509,8 @@ public:
// sum over all x,y,z,t and over all planes of spatial Wilson loop
//////////////////////////////////////////////////
static Real sumSpatialWilsonLoop(const GaugeLorentz &Umu,
const int R1, const int R2) {
std::vector<GaugeMat> U(4, Umu.Grid());
const int R1, const int R2) {
std::vector<GaugeMat> U(Nd, Umu.Grid());
for (int mu = 0; mu < Umu.Grid()->_ndimension; mu++) {
U[mu] = PeekIndex<LorentzIndex>(Umu, mu);

View File

@@ -363,12 +363,16 @@ public:
////////////////////////////////////////////////////////////////////////
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
{
// std::cout << "Communicate Begin "<<std::endl;
// _grid->Barrier();
FlightRecorder::StepLog("Communicate begin");
// All GPU kernel tasks must complete
// accelerator_barrier(); // All kernels should ALREADY be complete
// _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer
// But the HaloGather had a barrier too.
for(int i=0;i<Packets.size();i++){
// std::cout << "Communicate prepare "<<i<<std::endl;
// _grid->Barrier();
_grid->StencilSendToRecvFromPrepare(MpiReqs,
Packets[i].send_buf,
Packets[i].to_rank,Packets[i].do_send,
@@ -376,8 +380,15 @@ public:
Packets[i].from_rank,Packets[i].do_recv,
Packets[i].xbytes,Packets[i].rbytes,i);
}
// std::cout << "Communicate PollDtoH "<<std::endl;
// _grid->Barrier();
_grid->StencilSendToRecvFromPollDtoH (MpiReqs); /* Starts MPI*/
// std::cout << "Communicate CopySynch "<<std::endl;
// _grid->Barrier();
acceleratorCopySynchronise();
// Starts intranode
for(int i=0;i<Packets.size();i++){
// std::cout << "Communicate Begin "<<i<<std::endl;
_grid->StencilSendToRecvFromBegin(MpiReqs,
Packets[i].send_buf,
Packets[i].to_rank,Packets[i].do_send,
@@ -385,6 +396,7 @@ public:
Packets[i].from_rank,Packets[i].do_recv,
Packets[i].xbytes,Packets[i].rbytes,i);
}
FlightRecorder::StepLog("Communicate begin has finished");
// Get comms started then run checksums
// Having this PRIOR to the dslash seems to make Sunspot work... (!)
for(int i=0;i<Packets.size();i++){
@@ -395,7 +407,14 @@ public:
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
{
// std::cout << "Communicate Complete "<<std::endl;
// _grid->Barrier();
FlightRecorder::StepLog("Start communicate complete");
// std::cout << "Communicate Complete PollIRecv "<<std::endl;
// _grid->Barrier();
_grid->StencilSendToRecvFromPollIRecv(MpiReqs);
// std::cout << "Communicate Complete Complete "<<std::endl;
// _grid->Barrier();
_grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
if ( this->partialDirichlet ) DslashLogPartial();
else if ( this->fullDirichlet ) DslashLogDirichlet();
@@ -428,6 +447,7 @@ public:
Communicate();
CommsMergeSHM(compress);
CommsMerge(compress);
accelerator_barrier();
}
template<class compressor> int HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
@@ -483,6 +503,9 @@ public:
void HaloGather(const Lattice<vobj> &source,compressor &compress)
{
// accelerator_barrier();
//////////////////////////////////
// I will overwrite my send buffers
//////////////////////////////////
_grid->StencilBarrier();// Synch shared memory on a single nodes
assert(source.Grid()==_grid);
@@ -496,7 +519,11 @@ public:
HaloGatherDir(source,compress,point,face_idx);
}
accelerator_barrier(); // All my local gathers are complete
// _grid->StencilBarrier();// Synch shared memory on a single nodes
#ifdef NVLINK_GET
_grid->StencilBarrier(); // He can now get mu local gather, I can get his
// Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
// Or issue barrier AFTER the DMA is running
#endif
face_table_computed=1;
assert(u_comm_offset==_unified_buffer_size);
}
@@ -535,6 +562,7 @@ public:
coalescedWrite(to[j] ,coalescedRead(from [j]));
});
acceleratorFenceComputeStream();
// Also fenced in WilsonKernels
}
}
@@ -663,7 +691,7 @@ public:
}
}
}
std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
// std::cout << "BuildSurfaceList size is "<<surface_list_size<<std::endl;
surface_list.resize(surface_list_size);
std::vector<int> surface_list_host(surface_list_size);
int32_t ss=0;
@@ -683,6 +711,7 @@ public:
}
}
acceleratorCopyToDevice(&surface_list_host[0],&surface_list[0],surface_list_size*sizeof(int));
// std::cout << GridLogMessage<<"BuildSurfaceList size is "<<surface_list_size<<std::endl;
}
/// Introduce a block structure and switch off comms on boundaries
void DirichletBlock(const Coordinate &dirichlet_block)
@@ -774,8 +803,8 @@ public:
this->_entries_host_p = &_entries[0];
this->_entries_p = &_entries_device[0];
std::cout << GridLogMessage << " Stencil object allocated for "<<std::dec<<this->_osites
<<" sites table "<<std::hex<<this->_entries_p<< " GridPtr "<<_grid<<std::dec<<std::endl;
// std::cout << GridLogMessage << " Stencil object allocated for "<<std::dec<<this->_osites
// <<" sites table "<<std::hex<<this->_entries_p<< " GridPtr "<<_grid<<std::dec<<std::endl;
for(int ii=0;ii<npoints;ii++){

View File

@@ -242,19 +242,33 @@ inline void *acceleratorAllocDevice(size_t bytes)
return ptr;
};
typedef int acceleratorEvent_t;
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
acceleratorCopyToDevice(from,to,bytes);
return 0;
}
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
acceleratorCopyFromDevice(from,to,bytes);
return 0;
}
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
{
cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
return 0;
}
inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
inline void acceleratorEventWait(acceleratorEvent_t ev)
{
//auto discard=cudaStreamSynchronize(ev);
}
inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}
inline int acceleratorIsCommunicable(void *ptr)
@@ -323,7 +337,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
cgh.parallel_for( \
sycl::nd_range<3>(global,local), \
[=] (sycl::nd_item<3> item) /*mutable*/ \
[[intel::reqd_sub_group_size(16)]] \
[[sycl::reqd_sub_group_size(16)]] \
{ \
auto iter1 = item.get_global_id(0); \
auto iter2 = item.get_global_id(1); \
@@ -343,11 +357,28 @@ inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait(); }
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes);}
inline void acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); }
inline void acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); }
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
///////
// Asynch event interface
///////
typedef sycl::event acceleratorEvent_t;
inline void acceleratorEventWait(acceleratorEvent_t ev)
{
ev.wait();
}
inline int acceleratorEventIsComplete(acceleratorEvent_t ev)
{
return (ev.get_info<sycl::info::event::command_execution_status>() == sycl::info::event_command_status::complete);
}
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes);}
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}
inline int acceleratorIsCommunicable(void *ptr)
@@ -358,8 +389,10 @@ inline int acceleratorIsCommunicable(void *ptr)
else return 0;
#endif
return 1;
}
#endif
//////////////////////////////////////////////
@@ -459,7 +492,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
inline void *acceleratorAllocHost(size_t bytes)
{
void *ptr=NULL;
auto err = hipMallocHost((void **)&ptr,bytes);
auto err = hipHostMalloc((void **)&ptr,bytes);
if( err != hipSuccess ) {
ptr = (void *) NULL;
fprintf(stderr," hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
@@ -492,23 +525,35 @@ inline void *acceleratorAllocDevice(size_t bytes)
inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);};
inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
typedef int acceleratorEvent_t;
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
{
auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
return 0;
}
inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
acceleratorCopyToDevice(from,to,bytes);
return 0;
}
inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
acceleratorCopyFromDevice(from,to,bytes);
return 0;
}
inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); };
inline void acceleratorEventWait(acceleratorEvent_t ev)
{
// auto discard=hipStreamSynchronize(ev);
}
inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}
#endif
inline void acceleratorPin(void *ptr,unsigned long bytes)
@@ -545,6 +590,8 @@ inline void acceleratorPin(void *ptr,unsigned long bytes)
#undef GRID_SIMT
typedef int acceleratorEvent_t;
inline void acceleratorMem(void)
{
/*
@@ -565,8 +612,13 @@ inline void acceleratorMem(void)
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); }
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); }
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { acceleratorCopyToDevice(from,to,bytes); return 0; }
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes) { acceleratorCopyFromDevice(from,to,bytes); return 0; }
inline void acceleratorEventWait(acceleratorEvent_t ev){}
inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev); return 1;}
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); return 0;}
inline void acceleratorCopySynchronise(void) {};
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
@@ -655,9 +707,9 @@ inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
acceleratorCopySynchronise();
}
template<class T> void acceleratorPut(T& dev,T&host)
template<class T> void acceleratorPut(T& dev,const T&host)
{
acceleratorCopyToDevice(&host,&dev,sizeof(T));
acceleratorCopyToDevice((void *)&host,&dev,sizeof(T));
}
template<class T> T acceleratorGet(T& dev)
{

View File

@@ -73,9 +73,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define thread_critical DO_PRAGMA(omp critical)
#ifdef GRID_OMP
inline void thread_bcopy(void *from, void *to,size_t bytes)
inline void thread_bcopy(const void *from, void *to,size_t bytes)
{
uint64_t *ufrom = (uint64_t *)from;
const uint64_t *ufrom = (const uint64_t *)from;
uint64_t *uto = (uint64_t *)to;
assert(bytes%8==0);
uint64_t words=bytes/8;
@@ -84,7 +84,7 @@ inline void thread_bcopy(void *from, void *to,size_t bytes)
});
}
#else
inline void thread_bcopy(void *from, void *to,size_t bytes)
inline void thread_bcopy(const void *from, void *to,size_t bytes)
{
bcopy(from,to,bytes);
}

View File

@@ -187,9 +187,10 @@ void GridParseLayout(char **argv,int argc,
Coordinate &latt_c,
Coordinate &mpi_c)
{
auto mpi =std::vector<int>({1,1,1,1});
auto latt=std::vector<int>({8,8,8,8});
auto mpi =std::vector<int>(Nd,1);
auto latt=std::vector<int>(Nd,8);
std::cout << "Default mpi "<<mpi<<std::endl;
std::cout << "Default latt"<<latt<<std::endl;
GridThread::SetMaxThreads();
std::string arg;
@@ -228,6 +229,9 @@ void GridParseLayout(char **argv,int argc,
}
// Copy back into coordinate format
int nd = mpi.size();
std::cout << "mpi.size() "<<nd<<std::endl;
std::cout << "latt.size() "<<latt.size()<<std::endl;
std::cout << "Nd "<<Nd<<std::endl;
assert(latt.size()==nd);
latt_c.resize(nd);
mpi_c.resize(nd);
@@ -509,7 +513,14 @@ void Grid_init(int *argc,char ***argv)
Grid_default_latt,
Grid_default_mpi);
if( GridCmdOptionExists(*argv,*argv+*argc,"--flightrecorder") ){
std::cout << GridLogMessage <<" Enabling flight recorder " <<std::endl;
FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord);
FlightRecorder::PrintEntireLog = 1;
FlightRecorder::ChecksumComms = 1;
FlightRecorder::ChecksumCommsSend=1;
}
if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
@@ -631,12 +642,11 @@ void Grid_debug_handler_init(void)
sa.sa_flags = SA_SIGINFO;
// sigaction(SIGSEGV,&sa,NULL);
sigaction(SIGTRAP,&sa,NULL);
sigaction(SIGBUS,&sa,NULL);
// sigaction(SIGBUS,&sa,NULL);
// sigaction(SIGUSR2,&sa,NULL);
feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
sigaction(SIGFPE,&sa,NULL);
// feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
// sigaction(SIGFPE,&sa,NULL);
sigaction(SIGKILL,&sa,NULL);
sigaction(SIGILL,&sa,NULL);
@@ -651,3 +661,4 @@ void Grid_debug_handler_init(void)
}
NAMESPACE_END(Grid);

View File

@@ -50,7 +50,7 @@ namespace Grid{
int64_t index64;
IndexFromCoorReversed(coor,index64,dims);
if ( index64>=2*1024*1024*1024LL ){
std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
// std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
}
assert(index64<2*1024*1024*1024LL);
index = (int) index64;

View File

@@ -66,6 +66,7 @@ namespace Grid{
};
}
template <class T> void writeFile(T& in, std::string const fname){
#ifdef HAVE_LIME
// Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
@@ -73,7 +74,7 @@ template <class T> void writeFile(T& in, std::string const fname){
Grid::emptyUserRecord record;
Grid::ScidacWriter WR(in.Grid()->IsBoss());
WR.open(fname);
WR.writeScidacFieldRecord(in,record,0);
WR.writeScidacFieldRecord(in,record,0); // Lexico
WR.close();
#endif
// What is the appropriate way to throw error?
@@ -107,8 +108,18 @@ int main(int argc, char **argv) {
for (int conf = CPar.StartConfiguration; conf <= CPar.EndConfiguration; conf+= CPar.Skip){
#if 0
CPNersc.CheckpointRestore(conf, Umu, sRNG, pRNG);
#else
// Don't require Grid format RNGs
FieldMetaData header;
std::string file, filesmr;
file = CPar.conf_path + "/" + CPar.conf_prefix + "." + std::to_string(conf);
filesmr = CPar.conf_path + "/" + CPar.conf_smr_prefix + "." + std::to_string(conf);
NerscIO::readConfiguration(Umu,header,file);
#endif
std::cout << std::setprecision(15);
std::cout << GridLogMessage << "Initial plaquette: "<< WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu) << std::endl;
@@ -116,6 +127,7 @@ int main(int argc, char **argv) {
std::string file_post = CPar.conf_prefix + "." + std::to_string(conf);
WilsonFlow<PeriodicGimplR> WF(WFPar.step_size,WFPar.steps,WFPar.meas_interval);
WF.addMeasurement(WFPar.meas_interval_density, [&file_pre,&file_post,&conf](int step, RealD t, const typename PeriodicGimplR::GaugeField &U){
typedef typename PeriodicGimplR::GaugeLinkField GaugeMat;
@@ -165,33 +177,48 @@ int main(int argc, char **argv) {
//double coeff = 2.0 / (1.0 * Nd * (Nd - 1)) / 3.0;
//Plq = coeff * Plq;
int tau = std::round(t);
std::string efile = file_pre + "E_dnsty_" + std::to_string(tau) + "_" + file_post;
writeFile(R,efile);
std::string tfile = file_pre + "Top_dnsty_" + std::to_string(tau) + "_" + file_post;
writeFile(qfield,tfile);
RealD WFlow_TC5Li = WilsonLoops<PeriodicGimplR>::TopologicalCharge5Li(U);
int tau = std::round(t);
std::string efile = file_pre + "E_dnsty_" + std::to_string(tau) + "_" + file_post;
// writeFile(R,efile);
std::string tfile = file_pre + "Top_dnsty_" + std::to_string(tau) + "_" + file_post;
// writeFile(qfield,tfile);
std::string ufile = file_pre + "U_" + std::to_string(tau) + "_" + file_post;
{
// PeriodicGimplR::GaugeField Ucopy = U;
// NerscIO::writeConfiguration(Ucopy,ufile);
}
RealD E = real(sum(R))/ RealD(U.Grid()->gSites());
RealD T = real( sum(qfield) );
Coordinate scoor; for (int mu=0; mu < Nd; mu++) scoor[mu] = 0;
RealD E0 = real(peekSite(R,scoor));
RealD T0 = real(peekSite(qfield,scoor));
std::cout << GridLogMessage << "[WilsonFlow] Saved energy density (clover) & topo. charge density: " << conf << " " << step << " " << tau << " "
<< "(E_avg,T_sum) " << E << " " << T << " (E, T at origin) " << E0 << " " << T0 << std::endl;
<< "(E_avg,T_sum) " << E << " " << T << " (E, T at origin) " << E0 << " " << T0 << " Q5Li "<< WFlow_TC5Li << std::endl;
});
int t=WFPar.maxTau;
WF.smear(Uflow, Umu);
// NerscIO::writeConfiguration(Uflow,filesmr);
RealD WFlow_plaq = WilsonLoops<PeriodicGimplR>::avgPlaquette(Uflow);
RealD WFlow_TC = WilsonLoops<PeriodicGimplR>::TopologicalCharge(Uflow);
RealD WFlow_TC5Li = WilsonLoops<PeriodicGimplR>::TopologicalCharge5Li(Uflow);
RealD WFlow_T0 = WF.energyDensityPlaquette(t,Uflow); // t
RealD WFlow_EC = WF.energyDensityCloverleaf(t,Uflow);
std::cout << GridLogMessage << "Plaquette "<< conf << " " << WFlow_plaq << std::endl;
std::cout << GridLogMessage << "T0 "<< conf << " " << WFlow_T0 << std::endl;
std::cout << GridLogMessage << "TC0 "<< conf << " " << WFlow_EC << std::endl;
std::cout << GridLogMessage << "TopologicalCharge "<< conf << " " << WFlow_TC << std::endl;
std::cout << GridLogMessage << "Plaquette "<< conf << " " << WFlow_plaq << std::endl;
std::cout << GridLogMessage << "T0 "<< conf << " " << WFlow_T0 << std::endl;
std::cout << GridLogMessage << "TC0 "<< conf << " " << WFlow_EC << std::endl;
std::cout << GridLogMessage << "TopologicalCharge "<< conf << " " << WFlow_TC << std::endl;
std::cout << GridLogMessage << "TopologicalCharge5Li "<< conf << " " << WFlow_TC5Li<< std::endl;
std::cout<< GridLogMessage << " Admissibility check:\n";
const double sp_adm = 0.067; // admissible threshold

View File

@@ -25,13 +25,20 @@ directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#if Nc == 3
#include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
#include <Grid/qcd/smearing/JacobianAction.h>
#endif
using namespace Grid;
int main(int argc, char **argv)
{
#if Nc != 3
#warning FTHMC2p1f will not work for Nc != 3
std::cout << "This program will currently only work for Nc == 3." << std::endl;
#else
std::cout << std::setprecision(12);
Grid_init(&argc, &argv);
@@ -220,7 +227,6 @@ int main(int argc, char **argv)
TheHMC.Run(SmearingPolicy); // for smearing
Grid_finalize();
#endif
} // main

View File

@@ -24,14 +24,22 @@ See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#if Nc == 3
#include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
#include <Grid/qcd/smearing/JacobianAction.h>
#endif
using namespace Grid;
int main(int argc, char **argv)
{
#if Nc != 3
#warning FTHMC2p1f_3GeV will not work for Nc != 3
std::cout << "This program will currently only work for Nc == 3." << std::endl;
#else
std::cout << std::setprecision(12);
Grid_init(&argc, &argv);
@@ -220,6 +228,7 @@ int main(int argc, char **argv)
TheHMC.Run(SmearingPolicy); // for smearing
Grid_finalize();
#endif
} // main

View File

@@ -25,13 +25,20 @@ directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#if Nc == 3
#include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
#include <Grid/qcd/smearing/JacobianAction.h>
#endif
using namespace Grid;
int main(int argc, char **argv)
{
#if Nc != 3
#warning HMC2p1f_3GeV will not work for Nc != 3
std::cout << "This program will currently only work for Nc == 3." << std::endl;
#else
std::cout << std::setprecision(12);
Grid_init(&argc, &argv);
@@ -220,6 +227,7 @@ int main(int argc, char **argv)
TheHMC.Run(SmearingPolicy); // for smearing
Grid_finalize();
#endif
} // main

5
TODO
View File

@@ -1,3 +1,8 @@
* Clean up the extract merge and replace with insertLane/extractLane
-----
i) Refine subspace with HDCG & recompute
ii) Block Lanczos in coarse space
iii) Batched block project in the operator computation

View File

@@ -52,7 +52,7 @@ int main (int argc, char ** argv)
int threads = GridThread::GetThreads();
int Ls=8;
int Ls=16;
for(int i=0;i<argc;i++) {
if(std::string(argv[i]) == "-Ls"){
std::stringstream ss(argv[i+1]); ss >> Ls;

View File

@@ -175,8 +175,8 @@ public:
timestat.statistics(t_time);
dbytes=dbytes*ppn;
double xbytes = dbytes*0.5;
double bidibytes = dbytes;
double xbytes = dbytes;
double bidibytes = dbytes*2.0;
std::cout<<GridLogMessage << lat<<"\t"<<Ls<<"\t "
<< bytes << " \t "
@@ -492,17 +492,18 @@ public:
}
FGrid->Barrier();
double t1=usecond();
uint64_t ncall = 500;
FGrid->Broadcast(0,&ncall,sizeof(ncall));
uint64_t no = 50;
uint64_t ni = 100;
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
time_statistics timestat;
std::vector<double> t_time(ncall);
for(uint64_t i=0;i<ncall;i++){
std::vector<double> t_time(no);
for(uint64_t i=0;i<no;i++){
t0=usecond();
Dw.DhopEO(src_o,r_e,DaggerNo);
for(uint64_t j=0;j<ni;j++){
Dw.DhopEO(src_o,r_e,DaggerNo);
}
t1=usecond();
t_time[i] = t1-t0;
}
@@ -520,11 +521,11 @@ public:
double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time);
mf_hi = flops/timestat.min;
mf_lo = flops/timestat.max;
mf_hi = flops/timestat.min*ni;
mf_lo = flops/timestat.max*ni;
mf_err= flops/timestat.min * timestat.err/timestat.mean;
mflops = flops/timestat.mean;
mflops = flops/timestat.mean*ni;
mflops_all.push_back(mflops);
if ( mflops_best == 0 ) mflops_best = mflops;
if ( mflops_worst== 0 ) mflops_worst= mflops;
@@ -535,6 +536,7 @@ public:
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call "<< timestat.mean/ni<<std::endl;
}
@@ -654,17 +656,19 @@ public:
}
FGrid->Barrier();
double t1=usecond();
uint64_t ncall = 500;
FGrid->Broadcast(0,&ncall,sizeof(ncall));
uint64_t no = 50;
uint64_t ni = 100;
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
time_statistics timestat;
std::vector<double> t_time(ncall);
for(uint64_t i=0;i<ncall;i++){
std::vector<double> t_time(no);
for(uint64_t i=0;i<no;i++){
t0=usecond();
Ds.DhopEO(src_o,r_e,DaggerNo);
for(uint64_t j=0;j<ni;j++){
Ds.DhopEO(src_o,r_e,DaggerNo);
}
t1=usecond();
t_time[i] = t1-t0;
}
@@ -675,11 +679,11 @@ public:
double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time);
mf_hi = flops/timestat.min;
mf_lo = flops/timestat.max;
mf_hi = flops/timestat.min*ni;
mf_lo = flops/timestat.max*ni;
mf_err= flops/timestat.min * timestat.err/timestat.mean;
mflops = flops/timestat.mean;
mflops = flops/timestat.mean*ni;
mflops_all.push_back(mflops);
if ( mflops_best == 0 ) mflops_best = mflops;
if ( mflops_worst== 0 ) mflops_worst= mflops;
@@ -689,6 +693,7 @@ public:
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call "<< timestat.mean/ni<<std::endl;
}
@@ -792,19 +797,18 @@ public:
Dc.M(src,r);
}
FGrid->Barrier();
double t1=usecond();
uint64_t ncall = 500;
FGrid->Broadcast(0,&ncall,sizeof(ncall));
uint64_t ni = 100;
uint64_t no = 50;
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
time_statistics timestat;
std::vector<double> t_time(ncall);
for(uint64_t i=0;i<ncall;i++){
t0=usecond();
Dc.M(src,r);
t1=usecond();
std::vector<double> t_time(no);
for(uint64_t i=0;i<no;i++){
double t0=usecond();
for(uint64_t j=0;j<ni;j++){
Dc.M(src,r);
}
double t1=usecond();
t_time[i] = t1-t0;
}
FGrid->Barrier();
@@ -814,20 +818,21 @@ public:
double mf_hi, mf_lo, mf_err;
timestat.statistics(t_time);
mf_hi = flops/timestat.min;
mf_lo = flops/timestat.max;
mf_hi = flops/timestat.min*ni;
mf_lo = flops/timestat.max*ni;
mf_err= flops/timestat.min * timestat.err/timestat.mean;
mflops = flops/timestat.mean;
mflops = flops/timestat.mean*ni;
mflops_all.push_back(mflops);
if ( mflops_best == 0 ) mflops_best = mflops;
if ( mflops_worst== 0 ) mflops_worst= mflops;
if ( mflops>mflops_best ) mflops_best = mflops;
if ( mflops<mflops_worst) mflops_worst= mflops;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<" "<<timestat.mean<<" us"<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per rank "<< mflops/NP<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per node "<< mflops/NN<<std::endl;
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov us per call "<< timestat.mean/ni<<std::endl;
}
@@ -872,7 +877,7 @@ int main (int argc, char ** argv)
int do_dslash=1;
int sel=4;
std::vector<int> L_list({8,12,16,24});
std::vector<int> L_list({8,12,16,24,32});
int selm1=sel-1;
std::vector<double> clover;

View File

@@ -151,7 +151,7 @@ AC_ARG_ENABLE([tracing],
case ${ac_TRACING} in
nvtx)
AC_DEFINE([GRID_TRACING_NVTX],[1],[use NVTX])
LIBS="${LIBS} -lnvToolsExt64_1"
LIBS="${LIBS} -lnvToolsExt"
;;
roctx)
AC_DEFINE([GRID_TRACING_ROCTX],[1],[use ROCTX])
@@ -198,6 +198,8 @@ AC_ARG_ENABLE([Nc],
[ac_Nc=${enable_Nc}], [ac_Nc=3])
case ${ac_Nc} in
1)
AC_DEFINE([Config_Nc],[1],[Gauge group Nc]);;
2)
AC_DEFINE([Config_Nc],[2],[Gauge group Nc]);;
3)
@@ -211,6 +213,21 @@ case ${ac_Nc} in
*)
AC_MSG_ERROR(["Unsupport gauge group choice Nc = ${ac_Nc}"]);;
esac
############### Nd
AC_ARG_ENABLE([Nd],
[AS_HELP_STRING([--enable-Nd=2|3|4],[enable default LGT dimension])],
[ac_Nd=${enable_Nd}], [ac_Nd=4])
case ${ac_Nd} in
2)
AC_DEFINE([Config_Nd],[2],[Gauge field dimension Nd]);;
3)
AC_DEFINE([Config_Nd],[3],[Gauge field dimension Nd]);;
4)
AC_DEFINE([Config_Nd],[4],[Gauge field dimension Nd]);;
*)
AC_MSG_ERROR(["Unsupport dimension Nd = ${ac_Nd}"]);;
esac
############### Symplectic group
AC_ARG_ENABLE([Sp],
@@ -818,6 +835,7 @@ os (target) : $target_os
compiler vendor : ${ax_cv_cxx_compiler_vendor}
compiler version : ${ax_cv_gxx_version}
----- BUILD OPTIONS -----------------------------------
Nd : ${ac_Nd}
Nc : ${ac_Nc}
SIMD : ${ac_SIMD}${SIMD_GEN_WIDTH_MSG}
Threading : ${ac_openmp}

View File

@@ -93,10 +93,13 @@ int main(int argc, char ** argv)
Real coeff = (width*width) / Real(4*Iterations);
chi=kronecker;
// chi = (1-p^2/2N)^N kronecker
for(int n = 0; n < Iterations; ++n) {
Laplacian.M(chi,psi);
chi = chi - coeff*psi;
RealD n2 = norm2(chi);
chi = chi * (1.0/std::sqrt(n2));
}
std::cout << " Wuppertal smeared operator is chi = \n" << chi <<std::endl;

View File

@@ -0,0 +1,74 @@
#!/bin/bash
##PBS -q LatticeQCD_aesp_CNDA
#PBS -q debug-scaling
##PBS -q prod
#PBS -l select=16
#PBS -l walltime=00:20:00
#PBS -A LatticeQCD_aesp_CNDA
cd $PBS_O_WORKDIR
source ../sourceme.sh
cp $PBS_NODEFILE nodefile
export OMP_NUM_THREADS=4
export MPICH_OFI_NIC_POLICY=GPU
#export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
#
# Local vol 16.16.16.32
#
LX=16
LY=16
LZ=16
LT=32
NX=2
NY=2
NZ=4
NT=1
GX=2
GY=2
GZ=1
GT=3
PX=$((NX * GX ))
PY=$((NY * GY ))
PZ=$((NZ * GZ ))
PT=$((NT * GT ))
VX=$((PX * LX ))
VY=$((PY * LY ))
VZ=$((PZ * LZ ))
VT=$((PT * LT ))
NP=$((PX*PY*PZ*PT))
VOL=${VX}.${VY}.${VZ}.${VT}
AT=8
MPI=${PX}.${PY}.${PZ}.${PT}
CMD="mpiexec -np $NP -ppn 12 -envall \
./gpu_tile.sh ./Benchmark_dwf_fp32 --mpi $MPI --grid $VOL \
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads $AT --comms-overlap "
echo VOL $VOL
echo MPI $MPI
echo NPROC $NP
echo $CMD
$CMD

View File

@@ -19,7 +19,7 @@ export ONEAPI_DEVICE_FILTER=gpu,level_zero
export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:3
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:4
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
@@ -30,8 +30,8 @@ echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_A
if [ $PALS_RANKID = "0" ]
then
numactl -p $NUMAP -N $NUMAP unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@"
# numactl -p $NUMAP -N $NUMAP "$@"
# numactl -p $NUMAP -N $NUMAP unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@"
numactl -p $NUMAP -N $NUMAP "$@"
else
numactl -p $NUMAP -N $NUMAP "$@"
fi

View File

@@ -1,18 +1,19 @@
#Ahead of time compile for PVC
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib"
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/"
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc"
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/ -fPIC"
#JIT compile
#export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl "
#export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions "
../../configure \
../configure \
--enable-simd=GPU \
--enable-reduction=grid \
--enable-gen-simd-width=64 \
--enable-comms=mpi-auto \
--enable-debug \
--prefix $HOME/gpt-install \
--disable-gparity \
--disable-fermion-reps \
--with-lime=$CLIME \

View File

@@ -0,0 +1,22 @@
CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
../../configure --enable-comms=mpi-auto \
--with-lime=$CLIME \
--enable-unified=no \
--enable-shm=nvlink \
--enable-tracing=none \
--enable-accelerator=hip \
--enable-gen-simd-width=64 \
--disable-gparity \
--disable-fermion-reps \
--enable-simd=GPU \
--with-gmp=$OLCF_GMP_ROOT \
--with-fftw=$FFTW_DIR/.. \
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
--disable-fermion-reps \
CXX=hipcc MPICXX=mpicxx \
CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas"

View File

@@ -0,0 +1,16 @@
echo spack
. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
#module load cce/15.0.1
module load rocm/6.3.1
module load cray-fftw
module load craype-accel-amd-gfx90a
export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
#Ugly hacks to get down level software working on current system
#export LD_LIBRARY_PATH=/opt/cray/libfabric/1.20.1/lib64/:$LD_LIBRARY_PATH
#export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
#ln -s /opt/rocm-6.0.0/lib/libamdhip64.so.6 .

View File

@@ -30,14 +30,10 @@ source ${root}/sourceme.sh
export OMP_NUM_THREADS=7
export MPICH_GPU_SUPPORT_ENABLED=1
export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
for vol in 32.32.32.64
#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
#64.64.32.96
for vol in 64.64.32.64
do
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol > log.shm0.ov.$vol
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.ov.$vol
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 0 --grid $vol > log.shm0.seq.$vol
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.seq.$vol
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol -Ls 16
done

View File

@@ -3,20 +3,19 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
--with-lime=$CLIME \
--enable-unified=no \
--enable-shm=nvlink \
--enable-tracing=timer \
--enable-tracing=none \
--enable-accelerator=hip \
--enable-gen-simd-width=64 \
--disable-gparity \
--disable-fermion-reps \
--enable-simd=GPU \
--enable-accelerator-cshift \
--with-gmp=$OLCF_GMP_ROOT \
--with-fftw=$FFTW_DIR/.. \
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
--disable-fermion-reps \
CXX=hipcc MPICXX=mpicxx \
CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas"
CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas"

View File

@@ -1,12 +1,25 @@
echo spack
. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
spack load c-lime
module load emacs
module load PrgEnv-gnu
module load rocm
module load cray-mpich
module load gmp
module load cce/15.0.1
module load rocm/5.3.0
module load cray-fftw
module load craype-accel-amd-gfx90a
#Ugly hacks to get down level software working on current system
export LD_LIBRARY_PATH=/opt/cray/libfabric/1.20.1/lib64/:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
ln -s /opt/rocm-6.0.0/lib/libamdhip64.so.6 .
#echo spack load c-lime
#spack load c-lime
#module load emacs
##module load PrgEnv-gnu
##module load cray-mpich
##module load cray-fftw
##module load craype-accel-amd-gfx90a
##export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
#Hack for lib
#export LD_LIBRARY_PATH=`pwd`:$LD_LIBRARY_PATH
##export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH

Some files were not shown because too many files have changed in this diff Show More