1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-11-04 22:09:31 +00:00

Compare commits

..

100 Commits

Author SHA1 Message Date
73af020f98 improved 2025-06-27 06:08:54 +00:00
bffb83c46e std::cout<<GridLogMessage<<"Debug:"<<std::endl;
std::cout<<GridLogMessage<<"  --dylib-map     : print dynamic library map, useful for interpreting signal backtraces "<<std::endl;
    std::cout<<GridLogMessage<<"  --heartbeat     : periodic itimer wakeup (interrupts stuck system calls!) "<<std::endl;
    std::cout<<GridLogMessage<<"  --signal-delay n : pause for n seconds after signal handling (useful to get ALL nodes in stuck state) "<<std::endl;
    std::cout<<GridLogMessage<<"  --debug-stdout  : print stdout from EVERY node to file Grid.stdout/err.rank "<<std::endl;
    std::cout<<GridLogMessage<<"  --debug-signals : catch sigsegv and print a blame report, handle SIGHUP with a backtrace to stderr"<<std::endl;
    std::cout<<GridLogMessage<<"  --debug-heartbeat : periodically report backtrace "<<std::endl;

--dylib-map : Grid prints its dylib regions
--heartbeat : itimer based / SIGALRM wake up which seems to make Aurora
more stable
--debug-heartbeat : periodically report to stderr where we are in code

Now have libunwind option (configure: --with-unwind=<prefix>) to give an
Asynch-Signal safe backtrace. Avoid glibc backtrace due to mallocs.
2025-06-27 06:08:54 +00:00
7031f37350 Use libunwind for backtrace as it is signal asynch safe 2025-06-27 06:08:54 +00:00
829dd74cb2 Verbose change 2025-06-27 06:08:54 +00:00
66e671985d P2P 2025-06-27 06:08:54 +00:00
5afcbcf0f3 Cshift uses flight recorder 2025-06-27 06:08:54 +00:00
9730579312 Simplify and verbose 2025-06-27 06:08:51 +00:00
bfae14d035 More flight logging 2025-06-27 06:07:34 +00:00
b78fc73d19 Better signal handler 2025-06-27 06:07:34 +00:00
Peter Boyle
709f8ae76c Update README 2025-06-26 23:06:11 -04:00
Peter Boyle
7aa06329d0 Update for new stencil compression options 2025-06-17 18:06:19 +02:00
Peter Boyle
9d6a38c44c Compressed comms options as Sloppy 2025-06-17 16:43:53 +02:00
Peter Boyle
6ec5cee368 Preparing for compressed comms 2025-06-17 16:38:10 +02:00
Peter Boyle
f2e9a68825 Simplify 2025-06-13 17:32:05 +02:00
Peter Boyle
d88750e6b6 Sloppy + non-sloppy 2025-06-13 16:42:01 +02:00
Peter Boyle
821358eda7 Remove partial dirichlet. Favour intro reduced prec comms options 2025-06-13 05:08:45 +02:00
Peter Boyle
fce6e1f135 Kill core files for quota reasons 2025-06-13 05:08:15 +02:00
Peter Boyle
8f0bb3e676 remove partial dirichlet 2025-06-13 05:07:56 +02:00
Peter Boyle
262c70d967 USe sloppy comms options 2025-06-13 05:07:23 +02:00
Peter Boyle
da43ef7c2d REmove partial dirichlet option. It's going nowhere 2025-06-13 05:05:15 +02:00
Peter Boyle
7b60ab5df1 Warning suppress 2025-06-13 05:04:55 +02:00
Peter Boyle
f6b961a64e Warning suppress 2025-06-13 05:04:47 +02:00
Peter Boyle
f1ed988aa3 Interface to reduced precision comms 2025-06-13 05:04:12 +02:00
Peter Boyle
eea51bb604 Suppress annoying warns 2025-06-13 05:03:36 +02:00
Peter Boyle
9203126aa5 Scripts 2025-06-11 15:30:16 +02:00
Peter Boyle
f90ba4712a Update for Jupiter 2025-06-11 15:24:34 +02:00
Peter Boyle
3737a24096 Updated python output 2025-06-03 14:09:29 -04:00
d418f78352 Making running on Aurora more debuggable 2025-05-23 20:58:16 +00:00
25163998a0 Makes SYCL compiler happy 2025-05-23 20:57:11 +00:00
Peter Boyle
dc546aaa4b Updated config options for BNL cluster 2025-05-13 18:44:47 -04:00
Peter Boyle
5364d580c9 Output chirality, eigenvector density files and python source lego plot 2025-05-13 18:44:47 -04:00
Peter Boyle
2a9a6347e3 Do not require Grid format RNGs and also to the 5Li reporting 2025-05-13 18:44:47 -04:00
Peter Boyle
cfdb56f314 Run measurements at t=0 too 2025-05-13 18:44:46 -04:00
Peter Boyle
b517e88db3 Update README 2025-05-13 16:49:21 -04:00
bb317aba8d Lattice = for sycl 2025-05-13 12:50:58 +00:00
644cc6647e JSON update 2025-05-13 12:50:58 +00:00
72397ce23b SYCL interface change 2025-05-13 12:50:58 +00:00
Peter Boyle
d60a80c098 Fixes and visualisation 2025-04-29 18:04:23 -04:00
Peter Boyle
bb8b6d9d73 Fix 2025-04-29 18:04:04 -04:00
Peter Boyle
677b4cc5b0 Make all tests compile 2025-04-24 20:33:26 -04:00
Peter Boyle
be565ffab6 update mac config command 2025-04-24 14:50:06 -04:00
Peter Boyle
df6120e5f6 CPU compile oops fix 2025-04-24 14:50:06 -04:00
Peter Boyle
21de6f7da8 Merge pull request #477 from lehner/feature/wilson-clover-5d
Feature/wilson clover 5d
2025-04-24 14:44:48 -04:00
Peter Boyle
dbe39f9ce0 Merge pull request #471 from edbennett/fix-wflow
Shave off rough edges in Wilson flow test
2025-04-24 14:40:31 -04:00
Peter Boyle
ab3de50d5e Merge pull request #473 from UCL-ARC/gauge_action_deriv
WilsonGagueAction deriv
2025-04-24 14:39:10 -04:00
Peter Boyle
c545bd2139 Merge pull request #465 from edbennett/allow-nonsu3-compilation
guard against trying to compile SU3-specific code when Nc ≠ 3
2025-04-24 14:35:51 -04:00
Peter Boyle
6a1c64fbdd Merge pull request #470 from paboyle/specflow
Spectral flow, DWF/Mobius kernel measurement
2025-04-24 14:34:33 -04:00
Peter Boyle
b75809ed61 Update README 2025-04-24 14:27:22 -04:00
Peter Boyle
ecaf228e5c Update README 2025-04-24 14:25:32 -04:00
Peter Boyle
6d015ae8fc Visualisation tools 2025-04-24 13:47:34 -04:00
Peter Boyle
233150d93f Bug fix for no accelerator aware MPI, thanks Shuhei for finding it. 2025-04-24 11:40:46 -04:00
Peter Boyle
7af8c77a52 Normalise 2025-04-24 11:37:39 -04:00
Chulwoo Jung
a957e7bfa1 Adding DWF evec Chirality measurement 2025-04-22 22:17:51 +00:00
Chulwoo Jung
cee4c8ce8c Merge branch 'develop' of https://github.com/paboyle/Grid into specflow 2025-04-18 19:55:36 +00:00
Christoph Lehner
96bf814d8c Add checkerboarding to 5D compact clover 2025-04-10 23:05:39 +02:00
Christoph Lehner
7ddc422788 CompactWilsonClover5D 2025-04-10 23:05:29 +02:00
Peter Boyle
e652fc2825 Shared Memory test reenabled on every Grid object creation.
Const improvements in Accelerator.h
2025-04-07 11:51:40 -04:00
Peter Boyle
a49fa3f8d0 ROCM 6.3.1 appears to work 2025-04-07 11:50:59 -04:00
Peter Boyle
cd452a2f91 Slurm update 2025-04-04 18:40:20 -04:00
Peter Boyle
4f89f603ae Changes to add back shared memory test on GPU 2025-04-04 18:40:15 -04:00
Peter Boyle
11dc2c5e1d PVdagM initialise 2025-04-04 18:35:06 -04:00
Peter Boyle
6fec3c15ca Cleaner printing 2025-04-04 18:35:06 -04:00
Peter Boyle
938c47480f Updated compile on frontier.
Unsatisfactory hacsk
2025-04-04 18:35:06 -04:00
Peter Boyle
3811d19298 Fence 2025-04-04 18:35:06 -04:00
Peter Boyle
83a3ab6b6f Barrier -- not sure 100% this was needed 2025-04-04 18:35:05 -04:00
Peter Boyle
d66a9af6a3 No compile fix 2025-04-04 18:35:05 -04:00
Peter Boyle
adc90d3a86 NVLINK GET/PUT on cuda aware mpi 2025-04-04 18:35:05 -04:00
Peter Boyle
ebbd015c5c Deprecate shared memory copy as direction matters on nvidia GPU 2025-04-04 18:35:05 -04:00
Peter Boyle
4ab73b36b2 Deprecate shared memory copy as direction matters on GPU 2025-04-04 18:35:05 -04:00
Peter Boyle
130e07a422 Non hermitian support 2025-04-04 18:35:05 -04:00
Peter Boyle
8f47bb367e Shifted non herm 2025-04-04 18:35:05 -04:00
Peter Boyle
0c3cb60135 Script update 2025-04-04 18:35:05 -04:00
Peter Boyle
9eae8fca5d Size outut 2025-04-04 18:35:05 -04:00
Mashy Green
e465fce201 Merge remote-tracking branch 'upstream/develop' into gauge_action_deriv 2025-03-24 10:12:42 +00:00
Mashy Green
d41542c64b reverted sp2n test wilsonfundfermiongauge to original 2025-03-24 08:29:15 +00:00
Mashy Green
785bc7a14f Adding staple zeroing fix 2025-03-10 12:29:04 +00:00
Mashy Green
1a1fe85428 Merge remote-tracking branch 'upstream' into gauge_action_deriv 2025-03-10 08:37:36 +00:00
Mashy Green
0000d2e558 Merge branch 'develop' into gauge_action_deriv 2025-03-10 08:35:57 +00:00
Muhammad Asif
b1ba209696 Latest upstream with np-su3 patch and modified Sp_WilsonFunfFermionGauge test to be small (#22)
Co-authored-by: Mashy Green <mashy@me.com>

merging no-su3 patch
2025-02-24 11:38:42 +00:00
Muhammad Asif
cb3e529b1e Merge branch 'paboyle:develop' into develop 2025-02-24 11:29:09 +00:00
Mashy Green
717f647418 added the WilsonFlow patch from upstream PR #471 2025-02-24 08:41:31 +00:00
Mashy Green
98e7418187 Merge remote-tracking branch 'upstream/develop' into gauge_action_deriv 2025-02-24 08:33:05 +00:00
Mashy Green
fe05bf48b1 Improvements to WilsonGaugeAction deriv function (#16)
* patched version + modifications to deriv -> staple in qcd/gauge

* Cleaning up and aligning variable naming between action deriv versions

* Removing the regresion test files that were also in this branch for a clean PR

* Reverting whitespace changes

* Fixing after revering too much!

---------

Co-authored-by: Mashy Green <mashy@me.com>
2025-02-17 18:52:04 +00:00
Mashy Green
d2dd8f54e2 Fixing after revering too much! 2025-02-17 17:32:27 +00:00
Mashy Green
7726ee4b16 Reverting whitespace changes 2025-02-17 17:16:28 +00:00
8729c46169 add clover energy density measurement to default WilsonFlow measurements 2025-02-03 14:27:55 +00:00
09f81fe7c3 don't force energy density measurement to be every wilson flow iteration 2025-02-03 14:27:45 +00:00
1876e5b7c0 correct tests/smearing/WilsonFlow to use non-adaptive flow and use correct interface 2025-02-03 14:27:29 +00:00
Mashy Green
355ec76257 Merge pull request #18 from UCL-ARC/bugfix/nvtx
Bugfix/nvtx
2025-02-03 11:05:42 +00:00
Mashy Green
4f17c8d081 Merge branch 'paboyle:develop' into bugfix/nvtx 2025-01-29 13:10:12 +00:00
Mashy Green
aaab753982 Reverting to older version of nvtx for Tursa support 2025-01-29 12:57:38 +00:00
Chulwoo Jung
570b72a47b Bugfix. Sorry! 2025-01-21 15:37:39 -05:00
Chulwoo Jung
a5798a89ed Merge branch 'develop' into specflow 2025-01-21 12:13:24 -05:00
Chulwoo Jung
f7e2f9a401 Checking in spectral flow and DWF/Mobius kernel eigenvalue measurement 2025-01-16 20:47:33 +00:00
Chulwoo Jung
2848a9b558 DWF Kernel lanczos working(?) 2025-01-16 01:29:56 +00:00
Mashy Green
d4868991af Fixed wrong lib for NVTX in configure.ac and updated to nvtx3 2025-01-10 14:53:19 +00:00
Mashy Green
e99d42404e Removing the regresion test files that were also in this branch for a clean PR 2024-12-16 16:31:22 +00:00
Mashy Green
3ba019c747 Cleaning up and aligning variable naming between action deriv versions 2024-12-03 15:23:00 +00:00
Mashy Green
47429218bb patched version + modifications to deriv -> staple in qcd/gauge 2024-11-27 16:29:22 +00:00
8d305df0db guard against trying to compile SU3-specific code when Nc ≠ 3 2024-05-24 14:00:56 +01:00
108 changed files with 4662 additions and 1344 deletions

View File

@@ -51,11 +51,13 @@ directory
#pragma nv_diag_suppress cast_to_qualified_type
//disables nvcc specific warning in many files
#pragma nv_diag_suppress esa_on_defaulted_function_ignored
#pragma nv_diag_suppress declared_but_not_referenced
#pragma nv_diag_suppress extra_semicolon
#else
//disables nvcc specific warning in json.hpp
#pragma diag_suppress unsigned_compare_with_zero
#pragma diag_suppress cast_to_qualified_type
#pragma diag_suppress declared_but_not_referenced
//disables nvcc specific warning in many files
#pragma diag_suppress esa_on_defaulted_function_ignored
#pragma diag_suppress extra_semicolon

View File

@@ -277,6 +277,38 @@ public:
assert(0);
}
};
template<class Matrix,class Field>
class ShiftedNonHermitianLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
RealD shift;
public:
ShiftedNonHermitianLinearOperator(Matrix &Mat,RealD shft): _Mat(Mat),shift(shft){};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out);
out = out + shift*in;
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
_Mat.MdirAll(in,out);
};
void Op (const Field &in, Field &out){
_Mat.M(in,out);
out = out + shift * in;
}
void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out);
out = out + shift * in;
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
assert(0);
}
void HermOp(const Field &in, Field &out){
assert(0);
}
};
//////////////////////////////////////////////////////////
// Even Odd Schur decomp operators; there are several

View File

@@ -269,7 +269,9 @@ public:
RealD xscale = 2.0/(hi-lo);
RealD mscale = -(hi+lo)/(hi-lo);
Linop.HermOp(T0,y);
grid->Barrier();
axpby(T1,xscale,mscale,y,in);
grid->Barrier();
// sum = .5 c[0] T0 + c[1] T1
// out = ()*T0 + Coeffs[1]*T1;

View File

@@ -245,9 +245,10 @@ until convergence
_HermOp(src_n,tmp);
// std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
// std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
// RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
RealD vnum = real(innerProduct(tmp,tmp)); // HermOp^2.
RealD vden = norm2(src_n);
RealD na = vnum/vden;
RealD na = std::sqrt(vnum/vden);
if (fabs(evalMaxApprox/na - 1.0) < 0.0001)
i=_MAX_ITER_IRL_MEVAPP_;
evalMaxApprox = na;
@@ -255,6 +256,7 @@ until convergence
src_n = tmp;
}
}
std::cout << GridLogIRL << " Final evalMaxApprox " << evalMaxApprox << std::endl;
std::vector<RealD> lme(Nm);
std::vector<RealD> lme2(Nm);

View File

@@ -97,7 +97,7 @@ public:
RealD scale;
ConjugateGradient<FineField> CG(1.0e-2,100,false);
ConjugateGradient<FineField> CG(1.0e-3,400,false);
FineField noise(FineGrid);
FineField Mn(FineGrid);
@@ -110,7 +110,7 @@ public:
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
for(int i=0;i<1;i++){
for(int i=0;i<4;i++){
CG(hermop,noise,subspace[b]);
@@ -146,7 +146,7 @@ public:
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;
for(int i=0;i<3;i++){
for(int i=0;i<2;i++){
// void operator() (const Field &src, Field &psi){
#if 1
std::cout << GridLogMessage << " inverting on noise "<<std::endl;

View File

@@ -441,8 +441,20 @@ public:
std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl;
}
#else
//////////////////////////////////////////////////////////////////////
// Galerkin projection of matrix
//////////////////////////////////////////////////////////////////////
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & Subspace)
{
CoarsenOperator(linop,Subspace,Subspace);
}
//////////////////////////////////////////////////////////////////////
// Petrov - Galerkin projection of matrix
//////////////////////////////////////////////////////////////////////
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & U,
Aggregation<Fobj,CComplex,nbasis> & V)
{
std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
GridBase *grid = FineGrid();
@@ -458,11 +470,9 @@ public:
// Orthogonalise the subblocks over the basis
/////////////////////////////////////////////////////////////
CoarseScalar InnerProd(CoarseGrid());
blockOrthogonalise(InnerProd,Subspace.subspace);
blockOrthogonalise(InnerProd,V.subspace);
blockOrthogonalise(InnerProd,U.subspace);
// for(int s=0;s<Subspace.subspace.size();s++){
// std::cout << " subspace norm "<<norm2(Subspace.subspace[s])<<std::endl;
// }
const int npoint = geom.npoint;
Coordinate clatt = CoarseGrid()->GlobalDimensions();
@@ -542,7 +552,7 @@ public:
std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
tphaseBZ-=usecond();
phaV = phaF[p]*Subspace.subspace[i];
phaV = phaF[p]*V.subspace[i];
tphaseBZ+=usecond();
/////////////////////////////////////////////////////////////////////
@@ -555,7 +565,7 @@ public:
// std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl;
tproj-=usecond();
blockProject(coarseInner,MphaV,Subspace.subspace);
blockProject(coarseInner,MphaV,U.subspace);
coarseInner = conjugate(pha[p]) * coarseInner;
ComputeProj[p] = coarseInner;

View File

@@ -183,6 +183,7 @@ public:
int recv_from_rank,
int bytes);
int IsOffNode(int rank);
double StencilSendToRecvFrom(void *xmit,
int xmit_to_rank,int do_xmit,
void *recv,
@@ -201,9 +202,9 @@ public:
void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
void *xmit,void *xmit_comp,
int xmit_to_rank,int do_xmit,
void *recv,
void *recv,void *recv_comp,
int recv_from_rank,int do_recv,
int xbytes,int rbytes,int dir);

View File

@@ -260,32 +260,39 @@ CartesianCommunicator::~CartesianCommunicator()
}
#ifdef USE_GRID_REDUCTION
void CartesianCommunicator::GlobalSum(float &f){
FlightRecorder::StepLog("GlobalSumP2P");
CartesianCommunicator::GlobalSumP2P(f);
}
void CartesianCommunicator::GlobalSum(double &d)
{
FlightRecorder::StepLog("GlobalSumP2P");
CartesianCommunicator::GlobalSumP2P(d);
}
#else
void CartesianCommunicator::GlobalSum(float &f){
FlightRecorder::StepLog("AllReduce float");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(double &d)
{
FlightRecorder::StepLog("AllReduce double");
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
#endif
void CartesianCommunicator::GlobalSum(uint32_t &u){
FlightRecorder::StepLog("AllReduce uint32_t");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(uint64_t &u){
FlightRecorder::StepLog("AllReduce uint64_t");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
FlightRecorder::StepLog("AllReduceVector");
int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0);
}
@@ -294,26 +301,31 @@ void CartesianCommunicator::GlobalXOR(uint32_t &u){
assert(ierr==0);
}
void CartesianCommunicator::GlobalXOR(uint64_t &u){
FlightRecorder::StepLog("GlobalXOR");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalMax(float &f)
{
FlightRecorder::StepLog("GlobalMax");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalMax(double &d)
{
FlightRecorder::StepLog("GlobalMax");
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(float *f,int N)
{
FlightRecorder::StepLog("GlobalSumVector(float *)");
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(double *d,int N)
{
FlightRecorder::StepLog("GlobalSumVector(double *)");
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
@@ -388,11 +400,16 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
{
std::vector<CommsRequest_t> list;
double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
offbytes += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
offbytes += StencilSendToRecvFromBegin(list,xmit,xmit,dest,dox,recv,recv,from,dor,bytes,bytes,dir);
StencilSendToRecvFromComplete(list,dir);
return offbytes;
}
int CartesianCommunicator::IsOffNode(int rank)
{
int grank = ShmRanks[rank];
if ( grank == MPI_UNDEFINED ) return true;
else return false;
}
#ifdef ACCELERATOR_AWARE_MPI
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
@@ -407,9 +424,9 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
return 0.0; // Do nothing -- no preparation required
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
void *xmit,void *xmit_comp,
int dest,int dox,
void *recv,
void *recv,void *recv_comp,
int from,int dor,
int xbytes,int rbytes,int dir)
{
@@ -433,24 +450,35 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
if ( dor ) {
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+from*32;
ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
// std::cout << " StencilSendToRecvFrom "<<dir<<" MPI_Irecv "<<std::hex<<recv<<std::dec<<std::endl;
ierr=MPI_Irecv(recv_comp, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
assert(ierr==0);
list.push_back(rrq);
off_node_bytes+=rbytes;
}
#ifdef NVLINK_GET
else {
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
assert(shm!=NULL);
// std::cout << " StencilSendToRecvFrom "<<dir<<" CopyDeviceToDevice recv "<<std::hex<<recv<<" remote "<<shm <<std::dec<<std::endl;
acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
}
#endif
}
// This is a NVLINK PUT
if (dox) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+_processor*32;
ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
ierr =MPI_Isend(xmit_comp, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0);
list.push_back(xrq);
off_node_bytes+=xbytes;
} else {
#ifndef NVLINK_GET
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
#endif
}
}
return off_node_bytes;
@@ -459,7 +487,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{
int nreq=list.size();
/*finishes Get/Put*/
acceleratorCopySynchronise();
if (nreq==0) return;
@@ -660,9 +688,9 @@ void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsReque
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
void *xmit,void *xmit_comp,
int dest,int dox,
void *recv,
void *recv,void *recv_comp,
int from,int dor,
int xbytes,int rbytes,int dir)
{
@@ -785,6 +813,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
void CartesianCommunicator::StencilBarrier(void)
{
FlightRecorder::StepLog("NodeBarrier");
MPI_Barrier (ShmComm);
}
//void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
@@ -792,11 +821,13 @@ void CartesianCommunicator::StencilBarrier(void)
//}
void CartesianCommunicator::Barrier(void)
{
FlightRecorder::StepLog("GridBarrier");
int ierr = MPI_Barrier(communicator);
assert(ierr==0);
}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
{
FlightRecorder::StepLog("Broadcast");
int ierr=MPI_Bcast(data,
bytes,
MPI_BYTE,
@@ -810,11 +841,13 @@ int CartesianCommunicator::RankWorld(void){
return r;
}
void CartesianCommunicator::BarrierWorld(void){
FlightRecorder::StepLog("BarrierWorld");
int ierr = MPI_Barrier(communicator_world);
assert(ierr==0);
}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
{
FlightRecorder::StepLog("BroadcastWorld");
int ierr= MPI_Bcast(data,
bytes,
MPI_BYTE,
@@ -837,6 +870,7 @@ void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,
}
void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes)
{
FlightRecorder::StepLog("AllToAll");
// MPI is a pain and uses "int" arguments
// 64*64*64*128*16 == 500Million elements of data.
// When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.

View File

@@ -124,6 +124,8 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
dest=0;
}
int CartesianCommunicator::IsOffNode(int rank) { return false; }
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int xmit_to_rank,int dox,
void *recv,

View File

@@ -137,7 +137,7 @@ public:
///////////////////////////////////////////////////
static void SharedMemoryAllocate(uint64_t bytes, int flags);
static void SharedMemoryFree(void);
static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
// static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
static void SharedMemoryZero(void *dest,size_t bytes);
};

View File

@@ -543,49 +543,21 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
///////////////////////////////////////////////////////////////////////////////////////////////////////////
#ifndef ACCELERATOR_AWARE_MPI
// printf("Host buffer allocate for GPU non-aware MPI\n");
#if 0
HostCommBuf= acceleratorAllocHost(bytes);
#else
HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
#ifdef HAVE_NUMAIF_H
#warning "Moving host buffers to specific NUMA domain"
int numa;
char *numa_name=(char *)getenv("MPI_BUF_NUMA");
if(numa_name) {
unsigned long page_size = sysconf(_SC_PAGESIZE);
numa = atoi(numa_name);
unsigned long page_count = bytes/page_size;
std::vector<void *> pages(page_count);
std::vector<int> nodes(page_count,numa);
std::vector<int> status(page_count,-1);
for(unsigned long p=0;p<page_count;p++){
pages[p] =(void *) ((uint64_t) HostCommBuf + p*page_size);
}
int ret = move_pages(0,
page_count,
&pages[0],
&nodes[0],
&status[0],
MPOL_MF_MOVE);
printf("Host buffer move to numa domain %d : move_pages returned %d\n",numa,ret);
if (ret) perror(" move_pages failed for reason:");
}
#endif
acceleratorPin(HostCommBuf,bytes);
#endif
#endif
ShmCommBuf = acceleratorAllocDevice(bytes);
if (ShmCommBuf == (void *)NULL ) {
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
std::cerr << "SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
exit(EXIT_FAILURE);
}
if ( WorldRank == 0 ){
std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
std::cout << Mheader " acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl;
}
SharedMemoryZero(ShmCommBuf,bytes);
std::cout<< "Setting up IPC"<<std::endl;
if ( WorldRank == 0 ){
std::cout<< Mheader "Setting up IPC"<<std::endl;
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Loop over ranks/gpu's on our node
///////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -616,8 +588,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
if ( err != ZE_RESULT_SUCCESS ) {
std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE);
} else {
std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
}
memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
handle.pid = getpid();
@@ -676,12 +646,12 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifdef SHM_SOCKETS
myfd=UnixSockets::RecvFileDescriptor();
#else
std::cout<<"mapping seeking remote pid/fd "
<<handle.pid<<"/"
<<handle.fd<<std::endl;
// std::cout<<"mapping seeking remote pid/fd "
// <<handle.pid<<"/"
// <<handle.fd<<std::endl;
int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
// std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
// int myfd = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
myfd = syscall(438,pidfd,handle.fd,0);
int err_t = errno;
@@ -691,7 +661,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
assert(0);
}
#endif
std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
// std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle));
memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));
@@ -700,9 +670,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE);
} else {
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle pointer is "<<std::hex<<thisBuf<<std::dec<<std::endl;
}
assert(thisBuf!=nullptr);
}
@@ -783,6 +750,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
WorldShmCommBufs[r] =ptr;
// std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
}
std::cout<< Mheader " Intra-node IPC setup is complete "<<std::endl;
_ShmAlloc=1;
_ShmAllocBytes = bytes;
};
@@ -916,14 +884,14 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
bzero(dest,bytes);
#endif
}
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
{
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
acceleratorCopyToDevice(src,dest,bytes);
#else
bcopy(src,dest,bytes);
#endif
}
//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
//{
//#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
// acceleratorCopyToDevice(src,dest,bytes);
//#else
// bcopy(src,dest,bytes);
//#endif
//}
////////////////////////////////////////////////////////
// Global shared functionality finished
// Now move to per communicator functionality
@@ -959,6 +927,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
// std::cerr << " SetCommunicator rank "<<r<<" comm "<<ShmCommBufs[r] <<std::endl;
}
ShmBufferFreeAll();
@@ -989,7 +958,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
}
#endif
//SharedMemoryTest();
// SharedMemoryTest();
}
//////////////////////////////////////////////////////////////////
// On node barrier
@@ -1011,19 +980,18 @@ void SharedMemory::SharedMemoryTest(void)
check[0]=GlobalSharedMemory::WorldNode;
check[1]=r;
check[2]=magic;
GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t));
acceleratorCopyToDevice(check,ShmCommBufs[r],3*sizeof(uint64_t));
}
}
ShmBarrier();
for(uint64_t r=0;r<ShmSize;r++){
ShmBarrier();
GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t));
ShmBarrier();
acceleratorCopyFromDevice(ShmCommBufs[r],check,3*sizeof(uint64_t));
assert(check[0]==GlobalSharedMemory::WorldNode);
assert(check[1]==r);
assert(check[2]==magic);
ShmBarrier();
}
ShmBarrier();
std::cout << GridLogDebug << " SharedMemoryTest has passed "<<std::endl;
}
void *SharedMemory::ShmBuffer(int rank)
@@ -1039,11 +1007,13 @@ void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
{
int gpeer = ShmRanks[rank];
assert(gpeer!=ShmRank); // never send to self
// std::cout << "ShmBufferTranslate for rank " << rank<<" peer "<<gpeer<<std::endl;
if (gpeer == MPI_UNDEFINED){
return NULL;
} else {
uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
// std::cout << "ShmBufferTranslate : local,offset,remote "<<std::hex<<local_p<<" "<<offset<<" "<<remote<<std::dec<<std::endl;
return (void *) remote;
}
}

View File

@@ -122,10 +122,10 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
{
acceleratorMemSet(dest,0,bytes);
}
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
{
acceleratorCopyToDevice(src,dest,bytes);
}
//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
//{
// acceleratorCopyToDevice(src,dest,bytes);
//}
////////////////////////////////////////////////////////
// Global shared functionality finished
// Now move to per communicator functionality

View File

@@ -143,9 +143,11 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int comm_proc = ((x+sshift)/rd)%pd;
if (comm_proc==0) {
FlightRecorder::StepLog("Cshift_Copy_plane");
tcopy-=usecond();
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
tcopy+=usecond();
FlightRecorder::StepLog("Cshift_Copy_plane_complete");
} else {
int words = buffer_size;
@@ -153,9 +155,11 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int bytes = words * sizeof(vobj);
FlightRecorder::StepLog("Cshift_Gather_plane");
tgather-=usecond();
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
tgather+=usecond();
FlightRecorder::StepLog("Cshift_Gather_plane_complete");
// int rank = grid->_processor;
int recv_from_rank;
@@ -166,6 +170,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
tcomms-=usecond();
grid->Barrier();
FlightRecorder::StepLog("Cshift_SendRecv");
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank,
@@ -182,10 +187,12 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
bytes);
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
#endif
FlightRecorder::StepLog("Cshift_SendRecv_complete");
xbytes+=bytes;
grid->Barrier();
tcomms+=usecond();
FlightRecorder::StepLog("Cshift_barrier_complete");
tscatter-=usecond();
Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);

View File

@@ -236,7 +236,7 @@ public:
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
vobj vtmp;
vtmp = r;
#if 0
#if 1
deviceVector<vobj> vvtmp(1);
acceleratorPut(vvtmp[0],vtmp);
vobj *vvtmp_p = & vvtmp[0];

View File

@@ -325,8 +325,8 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
assert(ok);
}
FlightRecorder::StepLog("Start global sum");
// grid->GlobalSumP2P(nrm);
grid->GlobalSum(nrm);
grid->GlobalSumP2P(nrm);
// grid->GlobalSum(nrm);
FlightRecorder::StepLog("Finished global sum");
// std::cout << " norm "<< nrm << " p2p norm "<<nrmck<<std::endl;
FlightRecorder::ReductionLog(local,real(nrm));

View File

@@ -55,7 +55,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
//copy offsets to device
acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
acceleratorCopyToDeviceAsynch(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
@@ -88,7 +88,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
exit(EXIT_FAILURE);
}
acceleratorCopyFromDeviceAsync(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
acceleratorCopyFromDeviceAsynch(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
//sync after copy
accelerator_barrier();

View File

@@ -510,7 +510,6 @@ public:
grid->SendToRecvFromBegin(fwd_req,
(void *)&hsend_buf[d*buffer_size], xmit_to_rank,
(void *)&hrecv_buf[d*buffer_size], recv_from_rank, bytes, tag);
acceleratorCopyToDevice(&hrecv_buf[d*buffer_size],&recv_buf[d*buffer_size],bytes);
#endif
t_comms+=usecond()-t;
}
@@ -531,7 +530,6 @@ public:
grid->SendToRecvFromBegin(bwd_req,
(void *)&hsend_buf[(d+depth)*buffer_size], recv_from_rank,
(void *)&hrecv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
acceleratorCopyToDevice(&hrecv_buf[(d+depth)*buffer_size],&recv_buf[(d+depth)*buffer_size],bytes);
#endif
t_comms+=usecond()-t;
}
@@ -555,8 +553,13 @@ public:
t=usecond();
grid->CommsComplete(fwd_req);
#ifndef ACCELERATOR_AWARE_MPI
for ( int d=0;d < depth ; d ++ ) {
acceleratorCopyToDevice(&hrecv_buf[d*buffer_size],&recv_buf[d*buffer_size],bytes);
}
#endif
t_comms+= usecond() - t;
t=usecond();
for ( int d=0;d < depth ; d ++ ) {
ScatterSlice(recv_buf,to,nld-depth+d,dimension,plane*buffer_size); plane++;
@@ -565,6 +568,11 @@ public:
t=usecond();
grid->CommsComplete(bwd_req);
#ifndef ACCELERATOR_AWARE_MPI
for ( int d=0;d < depth ; d ++ ) {
acceleratorCopyToDevice(&hrecv_buf[(d+depth)*buffer_size],&recv_buf[(d+depth)*buffer_size],bytes);
}
#endif
t_comms+= usecond() - t;
t=usecond();

View File

@@ -0,0 +1,196 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion5D.h
Copyright (C) 2020 - 2025
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
Author: Nils Meyer <nils.meyer@ur.de>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/qcd/action/fermion/WilsonFermion5D.h>
#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
#include <Grid/qcd/action/fermion/CloverHelpers.h>
NAMESPACE_BEGIN(Grid);
// see Grid/qcd/action/fermion/CompactWilsonCloverFermion.h for description
template<class Impl, class CloverHelpers>
class CompactWilsonCloverFermion5D : public WilsonFermion5D<Impl>,
public WilsonCloverHelpers<Impl>,
public CompactWilsonCloverHelpers<Impl> {
/////////////////////////////////////////////
// Sizes
/////////////////////////////////////////////
public:
INHERIT_COMPACT_CLOVER_SIZES(Impl);
/////////////////////////////////////////////
// Type definitions
/////////////////////////////////////////////
public:
INHERIT_IMPL_TYPES(Impl);
INHERIT_CLOVER_TYPES(Impl);
INHERIT_COMPACT_CLOVER_TYPES(Impl);
typedef WilsonFermion5D<Impl> WilsonBase;
typedef WilsonCloverHelpers<Impl> Helpers;
typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
/////////////////////////////////////////////
// Constructors
/////////////////////////////////////////////
public:
CompactWilsonCloverFermion5D(GaugeField& _Umu,
GridCartesian &FiveDimGrid,
GridRedBlackCartesian &FiveDimRedBlackGrid,
GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid,
const RealD _mass,
const RealD _csw_r = 0.0,
const RealD _csw_t = 0.0,
const RealD _cF = 1.0,
const ImplParams& impl_p = ImplParams());
/////////////////////////////////////////////
// Member functions (implementing interface)
/////////////////////////////////////////////
public:
virtual void Instantiatable() {};
int ConstEE() override { return 0; };
int isTrivialEE() override { return 0; };
void Dhop(const FermionField& in, FermionField& out, int dag) override;
void DhopOE(const FermionField& in, FermionField& out, int dag) override;
void DhopEO(const FermionField& in, FermionField& out, int dag) override;
void DhopDir(const FermionField& in, FermionField& out, int dir, int disp) override;
void DhopDirAll(const FermionField& in, std::vector<FermionField>& out) /* override */;
void M(const FermionField& in, FermionField& out) override;
void Mdag(const FermionField& in, FermionField& out) override;
void Meooe(const FermionField& in, FermionField& out) override;
void MeooeDag(const FermionField& in, FermionField& out) override;
void Mooee(const FermionField& in, FermionField& out) override;
void MooeeDag(const FermionField& in, FermionField& out) override;
void MooeeInv(const FermionField& in, FermionField& out) override;
void MooeeInvDag(const FermionField& in, FermionField& out) override;
void Mdir(const FermionField& in, FermionField& out, int dir, int disp) override;
void MdirAll(const FermionField& in, std::vector<FermionField>& out) override;
void MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) override;
void MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
void MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
/////////////////////////////////////////////
// Member functions (internals)
/////////////////////////////////////////////
void MooeeInternal(const FermionField& in,
FermionField& out,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle);
/////////////////////////////////////////////
// Helpers
/////////////////////////////////////////////
void ImportGauge(const GaugeField& _Umu) override;
/////////////////////////////////////////////
// Helpers
/////////////////////////////////////////////
private:
template<class Field>
const MaskField* getCorrectMaskField(const Field &in) const {
if(in.Grid()->_isCheckerBoarded) {
if(in.Checkerboard() == Odd) {
return &this->BoundaryMaskOdd;
} else {
return &this->BoundaryMaskEven;
}
} else {
return &this->BoundaryMask;
}
}
template<class Field>
void ApplyBoundaryMask(Field& f) {
const MaskField* m = getCorrectMaskField(f); assert(m != nullptr);
assert(m != nullptr);
CompactHelpers::ApplyBoundaryMask(f, *m);
}
/////////////////////////////////////////////
// Member Data
/////////////////////////////////////////////
public:
RealD csw_r;
RealD csw_t;
RealD cF;
int n_rhs;
bool fixedBoundaries;
CloverDiagonalField Diagonal, DiagonalEven, DiagonalOdd;
CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
CloverTriangleField Triangle, TriangleEven, TriangleOdd;
CloverTriangleField TriangleInv, TriangleInvEven, TriangleInvOdd;
FermionField Tmp;
MaskField BoundaryMask, BoundaryMaskEven, BoundaryMaskOdd;
};
NAMESPACE_END(Grid);

View File

@@ -55,6 +55,7 @@ NAMESPACE_CHECK(Wilson);
NAMESPACE_CHECK(WilsonTM);
#include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h> // 4d compact wilson clover fermions
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h> // 5d compact wilson clover fermions
NAMESPACE_CHECK(WilsonClover);
#include <Grid/qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types
NAMESPACE_CHECK(Wilson5D);
@@ -164,12 +165,17 @@ typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiS
// Compact Clover fermions
template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>;
template <typename WImpl> using CompactWilsonClover5D = CompactWilsonCloverFermion5D<WImpl, CompactCloverHelpers<WImpl>>;
template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>;
typedef CompactWilsonClover<WilsonImplD2> CompactWilsonCloverFermionD2;
typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF;
typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD;
typedef CompactWilsonClover5D<WilsonImplD2> CompactWilsonCloverFermion5DD2;
typedef CompactWilsonClover5D<WilsonImplF> CompactWilsonCloverFermion5DF;
typedef CompactWilsonClover5D<WilsonImplD> CompactWilsonCloverFermion5DD;
typedef CompactWilsonExpClover<WilsonImplD2> CompactWilsonExpCloverFermionD2;
typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF;
typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD;

View File

@@ -154,6 +154,12 @@ public:
StencilImpl Stencil;
StencilImpl StencilEven;
StencilImpl StencilOdd;
void SloppyComms(int sloppy)
{
Stencil.SetSloppyComms(sloppy);
StencilEven.SetSloppyComms(sloppy);
StencilOdd.SetSloppyComms(sloppy);
}
// Copy of the gauge field , with even and odd subsets
DoubledGaugeField Umu;

View File

@@ -179,6 +179,12 @@ public:
StencilImpl Stencil;
StencilImpl StencilEven;
StencilImpl StencilOdd;
void SloppyComms(int sloppy)
{
Stencil.SetSloppyComms(sloppy);
StencilEven.SetSloppyComms(sloppy);
StencilOdd.SetSloppyComms(sloppy);
}
// Copy of the gauge field , with even and odd subsets
DoubledGaugeField Umu;

View File

@@ -146,6 +146,12 @@ public:
StencilImpl Stencil;
StencilImpl StencilEven;
StencilImpl StencilOdd;
void SloppyComms(int sloppy)
{
Stencil.SetSloppyComms(sloppy);
StencilEven.SetSloppyComms(sloppy);
StencilOdd.SetSloppyComms(sloppy);
}
// Copy of the gauge field , with even and odd subsets
DoubledGaugeField Umu;

View File

@@ -32,209 +32,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////
// Wilson compressor will need FaceGather policies for:
// Periodic, Dirichlet, and partial Dirichlet for DWF
///////////////////////////////////////////////////////////////
const int dwf_compressor_depth=2;
#define DWF_COMPRESS
class FaceGatherPartialDWF
{
public:
#ifdef DWF_COMPRESS
static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);};
#else
static int PartialCompressionFactor(GridBase *grid) { return 1;}
#endif
template<class vobj,class cobj,class compressor>
static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
const Lattice<vobj> &rhs,
cobj *buffer,
compressor &compress,
int off,int so,int partial)
{
//DWF only hack: If a direction that is OFF node we use Partial Dirichlet
// Shrinks local and remote comms buffers
GridBase *Grid = rhs.Grid();
int Ls = Grid->_rdimensions[0];
#ifdef DWF_COMPRESS
int depth=dwf_compressor_depth;
#else
int depth=Ls/2;
#endif
std::pair<int,int> *table_v = & table[0];
auto rhs_v = rhs.View(AcceleratorRead);
int vol=table.size()/Ls;
accelerator_forNB( idx,table.size(), vobj::Nsimd(), {
Integer i=idx/Ls;
Integer s=idx%Ls;
Integer sc=depth+s-(Ls-depth);
if(s<depth) compress.Compress(buffer[off+i+s*vol],rhs_v[so+table_v[idx].second]);
if(s>=Ls-depth) compress.Compress(buffer[off+i+sc*vol],rhs_v[so+table_v[idx].second]);
});
rhs_v.ViewClose();
}
template<class decompressor,class Decompression>
static void DecompressFace(decompressor decompress,Decompression &dd)
{
auto Ls = dd.dims[0];
#ifdef DWF_COMPRESS
int depth=dwf_compressor_depth;
#else
int depth=Ls/2;
#endif
// Just pass in the Grid
auto kp = dd.kernel_p;
auto mp = dd.mpi_p;
int size= dd.buffer_size;
int vol= size/Ls;
accelerator_forNB(o,size,1,{
int idx=o/Ls;
int s=o%Ls;
if ( s < depth ) {
int oo=s*vol+idx;
kp[o]=mp[oo];
} else if ( s >= Ls-depth ) {
int sc = depth + s - (Ls-depth);
int oo=sc*vol+idx;
kp[o]=mp[oo];
} else {
kp[o] = Zero();//fill rest with zero if partial dirichlet
}
});
}
////////////////////////////////////////////////////////////////////////////////////////////
// Need to gather *interior portions* for ALL s-slices in simd directions
// Do the gather as need to treat SIMD lanes differently, and insert zeroes on receive side
// Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2.
////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj,class cobj,class compressor>
static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
compressor &compress,int type,int partial)
{
GridBase *Grid = rhs.Grid();
int Ls = Grid->_rdimensions[0];
#ifdef DWF_COMPRESS
int depth=dwf_compressor_depth;
#else
int depth = Ls/2;
#endif
// insertion of zeroes...
assert( (table.size()&0x1)==0);
int num=table.size()/2;
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
auto rhs_v = rhs.View(AcceleratorRead);
auto p0=&pointers[0][0];
auto p1=&pointers[1][0];
auto tp=&table[0];
int nnum=num/Ls;
accelerator_forNB(j, num, vobj::Nsimd(), {
// Reorders both local and remote comms buffers
//
int s = j % Ls;
int sp1 = (s+depth)%Ls; // peri incremented s slice
int hxyz= j/Ls;
int xyz0= hxyz*2; // xyzt part of coor
int xyz1= hxyz*2+1;
int jj= hxyz + sp1*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice ....
int kk0= xyz0*Ls + s ; // s=0 goes to s=1
int kk1= xyz1*Ls + s ; // s=Ls-1 -> s=0
compress.CompressExchange(p0[jj],p1[jj],
rhs_v[so+tp[kk0 ].second], // Same s, consecutive xyz sites
rhs_v[so+tp[kk1 ].second],
type);
});
rhs_v.ViewClose();
}
// Merge routine is for SIMD faces
template<class decompressor,class Merger>
static void MergeFace(decompressor decompress,Merger &mm)
{
auto Ls = mm.dims[0];
#ifdef DWF_COMPRESS
int depth=dwf_compressor_depth;
#else
int depth = Ls/2;
#endif
int num= mm.buffer_size/2; // relate vol and Ls to buffer size
auto mp = &mm.mpointer[0];
auto vp0= &mm.vpointers[0][0]; // First arg is exchange first
auto vp1= &mm.vpointers[1][0];
auto type= mm.type;
int nnum = num/Ls;
accelerator_forNB(o,num,Merger::Nsimd,{
int s=o%Ls;
int hxyz=o/Ls; // xyzt related component
int xyz0=hxyz*2;
int xyz1=hxyz*2+1;
int sp = (s+depth)%Ls;
int jj= hxyz + sp*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice ....
int oo0= s+xyz0*Ls;
int oo1= s+xyz1*Ls;
// same ss0, ss1 pair goes to new layout
decompress.Exchange(mp[oo0],mp[oo1],vp0[jj],vp1[jj],type);
});
}
};
class FaceGatherDWFMixedBCs
{
public:
#ifdef DWF_COMPRESS
static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);};
#else
static int PartialCompressionFactor(GridBase *grid) {return 1;}
#endif
template<class vobj,class cobj,class compressor>
static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
const Lattice<vobj> &rhs,
cobj *buffer,
compressor &compress,
int off,int so,int partial)
{
// std::cout << " face gather simple DWF partial "<<partial <<std::endl;
if(partial) FaceGatherPartialDWF::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
else FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
}
template<class vobj,class cobj,class compressor>
static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
compressor &compress,int type,int partial)
{
// std::cout << " face gather exch DWF partial "<<partial <<std::endl;
if(partial) FaceGatherPartialDWF::Gather_plane_exchange(table,rhs,pointers,dimension, plane,cbmask,compress,type,partial);
else FaceGatherSimple::Gather_plane_exchange (table,rhs,pointers,dimension, plane,cbmask,compress,type,partial);
}
template<class decompressor,class Merger>
static void MergeFace(decompressor decompress,Merger &mm)
{
int partial = mm.partial;
// std::cout << " merge DWF partial "<<partial <<std::endl;
if ( partial ) FaceGatherPartialDWF::MergeFace(decompress,mm);
else FaceGatherSimple::MergeFace(decompress,mm);
}
template<class decompressor,class Decompression>
static void DecompressFace(decompressor decompress,Decompression &dd)
{
int partial = dd.partial;
// std::cout << " decompress DWF partial "<<partial <<std::endl;
if ( partial ) FaceGatherPartialDWF::DecompressFace(decompress,dd);
else FaceGatherSimple::DecompressFace(decompress,dd);
}
};
/////////////////////////////////////////////////////////////////////////////////////////////
// optimised versions supporting half precision too??? Deprecate
/////////////////////////////////////////////////////////////////////////////////////////////
@@ -242,8 +39,7 @@ public:
//Could make FaceGather a template param, but then behaviour is runtime not compile time
template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
class WilsonCompressorTemplate : public FaceGatherDWFMixedBCs
// : public FaceGatherSimple
class WilsonCompressorTemplate : public FaceGatherSimple
{
public:

View File

@@ -165,6 +165,12 @@ public:
StencilImpl Stencil;
StencilImpl StencilEven;
StencilImpl StencilOdd;
void SloppyComms(int sloppy)
{
Stencil.SetSloppyComms(sloppy);
StencilEven.SetSloppyComms(sloppy);
StencilOdd.SetSloppyComms(sloppy);
}
// Copy of the gauge field , with even and odd subsets
DoubledGaugeField Umu;

View File

@@ -91,13 +91,13 @@ public:
virtual void Mdag (const FermionField &in, FermionField &out){assert(0);};
// half checkerboard operations; leave unimplemented as abstract for now
virtual void Meooe (const FermionField &in, FermionField &out){assert(0);};
virtual void Mooee (const FermionField &in, FermionField &out){assert(0);};
virtual void MooeeInv (const FermionField &in, FermionField &out){assert(0);};
virtual void Meooe (const FermionField &in, FermionField &out);
virtual void Mooee (const FermionField &in, FermionField &out);
virtual void MooeeInv (const FermionField &in, FermionField &out);
virtual void MeooeDag (const FermionField &in, FermionField &out){assert(0);};
virtual void MooeeDag (const FermionField &in, FermionField &out){assert(0);};
virtual void MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
virtual void MeooeDag (const FermionField &in, FermionField &out);
virtual void MooeeDag (const FermionField &in, FermionField &out);
virtual void MooeeInvDag (const FermionField &in, FermionField &out);
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
@@ -204,7 +204,14 @@ public:
DoubledGaugeField Umu;
DoubledGaugeField UmuEven;
DoubledGaugeField UmuOdd;
void SloppyComms(int sloppy)
{
Stencil.SetSloppyComms(sloppy);
StencilEven.SetSloppyComms(sloppy);
StencilOdd.SetSloppyComms(sloppy);
}
// Comms buffer
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;

View File

@@ -0,0 +1,376 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion5DImplementation.h
Copyright (C) 2017 - 2025
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/qcd/spin/Dirac.h>
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h>
NAMESPACE_BEGIN(Grid);
template<class Impl, class CloverHelpers>
CompactWilsonCloverFermion5D<Impl, CloverHelpers>::CompactWilsonCloverFermion5D(GaugeField& _Umu,
GridCartesian &FiveDimGrid,
GridRedBlackCartesian &FiveDimRedBlackGrid,
GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid,
const RealD _mass,
const RealD _csw_r,
const RealD _csw_t,
const RealD _cF,
const ImplParams& impl_p)
: WilsonBase(_Umu, FiveDimGrid, FiveDimRedBlackGrid, FourDimGrid, FourDimRedBlackGrid, _mass, impl_p)
, csw_r(_csw_r)
, csw_t(_csw_t)
, cF(_cF)
, fixedBoundaries(impl_p.boundary_phases[Nd-1] == 0.0)
, Diagonal(&FourDimGrid), Triangle(&FourDimGrid)
, DiagonalEven(&FourDimRedBlackGrid), TriangleEven(&FourDimRedBlackGrid)
, DiagonalOdd(&FourDimRedBlackGrid), TriangleOdd(&FourDimRedBlackGrid)
, DiagonalInv(&FourDimGrid), TriangleInv(&FourDimGrid)
, DiagonalInvEven(&FourDimRedBlackGrid), TriangleInvEven(&FourDimRedBlackGrid)
, DiagonalInvOdd(&FourDimRedBlackGrid), TriangleInvOdd(&FourDimRedBlackGrid)
, Tmp(&FiveDimGrid)
, BoundaryMask(&FiveDimGrid)
, BoundaryMaskEven(&FiveDimRedBlackGrid), BoundaryMaskOdd(&FiveDimRedBlackGrid)
{
assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3);
csw_r *= 0.5;
csw_t *= 0.5;
//if (clover_anisotropy.isAnisotropic)
// csw_r /= clover_anisotropy.xi_0;
ImportGauge(_Umu);
if (fixedBoundaries) {
this->BoundaryMaskEven.Checkerboard() = Even;
this->BoundaryMaskOdd.Checkerboard() = Odd;
CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
}
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Dhop(const FermionField& in, FermionField& out, int dag) {
WilsonBase::Dhop(in, out, dag);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopOE(const FermionField& in, FermionField& out, int dag) {
WilsonBase::DhopOE(in, out, dag);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopEO(const FermionField& in, FermionField& out, int dag) {
WilsonBase::DhopEO(in, out, dag);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
WilsonBase::DhopDir(in, out, dir, disp);
if(this->fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
WilsonBase::DhopDirAll(in, out);
if(this->fixedBoundaries) {
for(auto& o : out) ApplyBoundaryMask(o);
}
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::M(const FermionField& in, FermionField& out) {
out.Checkerboard() = in.Checkerboard();
WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
Mooee(in, Tmp);
axpy(out, 1.0, out, Tmp);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Mdag(const FermionField& in, FermionField& out) {
out.Checkerboard() = in.Checkerboard();
WilsonBase::Dhop(in, out, DaggerYes); // call base to save applying bc
MooeeDag(in, Tmp);
axpy(out, 1.0, out, Tmp);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Meooe(const FermionField& in, FermionField& out) {
WilsonBase::Meooe(in, out);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MeooeDag(const FermionField& in, FermionField& out) {
WilsonBase::MeooeDag(in, out);
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Mooee(const FermionField& in, FermionField& out) {
if(in.Grid()->_isCheckerBoarded) {
if(in.Checkerboard() == Odd) {
MooeeInternal(in, out, DiagonalOdd, TriangleOdd);
} else {
MooeeInternal(in, out, DiagonalEven, TriangleEven);
}
} else {
MooeeInternal(in, out, Diagonal, Triangle);
}
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeDag(const FermionField& in, FermionField& out) {
Mooee(in, out); // blocks are hermitian
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInv(const FermionField& in, FermionField& out) {
if(in.Grid()->_isCheckerBoarded) {
if(in.Checkerboard() == Odd) {
MooeeInternal(in, out, DiagonalInvOdd, TriangleInvOdd);
} else {
MooeeInternal(in, out, DiagonalInvEven, TriangleInvEven);
}
} else {
MooeeInternal(in, out, DiagonalInv, TriangleInv);
}
if(fixedBoundaries) ApplyBoundaryMask(out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInvDag(const FermionField& in, FermionField& out) {
MooeeInv(in, out); // blocks are hermitian
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Mdir(const FermionField& in, FermionField& out, int dir, int disp) {
DhopDir(in, out, dir, disp);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MdirAll(const FermionField& in, std::vector<FermionField>& out) {
DhopDirAll(in, out);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
assert(!fixedBoundaries); // TODO check for changes required for open bc
// NOTE: code copied from original clover term
conformable(X.Grid(), Y.Grid());
conformable(X.Grid(), force.Grid());
GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
GaugeField clover_force(force.Grid());
PropagatorField Lambda(force.Grid());
// Guido: Here we are hitting some performance issues:
// need to extract the components of the DoubledGaugeField
// for each call
// Possible solution
// Create a vector object to store them? (cons: wasting space)
std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
Impl::extractLinkField(U, this->Umu);
force = Zero();
// Derivative of the Wilson hopping term
this->DhopDeriv(force, X, Y, dag);
///////////////////////////////////////////////////////////
// Clover term derivative
///////////////////////////////////////////////////////////
Impl::outerProductImpl(Lambda, X, Y);
//std::cout << "Lambda:" << Lambda << std::endl;
Gamma::Algebra sigma[] = {
Gamma::Algebra::SigmaXY,
Gamma::Algebra::SigmaXZ,
Gamma::Algebra::SigmaXT,
Gamma::Algebra::MinusSigmaXY,
Gamma::Algebra::SigmaYZ,
Gamma::Algebra::SigmaYT,
Gamma::Algebra::MinusSigmaXZ,
Gamma::Algebra::MinusSigmaYZ,
Gamma::Algebra::SigmaZT,
Gamma::Algebra::MinusSigmaXT,
Gamma::Algebra::MinusSigmaYT,
Gamma::Algebra::MinusSigmaZT};
/*
sigma_{\mu \nu}=
| 0 sigma[0] sigma[1] sigma[2] |
| sigma[3] 0 sigma[4] sigma[5] |
| sigma[6] sigma[7] 0 sigma[8] |
| sigma[9] sigma[10] sigma[11] 0 |
*/
int count = 0;
clover_force = Zero();
for (int mu = 0; mu < 4; mu++)
{
force_mu = Zero();
for (int nu = 0; nu < 4; nu++)
{
if (mu == nu)
continue;
RealD factor;
if (nu == 4 || mu == 4)
{
factor = 2.0 * csw_t;
}
else
{
factor = 2.0 * csw_r;
}
PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok
force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu); // checked
count++;
}
pokeLorentz(clover_force, U[mu] * force_mu, mu);
}
//clover_force *= csw;
force += clover_force;
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
assert(0);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
assert(0);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInternal(const FermionField& in,
FermionField& out,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle) {
assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
out.Checkerboard() = in.Checkerboard();
conformable(in, out);
CompactHelpers::MooeeKernel(diagonal.oSites(), this->Ls, in, out, diagonal, triangle);
}
template<class Impl, class CloverHelpers>
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::ImportGauge(const GaugeField& _Umu) {
// NOTE: parts copied from original implementation
// Import gauge into base class
double t0 = usecond();
WilsonBase::ImportGauge(_Umu); // NOTE: called here and in wilson constructor -> performed twice, but can't avoid that
// Initialize temporary variables
double t1 = usecond();
conformable(_Umu.Grid(), this->GaugeGrid());
GridBase* grid = _Umu.Grid();
typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
CloverField TmpOriginal(grid);
CloverField TmpInverse(grid);
// Compute the field strength terms mu>nu
double t2 = usecond();
WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
// Compute the Clover Operator acting on Colour and Spin
// multiply here by the clover coefficients for the anisotropy
double t3 = usecond();
TmpOriginal = Helpers::fillCloverYZ(Bx) * csw_r;
TmpOriginal += Helpers::fillCloverXZ(By) * csw_r;
TmpOriginal += Helpers::fillCloverXY(Bz) * csw_r;
TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
// Instantiate the clover term
// - In case of the standard clover the mass term is added
// - In case of the exponential clover the clover term is exponentiated
double t4 = usecond();
CloverHelpers::InstantiateClover(TmpOriginal, TmpInverse, csw_t, 4.0 + this->M5 /*this->diag_mass*/);
// Convert the data layout of the clover term
double t5 = usecond();
CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
// Modify the clover term at the temporal boundaries in case of open boundary conditions
double t6 = usecond();
if(fixedBoundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, 4.0 + this->M5 /*this->diag_mass*/);
// Invert the Clover term
// In case of the exponential clover with (anti-)periodic boundary conditions exp(-Clover) saved
// in TmpInverse can be used. In all other cases the clover term has to be explictly inverted.
// TODO: For now this inversion is explictly done on the CPU
double t7 = usecond();
CloverHelpers::InvertClover(TmpInverse, Diagonal, Triangle, DiagonalInv, TriangleInv, fixedBoundaries);
// Fill the remaining clover fields
double t8 = usecond();
pickCheckerboard(Even, DiagonalEven, Diagonal);
pickCheckerboard(Even, TriangleEven, Triangle);
pickCheckerboard(Odd, DiagonalOdd, Diagonal);
pickCheckerboard(Odd, TriangleOdd, Triangle);
pickCheckerboard(Even, DiagonalInvEven, DiagonalInv);
pickCheckerboard(Even, TriangleInvEven, TriangleInv);
pickCheckerboard(Odd, DiagonalInvOdd, DiagonalInv);
pickCheckerboard(Odd, TriangleInvOdd, TriangleInv);
// Report timings
double t9 = usecond();
std::cout << GridLogDebug << "CompactWilsonCloverFermion5D::ImportGauge timings:" << std::endl;
std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
std::cout << GridLogDebug << "allocations = " << (t2 - t1) / 1e6 << std::endl;
std::cout << GridLogDebug << "field strength = " << (t3 - t2) / 1e6 << std::endl;
std::cout << GridLogDebug << "fill clover = " << (t4 - t3) / 1e6 << std::endl;
std::cout << GridLogDebug << "instantiate clover = " << (t5 - t4) / 1e6 << std::endl;
std::cout << GridLogDebug << "convert layout = " << (t6 - t5) / 1e6 << std::endl;
std::cout << GridLogDebug << "modify boundaries = " << (t7 - t6) / 1e6 << std::endl;
std::cout << GridLogDebug << "invert clover = " << (t8 - t7) / 1e6 << std::endl;
std::cout << GridLogDebug << "pick cbs = " << (t9 - t8) / 1e6 << std::endl;
std::cout << GridLogDebug << "total = " << (t9 - t0) / 1e6 << std::endl;
}
NAMESPACE_END(Grid);

View File

@@ -14,6 +14,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
Author: Andrew Lawson <andrew.lawson1991@gmail.com>
Author: Vera Guelpers <V.M.Guelpers@soton.ac.uk>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -484,6 +485,54 @@ void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag
Dhop(in,out,dag); // -0.5 is included
axpy(out,4.0-M5,in,out);
}
template <class Impl>
void WilsonFermion5D<Impl>::Meooe(const FermionField &in, FermionField &out)
{
if (in.Checkerboard() == Odd) {
DhopEO(in, out, DaggerNo);
} else {
DhopOE(in, out, DaggerNo);
}
}
template <class Impl>
void WilsonFermion5D<Impl>::MeooeDag(const FermionField &in, FermionField &out)
{
if (in.Checkerboard() == Odd) {
DhopEO(in, out, DaggerYes);
} else {
DhopOE(in, out, DaggerYes);
}
}
template <class Impl>
void WilsonFermion5D<Impl>::Mooee(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard();
typename FermionField::scalar_type scal(4.0 + M5);
out = scal * in;
}
template <class Impl>
void WilsonFermion5D<Impl>::MooeeDag(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard();
Mooee(in, out);
}
template<class Impl>
void WilsonFermion5D<Impl>::MooeeInv(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard();
out = (1.0/(4.0 + M5))*in;
}
template<class Impl>
void WilsonFermion5D<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
{
out.Checkerboard() = in.Checkerboard();
MooeeInv(in,out);
}
template<class Impl>
void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in, RealD mass,std::vector<double> twist)

View File

@@ -63,7 +63,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
} else { \
chi = coalescedRead(buf[SE->_offset],lane); \
} \
acceleratorSynchronise(); \
acceleratorSynchronise(); \
Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \
Recon(result, Uchi);
@@ -504,7 +504,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
autoView(st_v , st,AcceleratorRead);
if( interior && exterior ) {
// acceleratorFenceComputeStream();
acceleratorFenceComputeStream();
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
#ifndef GRID_CUDA

View File

@@ -0,0 +1,45 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/ qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation5D.cc.master
Copyright (C) 2017 - 2025
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
Author: Mattia Bruno <mattia.bruno@cern.ch>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/qcd/spin/Dirac.h>
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h>
#include <Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermion5DImplementation.h>
#include <Grid/qcd/action/fermion/CloverHelpers.h>
NAMESPACE_BEGIN(Grid);
#include "impl.h"
template class CompactWilsonCloverFermion5D<IMPLEMENTATION, CompactCloverHelpers<IMPLEMENTATION>>;
template class CompactWilsonCloverFermion5D<IMPLEMENTATION, CompactExpCloverHelpers<IMPLEMENTATION>>;
NAMESPACE_END(Grid);

View File

@@ -0,0 +1 @@
../CompactWilsonCloverFermion5DInstantiation.cc.master

View File

@@ -0,0 +1 @@
../CompactWilsonCloverFermion5DInstantiation.cc.master

View File

@@ -62,7 +62,7 @@ do
done
done
CC_LIST="CompactWilsonCloverFermionInstantiation"
CC_LIST="CompactWilsonCloverFermionInstantiation CompactWilsonCloverFermion5DInstantiation"
for impl in $COMPACT_WILSON_IMPL_LIST
do

View File

@@ -76,27 +76,27 @@ public:
return action;
};
virtual void deriv(const GaugeField &Umu,GaugeField & dSdU) {
virtual void deriv(const GaugeField &U, GaugeField &dSdU) {
//extend Ta to include Lorentz indexes
RealD factor_p = c_plaq/RealD(Nc)*0.5;
RealD factor_r = c_rect/RealD(Nc)*0.5;
GridBase *grid = Umu.Grid();
GridBase *grid = U.Grid();
std::vector<GaugeLinkField> U (Nd,grid);
std::vector<GaugeLinkField> Umu (Nd,grid);
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
Umu[mu] = PeekIndex<LorentzIndex>(U,mu);
}
std::vector<GaugeLinkField> RectStaple(Nd,grid), Staple(Nd,grid);
WilsonLoops<Gimpl>::StapleAndRectStapleAll(Staple, RectStaple, U, workspace);
WilsonLoops<Gimpl>::StapleAndRectStapleAll(Staple, RectStaple, Umu, workspace);
GaugeLinkField dSdU_mu(grid);
GaugeLinkField staple(grid);
for (int mu=0; mu < Nd; mu++){
dSdU_mu = Ta(U[mu]*Staple[mu])*factor_p;
dSdU_mu = dSdU_mu + Ta(U[mu]*RectStaple[mu])*factor_r;
dSdU_mu = Ta(Umu[mu]*Staple[mu])*factor_p;
dSdU_mu = dSdU_mu + Ta(Umu[mu]*RectStaple[mu])*factor_r;
PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
}

View File

@@ -73,20 +73,23 @@ public:
// extend Ta to include Lorentz indexes
RealD factor = 0.5 * beta / RealD(Nc);
GridBase *grid = U.Grid();
GaugeLinkField Umu(U.Grid());
GaugeLinkField dSdU_mu(U.Grid());
GaugeLinkField dSdU_mu(grid);
std::vector<GaugeLinkField> Umu(Nd, grid);
for (int mu = 0; mu < Nd; mu++) {
Umu[mu] = PeekIndex<LorentzIndex>(U, mu);
}
Umu = PeekIndex<LorentzIndex>(U, mu);
for (int mu = 0; mu < Nd; mu++) {
// Staple in direction mu
WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
dSdU_mu = Ta(Umu * dSdU_mu) * factor;
WilsonLoops<Gimpl>::Staple(dSdU_mu, Umu, mu);
dSdU_mu = Ta(Umu[mu] * dSdU_mu) * factor;
PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
}
}
private:
RealD beta;
};

View File

@@ -111,8 +111,8 @@ public:
};
void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {
std::string config, rng;
this->build_filenames(traj, Params, config, rng);
std::string config, rng, smr;
this->build_filenames(traj, Params, config, smr, rng);
this->check_filename(rng);
this->check_filename(config);

View File

@@ -75,7 +75,7 @@ public:
GridParallelRNG &pRNG) {
if ((traj % Params.saveInterval) == 0) {
std::string config, rng, smr;
this->build_filenames(traj, Params, config, rng);
this->build_filenames(traj, Params, config, smr, rng);
GridBase *grid = SmartConfig.get_U(false).Grid();
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
@@ -102,7 +102,7 @@ public:
if ( Params.saveSmeared ) {
IldgWriter _IldgWriter(grid->IsBoss());
_IldgWriter.open(smr);
_IldgWriter.writeConfiguration<GaugeStats>(SmartConfig.get_U(true), traj, config, config);
_IldgWriter.writeConfiguration<GaugeStats>(SmartConfig.get_U(true), traj, smr, smr);
_IldgWriter.close();
std::cout << GridLogMessage << "Written ILDG Configuration on " << smr
@@ -118,8 +118,8 @@ public:
void CheckpointRestore(int traj, GaugeField &U, GridSerialRNG &sRNG,
GridParallelRNG &pRNG) {
std::string config, rng;
this->build_filenames(traj, Params, config, rng);
std::string config, rng, smr;
this->build_filenames(traj, Params, config, smr, rng);
this->check_filename(rng);
this->check_filename(config);

View File

@@ -107,8 +107,8 @@ class ScidacHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG,
GridParallelRNG &pRNG) {
std::string config, rng;
this->build_filenames(traj, Params, config, rng);
std::string config, rng, smr;
this->build_filenames(traj, Params, config, smr, rng);
this->check_filename(rng);
this->check_filename(config);

View File

@@ -62,15 +62,15 @@ accelerator_inline int stencilIndex(int mu, int nu) {
/*! @brief structure holding the link treatment */
struct SmearingParameters{
SmearingParameters(){}
struct HISQSmearingParameters{
HISQSmearingParameters(){}
Real c_1; // 1 link
Real c_naik; // Naik term
Real c_3; // 3 link
Real c_5; // 5 link
Real c_7; // 7 link
Real c_lp; // 5 link Lepage
SmearingParameters(Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp)
HISQSmearingParameters(Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp)
: c_1(c1),
c_naik(cnaik),
c_3(c3),
@@ -86,7 +86,7 @@ class Smear_HISQ : public Gimpl {
private:
GridCartesian* const _grid;
SmearingParameters _linkTreatment;
HISQSmearingParameters _linkTreatment;
public:
@@ -117,7 +117,7 @@ public:
// IN--u_thin
void smear(GF& u_smr, GF& u_naik, GF& u_thin) const {
SmearingParameters lt = this->_linkTreatment;
HISQSmearingParameters lt = this->_linkTreatment;
auto grid = this->_grid;
// Create a padded cell of extra padding depth=1 and fill the padding.

View File

@@ -207,11 +207,14 @@ std::vector<RealD> WilsonFlowBase<Gimpl>::flowMeasureEnergyDensityCloverleaf(con
}
template <class Gimpl>
void WilsonFlowBase<Gimpl>::setDefaultMeasurements(int topq_meas_interval){
addMeasurement(1, [](int step, RealD t, const typename Gimpl::GaugeField &U){
void WilsonFlowBase<Gimpl>::setDefaultMeasurements(int meas_interval){
addMeasurement(meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : " << step << " " << t << " " << energyDensityPlaquette(t,U) << std::endl;
});
addMeasurement(topq_meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
addMeasurement(meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
std::cout << GridLogMessage << "[WilsonFlow] Energy density (cloverleaf) : " << step << " " << t << " " << energyDensityCloverleaf(t,U) << std::endl;
});
addMeasurement(meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
std::cout << GridLogMessage << "[WilsonFlow] Top. charge : " << step << " " << WilsonLoops<Gimpl>::TopologicalCharge(U) << std::endl;
});
}
@@ -249,6 +252,11 @@ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const{
out = in;
RealD taus = 0.;
// Perform initial t=0 measurements
for(auto const &meas : this->functions)
meas.second(0,taus,out);
for (unsigned int step = 1; step <= Nstep; step++) { //step indicates the number of smearing steps applied at the time of measurement
auto start = std::chrono::high_resolution_clock::now();
evolve_step(out, taus);
@@ -333,6 +341,11 @@ void WilsonFlowAdaptive<Gimpl>::smear(GaugeField& out, const GaugeField& in) con
RealD taus = 0.;
RealD eps = init_epsilon;
unsigned int step = 0;
// Perform initial t=0 measurements
for(auto const &meas : this->functions)
meas.second(step,taus,out);
do{
int step_success = evolve_step_adaptive(out, taus, eps);
step += step_success; //step will not be incremented if the integration step fails

View File

@@ -292,19 +292,21 @@ public:
//////////////////////////////////////////////////
// the sum over all nu-oriented staples for nu != mu on each site
//////////////////////////////////////////////////
static void Staple(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
static void Staple(GaugeMat &staple, const GaugeLorentz &U, int mu) {
GridBase *grid = Umu.Grid();
std::vector<GaugeMat> U(Nd, grid);
std::vector<GaugeMat> Umu(Nd, U.Grid());
for (int d = 0; d < Nd; d++) {
U[d] = PeekIndex<LorentzIndex>(Umu, d);
Umu[d] = PeekIndex<LorentzIndex>(U, d);
}
Staple(staple, U, mu);
Staple(staple, Umu, mu);
}
static void Staple(GaugeMat &staple, const std::vector<GaugeMat> &U, int mu) {
staple = Zero();
static void Staple(GaugeMat &staple, const std::vector<GaugeMat> &Umu, int mu) {
autoView(staple_v, staple, AcceleratorWrite);
accelerator_for(i, staple.Grid()->oSites(), Simd::Nsimd(), {
staple_v[i] = Zero();
});
for (int nu = 0; nu < Nd; nu++) {
@@ -318,12 +320,12 @@ public:
// |
// __|
//
staple += Gimpl::ShiftStaple(
Gimpl::CovShiftForward(
U[nu], nu,
Umu[nu], nu,
Gimpl::CovShiftBackward(
U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))),
Umu[mu], mu, Gimpl::CovShiftIdentityBackward(Umu[nu], nu))),
mu);
// __
@@ -333,8 +335,8 @@ public:
//
staple += Gimpl::ShiftStaple(
Gimpl::CovShiftBackward(U[nu], nu,
Gimpl::CovShiftBackward(U[mu], mu, U[nu])), mu);
Gimpl::CovShiftBackward(Umu[nu], nu,
Gimpl::CovShiftBackward(Umu[mu], mu, Umu[nu])), mu);
}
}
}

View File

@@ -30,25 +30,26 @@
NAMESPACE_BEGIN(Grid);
uint64_t DslashFullCount;
uint64_t DslashPartialCount;
//uint64_t DslashPartialCount;
uint64_t DslashDirichletCount;
void DslashResetCounts(void)
{
DslashFullCount=0;
DslashPartialCount=0;
// DslashPartialCount=0;
DslashDirichletCount=0;
}
void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full)
{
dirichlet = DslashDirichletCount;
partial = DslashPartialCount;
partial = 0;
full = DslashFullCount;
}
void DslashLogFull(void) { DslashFullCount++;}
void DslashLogPartial(void) { DslashPartialCount++;}
//void DslashLogPartial(void) { DslashPartialCount++;}
void DslashLogDirichlet(void){ DslashDirichletCount++;}
deviceVector<unsigned char> StencilBuffer::DeviceCommBuf;
void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
int off,std::vector<std::pair<int,int> > & table)

View File

@@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid);
// These can move into a params header and be given MacroMagic serialisation
struct DefaultImplParams {
Coordinate dirichlet; // Blocksize of dirichlet BCs
int partialDirichlet;
// int partialDirichlet;
DefaultImplParams() {
dirichlet.resize(0);
partialDirichlet=0;
// partialDirichlet=0;
};
};
@@ -69,6 +69,12 @@ struct DefaultImplParams {
void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
int off,std::vector<std::pair<int,int> > & table);
class StencilBuffer
{
public:
static deviceVector<unsigned char> DeviceCommBuf; // placed in Stencil.cc
};
void DslashResetCounts(void);
void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full);
void DslashLogFull(void);
@@ -113,8 +119,8 @@ class CartesianStencilAccelerator {
///////////////////////////////////////////////////
// If true, this is partially communicated per face
///////////////////////////////////////////////////
StencilVector _comms_partial_send;
StencilVector _comms_partial_recv;
// StencilVector _comms_partial_send;
// StencilVector _comms_partial_recv;
//
StencilVector _comm_buf_size;
StencilVector _permute_type;
@@ -205,16 +211,16 @@ public:
struct Packet {
void * send_buf;
void * recv_buf;
#ifndef ACCELERATOR_AWARE_MPI
void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE
void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE
#endif
void * compressed_send_buf;
void * compressed_recv_buf;
Integer to_rank;
Integer from_rank;
Integer do_send;
Integer do_recv;
Integer xbytes;
Integer rbytes;
Integer xbytes_compressed;
Integer rbytes_compressed;
};
struct Merge {
static constexpr int Nsimd = vobj::Nsimd();
@@ -223,7 +229,7 @@ public:
std::vector<cobj *> vpointers;
Integer buffer_size;
Integer type;
Integer partial; // partial dirichlet BCs
// Integer partial; // partial dirichlet BCs
Coordinate dims;
};
struct Decompress {
@@ -231,7 +237,7 @@ public:
cobj * kernel_p;
cobj * mpi_p;
Integer buffer_size;
Integer partial; // partial dirichlet BCs
// Integer partial; // partial dirichlet BCs
Coordinate dims;
};
struct CopyReceiveBuffer {
@@ -252,9 +258,45 @@ public:
protected:
GridBase * _grid;
///////////////////////////////////////////////////
// Sloppy comms will make a second buffer upon comms
///////////////////////////////////////////////////
size_t device_heap_top; //
size_t device_heap_bytes;//
size_t device_heap_size; //
void *DeviceBufferMalloc(size_t bytes)
{
void *ptr = (void *)device_heap_top;
device_heap_top += bytes;
device_heap_bytes+= bytes;
if ( device_heap_bytes > device_heap_size ) {
std::cout << "DeviceBufferMalloc overflow bytes "<<bytes<<" heap bytes "<<device_heap_bytes<<" heap size "<<device_heap_size<<std::endl;
assert (device_heap_bytes <= device_heap_size);
}
return ptr;
}
void DeviceBufferFreeAll(void)
{
device_heap_size = _unified_buffer_size*sizeof(cobj);
// Resize up if necessary, never down
if ( StencilBuffer::DeviceCommBuf.size() < device_heap_size ) {
StencilBuffer::DeviceCommBuf.resize(device_heap_size);
}
device_heap_top =(size_t) &StencilBuffer::DeviceCommBuf[0];
device_heap_size = StencilBuffer::DeviceCommBuf.size();
device_heap_bytes=0;
}
public:
GridBase *Grid(void) const { return _grid; }
/////////////////////////////////////////////////////////
// Control reduced precision comms
/////////////////////////////////////////////////////////
int SloppyComms;
void SetSloppyComms(int sloppy) { SloppyComms = sloppy; };
////////////////////////////////////////////////////////////////////////
// Needed to conveniently communicate gparity parameters into GPU memory
// without adding parameters. Perhaps a template parameter to StenciView is
@@ -268,7 +310,7 @@ public:
}
int face_table_computed;
int partialDirichlet;
// int partialDirichlet;
int fullDirichlet;
std::vector<deviceVector<std::pair<int,int> > > face_table ;
deviceVector<int> surface_list;
@@ -361,24 +403,145 @@ public:
////////////////////////////////////////////////////////////////////////
// Non blocking send and receive. Necessarily parallel.
////////////////////////////////////////////////////////////////////////
void DecompressPacket(Packet &packet)
{
if ( !SloppyComms ) return;
if ( packet.do_recv && _grid->IsOffNode(packet.from_rank) ) {
typedef typename getPrecision<cobj>::real_scalar_type word;
uint64_t words = packet.rbytes/sizeof(word);
const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
const uint64_t outer = words/nsimd;
if(sizeof(word)==8) {
// Can either choose to represent as float vs double and prec change
// OR
// truncate the mantissa bfp16 style
double *dbuf =(double *) packet.recv_buf;
float *fbuf =(float *) packet.compressed_recv_buf;
accelerator_forNB(ss,outer,nsimd,{
int lane = acceleratorSIMTlane(nsimd);
dbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]; //conversion
});
} else if ( sizeof(word)==4){
// Can either choose to represent as half vs float and prec change
// OR
// truncate the mantissa bfp16 style
uint32_t *fbuf =(uint32_t *) packet.recv_buf;
uint16_t *hbuf =(uint16_t *) packet.compressed_recv_buf;
accelerator_forNB(ss,outer,nsimd,{
int lane = acceleratorSIMTlane(nsimd);
fbuf[ss*nsimd+lane] = ((uint32_t)hbuf[ss*nsimd+lane])<<16; //copy back and pad each word with zeroes
});
} else {
assert(0 && "unknown floating point precision");
}
}
}
void CompressPacket(Packet &packet)
{
packet.xbytes_compressed = packet.xbytes;
packet.compressed_send_buf = packet.send_buf;
packet.rbytes_compressed = packet.rbytes;
packet.compressed_recv_buf = packet.recv_buf;
if ( !SloppyComms ) {
return;
}
typedef typename getPrecision<cobj>::real_scalar_type word;
uint64_t words = packet.xbytes/sizeof(word);
const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
const uint64_t outer = words/nsimd;
if (packet.do_recv && _grid->IsOffNode(packet.from_rank) ) {
packet.rbytes_compressed = packet.rbytes/2;
packet.compressed_recv_buf = DeviceBufferMalloc(packet.rbytes_compressed);
// std::cout << " CompressPacket recv from "<<packet.from_rank<<" "<<std::hex<<packet.compressed_recv_buf<<std::dec<<std::endl;
}
//else {
// std::cout << " CompressPacket recv is uncompressed from "<<packet.from_rank<<" "<<std::hex<<packet.compressed_recv_buf<<std::dec<<std::endl;
// }
if (packet.do_send && _grid->IsOffNode(packet.to_rank) ) {
packet.xbytes_compressed = packet.xbytes/2;
packet.compressed_send_buf = DeviceBufferMalloc(packet.xbytes_compressed);
// std::cout << " CompressPacket send to "<<packet.to_rank<<" "<<std::hex<<packet.compressed_send_buf<<std::dec<<std::endl;
if(sizeof(word)==8) {
double *dbuf =(double *) packet.send_buf;
float *fbuf =(float *) packet.compressed_send_buf;
accelerator_forNB(ss,outer,nsimd,{
int lane = acceleratorSIMTlane(nsimd);
fbuf[ss*nsimd+lane] = dbuf[ss*nsimd+lane]; // convert fp64 to fp32
});
} else if ( sizeof(word)==4){
uint32_t *fbuf =(uint32_t *) packet.send_buf;
uint16_t *hbuf =(uint16_t *) packet.compressed_send_buf;
accelerator_forNB(ss,outer,nsimd,{
int lane = acceleratorSIMTlane(nsimd);
hbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]>>16; // convert as in Bagel/BFM ; bfloat16 ; s7e8 Intel patent
});
} else {
assert(0 && "unknown floating point precision");
}
}
// else {
// std::cout << " CompressPacket send is uncompressed to "<<packet.to_rank<<" "<<std::hex<<packet.compressed_send_buf<<std::dec<<std::endl;
// }
return;
}
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
{
// std::cout << "Communicate Begin "<<std::endl;
// _grid->Barrier();
FlightRecorder::StepLog("Communicate begin");
///////////////////////////////////////////////
// All GPU kernel tasks must complete
// accelerator_barrier(); // All kernels should ALREADY be complete
// _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer
// But the HaloGather had a barrier too.
// accelerator_barrier(); All kernels should ALREADY be complete
//Everyone is here, so noone running slow and still using receive buffer
_grid->StencilBarrier();
// But the HaloGather had a barrier too.
///////////////////////////////////////////////
if (SloppyComms) {
DeviceBufferFreeAll();
}
for(int i=0;i<Packets.size();i++){
this->CompressPacket(Packets[i]);
}
if (SloppyComms) {
accelerator_barrier();
#ifdef NVLINK_GET
_grid->StencilBarrier();
#endif
}
for(int i=0;i<Packets.size();i++){
// std::cout << "Communicate prepare "<<i<<std::endl;
// _grid->Barrier();
_grid->StencilSendToRecvFromPrepare(MpiReqs,
Packets[i].send_buf,
Packets[i].compressed_send_buf,
Packets[i].to_rank,Packets[i].do_send,
Packets[i].recv_buf,
Packets[i].compressed_recv_buf,
Packets[i].from_rank,Packets[i].do_recv,
Packets[i].xbytes,Packets[i].rbytes,i);
Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i);
}
// std::cout << "Communicate PollDtoH "<<std::endl;
// _grid->Barrier();
@@ -389,18 +552,22 @@ public:
// Starts intranode
for(int i=0;i<Packets.size();i++){
// std::cout << "Communicate Begin "<<i<<std::endl;
// _grid->Barrier();
_grid->StencilSendToRecvFromBegin(MpiReqs,
Packets[i].send_buf,
Packets[i].send_buf,Packets[i].compressed_send_buf,
Packets[i].to_rank,Packets[i].do_send,
Packets[i].recv_buf,
Packets[i].recv_buf,Packets[i].compressed_recv_buf,
Packets[i].from_rank,Packets[i].do_recv,
Packets[i].xbytes,Packets[i].rbytes,i);
Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i);
// std::cout << "Communicate Begin started "<<i<<std::endl;
// _grid->Barrier();
}
FlightRecorder::StepLog("Communicate begin has finished");
// Get comms started then run checksums
// Having this PRIOR to the dslash seems to make Sunspot work... (!)
for(int i=0;i<Packets.size();i++){
if ( Packets[i].do_send )
FlightRecorder::xmitLog(Packets[i].send_buf,Packets[i].xbytes);
FlightRecorder::xmitLog(Packets[i].compressed_send_buf,Packets[i].xbytes_compressed);
}
}
@@ -415,14 +582,15 @@ public:
// std::cout << "Communicate Complete Complete "<<std::endl;
// _grid->Barrier();
_grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
if ( this->partialDirichlet ) DslashLogPartial();
else if ( this->fullDirichlet ) DslashLogDirichlet();
// if ( this->partialDirichlet ) DslashLogPartial();
if ( this->fullDirichlet ) DslashLogDirichlet();
else DslashLogFull();
// acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
// accelerator_barrier();
for(int i=0;i<Packets.size();i++){
this->DecompressPacket(Packets[i]);
if ( Packets[i].do_recv )
FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
FlightRecorder::recvLog(Packets[i].compressed_recv_buf,Packets[i].rbytes_compressed,Packets[i].from_rank);
}
FlightRecorder::StepLog("Finish communicate complete");
}
@@ -446,6 +614,7 @@ public:
Communicate();
CommsMergeSHM(compress);
CommsMerge(compress);
accelerator_barrier();
}
template<class compressor> int HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
@@ -616,7 +785,7 @@ public:
}
void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size,std::vector<Decompress> &dv) {
Decompress d;
d.partial = this->partialDirichlet;
// d.partial = this->partialDirichlet;
d.dims = _grid->_fdimensions;
d.kernel_p = k_p;
d.mpi_p = m_p;
@@ -625,7 +794,7 @@ public:
}
void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
Merge m;
m.partial = this->partialDirichlet;
// m.partial = this->partialDirichlet;
m.dims = _grid->_fdimensions;
m.type = type;
m.mpointer = merge_p;
@@ -689,6 +858,7 @@ public:
}
}
}
// std::cout << "BuildSurfaceList size is "<<surface_list_size<<std::endl;
surface_list.resize(surface_list_size);
std::vector<int> surface_list_host(surface_list_size);
int32_t ss=0;
@@ -708,7 +878,7 @@ public:
}
}
acceleratorCopyToDevice(&surface_list_host[0],&surface_list[0],surface_list_size*sizeof(int));
std::cout << GridLogMessage<<"BuildSurfaceList size is "<<surface_list_size<<std::endl;
// std::cout << GridLogMessage<<"BuildSurfaceList size is "<<surface_list_size<<std::endl;
}
/// Introduce a block structure and switch off comms on boundaries
void DirichletBlock(const Coordinate &dirichlet_block)
@@ -729,8 +899,8 @@ public:
int block = dirichlet_block[dimension];
this->_comms_send[ii] = comm_dim;
this->_comms_recv[ii] = comm_dim;
this->_comms_partial_send[ii] = 0;
this->_comms_partial_recv[ii] = 0;
// this->_comms_partial_send[ii] = 0;
// this->_comms_partial_recv[ii] = 0;
if ( block && comm_dim ) {
assert(abs(displacement) < ld );
// Quiesce communication across block boundaries
@@ -751,10 +921,10 @@ public:
if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_send[ii] = 0;
if ( ( (ld*pc ) % block ) == 0 ) this->_comms_recv[ii] = 0;
}
if ( partialDirichlet ) {
this->_comms_partial_send[ii] = !this->_comms_send[ii];
this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
}
// if ( partialDirichlet ) {
// this->_comms_partial_send[ii] = !this->_comms_send[ii];
// this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
// }
}
}
}
@@ -766,6 +936,7 @@ public:
Parameters p=Parameters(),
bool preserve_shm=false)
{
SloppyComms = 0;
face_table_computed=0;
_grid = grid;
this->parameters=p;
@@ -783,7 +954,7 @@ public:
this->same_node.resize(npoints);
if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0);
partialDirichlet = p.partialDirichlet;
// partialDirichlet = p.partialDirichlet;
DirichletBlock(p.dirichlet); // comms send/recv set up
fullDirichlet=0;
for(int d=0;d<p.dirichlet.size();d++){
@@ -800,8 +971,8 @@ public:
this->_entries_host_p = &_entries[0];
this->_entries_p = &_entries_device[0];
std::cout << GridLogMessage << " Stencil object allocated for "<<std::dec<<this->_osites
<<" sites table "<<std::hex<<this->_entries_p<< " GridPtr "<<_grid<<std::dec<<std::endl;
// std::cout << GridLogMessage << " Stencil object allocated for "<<std::dec<<this->_osites
// <<" sites table "<<std::hex<<this->_entries_p<< " GridPtr "<<_grid<<std::dec<<std::endl;
for(int ii=0;ii<npoints;ii++){
@@ -864,7 +1035,7 @@ public:
/////////////////////////////////////////////////////////////////////////////////
const int Nsimd = grid->Nsimd();
// Allow for multiple stencils to exist simultaneously
// Allow for multiple stencils to be communicated simultaneously
if (!preserve_shm)
_grid->ShmBufferFreeAll();
@@ -932,7 +1103,8 @@ public:
GridBase *grid=_grid;
const int Nsimd = grid->Nsimd();
int comms_recv = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
// int comms_recv = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
int comms_recv = this->_comms_recv[point];
int fd = _grid->_fdimensions[dimension];
int ld = _grid->_ldimensions[dimension];
int rd = _grid->_rdimensions[dimension];
@@ -1121,8 +1293,8 @@ public:
int comms_send = this->_comms_send[point];
int comms_recv = this->_comms_recv[point];
int comms_partial_send = this->_comms_partial_send[point] ;
int comms_partial_recv = this->_comms_partial_recv[point] ;
// int comms_partial_send = this->_comms_partial_send[point] ;
// int comms_partial_recv = this->_comms_partial_recv[point] ;
assert(rhs.Grid()==_grid);
// conformable(_grid,rhs.Grid());
@@ -1157,11 +1329,11 @@ public:
int rbytes;
if ( comms_send ) xbytes = bytes; // Full send
else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
// else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
else xbytes = 0; // full dirichlet
if ( comms_recv ) rbytes = bytes;
else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
// else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
else rbytes = 0;
int so = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
@@ -1188,7 +1360,8 @@ public:
}
if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
// if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
if ( compress.DecompressionStep()&&comms_recv) {
recv_buf=u_simd_recv_buf[0];
} else {
recv_buf=this->u_recv_buf_p;
@@ -1222,7 +1395,8 @@ public:
#endif
// std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<<std::endl;
compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
// compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,0);
int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,xbytes,rbytes,cbmask);
if ( !duplicate ) { // Force comms for now
@@ -1231,8 +1405,8 @@ public:
// Build a list of things to do after we synchronise GPUs
// Start comms now???
///////////////////////////////////////////////////////////
int do_send = (comms_send|comms_partial_send) && (!shm_send );
int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
int do_send = (comms_send) && (!shm_send );
int do_recv = (comms_send) && (!shm_recv );
AddPacket((void *)&send_buf[comm_off],
(void *)&recv_buf[comm_off],
xmit_to_rank, do_send,
@@ -1240,7 +1414,7 @@ public:
xbytes,rbytes);
}
if ( (compress.DecompressionStep() && comms_recv) || comms_partial_recv ) {
if ( (compress.DecompressionStep() && comms_recv) ) {
AddDecompress(&this->u_recv_buf_p[comm_off],
&recv_buf[comm_off],
words,Decompressions);
@@ -1262,8 +1436,8 @@ public:
int comms_send = this->_comms_send[point];
int comms_recv = this->_comms_recv[point];
int comms_partial_send = this->_comms_partial_send[point] ;
int comms_partial_recv = this->_comms_partial_recv[point] ;
// int comms_partial_send = this->_comms_partial_send[point] ;
// int comms_partial_recv = this->_comms_partial_recv[point] ;
int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension];
@@ -1338,18 +1512,20 @@ public:
if ( comms_send ) xbytes = bytes;
else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
// else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
else xbytes = 0;
if ( comms_recv ) rbytes = bytes;
else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
// else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
else rbytes = 0;
// Gathers SIMD lanes for send and merge
// Different faces can be full comms or partial comms with multiple ranks per node
if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
// if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
if ( comms_send || comms_recv ) {
int partial = partialDirichlet;
// int partial = partialDirichlet;
int partial = 0;
compressor::Gather_plane_exchange(face_table[face_idx],rhs,
spointers,dimension,sx,cbmask,
compress,permute_type,partial );
@@ -1415,7 +1591,8 @@ public:
if ( (bytes != rbytes) && (rbytes!=0) ){
acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero
}
int do_send = (comms_send|comms_partial_send) && (!shm_send );
// int do_send = (comms_send|comms_partial_send) && (!shm_send );
int do_send = (comms_send) && (!shm_send );
AddPacket((void *)sp,(void *)rp,
xmit_to_rank,do_send,
recv_from_rank,do_send,
@@ -1429,7 +1606,8 @@ public:
}
}
// rpointer may be doing a remote read in the gather over SHM
if ( comms_recv|comms_partial_recv ) {
// if ( comms_recv|comms_partial_recv ) {
if ( comms_recv ) {
AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers);
}

View File

@@ -67,7 +67,7 @@ void acceleratorInit(void)
printf("AcceleratorCudaInit[%d]: Device identifier: %s\n",rank, prop.name);
GPU_PROP_FMT(totalGlobalMem,"%lld");
GPU_PROP_FMT(totalGlobalMem,"%zu");
GPU_PROP(managedMemory);
GPU_PROP(isMultiGpuBoard);
GPU_PROP(warpSize);
@@ -240,7 +240,7 @@ void acceleratorInit(void)
char hostname[HOST_NAME_MAX+1];
gethostname(hostname, HOST_NAME_MAX+1);
if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname);
if ( rank==0 ) printf("AcceleratorSyclInit world_rank %d is host %s \n",world_rank,hostname);
auto devices = sycl::device::get_devices();
for(int d = 0;d<devices.size();d++){

View File

@@ -215,7 +215,7 @@ inline void *acceleratorAllocHost(size_t bytes)
auto err = cudaMallocHost((void **)&ptr,bytes);
if( err != cudaSuccess ) {
ptr = (void *) NULL;
printf(" cudaMallocHost failed for %d %s \n",bytes,cudaGetErrorString(err));
printf(" cudaMallocHost failed for %zu %s \n",bytes,cudaGetErrorString(err));
assert(0);
}
return ptr;
@@ -226,7 +226,7 @@ inline void *acceleratorAllocShared(size_t bytes)
auto err = cudaMallocManaged((void **)&ptr,bytes);
if( err != cudaSuccess ) {
ptr = (void *) NULL;
printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
printf(" cudaMallocManaged failed for %zu %s \n",bytes,cudaGetErrorString(err));
assert(0);
}
return ptr;
@@ -237,24 +237,38 @@ inline void *acceleratorAllocDevice(size_t bytes)
auto err = cudaMalloc((void **)&ptr,bytes);
if( err != cudaSuccess ) {
ptr = (void *) NULL;
printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
printf(" cudaMalloc failed for %zu %s \n",bytes,cudaGetErrorString(err));
}
return ptr;
};
typedef int acceleratorEvent_t;
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
inline void acceleratorCopyToDeviceAsync(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
inline void acceleratorCopyFromDeviceAsync(const void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
acceleratorCopyToDevice(from,to,bytes);
return 0;
}
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
acceleratorCopyFromDevice(from,to,bytes);
return 0;
}
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
{
cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
return 0;
}
inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
inline void acceleratorEventWait(acceleratorEvent_t ev)
{
//auto discard=cudaStreamSynchronize(ev);
}
inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}
inline int acceleratorIsCommunicable(void *ptr)
@@ -323,7 +337,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
cgh.parallel_for( \
sycl::nd_range<3>(global,local), \
[=] (sycl::nd_item<3> item) /*mutable*/ \
[[intel::reqd_sub_group_size(16)]] \
[[sycl::reqd_sub_group_size(16)]] \
{ \
auto iter1 = item.get_global_id(0); \
auto iter2 = item.get_global_id(1); \
@@ -359,9 +373,9 @@ inline int acceleratorEventIsComplete(acceleratorEvent_t ev)
return (ev.get_info<sycl::info::event::command_execution_status>() == sycl::info::event_command_status::complete);
}
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes);}
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(const void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(const void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes);}
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
@@ -478,7 +492,7 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
inline void *acceleratorAllocHost(size_t bytes)
{
void *ptr=NULL;
auto err = hipMallocHost((void **)&ptr,bytes);
auto err = hipHostMalloc((void **)&ptr,bytes);
if( err != hipSuccess ) {
ptr = (void *) NULL;
fprintf(stderr," hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
@@ -516,18 +530,30 @@ inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ a
inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) // Asynch
typedef int acceleratorEvent_t;
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
{
auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
return 0;
}
inline void acceleratorCopyToDeviceAsync(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
acceleratorCopyToDevice(from,to,bytes);
return 0;
}
inline void acceleratorCopyFromDeviceAsync(const void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
acceleratorCopyFromDevice(from,to,bytes);
return 0;
}
inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); };
inline void acceleratorEventWait(acceleratorEvent_t ev)
{
// auto discard=hipStreamSynchronize(ev);
}
inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}
#endif
inline void acceleratorPin(void *ptr,unsigned long bytes)
@@ -564,6 +590,8 @@ inline void acceleratorPin(void *ptr,unsigned long bytes)
#undef GRID_SIMT
typedef int acceleratorEvent_t;
inline void acceleratorMem(void)
{
/*
@@ -583,9 +611,14 @@ inline void acceleratorMem(void)
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); }
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
inline void acceleratorCopyDeviceToDeviceAsynch(const void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes);}
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); }
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); }
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { acceleratorCopyToDevice(from,to,bytes); return 0; }
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes) { acceleratorCopyFromDevice(from,to,bytes); return 0; }
inline void acceleratorEventWait(acceleratorEvent_t ev){}
inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev); return 1;}
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); return 0;}
inline void acceleratorCopySynchronise(void) {};
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
@@ -668,7 +701,7 @@ accelerator_inline void acceleratorFence(void)
return;
}
inline void acceleratorCopyDeviceToDevice(const void *from,void *to,size_t bytes)
inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
{
acceleratorCopyDeviceToDeviceAsynch(from,to,bytes);
acceleratorCopySynchronise();

View File

@@ -46,10 +46,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <cstdlib>
#include <memory>
#include <Grid/Grid.h>
#include <Grid/util/CompilerCompatible.h>
#ifdef HAVE_UNWIND
#include <libunwind.h>
#endif
#include <fenv.h>
#ifdef __APPLE__
@@ -295,6 +299,20 @@ void GridBanner(void)
std::cout << std::setprecision(9);
}
//Some file local variables
static int fileno_stdout;
static int fileno_stderr;
static int signal_delay;
class dlRegion {
public:
uint64_t start;
uint64_t end;
uint64_t size;
uint64_t offset;
std::string name;
};
std::vector<dlRegion> dlMap;
void Grid_init(int *argc,char ***argv)
{
@@ -347,6 +365,19 @@ void Grid_init(int *argc,char ***argv)
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
Grid_debug_handler_init();
}
// Sleep n-seconds at end of handler
if( GridCmdOptionExists(*argv,*argv+*argc,"--signal-delay") ){
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--signal-delay");
GridCmdOptionInt(arg,signal_delay);
}
// periodic wakeup with stack trace printed
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-heartbeat") ){
Grid_debug_heartbeat();
}
// periodic wakeup with empty handler (interrupts some system calls)
if( GridCmdOptionExists(*argv,*argv+*argc,"--heartbeat") ){
Grid_heartbeat();
}
#if defined(A64FX)
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){
@@ -396,15 +427,25 @@ void Grid_init(int *argc,char ***argv)
fp=freopen(ename.str().c_str(),"w",stderr);
assert(fp!=(FILE *)NULL);
}
fileno_stdout = fileno(stdout);
fileno_stderr = fileno(stderr) ;
////////////////////////////////////////////////////
// OK to use GridLogMessage etc from here on
////////////////////////////////////////////////////
std::cout << GridLogMessage << "================================================ "<<std::endl;
std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl;
std::cout << GridLogMessage << "================================================ "<<std::endl;
gethostname(hostname, HOST_NAME_MAX+1);
std::cout << GridLogMessage << "This rank is running on host "<< hostname<<std::endl;
{
gethostname(hostname, HOST_NAME_MAX+1);
time_t mytime;
struct tm *info;
char buffer[80];
time(&mytime);
info = localtime(&mytime);
strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", info);
std::cout << GridLogMessage << "This rank is running on host "<< hostname<<" at local time "<<buffer<<std::endl;
}
/////////////////////////////////////////////////////////
// Reporting
@@ -421,6 +462,47 @@ void Grid_init(int *argc,char ***argv)
MemoryProfiler::stats = &dbgMemStats;
}
/////////////////////////////////////////////////////////
// LD.so space
/////////////////////////////////////////////////////////
#ifndef __APPLE__
{
// Provides mapping of .so files
FILE *f = fopen("/proc/self/maps", "r");
if (f) {
char line[256];
while (fgets(line, sizeof(line), f)) {
if (strstr(line, "r-xp")) {
dlRegion region;
uint32_t major, minor, inode;
uint64_t start,end,offset;
char path[PATH_MAX];
sscanf(line,"%lx-%lx r-xp %lx %x:%x %d %s",
&start,&end,&offset,
&major,&minor,&inode,path);
region.start=start;
region.end =end;
region.offset=offset;
region.name = std::string(path);
region.size = region.end-region.start;
dlMap.push_back(region);
// std::cout << GridLogMessage<< line;
}
}
fclose(f);
}
if( GridCmdOptionExists(*argv,*argv+*argc,"--dylib-map") ){
std::cout << GridLogMessage << "================================================ "<<std::endl;
std::cout << GridLogMessage<< " Dynamic library map: " <<std::endl;
std::cout << GridLogMessage << "================================================ "<<std::endl;
for(int r=0;r<dlMap.size();r++){
auto region = dlMap[r];
std::cout << GridLogMessage<<" "<<region.name<<std::hex<<region.start<<"-"<<region.end<<" sz "<<region.size<<std::dec<<std::endl;
}
std::cout << GridLogMessage << "================================================ "<<std::endl;
}
}
#endif
////////////////////////////////////
// Logging
////////////////////////////////////
@@ -453,14 +535,19 @@ void Grid_init(int *argc,char ***argv)
std::cout<<GridLogMessage<<" --shm-hugepages : use explicit huge pages in mmap call "<<std::endl;
std::cout<<GridLogMessage<<" --device-mem M : Size of device software cache for lattice fields (MB) "<<std::endl;
std::cout<<GridLogMessage<<std::endl;
std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl;
std::cout<<GridLogMessage<<"Verbose:"<<std::endl;
std::cout<<GridLogMessage<<std::endl;
std::cout<<GridLogMessage<<" --log list : comma separated list from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
std::cout<<GridLogMessage<<" --decomposition : report on default omp,mpi and simd decomposition"<<std::endl;
std::cout<<GridLogMessage<<" --debug-signals : catch sigsegv and print a blame report"<<std::endl;
std::cout<<GridLogMessage<<" --debug-stdout : print stdout from EVERY node"<<std::endl;
std::cout<<GridLogMessage<<" --debug-mem : print Grid allocator activity"<<std::endl;
std::cout<<GridLogMessage<<" --notimestamp : suppress millisecond resolution stamps"<<std::endl;
std::cout<<GridLogMessage<<" --decomposition : report on default omp,mpi and simd decomposition"<<std::endl;
std::cout<<GridLogMessage<<"Debug:"<<std::endl;
std::cout<<GridLogMessage<<" --dylib-map : print dynamic library map, useful for interpreting signal backtraces "<<std::endl;
std::cout<<GridLogMessage<<" --heartbeat : periodic itimer wakeup (interrupts stuck system calls!) "<<std::endl;
std::cout<<GridLogMessage<<" --signal-delay n : pause for n seconds after signal handling (useful to get ALL nodes in stuck state) "<<std::endl;
std::cout<<GridLogMessage<<" --debug-stdout : print stdout from EVERY node to file Grid.stdout/err.rank "<<std::endl;
std::cout<<GridLogMessage<<" --debug-signals : catch sigsegv and print a blame report, handle SIGHUP with a backtrace to stderr"<<std::endl;
std::cout<<GridLogMessage<<" --debug-heartbeat : periodically report backtrace "<<std::endl;
std::cout<<GridLogMessage<<" --debug-mem : print Grid allocator activity"<<std::endl;
std::cout<<GridLogMessage<<std::endl;
std::cout<<GridLogMessage<<"Performance:"<<std::endl;
std::cout<<GridLogMessage<<std::endl;
@@ -555,17 +642,56 @@ void GridLogLayout() {
}
void * Grid_backtrace_buffer[_NBACKTRACE];
#define SIGLOG(A) ::write(fileno_stderr,A,strlen(A));
void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
void sig_print_dig(uint32_t dig)
{
fprintf(stderr,"Signal handler on host %s\n",hostname);
fprintf(stderr,"FlightRecorder step %d stage %s \n",
FlightRecorder::StepLoggingCounter,
FlightRecorder::StepName);
fprintf(stderr,"Caught signal %d\n",si->si_signo);
fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr);
fprintf(stderr," code %d\n",si->si_code);
// x86 64bit
const char *digits[] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b", "c", "d", "e", "f" };
if ( dig>=0 && dig< 16){
SIGLOG(digits[dig]);
}
}
void sig_print_uint(uint32_t A)
{
int dig;
int nz=0;
#define DIGIT(DIV) dig = (A/DIV)%10 ; if(dig|nz) sig_print_dig(dig); nz = nz|dig;
DIGIT(1000000000); // Catches 4BN = 2^32
DIGIT(100000000);
DIGIT(10000000);
DIGIT(1000000);
DIGIT(100000);
DIGIT(10000);
DIGIT(1000);
DIGIT(100);
DIGIT(10);
DIGIT(1);
if (nz==0) SIGLOG("0");
}
void sig_print_hex(uint64_t A)
{
int nz=0;
int dig;
#define NIBBLE(A) dig = A ; if(dig|nz) sig_print_dig(dig); nz = nz|dig;
SIGLOG("0x");
NIBBLE((A>>(15*4))&0xF);
NIBBLE((A>>(14*4))&0xF);
NIBBLE((A>>(13*4))&0xF);
NIBBLE((A>>(12*4))&0xF);
NIBBLE((A>>(11*4))&0xF);
NIBBLE((A>>(10*4))&0xF);
NIBBLE((A>>(9*4))&0xF);
NIBBLE((A>>(8*4))&0xF);
NIBBLE((A>>(7*4))&0xF);
NIBBLE((A>>(6*4))&0xF);
NIBBLE((A>>(5*4))&0xF);
NIBBLE((A>>(4*4))&0xF);
NIBBLE((A>>(3*4))&0xF);
NIBBLE((A>>(2*4))&0xF);
NIBBLE((A>>4)&0xF);
sig_print_dig(A&0xF);
}
/*
#ifdef __linux__
#ifdef __x86_64__
ucontext_t * uc= (ucontext_t *)ptr;
@@ -573,81 +699,158 @@ void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
fprintf(stderr," instruction %llx\n",(unsigned long long)sc->rip);
#endif
#endif
fflush(stderr);
BACKTRACEFP(stderr);
fprintf(stderr,"Called backtrace\n");
fflush(stdout);
fflush(stderr);
*/
void Grid_generic_handler(int sig,siginfo_t *si,void * ptr)
{
SIGLOG("Signal handler on host ");
SIGLOG(hostname);
SIGLOG(" process id ");
sig_print_uint((uint32_t)getpid());
SIGLOG("\n");
SIGLOG("FlightRecorder step ");
sig_print_uint(FlightRecorder::StepLoggingCounter);
SIGLOG(" stage ");
SIGLOG(FlightRecorder::StepName);
SIGLOG("\n");
SIGLOG("Caught signal ");
sig_print_uint(si->si_signo);
SIGLOG("\n");
SIGLOG(" mem address ");
sig_print_hex((uint64_t)si->si_addr);
SIGLOG("\n");
SIGLOG(" code ");
sig_print_uint(si->si_code);
SIGLOG("\n");
ucontext_t *uc= (ucontext_t *)ptr;
SIGLOG("Backtrace:\n");
#ifdef HAVE_UNWIND
// Debug cross check on offsets
// int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE);
// backtrace_symbols_fd(Grid_backtrace_buffer,symbols,fileno_stderr);
unw_cursor_t cursor;
unw_word_t ip, off;
if (!unw_init_local(&cursor, uc) ) {
SIGLOG(" frame IP function\n");
int level = 0;
int ret = 0;
while(1) {
char name[128];
if (level >= _NBACKTRACE) return;
unw_get_reg(&cursor, UNW_REG_IP, &ip);
sig_print_uint(level); SIGLOG(" ");
sig_print_hex(ip); SIGLOG(" ");
for(int r=0;r<dlMap.size();r++){
if((ip>=dlMap[r].start) &&(ip<dlMap[r].end)){
SIGLOG(dlMap[r].name.c_str());
SIGLOG("+");
sig_print_hex((ip-dlMap[r].start));
break;
}
}
SIGLOG("\n");
Grid_backtrace_buffer[level]=(void *)ip;
level++;
ret = unw_step(&cursor);
if (ret <= 0) {
return;
}
}
}
#else
// Known Asynch-Signal unsafe
int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE);
backtrace_symbols_fd(Grid_backtrace_buffer,symbols,fileno_stderr);
#endif
}
void Grid_heartbeat_signal_handler(int sig,siginfo_t *si,void * ptr)
{
Grid_generic_handler(sig,si,ptr);
SIGLOG("\n");
}
void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
{
Grid_generic_handler(sig,si,ptr);
if (signal_delay) {
SIGLOG("Adding extra signal delay ");
sig_print_uint(signal_delay);
SIGLOG(" s\n");
usleep( (uint64_t) signal_delay*1000LL*1000LL);
}
SIGLOG("\n");
return;
}
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
void Grid_fatal_signal_handler(int sig,siginfo_t *si,void * ptr)
{
fprintf(stderr,"Signal handler on host %s\n",hostname);
fprintf(stderr,"Caught signal %d\n",si->si_signo);
fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr);
fprintf(stderr," code %d\n",si->si_code);
// Linux/Posix
#ifdef __linux__
// And x86 64bit
#ifdef __x86_64__
ucontext_t * uc= (ucontext_t *)ptr;
struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
fprintf(stderr," instruction %llx\n",(unsigned long long)sc->rip);
#define REG(A) fprintf(stderr," %s %lx\n",#A,sc-> A);
REG(rdi);
REG(rsi);
REG(rbp);
REG(rbx);
REG(rdx);
REG(rax);
REG(rcx);
REG(rsp);
REG(rip);
REG(r8);
REG(r9);
REG(r10);
REG(r11);
REG(r12);
REG(r13);
REG(r14);
REG(r15);
#endif
#endif
fflush(stderr);
BACKTRACEFP(stderr);
fprintf(stderr,"Called backtrace\n");
fflush(stdout);
fflush(stderr);
Grid_generic_handler(sig,si,ptr);
SIGLOG("\n");
exit(0);
return;
};
void Grid_empty_signal_handler(int sig,siginfo_t *si,void * ptr)
{
// SIGLOG("heartbeat signal handled\n");
return;
}
void Grid_debug_heartbeat(void)
{
struct sigaction sa_ping;
sigemptyset (&sa_ping.sa_mask);
sa_ping.sa_sigaction= Grid_usr_signal_handler;
sa_ping.sa_flags = SA_SIGINFO;
sigaction(SIGALRM,&sa_ping,NULL);
// repeating 10s heartbeat
struct itimerval it_val;
it_val.it_value.tv_sec = 10;
it_val.it_value.tv_usec = 0;
it_val.it_interval = it_val.it_value;
setitimer(ITIMER_REAL, &it_val, NULL);
}
void Grid_heartbeat(void)
{
struct sigaction sa_ping;
sigemptyset (&sa_ping.sa_mask);
sa_ping.sa_sigaction= Grid_empty_signal_handler;
sa_ping.sa_flags = SA_SIGINFO;
sigaction(SIGALRM,&sa_ping,NULL);
// repeating 10s heartbeat
struct itimerval it_val;
it_val.it_value.tv_sec = 10;
it_val.it_value.tv_usec = 1000;
it_val.it_interval = it_val.it_value;
setitimer(ITIMER_REAL, &it_val, NULL);
}
void Grid_exit_handler(void)
{
// BACKTRACEFP(stdout);
// fflush(stdout);
BACKTRACEFP(stdout);
fflush(stdout);
}
void Grid_debug_handler_init(void)
{
struct sigaction sa;
sigemptyset (&sa.sa_mask);
sa.sa_sigaction= Grid_sa_signal_handler;
sa.sa_sigaction= Grid_fatal_signal_handler;
sa.sa_flags = SA_SIGINFO;
// sigaction(SIGSEGV,&sa,NULL);
sigaction(SIGTRAP,&sa,NULL);
sigaction(SIGBUS,&sa,NULL);
// sigaction(SIGUSR2,&sa,NULL);
feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
sigaction(SIGFPE,&sa,NULL);
sigaction(SIGKILL,&sa,NULL);
sigaction(SIGILL,&sa,NULL);
#ifndef GRID_SYCL
sigaction(SIGSEGV,&sa,NULL); // SYCL is using SIGSEGV
sigaction(SIGBUS,&sa,NULL);
feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
sigaction(SIGFPE,&sa,NULL);
#endif
// Non terminating SIGUSR1/2 handler
// Non terminating SIGHUP handler
struct sigaction sa_ping;
sigemptyset (&sa_ping.sa_mask);
sa_ping.sa_sigaction= Grid_usr_signal_handler;

View File

@@ -38,7 +38,11 @@ char * GridHostname(void);
// internal, controled with --handle
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr);
void Grid_empty_signal_handler(int sig,siginfo_t *si,void * ptr);
void Grid_debug_handler_init(void);
void Grid_debug_heartbeat(void);
void Grid_heartbeat(void);
void Grid_quiesce_nodes(void);
void Grid_unquiesce_nodes(void);

View File

@@ -66,6 +66,7 @@ namespace Grid{
};
}
template <class T> void writeFile(T& in, std::string const fname){
#ifdef HAVE_LIME
// Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
@@ -73,7 +74,7 @@ template <class T> void writeFile(T& in, std::string const fname){
Grid::emptyUserRecord record;
Grid::ScidacWriter WR(in.Grid()->IsBoss());
WR.open(fname);
WR.writeScidacFieldRecord(in,record,0);
WR.writeScidacFieldRecord(in,record,0); // Lexico
WR.close();
#endif
// What is the appropriate way to throw error?
@@ -107,8 +108,18 @@ int main(int argc, char **argv) {
for (int conf = CPar.StartConfiguration; conf <= CPar.EndConfiguration; conf+= CPar.Skip){
#if 0
CPNersc.CheckpointRestore(conf, Umu, sRNG, pRNG);
#else
// Don't require Grid format RNGs
FieldMetaData header;
std::string file, filesmr;
file = CPar.conf_path + "/" + CPar.conf_prefix + "." + std::to_string(conf);
filesmr = CPar.conf_path + "/" + CPar.conf_smr_prefix + "." + std::to_string(conf);
NerscIO::readConfiguration(Umu,header,file);
#endif
std::cout << std::setprecision(15);
std::cout << GridLogMessage << "Initial plaquette: "<< WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu) << std::endl;
@@ -116,6 +127,7 @@ int main(int argc, char **argv) {
std::string file_post = CPar.conf_prefix + "." + std::to_string(conf);
WilsonFlow<PeriodicGimplR> WF(WFPar.step_size,WFPar.steps,WFPar.meas_interval);
WF.addMeasurement(WFPar.meas_interval_density, [&file_pre,&file_post,&conf](int step, RealD t, const typename PeriodicGimplR::GaugeField &U){
typedef typename PeriodicGimplR::GaugeLinkField GaugeMat;
@@ -165,33 +177,48 @@ int main(int argc, char **argv) {
//double coeff = 2.0 / (1.0 * Nd * (Nd - 1)) / 3.0;
//Plq = coeff * Plq;
int tau = std::round(t);
std::string efile = file_pre + "E_dnsty_" + std::to_string(tau) + "_" + file_post;
writeFile(R,efile);
std::string tfile = file_pre + "Top_dnsty_" + std::to_string(tau) + "_" + file_post;
writeFile(qfield,tfile);
RealD WFlow_TC5Li = WilsonLoops<PeriodicGimplR>::TopologicalCharge5Li(U);
int tau = std::round(t);
std::string efile = file_pre + "E_dnsty_" + std::to_string(tau) + "_" + file_post;
// writeFile(R,efile);
std::string tfile = file_pre + "Top_dnsty_" + std::to_string(tau) + "_" + file_post;
// writeFile(qfield,tfile);
std::string ufile = file_pre + "U_" + std::to_string(tau) + "_" + file_post;
{
// PeriodicGimplR::GaugeField Ucopy = U;
// NerscIO::writeConfiguration(Ucopy,ufile);
}
RealD E = real(sum(R))/ RealD(U.Grid()->gSites());
RealD T = real( sum(qfield) );
Coordinate scoor; for (int mu=0; mu < Nd; mu++) scoor[mu] = 0;
RealD E0 = real(peekSite(R,scoor));
RealD T0 = real(peekSite(qfield,scoor));
std::cout << GridLogMessage << "[WilsonFlow] Saved energy density (clover) & topo. charge density: " << conf << " " << step << " " << tau << " "
<< "(E_avg,T_sum) " << E << " " << T << " (E, T at origin) " << E0 << " " << T0 << std::endl;
<< "(E_avg,T_sum) " << E << " " << T << " (E, T at origin) " << E0 << " " << T0 << " Q5Li "<< WFlow_TC5Li << std::endl;
});
int t=WFPar.maxTau;
WF.smear(Uflow, Umu);
// NerscIO::writeConfiguration(Uflow,filesmr);
RealD WFlow_plaq = WilsonLoops<PeriodicGimplR>::avgPlaquette(Uflow);
RealD WFlow_TC = WilsonLoops<PeriodicGimplR>::TopologicalCharge(Uflow);
RealD WFlow_TC5Li = WilsonLoops<PeriodicGimplR>::TopologicalCharge5Li(Uflow);
RealD WFlow_T0 = WF.energyDensityPlaquette(t,Uflow); // t
RealD WFlow_EC = WF.energyDensityCloverleaf(t,Uflow);
std::cout << GridLogMessage << "Plaquette "<< conf << " " << WFlow_plaq << std::endl;
std::cout << GridLogMessage << "T0 "<< conf << " " << WFlow_T0 << std::endl;
std::cout << GridLogMessage << "TC0 "<< conf << " " << WFlow_EC << std::endl;
std::cout << GridLogMessage << "TopologicalCharge "<< conf << " " << WFlow_TC << std::endl;
std::cout << GridLogMessage << "Plaquette "<< conf << " " << WFlow_plaq << std::endl;
std::cout << GridLogMessage << "T0 "<< conf << " " << WFlow_T0 << std::endl;
std::cout << GridLogMessage << "TC0 "<< conf << " " << WFlow_EC << std::endl;
std::cout << GridLogMessage << "TopologicalCharge "<< conf << " " << WFlow_TC << std::endl;
std::cout << GridLogMessage << "TopologicalCharge5Li "<< conf << " " << WFlow_TC5Li<< std::endl;
std::cout<< GridLogMessage << " Admissibility check:\n";
const double sp_adm = 0.067; // admissible threshold

View File

@@ -25,13 +25,20 @@ directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#if Nc == 3
#include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
#include <Grid/qcd/smearing/JacobianAction.h>
#endif
using namespace Grid;
int main(int argc, char **argv)
{
#if Nc != 3
#warning FTHMC2p1f will not work for Nc != 3
std::cout << "This program will currently only work for Nc == 3." << std::endl;
#else
std::cout << std::setprecision(12);
Grid_init(&argc, &argv);
@@ -220,7 +227,6 @@ int main(int argc, char **argv)
TheHMC.Run(SmearingPolicy); // for smearing
Grid_finalize();
#endif
} // main

View File

@@ -24,14 +24,22 @@ See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#if Nc == 3
#include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
#include <Grid/qcd/smearing/JacobianAction.h>
#endif
using namespace Grid;
int main(int argc, char **argv)
{
#if Nc != 3
#warning FTHMC2p1f_3GeV will not work for Nc != 3
std::cout << "This program will currently only work for Nc == 3." << std::endl;
#else
std::cout << std::setprecision(12);
Grid_init(&argc, &argv);
@@ -220,6 +228,7 @@ int main(int argc, char **argv)
TheHMC.Run(SmearingPolicy); // for smearing
Grid_finalize();
#endif
} // main

View File

@@ -25,13 +25,20 @@ directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#if Nc == 3
#include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
#include <Grid/qcd/smearing/JacobianAction.h>
#endif
using namespace Grid;
int main(int argc, char **argv)
{
#if Nc != 3
#warning HMC2p1f_3GeV will not work for Nc != 3
std::cout << "This program will currently only work for Nc == 3." << std::endl;
#else
std::cout << std::setprecision(12);
Grid_init(&argc, &argv);
@@ -220,6 +227,7 @@ int main(int argc, char **argv)
TheHMC.Run(SmearingPolicy); // for smearing
Grid_finalize();
#endif
} // main

View File

@@ -201,8 +201,7 @@ int main(int argc, char **argv) {
Params.dirichlet=NonDirichlet;
ParamsDir.dirichlet=Dirichlet;
ParamsDir.partialDirichlet=0;
std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
// ParamsDir.partialDirichlet=0;
// double StoppingCondition = 1e-14;
// double MDStoppingCondition = 1e-9;
@@ -298,11 +297,11 @@ int main(int argc, char **argv) {
if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
else ParamsDen.dirichlet = NonDirichlet;
if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
else ParamsNum.partialDirichlet = 0;
// if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
// else ParamsNum.partialDirichlet = 0;
if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
else ParamsDen.partialDirichlet = 0;
// if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
// else ParamsDen.partialDirichlet = 0;
Numerators.push_back (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));

View File

@@ -333,9 +333,9 @@ int main(int argc, char **argv) {
ParamsF.dirichlet=NonDirichlet;
ParamsDir.dirichlet=Dirichlet;
ParamsDirF.dirichlet=Dirichlet;
ParamsDir.partialDirichlet=1;
ParamsDirF.partialDirichlet=1;
std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
// ParamsDir.partialDirichlet=1;
// ParamsDirF.partialDirichlet=1;
// std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
// double StoppingCondition = 1e-14;
// double MDStoppingCondition = 1e-9;
@@ -481,21 +481,21 @@ int main(int argc, char **argv) {
if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
else ParamsDen.dirichlet = NonDirichlet;
if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
else ParamsNum.partialDirichlet = 0;
// if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
// else ParamsNum.partialDirichlet = 0;
if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
else ParamsDen.partialDirichlet = 0;
// if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
// else ParamsDen.partialDirichlet = 0;
Numerators.push_back (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
ParamsDenF.dirichlet = ParamsDen.dirichlet;
ParamsDenF.partialDirichlet = ParamsDen.partialDirichlet;
// ParamsDenF.partialDirichlet = ParamsDen.partialDirichlet;
DenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsDenF));
ParamsNumF.dirichlet = ParamsNum.dirichlet;
ParamsNumF.partialDirichlet = ParamsNum.partialDirichlet;
// ParamsNumF.partialDirichlet = ParamsNum.partialDirichlet;
NumeratorsF.push_back (new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[h],M5,b,c, ParamsNumF));
LinOpD.push_back(new LinearOperatorD(*Denominators[h]));

View File

@@ -166,18 +166,18 @@ int main (int argc, char ** argv)
}
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking concurrent STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking sequential STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
header();
for(int lat=8;lat<=maxlat;lat+=4){
for(int Ls=8;Ls<=8;Ls*=2){
Coordinate latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1],
lat*mpi_layout[2],
lat*mpi_layout[3]});
lat*mpi_layout[1],
lat*mpi_layout[2],
lat*mpi_layout[3]});
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
RealD Nrank = Grid._Nprocessors;
@@ -193,101 +193,6 @@ int main (int argc, char ** argv)
rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
}
int ncomm;
double dbytes;
for(int i=0;i<Nloop;i++){
double start=usecond();
dbytes=0;
ncomm=0;
std::vector<CommsRequest_t> requests;
for(int mu=0;mu<4;mu++){
if (mpi_layout[mu]>1 ) {
ncomm++;
int comm_proc=1;
int xmit_to_rank;
int recv_from_rank;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
dbytes+=
Grid.StencilSendToRecvFromBegin(requests,
(void *)&xbuf[mu][0],
xmit_to_rank,1,
(void *)&rbuf[mu][0],
recv_from_rank,1,
bytes,bytes,mu);
comm_proc = mpi_layout[mu]-1;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
dbytes+=
Grid.StencilSendToRecvFromBegin(requests,
(void *)&xbuf[mu+4][0],
xmit_to_rank,1,
(void *)&rbuf[mu+4][0],
recv_from_rank,1,
bytes,bytes,mu+4);
}
}
Grid.StencilSendToRecvFromComplete(requests,0);
Grid.Barrier();
double stop=usecond();
t_time[i] = stop-start; // microseconds
}
timestat.statistics(t_time);
dbytes=dbytes*ppn;
double xbytes = dbytes*0.5;
// double rbytes = dbytes*0.5;
double bidibytes = dbytes;
std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
<<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
<<std::right<< xbytes/timestat.mean<<" "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
<<xbytes/timestat.max <<" "<< xbytes/timestat.min
<< "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< " " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
<< bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
}
}
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking sequential STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
header();
for(int lat=8;lat<=maxlat;lat+=4){
for(int Ls=8;Ls<=8;Ls*=2){
Coordinate latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1],
lat*mpi_layout[2],
lat*mpi_layout[3]});
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
RealD Nrank = Grid._Nprocessors;
RealD Nnode = Grid.NodeCount();
RealD ppn = Nrank/Nnode;
std::vector<HalfSpinColourVectorD *> xbuf(8);
std::vector<HalfSpinColourVectorD *> rbuf(8);
Grid.ShmBufferFreeAll();
uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
for(int d=0;d<8;d++){
xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
}
int ncomm;
double dbytes;
for(int i=0;i<Nloop;i++){
@@ -296,45 +201,34 @@ int main (int argc, char ** argv)
std::vector<CommsRequest_t> requests;
dbytes=0;
ncomm=0;
for(int mu=0;mu<4;mu++){
for(int dir=0;dir<8;dir++) {
double tbytes;
int mu =dir % 4;
if (mpi_layout[mu]>1 ) {
ncomm++;
int comm_proc=1;
int xmit_to_rank;
int recv_from_rank;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
dbytes+=
Grid.StencilSendToRecvFromBegin(requests,
(void *)&xbuf[mu][0],
xmit_to_rank,1,
(void *)&rbuf[mu][0],
recv_from_rank,1,
bytes,bytes,mu);
Grid.StencilSendToRecvFromComplete(requests,mu);
requests.resize(0);
if ( dir == mu ) {
int comm_proc=1;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
} else {
int comm_proc = mpi_layout[mu]-1;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
}
int tid = omp_get_thread_num();
tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,1,
(void *)&rbuf[dir][0], recv_from_rank,1, bytes,tid);
comm_proc = mpi_layout[mu]-1;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
dbytes+=
Grid.StencilSendToRecvFromBegin(requests,
(void *)&xbuf[mu+4][0],
xmit_to_rank,1,
(void *)&rbuf[mu+4][0],
recv_from_rank,1,
bytes,bytes,mu+4);
Grid.StencilSendToRecvFromComplete(requests,mu+4);
requests.resize(0);
dbytes+=tbytes;
}
}
}
Grid.Barrier();
double stop=usecond();
t_time[i] = stop-start; // microseconds
}
timestat.statistics(t_time);

View File

@@ -32,18 +32,18 @@
using namespace std;
using namespace Grid;
template<class d>
struct scal {
d internal;
////////////////////////
/// Move to domains ////
////////////////////////
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
};
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
};
void Benchmark(int Ls, Coordinate Dirichlet,bool Sloppy);
int main (int argc, char ** argv)
{
@@ -52,39 +52,108 @@ int main (int argc, char ** argv)
int threads = GridThread::GetThreads();
Coordinate latt4 = GridDefaultLatt();
int Ls=8;
for(int i=0;i<argc;i++)
int Ls=16;
for(int i=0;i<argc;i++) {
if(std::string(argv[i]) == "-Ls"){
std::stringstream ss(argv[i+1]); ss >> Ls;
}
}
//////////////////
// With comms
//////////////////
Coordinate Dirichlet(Nd+1,0);
std::cout << "\n\n\n\n\n\n" <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
Benchmark(Ls,Dirichlet,false);
std::cout << "\n\n\n\n\n\n" <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
Benchmark(Ls,Dirichlet,true);
//////////////////
// Domain decomposed
//////////////////
/*
Coordinate latt4 = GridDefaultLatt();
Coordinate mpi = GridDefaultMpi();
Coordinate CommDim(Nd);
Coordinate shm;
GlobalSharedMemory::GetShmDims(mpi,shm);
std::cout << "\n\n\n\n\n\n" <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
// std::cout << GridLogMessage<< " Testing without internode communication " <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
Dirichlet[0] = 0;
Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
Benchmark(Ls,Dirichlet,false);
std::cout << "\n\n\n\n\n\n" <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
Benchmark(Ls,Dirichlet,true);
*/
Grid_finalize();
exit(0);
}
void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
{
Coordinate latt4 = GridDefaultLatt();
GridLogLayout();
long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
#undef SINGLE
#ifdef SINGLE
typedef vComplexF Simd;
typedef LatticeFermionF FermionField;
typedef LatticeGaugeFieldF GaugeField;
typedef LatticeColourMatrixF ColourMatrixField;
typedef DomainWallFermionF FermionAction;
#else
typedef vComplexD Simd;
typedef LatticeFermionD FermionField;
typedef LatticeGaugeFieldD GaugeField;
typedef LatticeColourMatrixD ColourMatrixField;
typedef DomainWallFermionD FermionAction;
#endif
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG"));
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG"));
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
LatticeFermion src (FGrid); random(RNG5,src);
FermionField src (FGrid); random(RNG5,src);
#if 0
src = Zero();
{
@@ -100,46 +169,39 @@ int main (int argc, char ** argv)
src = src*N2;
#endif
LatticeFermion result(FGrid); result=Zero();
LatticeFermion ref(FGrid); ref=Zero();
LatticeFermion tmp(FGrid);
LatticeFermion err(FGrid);
FermionField result(FGrid); result=Zero();
FermionField ref(FGrid); ref=Zero();
FermionField tmp(FGrid);
FermionField err(FGrid);
std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
LatticeGaugeField Umu(UGrid);
GaugeField Umu(UGrid);
GaugeField UmuCopy(UGrid);
SU<Nc>::HotConfiguration(RNG4,Umu);
// SU<Nc>::ColdConfiguration(Umu);
UmuCopy=Umu;
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
#if 0
Umu=1.0;
for(int mu=0;mu<Nd;mu++){
LatticeColourMatrix ttmp(UGrid);
ttmp = PeekIndex<LorentzIndex>(Umu,mu);
// if (mu !=2 ) ttmp = 0;
// ttmp = ttmp* pow(10.0,mu);
PokeIndex<LorentzIndex>(Umu,ttmp,mu);
}
std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
#endif
////////////////////////////////////
// Apply BCs
////////////////////////////////////
Coordinate Block(4);
for(int d=0;d<4;d++) Block[d]= Dirichlet[d+1];
std::cout << GridLogMessage << "Applying BCs for Dirichlet Block5 " << Dirichlet << std::endl;
std::cout << GridLogMessage << "Applying BCs for Dirichlet Block4 " << Block << std::endl;
DirichletFilter<GaugeField> Filter(Block);
Filter.applyFilter(Umu);
////////////////////////////////////
// Naive wilson implementation
////////////////////////////////////
// replicate across fifth dimension
LatticeGaugeField Umu5d(FGrid);
std::vector<LatticeColourMatrix> U(4,FGrid);
{
autoView( Umu5d_v, Umu5d, CpuWrite);
autoView( Umu_v , Umu , CpuRead);
for(int ss=0;ss<Umu.Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
Umu5d_v[Ls*ss+s] = Umu_v[ss];
}
}
}
std::vector<ColourMatrixField> U(4,UGrid);
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
}
std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
if (1)
@@ -147,10 +209,28 @@ int main (int argc, char ** argv)
ref = Zero();
for(int mu=0;mu<Nd;mu++){
tmp = U[mu]*Cshift(src,mu+1,1);
tmp = Cshift(src,mu+1,1);
{
autoView( tmp_v , tmp , CpuWrite);
autoView( U_v , U[mu] , CpuRead);
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
}
}
}
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
tmp =adj(U[mu])*src;
{
autoView( tmp_v , tmp , CpuWrite);
autoView( U_v , U[mu] , CpuRead);
autoView( src_v, src , CpuRead);
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
}
}
}
tmp =Cshift(tmp,mu+1,-1);
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
}
@@ -167,11 +247,9 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionD::Dhop "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(vComplex)<< " B"<<std::endl;
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(Simd)<< " B"<<std::endl;
#ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
@@ -181,9 +259,15 @@ int main (int argc, char ** argv)
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
DomainWallFermionD Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
int ncall =1000;
FermionAction::ImplParams p;
p.dirichlet=Dirichlet;
FermionAction Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
Dw.SloppyComms(sloppy);
Dw.ImportGauge(Umu);
int ncall =300;
RealD n2e;
if (1) {
FGrid->Barrier();
Dw.Dhop(src,result,0);
@@ -198,8 +282,8 @@ int main (int argc, char ** argv)
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=single_site_flops*volume*ncall;
auto nsimd = vComplex::Nsimd();
auto simdwidth = sizeof(vComplex);
auto nsimd = Simd::Nsimd();
auto simdwidth = sizeof(Simd);
// RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
@@ -208,28 +292,27 @@ int main (int argc, char ** argv)
double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
// std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
// std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
std::cout<<GridLogMessage << "RF GiB/s (base 2) = "<< 1000000. * data_rf/((t1-t0))<<std::endl;
std::cout<<GridLogMessage << "mem GiB/s (base 2) = "<< 1000000. * data_mem/((t1-t0))<<std::endl;
err = ref-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
//exit(0);
n2e = norm2(err);
std::cout<<GridLogMessage << "norm diff "<< n2e<< " Line "<<__LINE__ <<std::endl;
if(( norm2(err)>1.0e-4) ) {
/*
std::cout << "RESULT\n " << result<<std::endl;
std::cout << "REF \n " << ref <<std::endl;
std::cout << "ERR \n " << err <<std::endl;
*/
if(( n2e>1.0e-4) ) {
std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
FGrid->Barrier();
std::cout<<GridLogMessage << "RESULT" << std::endl;
// std::cout << result<<std::endl;
std::cout << norm2(result)<<std::endl;
std::cout<<GridLogMessage << "REF" << std::endl;
std::cout << norm2(ref)<<std::endl;
std::cout<<GridLogMessage << "ERR" << std::endl;
std::cout << norm2(err)<<std::endl;
FGrid->Barrier();
exit(-1);
}
assert (norm2(err)< 1.0e-4 );
assert (n2e< 1.0e-4 );
}
if (1)
@@ -238,16 +321,30 @@ int main (int argc, char ** argv)
for(int mu=0;mu<Nd;mu++){
// ref = src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
tmp = U[mu]*Cshift(src,mu+1,1);
tmp = Cshift(src,mu+1,1);
{
autoView( ref_v, ref, CpuWrite);
autoView( tmp_v, tmp, CpuRead);
for(int i=0;i<ref_v.size();i++){
ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
autoView( U_v , U[mu] , CpuRead);
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
int i=s+Ls*ss;
ref_v[i]+= U_v[ss]*(tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]); ;
}
}
}
tmp =adj(U[mu])*src;
{
autoView( tmp_v , tmp , CpuWrite);
autoView( U_v , U[mu] , CpuRead);
autoView( src_v, src , CpuRead);
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
}
}
}
// tmp =adj(U[mu])*src;
tmp =Cshift(tmp,mu+1,-1);
{
autoView( ref_v, ref, CpuWrite);
@@ -259,27 +356,27 @@ int main (int argc, char ** argv)
}
ref = -0.5*ref;
}
// dump=1;
Dw.Dhop(src,result,1);
Dw.Dhop(src,result,DaggerYes);
std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "norm dag ref "<< norm2(ref)<<std::endl;
err = ref-result;
std::cout<<GridLogMessage << "norm dag diff "<< norm2(err)<<std::endl;
if((norm2(err)>1.0e-4)){
/*
std::cout<< "DAG RESULT\n " <<ref << std::endl;
std::cout<< "DAG sRESULT\n " <<result << std::endl;
std::cout<< "DAG ERR \n " << err <<std::endl;
*/
}
LatticeFermion src_e (FrbGrid);
LatticeFermion src_o (FrbGrid);
LatticeFermion r_e (FrbGrid);
LatticeFermion r_o (FrbGrid);
LatticeFermion r_eo (FGrid);
n2e= norm2(err);
std::cout<<GridLogMessage << "norm dag diff "<< n2e<< " Line "<<__LINE__ <<std::endl;
assert((n2e)<1.0e-4);
FermionField src_e (FrbGrid);
FermionField src_o (FrbGrid);
FermionField r_e (FrbGrid);
FermionField r_o (FrbGrid);
FermionField r_eo (FGrid);
std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
pickCheckerboard(Even,src_e,src);
@@ -291,10 +388,8 @@ int main (int argc, char ** argv)
// S-direction is INNERMOST and takes no part in the parity.
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionD::DhopEO "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermion::DhopEO "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
#ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
@@ -308,13 +403,7 @@ int main (int argc, char ** argv)
Dw.DhopEO(src_o,r_e,DaggerNo);
double t0=usecond();
for(int i=0;i<ncall;i++){
#ifdef CUDA_PROFILE
if(i==10) cudaProfilerStart();
#endif
Dw.DhopEO(src_o,r_e,DaggerNo);
#ifdef CUDA_PROFILE
if(i==20) cudaProfilerStop();
#endif
}
double t1=usecond();
FGrid->Barrier();
@@ -338,14 +427,9 @@ int main (int argc, char ** argv)
setCheckerboard(r_eo,r_e);
err = r_eo-result;
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
if((norm2(err)>1.0e-4)){
/*
std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
std::cout<< "Deo REF\n " <<result << std::endl;
std::cout<< "Deo ERR \n " << err <<std::endl;
*/
}
n2e= norm2(err);
std::cout<<GridLogMessage << "norm diff "<< n2e<<std::endl;
assert(n2e<1.0e-4);
pickCheckerboard(Even,src_e,err);
pickCheckerboard(Odd,src_o,err);
@@ -354,6 +438,4 @@ int main (int argc, char ** argv)
assert(norm2(src_e)<1.0e-4);
assert(norm2(src_o)<1.0e-4);
Grid_finalize();
exit(0);
}

View File

@@ -43,7 +43,7 @@ Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaT
};
void Benchmark(int Ls, Coordinate Dirichlet);
void Benchmark(int Ls, Coordinate Dirichlet,bool Sloppy);
int main (int argc, char ** argv)
{
@@ -69,11 +69,19 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
Benchmark(Ls,Dirichlet);
Benchmark(Ls,Dirichlet,false);
std::cout << "\n\n\n\n\n\n" <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
Benchmark(Ls,Dirichlet,true);
//////////////////
// Domain decomposed
//////////////////
/*
Coordinate latt4 = GridDefaultLatt();
Coordinate mpi = GridDefaultMpi();
Coordinate CommDim(Nd);
@@ -81,42 +89,35 @@ int main (int argc, char ** argv)
GlobalSharedMemory::GetShmDims(mpi,shm);
//////////////////////
// Node level
//////////////////////
std::cout << "\n\n\n\n\n\n" <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
std::cout << GridLogMessage<< " Testing without internode communication " <<std::endl;
// std::cout << GridLogMessage<< " Testing without internode communication " <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
// Dirichlet[0] = 0;
// Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
// Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
// Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
// Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
Dirichlet[0] = 0;
Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
Benchmark(Ls,Dirichlet);
Benchmark(Ls,Dirichlet,false);
std::cout << "\n\n\n\n\n\n" <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
std::cout << GridLogMessage<< " Testing without intranode communication " <<std::endl;
std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
// Dirichlet[0] = 0;
// Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
// Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
// Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
// Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
Benchmark(Ls,Dirichlet);
Benchmark(Ls,Dirichlet,true);
*/
Grid_finalize();
exit(0);
}
void Benchmark(int Ls, Coordinate Dirichlet)
void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
{
Coordinate latt4 = GridDefaultLatt();
GridLogLayout();
@@ -132,21 +133,13 @@ void Benchmark(int Ls, Coordinate Dirichlet)
typedef LatticeGaugeFieldF GaugeField;
typedef LatticeColourMatrixF ColourMatrixField;
typedef DomainWallFermionF FermionAction;
#endif
#ifdef DOUBLE
#else
typedef vComplexD Simd;
typedef LatticeFermionD FermionField;
typedef LatticeGaugeFieldD GaugeField;
typedef LatticeColourMatrixD ColourMatrixField;
typedef DomainWallFermionD FermionAction;
#endif
#ifdef DOUBLE2
typedef vComplexD2 Simd;
typedef LatticeFermionD2 FermionField;
typedef LatticeGaugeFieldD2 GaugeField;
typedef LatticeColourMatrixD2 ColourMatrixField;
typedef DomainWallFermionD2 FermionAction;
#endif
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
@@ -269,6 +262,7 @@ void Benchmark(int Ls, Coordinate Dirichlet)
FermionAction::ImplParams p;
p.dirichlet=Dirichlet;
FermionAction Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
Dw.SloppyComms(sloppy);
Dw.ImportGauge(Umu);
int ncall =300;

View File

@@ -1,465 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./benchmarks/Benchmark_dwf.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#ifdef GRID_CUDA
#define CUDA_PROFILE
#endif
#ifdef CUDA_PROFILE
#include <cuda_profiler_api.h>
#endif
using namespace std;
using namespace Grid;
////////////////////////
/// Move to domains ////
////////////////////////
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
};
void Benchmark(int Ls, Coordinate Dirichlet, int partial);
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
int threads = GridThread::GetThreads();
int Ls=8;
for(int i=0;i<argc;i++) {
if(std::string(argv[i]) == "-Ls"){
std::stringstream ss(argv[i+1]); ss >> Ls;
}
}
//////////////////
// With comms
//////////////////
Coordinate Dirichlet(Nd+1,0);
for(auto partial : {0}) {
std::cout << "\n\n\n\n\n\n" <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
Benchmark(Ls,Dirichlet,partial);
}
//////////////////
// Domain decomposed
//////////////////
Coordinate latt4 = GridDefaultLatt();
Coordinate mpi = GridDefaultMpi();
Coordinate CommDim(Nd);
//Coordinate shm({2,1,1,1});
Coordinate shm;
GlobalSharedMemory::GetShmDims(mpi,shm);
std::cout <<GridLogMessage << " Shared memory MPI decomp is " <<shm<<std::endl;
//////////////////////
// Node level
//////////////////////
for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
// for(int d=0;d<Nd;d++) CommDim[d]= 1;
Dirichlet[0] = 0;
Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
for(auto partial : {0,1}) {
std::cout << "\n\n\n\n\n\n" <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
std::cout << GridLogMessage<< " Testing without internode communication partial dirichlet="<<partial <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
Benchmark(Ls,Dirichlet,partial);
}
for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
Dirichlet[0] = 0;
Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
for(auto partial : {0,1}) {
std::cout << "\n\n\n\n\n\n" <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
std::cout << GridLogMessage<< " Testing without intranode communication; partial dirichlet= "<<partial <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
Benchmark(Ls,Dirichlet,partial);
}
Grid_finalize();
exit(0);
}
void Benchmark(int Ls, Coordinate Dirichlet, int partial)
{
Coordinate latt4 = GridDefaultLatt();
GridLogLayout();
long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
#define SINGLE
#ifdef SINGLE
typedef vComplexF Simd;
typedef LatticeFermionF FermionField;
typedef LatticeGaugeFieldF GaugeField;
typedef LatticeColourMatrixF ColourMatrixField;
typedef DomainWallFermionF FermionAction;
#endif
#ifdef DOUBLE
typedef vComplexD Simd;
typedef LatticeFermionD FermionField;
typedef LatticeGaugeFieldD GaugeField;
typedef LatticeColourMatrixD ColourMatrixField;
typedef DomainWallFermionD FermionAction;
#endif
#ifdef DOUBLE2
typedef vComplexD2 Simd;
typedef LatticeFermionD2 FermionField;
typedef LatticeGaugeFieldD2 GaugeField;
typedef LatticeColourMatrixD2 ColourMatrixField;
typedef DomainWallFermionD2 FermionAction;
#endif
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG"));
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG"));
FermionField src (FGrid); random(RNG5,src);
#if 0
src = Zero();
{
Coordinate origin({0,0,0,latt4[2]-1,0});
SpinColourVectorF tmp;
tmp=Zero();
tmp()(0)(0)=Complex(-2.0,0.0);
std::cout << " source site 0 " << tmp<<std::endl;
pokeSite(tmp,src,origin);
}
#else
RealD N2 = 1.0/::sqrt(norm2(src));
src = src*N2;
#endif
FermionField result(FGrid); result=Zero();
FermionField ref(FGrid); ref=Zero();
FermionField tmp(FGrid);
FermionField err(FGrid);
std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
GaugeField Umu(UGrid);
GaugeField UmuFull(UGrid);
GaugeField UmuCopy(UGrid);
SU<Nc>::HotConfiguration(RNG4,Umu);
UmuCopy=Umu;
UmuFull=Umu;
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
////////////////////////////////////
// Apply BCs
////////////////////////////////////
Coordinate Block(4);
for(int d=0;d<4;d++) Block[d]= Dirichlet[d+1];
std::cout << GridLogMessage << "Applying BCs for Dirichlet Block5 " << Dirichlet << std::endl;
std::cout << GridLogMessage << "Applying BCs for Dirichlet Block4 " << Block << std::endl;
DirichletFilter<GaugeField> Filter(Block);
Filter.applyFilter(Umu);
if(!partial) Filter.applyFilter(UmuCopy);
////////////////////////////////////
// Naive wilson implementation
////////////////////////////////////
std::vector<ColourMatrixField> U(4,UGrid);
std::vector<ColourMatrixField> Ucopy(4,UGrid);
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
Ucopy[mu] = PeekIndex<LorentzIndex>(UmuCopy,mu);
}
std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
if (1)
{
ref = Zero();
for(int mu=0;mu<Nd;mu++){
int depth=dwf_compressor_depth;
tmp = Cshift(src,mu+1,1);
{
autoView( tmp_v , tmp , CpuWrite);
autoView( U_v , U[mu] , CpuRead);
autoView( Ucopy_v, Ucopy[mu] , CpuRead);
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
if ( (s<depth) || (s>=Ls-depth)){
tmp_v[Ls*ss+s] = Ucopy_v[ss]*tmp_v[Ls*ss+s];
} else {
tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
}
}
}
}
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
{
autoView( tmp_v , tmp , CpuWrite);
autoView( U_v , U[mu] , CpuRead);
autoView( Ucopy_v, Ucopy[mu] , CpuRead);
autoView( src_v, src , CpuRead);
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
if ( (s<depth) || (s>=Ls-depth)){
tmp_v[Ls*ss+s] = adj(Ucopy_v[ss])*src_v[Ls*ss+s];
} else {
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
}
}
}
}
tmp =Cshift(tmp,mu+1,-1);
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
}
ref = -0.5*ref;
}
RealD mass=0.1;
RealD M5 =1.8;
RealD NP = UGrid->_Nprocessors;
RealD NN = UGrid->NodeCount();
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
std::cout << GridLogMessage <<"* BCs for Dirichlet Block4 " << Block << std::endl;
std::cout << GridLogMessage <<"* Partial Dirichlet BC = " << partial << std::endl;
std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(Simd)<< " B"<<std::endl;
#ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
#endif
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
FermionAction::ImplParams p;
p.dirichlet=Dirichlet;
p.partialDirichlet=partial;
FermionAction Dw(UmuFull,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
int ncall =1;
RealD n2e;
if (1) {
FGrid->Barrier();
Dw.Dhop(src,result,0);
std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
double t0=usecond();
for(int i=0;i<ncall;i++){
Dw.Dhop(src,result,0);
}
double t1=usecond();
FGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=single_site_flops*volume*ncall;
auto nsimd = Simd::Nsimd();
auto simdwidth = sizeof(Simd);
// RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
// mem: Nd Wilson * Ls, Nd gauge, Nc colors
double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
err = ref-result;
n2e = norm2(err);
std::cout<<GridLogMessage << "norm diff "<< n2e<< " Line "<<__LINE__ <<std::endl;
if(( n2e>1.0e-4) ) {
std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
FGrid->Barrier();
DumpSliceNorm("s-slice ref ",ref,1);
DumpSliceNorm("s-slice res ",result,1);
DumpSliceNorm("s-slice error ",err,1);
exit(-1);
}
assert (n2e< 1.0e-4 );
}
if (1)
{ // Naive wilson dag implementation
ref = Zero();
for(int mu=0;mu<Nd;mu++){
int depth=dwf_compressor_depth;
tmp = Cshift(src,mu+1,1);
{
autoView( tmp_v , tmp , CpuWrite);
autoView( U_v , U[mu] , CpuRead);
autoView( Ucopy_v, Ucopy[mu] , CpuRead);
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
if ( (s<depth) || (s>=Ls-depth)){
tmp_v[Ls*ss+s] = Ucopy_v[ss]*tmp_v[Ls*ss+s];
} else {
tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
}
}
}
}
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
{
autoView( tmp_v , tmp , CpuWrite);
autoView( U_v , U[mu] , CpuRead);
autoView( Ucopy_v, Ucopy[mu] , CpuRead);
autoView( src_v, src , CpuRead);
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
if ( (s<depth) || (s>=Ls-depth)){
tmp_v[Ls*ss+s] = adj(Ucopy_v[ss])*src_v[Ls*ss+s];
} else {
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
}
}
}
}
tmp =Cshift(tmp,mu+1,-1);
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
}
ref = -0.5*ref;
}
Dw.Dhop(src,result,DaggerYes);
std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "norm dag ref "<< norm2(ref)<<std::endl;
err = ref-result;
n2e= norm2(err);
std::cout<<GridLogMessage << "norm dag diff "<< n2e<< " Line "<<__LINE__ <<std::endl;
assert((n2e)<1.0e-4);
FermionField src_e (FrbGrid);
FermionField src_o (FrbGrid);
FermionField r_e (FrbGrid);
FermionField r_o (FrbGrid);
FermionField r_eo (FGrid);
std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd,src_o,src);
std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
// S-direction is INNERMOST and takes no part in the parity.
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermion::DhopEO "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
#ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
#endif
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
{
FGrid->Barrier();
Dw.DhopEO(src_o,r_e,DaggerNo);
double t0=usecond();
for(int i=0;i<ncall;i++){
Dw.DhopEO(src_o,r_e,DaggerNo);
}
double t1=usecond();
FGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=(single_site_flops*volume*ncall)/2.0;
std::cout<<GridLogMessage << "Deo mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "Deo mflop/s per rank "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "Deo mflop/s per node "<< flops/(t1-t0)/NN<<std::endl;
}
Dw.DhopEO(src_o,r_e,DaggerNo);
Dw.DhopOE(src_e,r_o,DaggerNo);
Dw.Dhop (src ,result,DaggerNo);
std::cout<<GridLogMessage << "r_e"<<norm2(r_e)<<std::endl;
std::cout<<GridLogMessage << "r_o"<<norm2(r_o)<<std::endl;
std::cout<<GridLogMessage << "res"<<norm2(result)<<std::endl;
setCheckerboard(r_eo,r_o);
setCheckerboard(r_eo,r_e);
err = r_eo-result;
n2e= norm2(err);
std::cout<<GridLogMessage << "norm diff "<< n2e<< " Line "<<__LINE__ <<std::endl;
assert(n2e<1.0e-4);
pickCheckerboard(Even,src_e,err);
pickCheckerboard(Odd,src_o,err);
std::cout<<GridLogMessage << "norm diff even "<< norm2(src_e)<<std::endl;
std::cout<<GridLogMessage << "norm diff odd "<< norm2(src_o)<<std::endl;
assert(norm2(src_e)<1.0e-4);
assert(norm2(src_o)<1.0e-4);
}

View File

@@ -86,6 +86,7 @@ AC_ARG_WITH([gmp],
[try this for a non-standard install prefix of the GMP library])],
[AM_CXXFLAGS="-I$with_gmp/include $AM_CXXFLAGS"]
[AM_LDFLAGS="-L$with_gmp/lib $AM_LDFLAGS"])
AC_ARG_WITH([mpfr],
[AS_HELP_STRING([--with-mpfr=prefix],
[try this for a non-standard install prefix of the MPFR library])],
@@ -106,6 +107,13 @@ AC_ARG_WITH([lime],
[AM_CXXFLAGS="-I$with_lime/include $AM_CXXFLAGS"]
[AM_LDFLAGS="-L$with_lime/lib $AM_LDFLAGS"])
############### LIBUNWIND
AC_ARG_WITH([unwind],
[AS_HELP_STRING([--with-unwind=prefix],
[try this for a non-standard install prefix of the libunwind library])],
[AM_CXXFLAGS="-I$with_unwind/include $AM_CXXFLAGS"]
[AM_LDFLAGS="-L$with_unwind/lib $AM_LDFLAGS"])
############### OpenSSL
AC_ARG_WITH([openssl],
[AS_HELP_STRING([--with-openssl=prefix],
@@ -151,7 +159,7 @@ AC_ARG_ENABLE([tracing],
case ${ac_TRACING} in
nvtx)
AC_DEFINE([GRID_TRACING_NVTX],[1],[use NVTX])
LIBS="${LIBS} -lnvToolsExt64_1"
LIBS="${LIBS} -lnvToolsExt"
;;
roctx)
AC_DEFINE([GRID_TRACING_ROCTX],[1],[use ROCTX])
@@ -373,6 +381,16 @@ AC_SEARCH_LIBS([limeCreateReader], [lime],
[have_lime=true],
[AC_MSG_WARN(LIME library was not found in your system.)])
AC_SEARCH_LIBS([unw_backtrace], [unwind],
[AC_DEFINE([HAVE_UNWIND], [1], [Define to 1 if you have the `libunwind' library])]
[have_unwind=true],
[AC_MSG_WARN(libunwind library was not found in your system.)])
AC_SEARCH_LIBS([_Ux86_64_step], [unwind-x86_64],
[AC_DEFINE([HAVE_UNWIND_X86_64], [1], [Define to 1 if you have the `libunwind-x86_64' library])]
[have_unwind_x86_64=true],
[AC_MSG_WARN(libunwind library was not found in your system.)])
AC_SEARCH_LIBS([SHA256_Init], [crypto],
[AC_DEFINE([HAVE_CRYPTO], [1], [Define to 1 if you have the `OpenSSL' library])]
[have_crypto=true],

View File

@@ -93,10 +93,13 @@ int main(int argc, char ** argv)
Real coeff = (width*width) / Real(4*Iterations);
chi=kronecker;
// chi = (1-p^2/2N)^N kronecker
for(int n = 0; n < Iterations; ++n) {
Laplacian.M(chi,psi);
chi = chi - coeff*psi;
RealD n2 = norm2(chi);
chi = chi * (1.0/std::sqrt(n2));
}
std::cout << " Wuppertal smeared operator is chi = \n" << chi <<std::endl;

View File

@@ -0,0 +1,22 @@
CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
../../configure --enable-comms=mpi-auto \
--with-lime=$CLIME \
--enable-unified=no \
--enable-shm=nvlink \
--enable-tracing=none \
--enable-accelerator=hip \
--enable-gen-simd-width=64 \
--disable-gparity \
--disable-fermion-reps \
--enable-simd=GPU \
--with-gmp=$OLCF_GMP_ROOT \
--with-fftw=$FFTW_DIR/.. \
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
--disable-fermion-reps \
CXX=hipcc MPICXX=mpicxx \
CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas"

View File

@@ -0,0 +1,16 @@
echo spack
. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
#module load cce/15.0.1
module load rocm/6.3.1
module load cray-fftw
module load craype-accel-amd-gfx90a
export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
#Ugly hacks to get down level software working on current system
#export LD_LIBRARY_PATH=/opt/cray/libfabric/1.20.1/lib64/:$LD_LIBRARY_PATH
#export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
#ln -s /opt/rocm-6.0.0/lib/libamdhip64.so.6 .

View File

@@ -30,14 +30,10 @@ source ${root}/sourceme.sh
export OMP_NUM_THREADS=7
export MPICH_GPU_SUPPORT_ENABLED=1
export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
for vol in 32.32.32.64
#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
#64.64.32.96
for vol in 64.64.32.64
do
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol > log.shm0.ov.$vol
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.ov.$vol
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 0 --grid $vol > log.shm0.seq.$vol
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.seq.$vol
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol -Ls 16
done

View File

@@ -3,20 +3,19 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
--with-lime=$CLIME \
--enable-unified=no \
--enable-shm=nvlink \
--enable-tracing=timer \
--enable-tracing=none \
--enable-accelerator=hip \
--enable-gen-simd-width=64 \
--disable-gparity \
--disable-fermion-reps \
--enable-simd=GPU \
--enable-accelerator-cshift \
--with-gmp=$OLCF_GMP_ROOT \
--with-fftw=$FFTW_DIR/.. \
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
--disable-fermion-reps \
CXX=hipcc MPICXX=mpicxx \
CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas"
CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas"

View File

@@ -1,12 +1,25 @@
echo spack
. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
spack load c-lime
module load emacs
module load PrgEnv-gnu
module load rocm/6.0.0
module load cray-mpich
module load gmp
module load cce/15.0.1
module load rocm/5.3.0
module load cray-fftw
module load craype-accel-amd-gfx90a
#Ugly hacks to get down level software working on current system
export LD_LIBRARY_PATH=/opt/cray/libfabric/1.20.1/lib64/:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
ln -s /opt/rocm-6.0.0/lib/libamdhip64.so.6 .
#echo spack load c-lime
#spack load c-lime
#module load emacs
##module load PrgEnv-gnu
##module load cray-mpich
##module load cray-fftw
##module load craype-accel-amd-gfx90a
##export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
#Hack for lib
#export LD_LIBRARY_PATH=`pwd`:$LD_LIBRARY_PATH
##export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH

View File

@@ -0,0 +1,273 @@
RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
SLURM detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA GH200 120GB
AcceleratorCudaInit[0]: totalGlobalMem: 102005473280
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 1
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 0 device 0 bus id: 0009:01:00.0
AcceleratorCudaInit: ================================================
SharedMemoryMpi: World communicator of size 4
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x4002c0000000 - 40033fffffff for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=3737a24096282ea179607fc879814710860a0de6: (HEAD -> develop, origin/develop, origin/HEAD) clean
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : This rank is running on host jpbo-119-30.jupiter.internal
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 81604378624 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent host allocations: SMALL 8 LARGE 2 HUGE 0
Grid : Message : MemoryManager::Init() cache pool for recent device allocations: SMALL 16 LARGE 8 Huge 0
Grid : Message : MemoryManager::Init() cache pool for recent shared allocations: SMALL 16 LARGE 8 Huge 0
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 0.303000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 0.309000 s : Testing with full communication
Grid : Message : 0.312000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 0.313000 s : Grid Layout
Grid : Message : 0.313000 s : Global lattice size : 32 32 64 64
Grid : Message : 0.319000 s : OpenMP threads : 4
Grid : Message : 0.320000 s : MPI tasks : 1 1 2 2
Grid : Message : 0.129590 s : Initialising 4d RNG
Grid : Message : 0.764790 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 0.764920 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 0.942440 s : Initialising 5d RNG
Grid : Message : 1.149388 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 1.149404 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
local rank 1 device 0 bus id: 0019:01:00.0
local rank 2 device 0 bus id: 0029:01:00.0
local rank 3 device 0 bus id: 0039:01:00.0
Grid : Message : 43.893114 s : Drawing gauge field
Grid : Message : 54.574150 s : Random gauge initialised
Grid : Message : 54.574170 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
Grid : Message : 54.574172 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
Grid : Message : 54.580032 s : Setting up Cshift based reference
Grid : Message : 60.407451 s : *****************************************************************
Grid : Message : 60.407469 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 60.407470 s : *****************************************************************
Grid : Message : 60.407471 s : *****************************************************************
Grid : Message : 60.407472 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 60.407473 s : * Vectorising space-time by 8
Grid : Message : 60.407475 s : * VComplex size is 64 B
Grid : Message : 60.407477 s : * Using Overlapped Comms/Compute
Grid : Message : 60.407479 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 60.407480 s : *****************************************************************
Grid : Message : 61.102178 s : Called warmup
Grid : Message : 62.177160 s : Called Dw 300 times in 1074958 us
Grid : Message : 62.177198 s : mflop/s = 24721998.6
Grid : Message : 62.177201 s : mflop/s per rank = 6180499.64
Grid : Message : 62.177204 s : mflop/s per node = 24721998.6
Grid : Message : 62.182696 s : norm diff 5.8108784e-14 Line 306
Grid : Message : 71.328862 s : ----------------------------------------------------------------
Grid : Message : 71.328884 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 71.328885 s : ----------------------------------------------------------------
Grid : Message : 71.328886 s : Called DwDag
Grid : Message : 71.328887 s : norm dag result 4.12810493
Grid : Message : 71.329493 s : norm dag ref 4.12810493
Grid : Message : 71.331967 s : norm dag diff 3.40632318e-14 Line 377
Grid : Message : 71.394727 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 71.803650 s : src_e0.500003185
Grid : Message : 71.819727 s : src_o0.499996882
Grid : Message : 71.821991 s : *********************************************************
Grid : Message : 71.821993 s : * Benchmarking DomainWallFermion::DhopEO
Grid : Message : 71.821995 s : * Vectorising space-time by 8
Grid : Message : 71.821998 s : * Using Overlapped Comms/Compute
Grid : Message : 71.822002 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 71.822003 s : *********************************************************
Grid : Message : 72.377054 s : Deo mflop/s = 24065467
Grid : Message : 72.377071 s : Deo mflop/s per rank 6016366.75
Grid : Message : 72.377074 s : Deo mflop/s per node 24065467
Grid : Message : 72.624877 s : r_e2.06377678
Grid : Message : 72.625198 s : r_o2.06381058
Grid : Message : 72.625507 s : res4.12758736
Grid : Message : 73.759140 s : norm diff 0
Grid : Message : 73.868204 s : norm diff even 0
Grid : Message : 73.907201 s : norm diff odd 0
Grid : Message : 74.414580 s : ++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 74.414582 s : Testing without internode communication
Grid : Message : 74.414584 s : ++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 74.414586 s : Grid Layout
Grid : Message : 74.414586 s : Global lattice size : 32 32 64 64
Grid : Message : 74.414594 s : OpenMP threads : 4
Grid : Message : 74.414595 s : MPI tasks : 1 1 2 2
Grid : Message : 74.679364 s : Initialising 4d RNG
Grid : Message : 74.742332 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 74.742343 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 74.759525 s : Initialising 5d RNG
Grid : Message : 75.812412 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 75.812429 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 119.252016 s : Drawing gauge field
Grid : Message : 129.919846 s : Random gauge initialised
Grid : Message : 129.919863 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
Grid : Message : 129.919865 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
Grid : Message : 129.923611 s : Setting up Cshift based reference
Grid : Message : 135.522878 s : *****************************************************************
Grid : Message : 135.522897 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 135.522899 s : *****************************************************************
Grid : Message : 135.522899 s : *****************************************************************
Grid : Message : 135.522900 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 135.522901 s : * Vectorising space-time by 8
Grid : Message : 135.522903 s : * VComplex size is 64 B
Grid : Message : 135.522905 s : * Using Overlapped Comms/Compute
Grid : Message : 135.522907 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 135.522908 s : *****************************************************************
Grid : Message : 136.151202 s : Called warmup
Grid : Message : 137.224721 s : Called Dw 300 times in 1073490 us
Grid : Message : 137.224748 s : mflop/s = 24755806
Grid : Message : 137.224751 s : mflop/s per rank = 6188951.49
Grid : Message : 137.224753 s : mflop/s per node = 24755806
Grid : Message : 137.235239 s : norm diff 5.8108784e-14 Line 306
Grid : Message : 146.451686 s : ----------------------------------------------------------------
Grid : Message : 146.451708 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 146.451710 s : ----------------------------------------------------------------
Grid : Message : 146.451712 s : Called DwDag
Grid : Message : 146.451714 s : norm dag result 4.12810493
Grid : Message : 146.452323 s : norm dag ref 4.12810493
Grid : Message : 146.454799 s : norm dag diff 3.40632318e-14 Line 377
Grid : Message : 146.498557 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 146.940894 s : src_e0.500003185
Grid : Message : 146.953676 s : src_o0.499996882
Grid : Message : 146.955927 s : *********************************************************
Grid : Message : 146.955929 s : * Benchmarking DomainWallFermion::DhopEO
Grid : Message : 146.955932 s : * Vectorising space-time by 8
Grid : Message : 146.955936 s : * Using Overlapped Comms/Compute
Grid : Message : 146.955938 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 146.955941 s : *********************************************************
Grid : Message : 147.511975 s : Deo mflop/s = 24036256.5
Grid : Message : 147.511989 s : Deo mflop/s per rank 6009064.13
Grid : Message : 147.511991 s : Deo mflop/s per node 24036256.5
Grid : Message : 147.522100 s : r_e2.06377678
Grid : Message : 147.522433 s : r_o2.06381058
Grid : Message : 147.522745 s : res4.12758736
Grid : Message : 148.229848 s : norm diff 0
Grid : Message : 149.233474 s : norm diff even 0
Grid : Message : 149.235815 s : norm diff odd 0
Grid : Message : 149.960985 s : ++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 149.960990 s : Testing without intranode communication
Grid : Message : 149.960991 s : ++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 149.960995 s : Grid Layout
Grid : Message : 149.960995 s : Global lattice size : 32 32 64 64
Grid : Message : 149.961003 s : OpenMP threads : 4
Grid : Message : 149.961004 s : MPI tasks : 1 1 2 2
Grid : Message : 150.155810 s : Initialising 4d RNG
Grid : Message : 150.800200 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 150.800340 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 150.973420 s : Initialising 5d RNG
Grid : Message : 151.131117 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 151.131136 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 193.933765 s : Drawing gauge field
Grid : Message : 204.611551 s : Random gauge initialised
Grid : Message : 204.611574 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
Grid : Message : 204.611576 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
Grid : Message : 204.615265 s : Setting up Cshift based reference
Grid : Message : 210.117788 s : *****************************************************************
Grid : Message : 210.117807 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 210.117809 s : *****************************************************************
Grid : Message : 210.117810 s : *****************************************************************
Grid : Message : 210.117812 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 210.117813 s : * Vectorising space-time by 8
Grid : Message : 210.117814 s : * VComplex size is 64 B
Grid : Message : 210.117817 s : * Using Overlapped Comms/Compute
Grid : Message : 210.117818 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 210.117819 s : *****************************************************************
Grid : Message : 210.714641 s : Called warmup
Grid : Message : 211.892227 s : Called Dw 300 times in 1177557 us
Grid : Message : 211.892252 s : mflop/s = 22568003.2
Grid : Message : 211.892255 s : mflop/s per rank = 5642000.8
Grid : Message : 211.892257 s : mflop/s per node = 22568003.2
Grid : Message : 211.896037 s : norm diff 5.8108784e-14 Line 306
Grid : Message : 220.751375 s : ----------------------------------------------------------------
Grid : Message : 220.751406 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 220.751409 s : ----------------------------------------------------------------
Grid : Message : 220.751411 s : Called DwDag
Grid : Message : 220.751412 s : norm dag result 4.12810493
Grid : Message : 220.753307 s : norm dag ref 4.12810493
Grid : Message : 220.755796 s : norm dag diff 3.40632318e-14 Line 377
Grid : Message : 220.813226 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 221.697800 s : src_e0.500003185
Grid : Message : 221.890920 s : src_o0.499996882
Grid : Message : 221.913430 s : *********************************************************
Grid : Message : 221.913450 s : * Benchmarking DomainWallFermion::DhopEO
Grid : Message : 221.913480 s : * Vectorising space-time by 8
Grid : Message : 221.913500 s : * Using Overlapped Comms/Compute
Grid : Message : 221.913530 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 221.913550 s : *********************************************************
Grid : Message : 221.645213 s : Deo mflop/s = 24114032
Grid : Message : 221.645228 s : Deo mflop/s per rank 6028508.01
Grid : Message : 221.645231 s : Deo mflop/s per node 24114032
Grid : Message : 221.656021 s : r_e2.06377678
Grid : Message : 221.656389 s : r_o2.06381058
Grid : Message : 221.656698 s : res4.12758736
Grid : Message : 222.110075 s : norm diff 0
Grid : Message : 222.857692 s : norm diff even 0
Grid : Message : 222.875763 s : norm diff odd 0
Grid : Message : 223.598127 s : *******************************************
Grid : Message : 223.598145 s : ******* Grid Finalize ******
Grid : Message : 223.598146 s : *******************************************

View File

@@ -0,0 +1,286 @@
RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
SLURM detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA GH200 120GB
AcceleratorCudaInit[0]: totalGlobalMem: 102005473280
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 1
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 0 device 0 bus id: 0009:01:00.0
AcceleratorCudaInit: ================================================
SharedMemoryMpi: World communicator of size 16
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x4002a0000000 - 40031fffffff for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=3737a24096282ea179607fc879814710860a0de6: (HEAD -> develop, origin/develop, origin/HEAD) clean
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : This rank is running on host jpbo-012-11.jupiter.internal
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 81604378624 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent host allocations: SMALL 8 LARGE 2 HUGE 0
Grid : Message : MemoryManager::Init() cache pool for recent device allocations: SMALL 16 LARGE 8 Huge 0
Grid : Message : MemoryManager::Init() cache pool for recent shared allocations: SMALL 16 LARGE 8 Huge 0
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 0.834000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 0.838000 s : Testing with full communication
Grid : Message : 0.839000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 0.840000 s : Grid Layout
Grid : Message : 0.840000 s : Global lattice size : 64 64 64 64
Grid : Message : 0.846000 s : OpenMP threads : 4
Grid : Message : 0.846000 s : MPI tasks : 2 2 2 2
Grid : Message : 0.165970 s : Initialising 4d RNG
Grid : Message : 0.787270 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 0.787340 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 0.960410 s : Initialising 5d RNG
Grid : Message : 1.142344 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 1.142352 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
local rank 2 device 0 bus id: 0029:01:00.0
local rank 3 device 0 bus id: 0039:01:00.0
local rank 1 device 0 bus id: 0019:01:00.0
Grid : Message : 44.657270 s : Drawing gauge field
Grid : Message : 55.247733 s : Random gauge initialised
Grid : Message : 55.247745 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
Grid : Message : 55.247747 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
Grid : Message : 55.253053 s : Setting up Cshift based reference
Grid : Message : 62.191747 s : *****************************************************************
Grid : Message : 62.191767 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 62.191768 s : *****************************************************************
Grid : Message : 62.191769 s : *****************************************************************
Grid : Message : 62.191769 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 62.191769 s : * Vectorising space-time by 8
Grid : Message : 62.191770 s : * VComplex size is 64 B
Grid : Message : 62.191771 s : * Using Overlapped Comms/Compute
Grid : Message : 62.191771 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 62.191772 s : *****************************************************************
Grid : Message : 62.857568 s : Called warmup
Grid : Message : 65.581790 s : Called Dw 300 times in 2200540 us
Grid : Message : 65.582120 s : mflop/s = 48306525
Grid : Message : 65.582140 s : mflop/s per rank = 3019157.81
Grid : Message : 65.582150 s : mflop/s per node = 12076631.3
Grid : Message : 65.637550 s : norm diff 5.80156793e-14 Line 306
Grid : Message : 75.122153 s : ----------------------------------------------------------------
Grid : Message : 75.122166 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 75.122167 s : ----------------------------------------------------------------
Grid : Message : 75.122167 s : Called DwDag
Grid : Message : 75.122167 s : norm dag result 4.12801829
Grid : Message : 75.123295 s : norm dag ref 4.12801829
Grid : Message : 75.125890 s : norm dag diff 3.42093991e-14 Line 377
Grid : Message : 75.188462 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 75.605683 s : src_e0.500004005
Grid : Message : 75.617824 s : src_o0.499996067
Grid : Message : 75.620089 s : *********************************************************
Grid : Message : 75.620091 s : * Benchmarking DomainWallFermion::DhopEO
Grid : Message : 75.620093 s : * Vectorising space-time by 8
Grid : Message : 75.620094 s : * Using Overlapped Comms/Compute
Grid : Message : 75.620095 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 75.620096 s : *********************************************************
Grid : Message : 76.732272 s : Deo mflop/s = 48068252.4
Grid : Message : 76.732283 s : Deo mflop/s per rank 3004265.77
Grid : Message : 76.732285 s : Deo mflop/s per node 12017063.1
Grid : Message : 76.749317 s : r_e2.06443136
Grid : Message : 76.749652 s : r_o2.06378451
Grid : Message : 76.749955 s : res4.12821587
Grid : Message : 77.198827 s : norm diff 0
Grid : Message : 77.981760 s : norm diff even 0
Grid : Message : 78.455900 s : norm diff odd 0
Grid : Message : 78.539333 s : ++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 78.539337 s : Testing without internode communication
Grid : Message : 78.539338 s : ++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 78.539339 s : Grid Layout
Grid : Message : 78.539339 s : Global lattice size : 64 64 64 64
Grid : Message : 78.539347 s : OpenMP threads : 4
Grid : Message : 78.539348 s : MPI tasks : 2 2 2 2
Grid : Message : 78.798501 s : Initialising 4d RNG
Grid : Message : 78.862916 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 78.862925 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 78.879916 s : Initialising 5d RNG
Grid : Message : 79.941271 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 79.941280 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 124.586264 s : Drawing gauge field
Grid : Message : 135.338090 s : Random gauge initialised
Grid : Message : 135.338102 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
Grid : Message : 135.338103 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
Grid : Message : 135.341266 s : Setting up Cshift based reference
Grid : Message : 142.604280 s : *****************************************************************
Grid : Message : 142.604450 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 142.604460 s : *****************************************************************
Grid : Message : 142.604470 s : *****************************************************************
Grid : Message : 142.604480 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 142.604480 s : * Vectorising space-time by 8
Grid : Message : 142.604500 s : * VComplex size is 64 B
Grid : Message : 142.604510 s : * Using Overlapped Comms/Compute
Grid : Message : 142.604510 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 142.604520 s : *****************************************************************
Grid : Message : 142.686034 s : Called warmup
Grid : Message : 144.868543 s : Called Dw 300 times in 2182483 us
Grid : Message : 144.868559 s : mflop/s = 48706194.1
Grid : Message : 144.868561 s : mflop/s per rank = 3044137.13
Grid : Message : 144.868562 s : mflop/s per node = 12176548.5
Grid : Message : 144.887595 s : norm diff 5.80156793e-14 Line 306
Grid : Message : 153.622978 s : ----------------------------------------------------------------
Grid : Message : 153.622994 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 153.622995 s : ----------------------------------------------------------------
Grid : Message : 153.622995 s : Called DwDag
Grid : Message : 153.622996 s : norm dag result 4.12801829
Grid : Message : 153.623604 s : norm dag ref 4.12801829
Grid : Message : 153.626098 s : norm dag diff 3.42093991e-14 Line 377
Grid : Message : 153.691426 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 154.148319 s : src_e0.500004005
Grid : Message : 154.151454 s : src_o0.499996067
Grid : Message : 154.153722 s : *********************************************************
Grid : Message : 154.153724 s : * Benchmarking DomainWallFermion::DhopEO
Grid : Message : 154.153725 s : * Vectorising space-time by 8
Grid : Message : 154.153726 s : * Using Overlapped Comms/Compute
Grid : Message : 154.153727 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 154.153728 s : *********************************************************
Grid : Message : 155.200671 s : Deo mflop/s = 51121022.4
Grid : Message : 155.200682 s : Deo mflop/s per rank 3195063.9
Grid : Message : 155.200684 s : Deo mflop/s per node 12780255.6
Grid : Message : 155.217204 s : r_e2.06443136
Grid : Message : 155.217550 s : r_o2.06378451
Grid : Message : 155.217869 s : res4.12821587
Grid : Message : 155.673744 s : norm diff 0
Grid : Message : 156.463329 s : norm diff even 0
Grid : Message : 156.878866 s : norm diff odd 0
Grid : Message : 157.620761 s : ++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 157.620764 s : Testing without intranode communication
Grid : Message : 157.620765 s : ++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 157.620766 s : Grid Layout
Grid : Message : 157.620766 s : Global lattice size : 64 64 64 64
Grid : Message : 157.620773 s : OpenMP threads : 4
Grid : Message : 157.620774 s : MPI tasks : 2 2 2 2
Grid : Message : 157.671479 s : Initialising 4d RNG
Grid : Message : 157.738691 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 157.738698 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 157.755651 s : Initialising 5d RNG
Grid : Message : 158.848676 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 158.848685 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 202.465158 s : Drawing gauge field
Grid : Message : 213.214546 s : Random gauge initialised
Grid : Message : 213.214561 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
Grid : Message : 213.214563 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
Grid : Message : 213.217711 s : Setting up Cshift based reference
Grid : Message : 219.662772 s : *****************************************************************
Grid : Message : 219.662786 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 219.662787 s : *****************************************************************
Grid : Message : 219.662788 s : *****************************************************************
Grid : Message : 219.662788 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 219.662789 s : * Vectorising space-time by 8
Grid : Message : 219.662790 s : * VComplex size is 64 B
Grid : Message : 219.662791 s : * Using Overlapped Comms/Compute
Grid : Message : 219.662791 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 219.662791 s : *****************************************************************
Grid : Message : 220.425592 s : Called warmup
Grid : Message : 222.536249 s : Called Dw 300 times in 2110597 us
Grid : Message : 222.536267 s : mflop/s = 50365105.5
Grid : Message : 222.536269 s : mflop/s per rank = 3147819.09
Grid : Message : 222.536270 s : mflop/s per node = 12591276.4
Grid : Message : 222.541053 s : norm diff 5.80156793e-14 Line 306
Grid : Message : 232.135901 s : ----------------------------------------------------------------
Grid : Message : 232.135915 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 232.135916 s : ----------------------------------------------------------------
Grid : Message : 232.135917 s : Called DwDag
Grid : Message : 232.135918 s : norm dag result 4.12801829
Grid : Message : 232.151938 s : norm dag ref 4.12801829
Grid : Message : 232.154451 s : norm dag diff 3.42093991e-14 Line 377
Grid : Message : 232.216117 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 232.630529 s : src_e0.500004005
Grid : Message : 232.643197 s : src_o0.499996067
Grid : Message : 232.645527 s : *********************************************************
Grid : Message : 232.645529 s : * Benchmarking DomainWallFermion::DhopEO
Grid : Message : 232.645532 s : * Vectorising space-time by 8
Grid : Message : 232.645533 s : * Using Overlapped Comms/Compute
Grid : Message : 232.645534 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 232.645535 s : *********************************************************
Grid : Message : 233.774184 s : Deo mflop/s = 47432091.9
Grid : Message : 233.774194 s : Deo mflop/s per rank 2964505.74
Grid : Message : 233.774196 s : Deo mflop/s per node 11858023
Grid : Message : 233.791552 s : r_e2.06443136
Grid : Message : 233.791899 s : r_o2.06378451
Grid : Message : 233.792204 s : res4.12821587
Grid : Message : 234.230783 s : norm diff 0
Grid : Message : 235.162780 s : norm diff even 0
Grid : Message : 235.291950 s : norm diff odd 0
Grid : Message : 235.765411 s : *******************************************
Grid : Message : 235.765424 s : ******* Grid Finalize ******
Grid : Message : 235.765425 s : *******************************************

View File

@@ -0,0 +1,57 @@
#!/bin/sh
#SBATCH --account=jureap14
#SBATCH --nodes=1
#SBATCH --ntasks=4
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=64
#SBATCH --time=2:00:00
#SBATCH --partition=booster
#SBATCH --gres=gpu:4
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
OPT="--comms-overlap"
source ../sourceme.sh
cat << EOF > bind_gpu
#!/bin/bash
export GPU_MAP=(0 1 2 3)
export NUMA_MAP=(0 1 2 3)
export NIC_MAP=(0 1 2 3)
export GPU=\$SLURM_LOCALID
export NUMA=\$SLURM_LOCALID
export NIC=\$SLURM_LOCALID
export CUDA_VISIBLE_DEVICES=\$GPU
export UCX_NET_DEVICES=mlx5_\${NIC}:1
echo RANK \$SLURM_LOCALID using NUMA \$NUMA GPU \$GPU NIC \$UCX_NET_DEVICES
exec numactl -m \$NUMA -N \$NUMA \$*
EOF
chmod +x ./bind_gpu
srun --cpu-bind=no -N 1 -n $SLURM_NTASKS \
./bind_gpu ./Benchmark_dwf_fp32 \
$OPT \
--mpi 1.1.2.2 \
--accelerator-threads 8 \
--grid 32.32.64.64 \
--shm 2048 > dwf.1node.perf
srun --cpu-bind=no -N 1 -n $SLURM_NTASKS \
./bind_gpu ./Benchmark_comms_host_device \
--mpi 1.1.2.2 \
--accelerator-threads 8 \
--grid 32.32.64.64 \
--shm 2048 > comms.1node.perf

View File

@@ -0,0 +1,57 @@
#!/bin/sh
#SBATCH --account=jureap14
#SBATCH --nodes=4
#SBATCH --ntasks=16
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=64
#SBATCH --time=2:00:00
#SBATCH --partition=booster
#SBATCH --gres=gpu:4
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
OPT="--comms-overlap"
source ../sourceme.sh
cat << EOF > bind_gpu
#!/bin/bash
export GPU_MAP=(0 1 2 3)
export NUMA_MAP=(0 1 2 3)
export NIC_MAP=(0 1 2 3)
export GPU=\$SLURM_LOCALID
export NUMA=\$SLURM_LOCALID
export NIC=\$SLURM_LOCALID
export CUDA_VISIBLE_DEVICES=\$GPU
export UCX_NET_DEVICES=mlx5_\${NIC}:1
echo RANK \$SLURM_LOCALID using NUMA \$NUMA GPU \$GPU NIC \$UCX_NET_DEVICES
exec numactl -m \$NUMA -N \$NUMA \$*
EOF
chmod +x ./bind_gpu
srun --cpu-bind=no -N 4 -n $SLURM_NTASKS \
./bind_gpu ./Benchmark_dwf_fp32 \
$OPT \
--mpi 2.2.2.2 \
--accelerator-threads 8 \
--grid 64.64.64.64 \
--shm 2048 > dwf.4node.perf
srun --cpu-bind=no -N 4 -n $SLURM_NTASKS \
./bind_gpu ./Benchmark_comms_host_device \
--mpi 2.2.2.2 \
--accelerator-threads 8 \
--grid 32.32.64.64 \
--shm 2048 > comms.4node.perf

View File

@@ -0,0 +1,16 @@
export CXX=nvcc
export OPENMPI=/p/software/default/stages/2025/software/OpenMPI/5.0.5-NVHPC-24.9-CUDA-12/
export LDFLAGS="-cudart shared -L${OPENMPI}/lib"
export CXXFLAGS="-ccbin clang++ -gencode arch=compute_90,code=sm_90 -std=c++17 -cudart shared -lcublas -lmpi -I${OPENMPI}/include"
../../configure \
--enable-comms=mpi \
--enable-simd=GPU \
--enable-gen-simd-width=64 \
--enable-shm=nvlink \
--enable-accelerator=cuda \
--with-lime=$CLIME \
--disable-gparity \
--disable-fermion-reps \
--disable-unified

View File

@@ -0,0 +1,10 @@
CLIME=$HOME/install/
module load Clang
module load CUDA
module load FFTW
module load OpenSSL
module load MPFR
module load NVHPC
module load UCX
module load OpenMPI
ulimit -c 0

View File

@@ -1,2 +1,14 @@
CXXFLAGS=-I/opt/local/include LDFLAGS=-L/opt/local/lib/ CXX=c++-13 MPICXX=mpicxx ../../configure --enable-simd=GEN --enable-comms=mpi-auto --enable-unified=yes --prefix $HOME/QCD/GridInstall --with-lime=/Users/peterboyle/QCD/SciDAC/install/ --with-openssl=$BREW --disable-fermion-reps --disable-gparity --disable-debug
CXX=mpicxx ../../configure \
--enable-simd=GEN \
--enable-comms=mpi-auto \
--enable-Sp=yes \
--enable-unified=yes \
--prefix /Users/peterboyle/QCD/vtk/Grid/install \
--with-lime=$CLIME \
--with-openssl=$OPENSSL \
--with-gmp=$GMP \
--with-mpfr=$MPFR \
--disable-debug

View File

@@ -1,3 +1,12 @@
spack load c-lime
spack load fftw
spack load hdf5+cxx
export FFTW=`spack find --paths fftw | grep ^fftw | awk '{print $2}' `
export HDF5=`spack find --paths hdf5+cxx | grep ^hdf5 | awk '{print $2}' `
export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
../../configure \
--enable-comms=mpi-auto \
--enable-unified=yes \
@@ -5,12 +14,16 @@
--enable-shm-fast-path=shmopen \
--enable-accelerator=none \
--enable-simd=AVX512 \
--disable-accelerator-cshift \
--with-lime=$CLIME \
--with-hdf5=$HDF5 \
--with-fftw=$FFTW \
--disable-fermion-reps \
--disable-gparity \
CXX=clang++ \
MPICXX=mpicxx \
CXXFLAGS="-std=c++17"
LIBS=-llime \
LDFLAGS=-L$CLIME/lib/ \
CXXFLAGS="-std=c++17 -fPIE"

View File

@@ -1,4 +1,5 @@
source $HOME/spack/share/spack/setup-env.sh
spack load llvm@17.0.4
export LD_LIBRARY_PATH=/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/llvm-17.0.4-laufdrcip63ivkadmtgoepwmj3dtztdu/lib:$LD_LIBRARY_PATH
module load openmpi
module load openmpi/4.1.8
spack load c-lime

View File

@@ -62,7 +62,7 @@ int VerifyOnDevice(const FermionField &res, FermionField &ref)
if (((random()&0xF)==0)&&injection) {
uint64_t sF = random()%(NN);
int lane=0;
printf("Error injection site %ld on rank %d\n",sF,res.Grid()->ThisRank());
printf("Error injection site %ld on rank %d\n",(long)sF,res.Grid()->ThisRank());
auto vv = acceleratorGet(res_v[sF]);
double *dd = (double *)&vv;
*dd=M_PI;

View File

@@ -195,8 +195,8 @@ int main (int argc, char ** argv)
int Nk=nrhs;
int Nm=Nk*3;
int Nk=36;
int Nm=144;
// int Nk=36;
// int Nm=144;
int Nstop=Nk;
int Nconv_test_interval=1;

View File

@@ -47,20 +47,20 @@ public:
void OpDir (const Field &in, Field &out,int dir,int disp) { assert(0); }
void OpDirAll (const Field &in, std::vector<Field> &out){ assert(0); };
void Op (const Field &in, Field &out){
std::cout << "Op: PVdag M "<<std::endl;
// std::cout << "Op: PVdag M "<<std::endl;
Field tmp(in.Grid());
_Mat.M(in,tmp);
_PV.Mdag(tmp,out);
}
void AdjOp (const Field &in, Field &out){
std::cout << "AdjOp: Mdag PV "<<std::endl;
// std::cout << "AdjOp: Mdag PV "<<std::endl;
Field tmp(in.Grid());
_PV.M(in,tmp);
_Mat.Mdag(tmp,out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
void HermOp(const Field &in, Field &out){
std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
// std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
Field tmp(in.Grid());
// _Mat.M(in,tmp);
// _PV.Mdag(tmp,out);
@@ -83,14 +83,14 @@ public:
void OpDir (const Field &in, Field &out,int dir,int disp) { assert(0); }
void OpDirAll (const Field &in, std::vector<Field> &out){ assert(0); };
void Op (const Field &in, Field &out){
std::cout << "Op: PVdag M "<<std::endl;
// std::cout << "Op: PVdag M "<<std::endl;
Field tmp(in.Grid());
_Mat.M(in,tmp);
_PV.Mdag(tmp,out);
out = out + shift * in;
}
void AdjOp (const Field &in, Field &out){
std::cout << "AdjOp: Mdag PV "<<std::endl;
// std::cout << "AdjOp: Mdag PV "<<std::endl;
Field tmp(in.Grid());
_PV.M(tmp,out);
_Mat.Mdag(in,tmp);
@@ -98,7 +98,7 @@ public:
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ assert(0); }
void HermOp(const Field &in, Field &out){
std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
// std::cout << "HermOp: Mdag PV PVdag M"<<std::endl;
Field tmp(in.Grid());
Op(in,tmp);
AdjOp(tmp,out);

View File

@@ -54,6 +54,7 @@ const RealD M5 = 1.8;
int main(int argc, char** argv)
{
#ifdef ENABLE_GPARITY
Grid_init(&argc, &argv);
int threads = GridThread::GetThreads();
@@ -106,6 +107,6 @@ int main(int argc, char** argv)
Meofa.refresh(Umu,sRNG, RNG5);
printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
}
#endif
return 0;
}

View File

@@ -56,6 +56,7 @@ const RealD M5 = 1.8;
int main(int argc, char** argv)
{
#ifdef ENABLE_GPARITY
Grid_init(&argc, &argv);
int threads = GridThread::GetThreads();
@@ -106,6 +107,6 @@ int main(int argc, char** argv)
Meofa.refresh(Umu, sRNG, RNG5);
printf("<Phi|Meofa|Phi> = %1.15e\n", Meofa.S(Umu));
}
#endif
return 0;
}

View File

@@ -33,6 +33,7 @@ using namespace std;
using namespace Grid;
// This is to optimize the SIMD
/*
template<class vobj> void gpermute(vobj & inout,int perm){
vobj tmp=inout;
if (perm & 0x1 ) { permute(inout,tmp,0); tmp=inout;}
@@ -40,7 +41,7 @@ template<class vobj> void gpermute(vobj & inout,int perm){
if (perm & 0x4 ) { permute(inout,tmp,2); tmp=inout;}
if (perm & 0x8 ) { permute(inout,tmp,3); tmp=inout;}
}
*/
int main (int argc, char ** argv)
{

View File

@@ -153,7 +153,7 @@ public:
t=usecond();
{
autoView( gStaple_v , gStaple, AcceleratorWrite);
auto gStencil_v = gStencil.View();
auto gStencil_v = gStencil.View(AcceleratorRead);
autoView( Ug_mu_v , Ug_mu, AcceleratorRead);
autoView( Ug_nu_v , Ug_nu, AcceleratorRead);
@@ -389,7 +389,7 @@ public:
GeneralLocalStencil gStencil(ggrid,shifts);
{
autoView( gStaple_v , gStaple, AcceleratorWrite);
auto gStencil_v = gStencil.View();
auto gStencil_v = gStencil.View(AcceleratorRead);
typedef LatticeView<typename GaugeMat::vector_object> GaugeViewType;
size_t vsize = Nd*sizeof(GaugeViewType);

View File

@@ -83,6 +83,7 @@ std::vector<RealD> jack_stats(const std::vector<RealD>& data)
int main(int argc, char **argv)
{
#ifdef ENABLE_GPARITY
Grid_init(&argc, &argv);
// Initialize spacetime grid
@@ -206,4 +207,5 @@ int main(int argc, char **argv)
std::cout << std::endl << "EOFA: rw = " << eofa_result[0] << " +/- " << eofa_result[1] << std::endl;
Grid_finalize();
#endif
}

View File

@@ -85,6 +85,7 @@ std::vector<RealD> jack_stats(const std::vector<RealD>& data)
int main(int argc, char **argv)
{
#ifdef ENABLE_GPARITY
Grid_init(&argc, &argv);
// Initialize spacetime grid
@@ -215,4 +216,5 @@ int main(int argc, char **argv)
std::cout << std::endl << "EOFA: rw = " << eofa_result[0] << " +/- " << eofa_result[1] << std::endl;
Grid_finalize();
#endif
}

View File

@@ -35,6 +35,7 @@ using namespace Grid;
int main (int argc, char ** argv)
{
#ifdef ENABLE_GPARITY
Grid_init(&argc,&argv);
Coordinate latt_size = GridDefaultLatt();
@@ -244,4 +245,5 @@ int main (int argc, char ** argv)
std::cout<< GridLogMessage << "Done" <<std::endl;
Grid_finalize();
#endif
}

View File

@@ -38,6 +38,7 @@ typedef typename FermionAction::FermionField FermionField;
int main (int argc, char** argv)
{
#ifdef ENABLE_GPARITY
Grid_init(&argc, &argv);
Coordinate latt_size = GridDefaultLatt();
@@ -173,4 +174,5 @@ int main (int argc, char** argv)
std::cout << GridLogMessage << "Done" << std::endl;
Grid_finalize();
#endif
}

View File

@@ -35,6 +35,7 @@ using namespace Grid;
int main (int argc, char ** argv)
{
#ifdef ENABLE_GPARITY
Grid_init(&argc,&argv);
Coordinate latt_size = GridDefaultLatt();
@@ -204,4 +205,5 @@ int main (int argc, char ** argv)
assert( fabs(real(Sprime-S-dSpred)) < 1.0 ) ;
std::cout<< GridLogMessage << "Done" <<std::endl;
Grid_finalize();
#endif
}

View File

@@ -32,6 +32,7 @@ using namespace std;
using namespace Grid;
//Here we test the G-parity action and force between the 1f (doubled-lattice) and 2f approaches
#ifdef ENABLE_GPARITY
void copyConjGauge(LatticeGaugeFieldD &Umu_1f, const LatticeGaugeFieldD &Umu_2f, const int nu){
@@ -444,3 +445,7 @@ int main (int argc, char ** argv)
assert(0);
}
}
#else
int main (int argc, char ** argv){};
#endif

View File

@@ -32,6 +32,7 @@ using namespace Grid;
int main (int argc, char ** argv)
{
#ifdef ENABLE_GPARITY
Grid_init(&argc,&argv);
Coordinate latt_size = GridDefaultLatt();
@@ -155,4 +156,5 @@ int main (int argc, char ** argv)
std::cout<< GridLogMessage << "Done" <<std::endl;
Grid_finalize();
#endif
}

View File

@@ -30,9 +30,10 @@ See the full license in the file "LICENSE" in the top level distribution directo
#include <Grid/Grid.h>
#ifdef ENABLE_GPARITY
using namespace std;
using namespace Grid;
;
typedef GparityWilsonImplD FermionImplPolicyD;
typedef GparityMobiusEOFAFermionD FermionActionD;
@@ -231,3 +232,7 @@ int main (int argc, char** argv)
std::cout << GridLogMessage << "Done" << std::endl;
Grid_finalize();
}
#else
int main(int argc,char ** argv) { return 0;};
#endif

View File

@@ -31,14 +31,14 @@ See the full license in the file "LICENSE" in the top level distribution directo
using namespace std;
using namespace Grid;
;
typedef GparityWilsonImplR FermionImplPolicy;
typedef GparityMobiusEOFAFermionD FermionAction;
typedef typename FermionAction::FermionField FermionField;
int main (int argc, char** argv)
{
#ifdef ENABLE_GPARITY
Grid_init(&argc, &argv);
Coordinate latt_size = GridDefaultLatt();
@@ -171,4 +171,5 @@ int main (int argc, char** argv)
std::cout << GridLogMessage << "Done" << std::endl;
Grid_finalize();
#endif
}

View File

@@ -30,7 +30,7 @@
using namespace Grid;
#ifdef ENABLE_GPARITY
template<typename FermionField2f, typename FermionField1f>
void copy2fTo1fFermionField(FermionField1f &out, const FermionField2f &in, int gpdir){
@@ -255,3 +255,6 @@ int main(int argc, char **argv) {
} // main
#else
int main(int argc, char **argv){};
#endif

View File

@@ -30,6 +30,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
int main(int argc, char **argv) {
#ifdef ENABLE_GPARITY
using namespace Grid;
;
@@ -139,7 +140,7 @@ int main(int argc, char **argv) {
Grid_finalize();
#endif
} // main

View File

@@ -55,13 +55,13 @@ namespace Grid{
};
struct SmearingParameters: Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(SmearingParameters,
struct HmcSmearingParameters: Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(HmcSmearingParameters,
double, rho,
Integer, Nsmear)
template <class ReaderClass >
SmearingParameters(Reader<ReaderClass>& Reader){
HmcSmearingParameters(Reader<ReaderClass>& Reader){
read(Reader, "StoutSmearing", *this);
}
@@ -213,7 +213,7 @@ int main(int argc, char **argv) {
// Reset performance counters
if (ApplySmearing){
SmearingParameters SmPar(Reader);
HmcSmearingParameters SmPar(Reader);
//double rho = 0.1; // smearing parameter
//int Nsmear = 3; // number of smearing levels
Smear_Stout<HMCWrapper::ImplPolicy> Stout(SmPar.rho);

View File

@@ -0,0 +1,14 @@
<?xml version="1.0"?>
<grid>
<LanczosParameters>
<mass>0.00107</mass>
<M5>1.8</M5>
<Ls>48</Ls>
<Nstop>10</Nstop>
<Nk>15</Nk>
<Np>85</Np>
<ChebyLow>0.003</ChebyLow>
<ChebyHigh>60</ChebyHigh>
<ChebyOrder>201</ChebyOrder>
</LanczosParameters>
</grid>

View File

@@ -35,6 +35,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
#include <Grid/algorithms/iterative/LocalCoherenceLanczos.h>
#ifdef ENABLE_GPARITY
using namespace std;
using namespace Grid;
@@ -378,7 +380,8 @@ void runTest(const Options &opt){
//Note: because we rely upon physical properties we must use a "real" gauge configuration
int main (int argc, char ** argv) {
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
GridLogIRL.TimingMode(1);
@@ -482,4 +485,8 @@ int main (int argc, char ** argv) {
Grid_finalize();
}
#else
int main(int argc, char **argv){};
#endif

View File

@@ -0,0 +1,428 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/Test_dwf_G5R5.cc
Copyright (C) 2015
Author: Chulwoo Jung <chulwoo@bnl.gov>
From Duo and Bob's Chirality study
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
//typedef WilsonFermionD FermionOp;
typedef DomainWallFermionD FermionOp;
typedef typename DomainWallFermionD::FermionField FermionField;
template <class T> void writeFile(T& in, std::string const fname){
#ifdef HAVE_LIME
// Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
Grid::emptyUserRecord record;
Grid::ScidacWriter WR(in.Grid()->IsBoss());
WR.open(fname);
WR.writeScidacFieldRecord(in,record,0);
WR.close();
#endif
// What is the appropriate way to throw error?
}
RealD AllZero(RealD x) { return 0.; }
namespace Grid {
struct LanczosParameters: Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
RealD, mass ,
RealD, M5 ,
Integer, Ls,
Integer, Nstop,
Integer, Nk,
Integer, Np,
RealD, ChebyLow,
RealD, ChebyHigh,
Integer, ChebyOrder)
// Integer, StartTrajectory,
// Integer, Trajectories, /* @brief Number of sweeps in this run */
// bool, MetropolisTest,
// Integer, NoMetropolisUntil,
// std::string, StartingType,
// Integer, SW,
// RealD, Kappa,
// IntegratorParameters, MD)
LanczosParameters() {
////////////////////////////// Default values
mass = 0;
// MetropolisTest = true;
// NoMetropolisUntil = 10;
// StartTrajectory = 0;
// SW = 2;
// Trajectories = 10;
// StartingType = "HotStart";
/////////////////////////////////
}
template <class ReaderClass >
LanczosParameters(Reader<ReaderClass> & TheReader){
initialize(TheReader);
}
template < class ReaderClass >
void initialize(Reader<ReaderClass> &TheReader){
// std::cout << GridLogMessage << "Reading HMC\n";
read(TheReader, "HMC", *this);
}
void print_parameters() const {
// std::cout << GridLogMessage << "[HMC parameters] Trajectories : " << Trajectories << "\n";
// std::cout << GridLogMessage << "[HMC parameters] Start trajectory : " << StartTrajectory << "\n";
// std::cout << GridLogMessage << "[HMC parameters] Metropolis test (on/off): " << std::boolalpha << MetropolisTest << "\n";
// std::cout << GridLogMessage << "[HMC parameters] Thermalization trajs : " << NoMetropolisUntil << "\n";
// std::cout << GridLogMessage << "[HMC parameters] Starting type : " << StartingType << "\n";
// MD.print_parameters();
}
};
}
int main(int argc, char** argv) {
Grid_init(&argc, &argv);
LanczosParameters LanParams;
#if 1
{
XmlReader HMCrd("LanParams.xml");
read(HMCrd,"LanczosParameters",LanParams);
}
#else
{
LanParams.mass = mass;
}
#endif
std::cout << GridLogMessage<< LanParams <<std::endl;
{
XmlWriter HMCwr("LanParams.xml.out");
write(HMCwr,"LanczosParameters",LanParams);
}
int Ls=16;
RealD M5=1.8;
RealD mass = 0.01;
mass=LanParams.mass;
Ls=LanParams.Ls;
M5=LanParams.M5;
GridCartesian* UGrid = SpaceTimeGrid::makeFourDimGrid(
GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()),
GridDefaultMpi());
GridRedBlackCartesian* UrbGrid =
SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
// GridCartesian* FGrid = UGrid;
// GridRedBlackCartesian* FrbGrid = UrbGrid;
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls, UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls, UGrid);
// printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n", UGrid, UrbGrid, FGrid, FrbGrid);
std::vector<int> seeds4({1, 2, 3, 4});
std::vector<int> seeds5({5, 6, 7, 8});
GridParallelRNG RNG5(FGrid); RNG5.SeedFixedIntegers(seeds5);
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
GridParallelRNG RNG5rb(FrbGrid); RNG5.SeedFixedIntegers(seeds5);
LatticeGaugeField Umu(UGrid);
FieldMetaData header;
std::string file("./config");
int precision32 = 0;
int tworow = 0;
NerscIO::readConfiguration(Umu,header,file);
/*
std::vector<LatticeColourMatrix> U(4, UGrid);
for (int mu = 0; mu < Nd; mu++) {
U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
}
*/
int Nk = 20;
int Nstop = Nk;
int Np = 80;
Nstop=LanParams.Nstop;
Nk=LanParams.Nk;
Np=LanParams.Np;
int Nm = Nk + Np;
int MaxIt = 100;
RealD resid = 1.0e-4;
//while ( mass > - 5.0){
FermionOp Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
MdagMLinearOperator<FermionOp,FermionField> HermOp(Ddwf); /// <-----
// Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
Gamma5R5HermitianLinearOperator<FermionOp, LatticeFermion> G5R5Herm(Ddwf);
// Gamma5R5HermitianLinearOperator
std::vector<double> Coeffs{0, 1.};
Polynomial<FermionField> PolyX(Coeffs);
Chebyshev<FermionField> Cheby(LanParams.ChebyLow,LanParams.ChebyHigh,LanParams.ChebyOrder);
FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
PlainHermOp<FermionField> Op (HermOp);
PlainHermOp<FermionField> Op2 (G5R5Herm);
ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op, Nstop, Nk, Nm, resid, MaxIt);
std::vector<RealD> eval(Nm);
FermionField src(FGrid);
gaussian(RNG5, src);
std::vector<FermionField> evec(Nm, FGrid);
for (int i = 0; i < 1; i++) {
std::cout << i << " / " << Nm << " grid pointer " << evec[i].Grid()
<< std::endl;
};
int Nconv;
IRL.calc(eval, evec, src, Nconv);
std::cout << mass <<" : " << eval << std::endl;
std::cout << " #evecs " << evec.size() << std::endl;
std::cout << " Nconv " << Nconv << std::endl;
std::cout << " Nm " << Nm << std::endl;
if ( Nconv > evec.size() ) Nconv = evec.size();
#if 0
Gamma g5(Gamma::Algebra::Gamma5) ;
ComplexD dot;
FermionField tmp(FGrid);
// RealD eMe,eMMe;
for (int i = 0; i < Nstop ; i++) {
// tmp = g5*evec[i];
dot = innerProduct(evec[i],evec[i]);
// G5R5(tmp,evec[i]);
G5R5Herm.HermOpAndNorm(evec[i],tmp,eMe,eMMe);
std::cout <<"Norm "<<M5<<" "<< mass << " : " << i << " " << real(dot) << " " << imag(dot) << " "<< eMe << " " <<eMMe<< std::endl ;
for (int j = 0; j < Nstop ; j++) {
dot = innerProduct(tmp,evec[j]);
std::cout <<"G5R5 "<<M5<<" "<< mass << " : " << i << " " <<j<<" " << real(dot) << " " << imag(dot) << std::endl ;
}
}
// src = evec[0]+evec[1]+evec[2];
// mass += -0.1;
#endif
//**********************************************************************
//orthogonalization
//calculat the matrix
cout << "Start orthogonalization " << endl;
cout << "calculate the matrix element" << endl;
vector<LatticeFermion> G5R5Mevec(Nconv, FGrid);
vector<LatticeFermion> finalevec(Nconv, FGrid);
vector<RealD> eMe(Nconv), eMMe(Nconv);
for(int i = 0; i < Nconv; i++){
cout << "calculate the matrix element["<<i<<"]" << endl;
G5R5Herm.HermOpAndNorm(evec[i], G5R5Mevec[i], eMe[i], eMMe[i]);
}
cout << "Re<evec, G5R5M(evec)>: " << endl;
cout << eMe << endl;
cout << "<G5R5M(evec), G5R5M(evec)>" << endl;
cout << eMMe << endl;
vector<vector<ComplexD>> VevecG5R5Mevec(Nconv);
Eigen::MatrixXcd evecG5R5Mevec = Eigen::MatrixXcd::Zero(Nconv, Nconv);
for(int i = 0; i < Nconv; i++){
VevecG5R5Mevec[i].resize(Nconv);
for(int j = 0; j < Nconv; j++){
VevecG5R5Mevec[i][j] = innerProduct(evec[i], G5R5Mevec[j]);
evecG5R5Mevec(i, j) = VevecG5R5Mevec[i][j];
}
}
//calculate eigenvector
cout << "Eigen solver" << endl;
Eigen::SelfAdjointEigenSolver<Eigen::MatrixXcd> eigensolver(evecG5R5Mevec);
vector<RealD> eigeneval(Nconv);
vector<vector<ComplexD>> eigenevec(Nconv);
for(int i = 0; i < Nconv; i++){
eigeneval[i] = eigensolver.eigenvalues()[i];
eigenevec[i].resize(Nconv);
for(int j = 0; j < Nconv; j++){
eigenevec[i][j] = eigensolver.eigenvectors()(i, j);
}
}
//rotation
cout << "Do rotation" << endl;
for(int i = 0; i < Nconv; i++){
finalevec[i] = finalevec[i] - finalevec[i];
for(int j = 0; j < Nconv; j++){
finalevec[i] = eigenevec[j][i]*evec[j] + finalevec[i];
}
}
//normalize again;
for(int i = 0; i < Nconv; i++){
RealD tmp_RealD = norm2(finalevec[i]);
tmp_RealD = 1./pow(tmp_RealD, 0.5);
finalevec[i] = finalevec[i]*tmp_RealD;
}
//check
for(int i = 0; i < Nconv; i++){
G5R5Herm.HermOpAndNorm(finalevec[i], G5R5Mevec[i], eMe[i], eMMe[i]);
}
//**********************************************************************
//sort the eigenvectors
vector<LatticeFermion> finalevec_copy(Nconv, FGrid);
for(int i = 0; i < Nconv; i++){
finalevec_copy[i] = finalevec[i];
}
vector<RealD> eMe_copy(eMe);
for(int i = 0; i < Nconv; i++){
eMe[i] = fabs(eMe[i]);
eMe_copy[i] = eMe[i];
}
sort(eMe_copy.begin(), eMe_copy.end());
for(int i = 0; i < Nconv; i++){
for(int j = 0; j < Nconv; j++){
if(eMe[j] == eMe_copy[i]){
finalevec[i] = finalevec_copy[j];
}
}
}
for(int i = 0; i < Nconv; i++){
G5R5Herm.HermOpAndNorm(finalevec[i], G5R5Mevec[i], eMe[i], eMMe[i]);
}
cout << "Re<evec, G5R5M(evec)>: " << endl;
cout << eMe << endl;
cout << "<G5R5M(evec), G5R5M(evec)>" << endl;
cout << eMMe << endl;
// vector<LatticeFermion> finalevec(Nconv, FGrid);
// temporary, until doing rotation
// for(int i = 0; i < Nconv; i++)
// finalevec[i]=evec[i];
//**********************************************************************
//calculate chirality matrix
vector<LatticeFermion> G5evec(Nconv, FGrid);
vector<vector<ComplexD>> chiral_matrix(Nconv);
vector<vector<RealD>> chiral_matrix_real(Nconv);
for(int i = 0; i < Nconv; i++){
// G5evec[i] = G5evec[i] - G5evec[i];
G5evec[i] = Zero();
for(int j = 0; j < Ls/2; j++){
axpby_ssp(G5evec[i], 1., finalevec[i], 0., G5evec[i], j, j);
}
for(int j = Ls/2; j < Ls; j++){
axpby_ssp(G5evec[i], -1., finalevec[i], 0., G5evec[i], j, j);
}
}
for(int i = 0; i < Nconv; i++){
Ddwf.M(finalevec[i], G5R5Mevec[i]);
for(int j = 0; j < Nconv; j++){
std::cout << "<"<<j<<"|Ddwf|"<<i<<"> = "<<innerProduct(finalevec[j],G5R5Mevec[i])<<std::endl;
}
}
for(int i = 0; i < Nconv; i++){
RealD t1,t2;
G5R5Herm.HermOpAndNorm(finalevec[i], G5R5Mevec[i], t1, t2);
for(int j = 0; j < Nconv; j++){
std::cout << "<"<<j<<"|G5R5 M|"<<i<<"> = "<<innerProduct(finalevec[j],G5R5Mevec[i])<<std::endl;
}
}
for(int i = 0; i < Nconv; i++){
chiral_matrix_real[i].resize(Nconv);
chiral_matrix[i].resize(Nconv);
std::string evfile("./evec_density");
evfile = evfile+"_"+std::to_string(i);
auto evdensity = localInnerProduct(finalevec[i],finalevec[i] );
writeFile(evdensity,evfile);
for(int j = 0; j < Nconv; j++){
chiral_matrix[i][j] = innerProduct(finalevec[i], G5evec[j]);
std::cout <<" chiral_matrix_real signed "<<i<<" "<<j<<" "<< chiral_matrix_real[i][j] << std::endl;
chiral_matrix_real[i][j] = abs(chiral_matrix[i][j]);
std::cout <<" chiral_matrix_real "<<i<<" "<<j<<" "<< chiral_matrix_real[i][j] << std::endl;
if ( chiral_matrix_real[i][j] > 0.8 ) {
auto g5density = localInnerProduct(finalevec[i], G5evec[j]);
std::string chfile("./chiral_density_");
chfile = chfile +std::to_string(i)+"_"+std::to_string(j);
writeFile(g5density,chfile);
}
}
}
for(int i = 0; i < Nconv; i++){
if(chiral_matrix[i][i].real() < 0.){
chiral_matrix_real[i][i] = -1. * chiral_matrix_real[i][i];
}
}
FILE *fp = fopen("lego-plot.py","w"); assert(fp!=NULL);
#define PYTHON_LINE(A) fprintf(fp,A"\n");
PYTHON_LINE("import matplotlib.pyplot as plt");
PYTHON_LINE("import numpy as np");
PYTHON_LINE("");
PYTHON_LINE("fig = plt.figure()");
PYTHON_LINE("ax = fig.add_subplot(projection='3d')");
PYTHON_LINE("");
PYTHON_LINE("x, y = np.random.rand(2, 100) * 4");
fprintf(fp,"hist, xedges, yedges = np.histogram2d(x, y, bins=%d, range=[[0, %d], [0, %d]])\n",Nconv,Nconv-1,Nconv-1);
PYTHON_LINE("");
PYTHON_LINE("# Construct arrays for the anchor positions of the 16 bars");
PYTHON_LINE("xpos, ypos = np.meshgrid(xedges[:-1] + 0.25, yedges[:-1] + 0.25, indexing=\"ij\")");
PYTHON_LINE("xpos = xpos.ravel()");
PYTHON_LINE("ypos = ypos.ravel()");
PYTHON_LINE("zpos = 0");
PYTHON_LINE("");
PYTHON_LINE("# Construct arrays with the dimensions for the 16 bars.");
PYTHON_LINE("dx = dy = 0.5 * np.ones_like(zpos)");
PYTHON_LINE("dz = np.array([");
for(int i = 0; i < Nconv; i++){
fprintf(fp,"\t[ ");
for(int j = 0; j < Nconv; j++){
fprintf(fp,"%lf ",chiral_matrix_real[i][j]);
if(j<Nconv-1) fprintf(fp,",");
else fprintf(fp," ");
}
fprintf(fp,"]");
if(i<Nconv-1) fprintf(fp,",\n");
else fprintf(fp,"\n");
}
PYTHON_LINE("\t])");
PYTHON_LINE("dz = dz.ravel()");
PYTHON_LINE("ax.bar3d(xpos, ypos, zpos, dx, dy, dz, zsort='average')");
PYTHON_LINE("plt.show()");
fclose(fp);
Grid_finalize();
}

View File

@@ -29,11 +29,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
using namespace std;
using namespace Grid;
;
template<typename Action>
struct Setup{};
#ifdef ENABLE_GPARITY
template<>
struct Setup<GparityMobiusFermionF>{
static GparityMobiusFermionF* getAction(LatticeGaugeFieldF &Umu,
@@ -47,16 +47,24 @@ struct Setup<GparityMobiusFermionF>{
return new GparityMobiusFermionF(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.,params);
}
};
#endif
template<>
struct Setup<DomainWallFermionF>{
static DomainWallFermionF* getAction(LatticeGaugeFieldF &Umu,
GridCartesian* FGrid, GridRedBlackCartesian* FrbGrid, GridCartesian* UGrid, GridRedBlackCartesian* UrbGrid){
RealD mass=0.00054;
RealD M5=1.8;
return new DomainWallFermionF(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
}
};
template<>
struct Setup<DomainWallFermionD>{
static DomainWallFermionD* getAction(LatticeGaugeField &Umu,
GridCartesian* FGrid, GridRedBlackCartesian* FrbGrid, GridCartesian* UGrid, GridRedBlackCartesian* UrbGrid){
RealD mass=0.00054;
RealD M5=1.8;
return new DomainWallFermionF(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
return new DomainWallFermionD(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
}
};
@@ -168,7 +176,9 @@ int main (int argc, char ** argv)
}
if(action == "GparityMobius"){
#ifdef ENABLE_GPARITY
run<GparityMobiusFermionF>();
#endif
}else if(action == "DWF"){
run<DomainWallFermionF>();
}else if(action == "Mobius"){

View File

@@ -555,6 +555,7 @@ int main (int argc, char ** argv) {
double c = (args.mobius_scale - bmc)/2.; // c = 1/2 [ (b+c) - (b-c) ]
if(is_gparity){
#ifdef ENABLE_GPARITY
GparityWilsonImplD::ImplParams Params = setupGparityParams(args.GparityDirs);
readConfiguration<ConjugateGimplD>(Umu, config, args.is_cps_cfg); //Read the gauge field
@@ -564,7 +565,10 @@ int main (int argc, char ** argv) {
}else if(action_s == "Mobius"){
GparityMobiusFermionD action(Umu, *FGrid, *FrbGrid, *UGrid, *UrbGrid, args.mass, args.M5, b, c, Params);
run(action, config, args);
}
}
#else
assert(0);
#endif
}else{
WilsonImplD::ImplParams Params = setupParams();
readConfiguration<PeriodicGimplD>(Umu, config, args.is_cps_cfg); //Read the gauge field

View File

@@ -0,0 +1,278 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/Test_dwf_lanczos.cc
Copyright (C) 2015
Author: Chulwoo Jung <chulwoo@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
;
typedef WilsonFermionD FermionOp;
typedef typename WilsonFermionD::FermionField FermionField;
RealD AllZero(RealD x) { return 0.; }
namespace Grid {
#if 0
template<typename Field>
class RationalHermOp : public LinearFunction<Field> {
public:
using LinearFunction<Field>::operator();
// OperatorFunction<Field> & _poly;
LinearOperatorBase<Field> &_Linop;
RealD _massDen, _massNum;
FunctionHermOp(LinearOperatorBase<Field>& linop, RealD massDen,RealD massNum)
: _Linop(linop) ,_massDen(massDen),_massNum(massNum) {};
void operator()(const Field& in, Field& out) {
// _poly(_Linop,in,out);
}
};
#endif
template<class Matrix,class Field>
class InvG5LinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
RealD _num;
RealD _Tol;
Integer _MaxIt;
Gamma g5;
public:
InvG5LinearOperator(Matrix &Mat,RealD num): _Mat(Mat),_num(num), _Tol(1e-12),_MaxIt(10000), g5(Gamma::Algebra::Gamma5) {};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
assert(0);
_Mat.Mdiag(in,out);
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
assert(0);
_Mat.Mdir(in,out,dir,disp);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
assert(0);
_Mat.MdirAll(in,out);
};
void Op (const Field &in, Field &out){
assert(0);
_Mat.M(in,out);
}
void AdjOp (const Field &in, Field &out){
assert(0);
_Mat.Mdag(in,out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
HermOp(in,out);
ComplexD dot = innerProduct(in,out);
n1=real(dot);
n2=norm2(out);
}
void HermOp(const Field &in, Field &out){
Field tmp(in.Grid());
MdagMLinearOperator<Matrix,Field> denom(_Mat);
ConjugateGradient<Field> CG(_Tol,_MaxIt);
_Mat.M(in,tmp);
tmp += _num*in;
_Mat.Mdag(tmp,out);
CG(denom,out,tmp);
out = g5*tmp;
}
};
struct LanczosParameters: Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
RealD, mass ,
RealD, resid,
RealD, ChebyLow,
RealD, ChebyHigh,
Integer, ChebyOrder)
// Integer, StartTrajectory,
// Integer, Trajectories, /* @brief Number of sweeps in this run */
// bool, MetropolisTest,
// Integer, NoMetropolisUntil,
// std::string, StartingType,
// Integer, SW,
// RealD, Kappa,
// IntegratorParameters, MD)
LanczosParameters() {
////////////////////////////// Default values
mass = 0;
// MetropolisTest = true;
// NoMetropolisUntil = 10;
// StartTrajectory = 0;
// SW = 2;
// Trajectories = 10;
// StartingType = "HotStart";
/////////////////////////////////
}
template <class ReaderClass >
LanczosParameters(Reader<ReaderClass> & TheReader){
initialize(TheReader);
}
template < class ReaderClass >
void initialize(Reader<ReaderClass> &TheReader){
// std::cout << GridLogMessage << "Reading HMC\n";
read(TheReader, "HMC", *this);
}
void print_parameters() const {
// std::cout << GridLogMessage << "[HMC parameters] Trajectories : " << Trajectories << "\n";
// std::cout << GridLogMessage << "[HMC parameters] Start trajectory : " << StartTrajectory << "\n";
// std::cout << GridLogMessage << "[HMC parameters] Metropolis test (on/off): " << std::boolalpha << MetropolisTest << "\n";
// std::cout << GridLogMessage << "[HMC parameters] Thermalization trajs : " << NoMetropolisUntil << "\n";
// std::cout << GridLogMessage << "[HMC parameters] Starting type : " << StartingType << "\n";
// MD.print_parameters();
}
};
}
int main(int argc, char** argv) {
Grid_init(&argc, &argv);
GridCartesian* UGrid = SpaceTimeGrid::makeFourDimGrid(
GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()),
GridDefaultMpi());
GridRedBlackCartesian* UrbGrid =
SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian* FGrid = UGrid;
GridRedBlackCartesian* FrbGrid = UrbGrid;
// printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n", UGrid, UrbGrid, FGrid, FrbGrid);
std::vector<int> seeds4({1, 2, 3, 4});
std::vector<int> seeds5({5, 6, 7, 8});
GridParallelRNG RNG5(FGrid);
RNG5.SeedFixedIntegers(seeds5);
GridParallelRNG RNG4(UGrid);
RNG4.SeedFixedIntegers(seeds4);
GridParallelRNG RNG5rb(FrbGrid);
RNG5.SeedFixedIntegers(seeds5);
LatticeGaugeField Umu(UGrid);
// SU<Nc>::HotConfiguration(RNG4, Umu);
FieldMetaData header;
std::string file("./config");
int precision32 = 0;
int tworow = 0;
// NerscIO::writeConfiguration(Umu,file,tworow,precision32);
NerscIO::readConfiguration(Umu,header,file);
/*
std::vector<LatticeColourMatrix> U(4, UGrid);
for (int mu = 0; mu < Nd; mu++) {
U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
}
*/
int Nstop = 5;
int Nk = 10;
int Np = 90;
int Nm = Nk + Np;
int MaxIt = 10000;
RealD resid = 1.0e-5;
RealD mass = -1.0;
LanczosParameters LanParams;
#if 1
{
XmlReader HMCrd("LanParams.xml");
read(HMCrd,"LanczosParameters",LanParams);
}
#else
{
LanParams.mass = mass;
}
#endif
std::cout << GridLogMessage<< LanParams <<std::endl;
{
XmlWriter HMCwr("LanParams.xml.out");
write(HMCwr,"LanczosParameters",LanParams);
}
mass=LanParams.mass;
resid=LanParams.resid;
while ( mass > - 5.0){
FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,2.+mass);
InvG5LinearOperator<FermionOp,LatticeFermion> HermOp(WilsonOperator,-2.); /// <-----
//SchurDiagTwoOperator<FermionOp,FermionField> HermOp(WilsonOperator);
// Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
std::vector<double> Coeffs{0, 0, 1.};
Polynomial<FermionField> PolyX(Coeffs);
Chebyshev<FermionField> Cheby(LanParams.ChebyLow,LanParams.ChebyHigh,LanParams.ChebyOrder);
FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
// InvHermOp<FermionField> Op(WilsonOperator,HermOp);
PlainHermOp<FermionField> Op (HermOp);
// PlainHermOp<FermionField> Op2 (HermOp2);
ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op, Nstop, Nk, Nm, resid, MaxIt);
std::vector<RealD> eval(Nm);
FermionField src(FGrid);
gaussian(RNG5, src);
std::vector<FermionField> evec(Nm, FGrid);
for (int i = 0; i < 1; i++) {
std::cout << i << " / " << Nm << " grid pointer " << evec[i].Grid()
<< std::endl;
};
int Nconv;
IRL.calc(eval, evec, src, Nconv);
std::cout << mass <<" : " << eval << std::endl;
Gamma g5(Gamma::Algebra::Gamma5) ;
ComplexD dot;
FermionField tmp(FGrid);
for (int i = 0; i < Nstop ; i++) {
tmp = g5*evec[i];
dot = innerProduct(tmp,evec[i]);
std::cout << mass << " : " << eval[i] << " " << real(dot) << " " << imag(dot) << std::endl ;
}
src = evec[0]+evec[1]+evec[2];
mass += -0.1;
}
Grid_finalize();
}

View File

@@ -0,0 +1,211 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./tests/Test_dwf_lanczos.cc
Copyright (C) 2015
Author: Chulwoo Jung <chulwoo@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
;
typedef WilsonFermionD FermionOp;
typedef typename WilsonFermionD::FermionField FermionField;
RealD AllZero(RealD x) { return 0.; }
namespace Grid {
struct LanczosParameters: Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
RealD, mass ,
RealD, ChebyLow,
RealD, ChebyHigh,
Integer, ChebyOrder)
// Integer, StartTrajectory,
// Integer, Trajectories, /* @brief Number of sweeps in this run */
// bool, MetropolisTest,
// Integer, NoMetropolisUntil,
// std::string, StartingType,
// Integer, SW,
// RealD, Kappa,
// IntegratorParameters, MD)
LanczosParameters() {
////////////////////////////// Default values
mass = 0;
// MetropolisTest = true;
// NoMetropolisUntil = 10;
// StartTrajectory = 0;
// SW = 2;
// Trajectories = 10;
// StartingType = "HotStart";
/////////////////////////////////
}
template <class ReaderClass >
LanczosParameters(Reader<ReaderClass> & TheReader){
initialize(TheReader);
}
template < class ReaderClass >
void initialize(Reader<ReaderClass> &TheReader){
// std::cout << GridLogMessage << "Reading HMC\n";
read(TheReader, "HMC", *this);
}
void print_parameters() const {
// std::cout << GridLogMessage << "[HMC parameters] Trajectories : " << Trajectories << "\n";
// std::cout << GridLogMessage << "[HMC parameters] Start trajectory : " << StartTrajectory << "\n";
// std::cout << GridLogMessage << "[HMC parameters] Metropolis test (on/off): " << std::boolalpha << MetropolisTest << "\n";
// std::cout << GridLogMessage << "[HMC parameters] Thermalization trajs : " << NoMetropolisUntil << "\n";
// std::cout << GridLogMessage << "[HMC parameters] Starting type : " << StartingType << "\n";
// MD.print_parameters();
}
};
}
int main(int argc, char** argv) {
Grid_init(&argc, &argv);
GridCartesian* UGrid = SpaceTimeGrid::makeFourDimGrid(
GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()),
GridDefaultMpi());
GridRedBlackCartesian* UrbGrid =
SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian* FGrid = UGrid;
GridRedBlackCartesian* FrbGrid = UrbGrid;
// printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n", UGrid, UrbGrid, FGrid, FrbGrid);
std::vector<int> seeds4({1, 2, 3, 4});
std::vector<int> seeds5({5, 6, 7, 8});
GridParallelRNG RNG5(FGrid);
RNG5.SeedFixedIntegers(seeds5);
GridParallelRNG RNG4(UGrid);
RNG4.SeedFixedIntegers(seeds4);
GridParallelRNG RNG5rb(FrbGrid);
RNG5.SeedFixedIntegers(seeds5);
LatticeGaugeField Umu(UGrid);
// SU<Nc>::HotConfiguration(RNG4, Umu);
FieldMetaData header;
std::string file("./config");
int precision32 = 0;
int tworow = 0;
// NerscIO::writeConfiguration(Umu,file,tworow,precision32);
NerscIO::readConfiguration(Umu,header,file);
/*
std::vector<LatticeColourMatrix> U(4, UGrid);
for (int mu = 0; mu < Nd; mu++) {
U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
}
*/
int Nstop = 10;
int Nk = 20;
int Np = 80;
int Nm = Nk + Np;
int MaxIt = 10000;
RealD resid = 1.0e-5;
RealD mass = -1.0;
LanczosParameters LanParams;
#if 1
{
XmlReader HMCrd("LanParams.xml");
read(HMCrd,"LanczosParameters",LanParams);
}
#else
{
LanParams.mass = mass;
}
#endif
std::cout << GridLogMessage<< LanParams <<std::endl;
{
XmlWriter HMCwr("LanParams.xml.out");
write(HMCwr,"LanczosParameters",LanParams);
}
mass=LanParams.mass;
while ( mass > - 5.0){
FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass);
MdagMLinearOperator<FermionOp,FermionField> HermOp(WilsonOperator); /// <-----
//SchurDiagTwoOperator<FermionOp,FermionField> HermOp(WilsonOperator);
Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
std::vector<double> Coeffs{0, 1.};
Polynomial<FermionField> PolyX(Coeffs);
// Chebyshev<FermionField> Cheby(0.5, 60., 31);
// RealD, ChebyLow,
// RealD, ChebyHigh,
// Integer, ChebyOrder)
Chebyshev<FermionField> Cheby(LanParams.ChebyLow,LanParams.ChebyHigh,LanParams.ChebyOrder);
FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
PlainHermOp<FermionField> Op (HermOp);
PlainHermOp<FermionField> Op2 (HermOp2);
ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op2, Nstop, Nk, Nm, resid, MaxIt);
std::vector<RealD> eval(Nm);
FermionField src(FGrid);
gaussian(RNG5, src);
std::vector<FermionField> evec(Nm, FGrid);
for (int i = 0; i < 1; i++) {
std::cout << i << " / " << Nm << " grid pointer " << evec[i].Grid()
<< std::endl;
};
int Nconv;
IRL.calc(eval, evec, src, Nconv);
std::cout << mass <<" : " << eval << std::endl;
Gamma g5(Gamma::Algebra::Gamma5) ;
ComplexD dot;
FermionField tmp(FGrid);
for (int i = 0; i < Nstop ; i++) {
tmp = g5*evec[i];
dot = innerProduct(tmp,evec[i]);
std::cout << mass << " : " << eval[i] << " " << real(dot) << " " << imag(dot) << std::endl ;
}
src = evec[0]+evec[1]+evec[2];
mass += -0.1;
}
Grid_finalize();
}

View File

@@ -33,8 +33,7 @@ namespace Grid{
GRID_SERIALIZABLE_CLASS_MEMBERS(WFParameters,
int, steps,
double, step_size,
int, meas_interval,
double, maxTau); // for the adaptive algorithm
int, meas_interval);
template <class ReaderClass >
@@ -86,7 +85,7 @@ int main(int argc, char **argv) {
WFParameters WFPar(Reader);
ConfParameters CPar(Reader);
CheckpointerParameters CPPar(CPar.conf_prefix, CPar.rng_prefix);
BinaryHmcCheckpointer<PeriodicGimplR> CPBin(CPPar);
NerscHmcCheckpointer<PeriodicGimplR> CPBin(CPPar);
for (int conf = CPar.StartConfiguration; conf <= CPar.EndConfiguration; conf+= CPar.Skip){
@@ -96,19 +95,13 @@ int main(int argc, char **argv) {
std::cout << GridLogMessage << "Initial plaquette: "
<< WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu) << std::endl;
int t=WFPar.maxTau;
WilsonFlowAdaptive<PeriodicGimplR> WF(WFPar.step_size, WFPar.maxTau,
1.0e-4,
WilsonFlow<PeriodicGimplR> WF(WFPar.step_size, WFPar.steps,
WFPar.meas_interval);
WF.smear(Uflow, Umu);
RealD WFlow_plaq = WilsonLoops<PeriodicGimplR>::avgPlaquette(Uflow);
RealD WFlow_TC = WilsonLoops<PeriodicGimplR>::TopologicalCharge(Uflow);
RealD WFlow_T0 = WF.energyDensityPlaquette(t,Uflow);
std::cout << GridLogMessage << "Plaquette "<< conf << " " << WFlow_plaq << std::endl;
std::cout << GridLogMessage << "T0 "<< conf << " " << WFlow_T0 << std::endl;
std::cout << GridLogMessage << "TopologicalCharge "<< conf << " " << WFlow_TC << std::endl;
std::cout<< GridLogMessage << " Admissibility check:\n";
const double sp_adm = 0.067; // admissible threshold

View File

@@ -1,4 +1,4 @@
*************************************************************************************
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid

Some files were not shown because too many files have changed in this diff Show More