mirror of
https://github.com/paboyle/Grid.git
synced 2025-11-05 22:39:32 +00:00
Compare commits
161 Commits
9fa8bd6438
...
specflow
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7780d88d26 | ||
|
|
2bf9179d2c | ||
|
|
c606f5dca0 | ||
|
|
8419cc5c64 | ||
|
|
2cc6deb8e0 | ||
|
|
19d0590579 | ||
|
|
677b4cc5b0 | ||
|
|
be565ffab6 | ||
|
|
df6120e5f6 | ||
|
|
21de6f7da8 | ||
|
|
dbe39f9ce0 | ||
|
|
ab3de50d5e | ||
|
|
c545bd2139 | ||
|
|
6a1c64fbdd | ||
|
|
b75809ed61 | ||
|
|
ecaf228e5c | ||
|
|
6d015ae8fc | ||
|
|
233150d93f | ||
|
|
7af8c77a52 | ||
|
|
a957e7bfa1 | ||
|
|
cee4c8ce8c | ||
|
|
96bf814d8c | ||
|
|
7ddc422788 | ||
|
|
e652fc2825 | ||
|
|
a49fa3f8d0 | ||
|
|
cd452a2f91 | ||
|
|
4f89f603ae | ||
|
|
11dc2c5e1d | ||
|
|
6fec3c15ca | ||
|
|
938c47480f | ||
|
|
3811d19298 | ||
|
|
83a3ab6b6f | ||
|
|
d66a9af6a3 | ||
|
|
adc90d3a86 | ||
|
|
ebbd015c5c | ||
|
|
4ab73b36b2 | ||
|
|
130e07a422 | ||
|
|
8f47bb367e | ||
|
|
0c3cb60135 | ||
|
|
9eae8fca5d | ||
|
|
882a217074 | ||
|
|
e465fce201 | ||
|
|
d41542c64b | ||
|
|
199818bd6c | ||
|
|
fe66c7ca30 | ||
|
|
e9177e4af3 | ||
|
|
d15a6c5933 | ||
| 25ab9325e7 | |||
| 19f9378b98 | |||
|
|
785bc7a14f | ||
|
|
1a1fe85428 | ||
|
|
0000d2e558 | ||
|
|
9ffd1ed4ce | ||
|
|
3d014864e2 | ||
| 1d22841811 | |||
|
|
a1cdda833f | ||
|
|
ad6db92690 | ||
|
|
e8ff9d8e50 | ||
|
|
795769c636 | ||
|
|
267a39d943 | ||
|
|
3624bd3d22 | ||
|
|
bc12dbbb38 | ||
|
|
eb8a008a8f | ||
| c4d9aa1a21 | |||
| 6ae809ed40 | |||
|
|
311e2aab3f | ||
| 438dfbdb83 | |||
| b2ce760cf4 | |||
|
|
b1ba209696 | ||
|
|
cb3e529b1e | ||
|
|
717f647418 | ||
|
|
98e7418187 | ||
|
|
fe05bf48b1 | ||
|
|
d2dd8f54e2 | ||
|
|
7726ee4b16 | ||
| ba9bbe0221 | |||
| 4c3dd82d84 | |||
| 44e911b5b7 | |||
| a7a16df9d0 | |||
| 382e0abefd | |||
| 6fdefe5b90 | |||
| 4788dd8e2e | |||
| 1cc5f221f3 | |||
| 93251bfba0 | |||
| 18b79508b8 | |||
| 4de5ed1613 | |||
| 0baaddbe98 | |||
| 8729c46169 | |||
| 09f81fe7c3 | |||
| 1876e5b7c0 | |||
|
|
355ec76257 | ||
| b50fb34e71 | |||
| de84d730ff | |||
|
|
c74d11e3d7 | ||
|
|
84cab5e6e7 | ||
| c4fc972fec | |||
| 8cf809e231 | |||
| 94019a922e | |||
|
|
4f17c8d081 | ||
|
|
aaab753982 | ||
| d6b2727f86 | |||
| 74a4f43946 | |||
| 1caf8b0f86 | |||
|
|
570b72a47b | ||
|
|
a5798a89ed | ||
|
|
3f3661a86f | ||
|
|
f7e2f9a401 | ||
|
|
2848a9b558 | ||
|
|
d4868991af | ||
|
|
e99d42404e | ||
|
|
3ba019c747 | ||
|
|
47429218bb | ||
| 8fe429346f | |||
|
|
5a4f9bf2e3 | ||
|
|
b91fc1b6b4 | ||
|
|
eafc150034 | ||
|
|
2877f1a268 | ||
|
|
1e893af775 | ||
|
|
d9f430a575 | ||
|
|
63abe87f36 | ||
|
|
368d649c8a | ||
|
|
5603464f39 | ||
|
|
655c79f39e | ||
|
|
565b231c03 | ||
|
|
62a9f180fa | ||
|
|
5ae77876a8 | ||
|
|
4ed2c2c74f | ||
|
|
955da582b6 | ||
|
|
11b07b950d | ||
|
|
8f70cfeda9 | ||
|
|
ce64271048 | ||
| 5cc4f3241d | |||
|
|
6815e138b4 | ||
| a78a61d76f | |||
| 2eff3f34ed | |||
| 03687c1d62 | |||
| febfe4e77f | |||
| 4d1aa134b5 | |||
| 5ec879860a | |||
|
|
f617468e04 | ||
| b728af903c | |||
| 54f1999030 | |||
| fd58f0b669 | |||
| c5c67b706e | |||
| be7a543e2c | |||
| 68f112d576 | |||
| ec1395a304 | |||
| beb0e474ee | |||
| 2b5fdcbbc5 | |||
| 295127d456 | |||
| 7dcfb13694 | |||
|
|
ee4046fe92 | ||
|
|
2a9cfeb9ea | ||
|
|
1147b8ea40 | ||
|
|
3f9119b39d | ||
|
|
35e8225abd | ||
|
|
bdbfbb7a14 | ||
|
|
f7d4be8d96 | ||
| 8d305df0db | |||
|
|
e29b97b3ea | ||
|
|
ad2b699d2b |
@@ -73,6 +73,7 @@ NAMESPACE_CHECK(BiCGSTAB);
|
||||
#include <Grid/algorithms/iterative/FlexibleCommunicationAvoidingGeneralisedMinimalResidual.h>
|
||||
#include <Grid/algorithms/iterative/MixedPrecisionFlexibleGeneralisedMinimalResidual.h>
|
||||
#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
|
||||
#include <Grid/algorithms/iterative/SimpleLanczos.h>
|
||||
#include <Grid/algorithms/iterative/PowerMethod.h>
|
||||
#include <Grid/algorithms/iterative/AdefGeneric.h>
|
||||
#include <Grid/algorithms/iterative/AdefMrhs.h>
|
||||
|
||||
@@ -191,7 +191,7 @@ public:
|
||||
|
||||
Lattice<sobj> pgbuf(&pencil_g);
|
||||
autoView(pgbuf_v , pgbuf, CpuWrite);
|
||||
std::cout << "CPU view" << std::endl;
|
||||
//std::cout << "CPU view" << std::endl;
|
||||
|
||||
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
|
||||
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
|
||||
@@ -215,7 +215,7 @@ public:
|
||||
else if ( sign == forward ) div = 1.0;
|
||||
else assert(0);
|
||||
|
||||
std::cout << "Making FFTW plan" << std::endl;
|
||||
//std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
|
||||
FFTW_plan p;
|
||||
{
|
||||
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
|
||||
@@ -229,7 +229,7 @@ public:
|
||||
}
|
||||
|
||||
// Barrel shift and collect global pencil
|
||||
std::cout << "Making pencil" << std::endl;
|
||||
//std::cout << GridLogPerformance<<"Making pencil" << std::endl;
|
||||
Coordinate lcoor(Nd), gcoor(Nd);
|
||||
result = source;
|
||||
int pc = processor_coor[dim];
|
||||
@@ -251,7 +251,7 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Looping orthog" << std::endl;
|
||||
//std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
|
||||
// Loop over orthog coords
|
||||
int NN=pencil_g.lSites();
|
||||
GridStopWatch timer;
|
||||
@@ -274,7 +274,7 @@ public:
|
||||
usec += timer.useconds();
|
||||
flops+= flops_call*NN;
|
||||
|
||||
std::cout << "Writing back results " << std::endl;
|
||||
//std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
|
||||
// writing out result
|
||||
{
|
||||
autoView(pgbuf_v,pgbuf,CpuRead);
|
||||
@@ -291,7 +291,7 @@ public:
|
||||
}
|
||||
result = result*div;
|
||||
|
||||
std::cout << "Destroying plan " << std::endl;
|
||||
//std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
|
||||
// destroying plan
|
||||
FFTW<scalar>::fftw_destroy_plan(p);
|
||||
#endif
|
||||
|
||||
@@ -277,6 +277,38 @@ public:
|
||||
assert(0);
|
||||
}
|
||||
};
|
||||
template<class Matrix,class Field>
|
||||
class ShiftedNonHermitianLinearOperator : public LinearOperatorBase<Field> {
|
||||
Matrix &_Mat;
|
||||
RealD shift;
|
||||
public:
|
||||
ShiftedNonHermitianLinearOperator(Matrix &Mat,RealD shft): _Mat(Mat),shift(shft){};
|
||||
// Support for coarsening to a multigrid
|
||||
void OpDiag (const Field &in, Field &out) {
|
||||
_Mat.Mdiag(in,out);
|
||||
out = out + shift*in;
|
||||
}
|
||||
void OpDir (const Field &in, Field &out,int dir,int disp) {
|
||||
_Mat.Mdir(in,out,dir,disp);
|
||||
}
|
||||
void OpDirAll (const Field &in, std::vector<Field> &out){
|
||||
_Mat.MdirAll(in,out);
|
||||
};
|
||||
void Op (const Field &in, Field &out){
|
||||
_Mat.M(in,out);
|
||||
out = out + shift * in;
|
||||
}
|
||||
void AdjOp (const Field &in, Field &out){
|
||||
_Mat.Mdag(in,out);
|
||||
out = out + shift * in;
|
||||
}
|
||||
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
|
||||
assert(0);
|
||||
}
|
||||
void HermOp(const Field &in, Field &out){
|
||||
assert(0);
|
||||
}
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////
|
||||
// Even Odd Schur decomp operators; there are several
|
||||
|
||||
@@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid);
|
||||
typedef cublasHandle_t gridblasHandle_t;
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
typedef cl::sycl::queue *gridblasHandle_t;
|
||||
typedef sycl::queue *gridblasHandle_t;
|
||||
#endif
|
||||
#ifdef GRID_ONE_MKL
|
||||
typedef cl::sycl::queue *gridblasHandle_t;
|
||||
typedef sycl::queue *gridblasHandle_t;
|
||||
#endif
|
||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
|
||||
typedef int32_t gridblasHandle_t;
|
||||
@@ -89,9 +89,9 @@ public:
|
||||
gridblasHandle = theGridAccelerator;
|
||||
#endif
|
||||
#ifdef GRID_ONE_MKL
|
||||
cl::sycl::gpu_selector selector;
|
||||
cl::sycl::device selectedDevice { selector };
|
||||
cl::sycl::property_list q_prop{cl::sycl::property::queue::in_order()};
|
||||
sycl::gpu_selector selector;
|
||||
sycl::device selectedDevice { selector };
|
||||
sycl::property_list q_prop{sycl::property::queue::in_order()};
|
||||
gridblasHandle =new sycl::queue (selectedDevice,q_prop);
|
||||
#endif
|
||||
gridblasInit=1;
|
||||
@@ -208,8 +208,8 @@ public:
|
||||
assert(Bkn.size()==batchCount);
|
||||
assert(Cmn.size()==batchCount);
|
||||
|
||||
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
||||
assert(OpB!=GridBLAS_OP_T);
|
||||
//assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
||||
//assert(OpB!=GridBLAS_OP_T);
|
||||
|
||||
int lda = m; // m x k column major
|
||||
int ldb = k; // k x n column major
|
||||
@@ -367,28 +367,67 @@ public:
|
||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||
else
|
||||
eCmn = alpha * eAmk * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
|
||||
else
|
||||
eCmn = alpha * eAmk.adjoint() * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||
else
|
||||
eCmn = alpha * eAmk.transpose() * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
|
||||
else
|
||||
eCmn = alpha * eAmk * eBkn.adjoint() ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
||||
else
|
||||
eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
||||
} );
|
||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||
else
|
||||
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||
} );
|
||||
} else {
|
||||
assert(0);
|
||||
@@ -414,8 +453,8 @@ public:
|
||||
RealD t2=usecond();
|
||||
int32_t batchCount = Amk.size();
|
||||
|
||||
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
||||
assert(OpB!=GridBLAS_OP_T);
|
||||
//assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
||||
//assert(OpB!=GridBLAS_OP_T);
|
||||
|
||||
int lda = m; // m x k column major
|
||||
int ldb = k; // k x n column major
|
||||
@@ -514,28 +553,70 @@ public:
|
||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||
else
|
||||
eCmn = alpha * eAmk * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
|
||||
else
|
||||
eCmn = alpha * eAmk.adjoint() * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||
else
|
||||
eCmn = alpha * eAmk.transpose() * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
|
||||
else
|
||||
eCmn = alpha * eAmk * eBkn.adjoint() ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||
else
|
||||
eCmn = alpha * eAmk * eBkn.transpose() ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
||||
else
|
||||
eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
||||
} );
|
||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||
else
|
||||
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||
} );
|
||||
} else {
|
||||
assert(0);
|
||||
@@ -661,29 +742,41 @@ public:
|
||||
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||
else
|
||||
eCmn = alpha * eAmk * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||
else
|
||||
eCmn = alpha * eAmk.transpose() * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||
else
|
||||
eCmn = alpha * eAmk * eBkn.transpose() ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||
} );
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||
else
|
||||
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||
});
|
||||
} else {
|
||||
assert(0);
|
||||
}
|
||||
@@ -809,28 +902,40 @@ public:
|
||||
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||
else
|
||||
eCmn = alpha * eAmk * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||
else
|
||||
eCmn = alpha * eAmk.transpose() * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||
else
|
||||
eCmn = alpha * eAmk * eBkn.transpose() ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||
else
|
||||
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||
});
|
||||
} else {
|
||||
assert(0);
|
||||
|
||||
@@ -144,11 +144,11 @@ public:
|
||||
acceleratorCopyDeviceToDevice(&BLAS_Y[offset],&y_v[0],sizeof(scalar_object)*vol);
|
||||
}
|
||||
RealD t4 = usecond();
|
||||
std::cout << "MulMatrix alloc took "<< t1-t0<<" us"<<std::endl;
|
||||
std::cout << "MulMatrix preamble took "<< t2-t1<<" us"<<std::endl;
|
||||
std::cout << "MulMatrix blas took "<< t3-t2<<" us"<<std::endl;
|
||||
std::cout << "MulMatrix copy took "<< t4-t3<<" us"<<std::endl;
|
||||
std::cout << "MulMatrix total "<< t4-t0<<" us"<<std::endl;
|
||||
std::cout <<GridLogPerformance << "MulMatrix alloc took "<< t1-t0<<" us"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "MulMatrix preamble took "<< t2-t1<<" us"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "MulMatrix blas took "<< t3-t2<<" us"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "MulMatrix copy took "<< t4-t3<<" us"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "MulMatrix total "<< t4-t0<<" us"<<std::endl;
|
||||
}
|
||||
|
||||
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y)
|
||||
@@ -242,16 +242,16 @@ public:
|
||||
RealD flops = 8.0*M*N*K;
|
||||
flops = flops/(t4-t3)/1.e3;
|
||||
bytes = bytes/(t4-t3)/1.e3;
|
||||
std::cout << "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
|
||||
std::cout << "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
|
||||
std::cout << "InnerProductMatrix cp t2 "<< t2-t1<<" us"<<std::endl;
|
||||
std::cout << "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
|
||||
std::cout << "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
|
||||
std::cout << "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
|
||||
std::cout << "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
|
||||
std::cout << "InnerProductMatrix gsum t5 "<< t5-t4<<" us"<<std::endl;
|
||||
std::cout << "InnerProductMatrix cp t6 "<< t6-t5<<" us"<<std::endl;
|
||||
std::cout << "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t2 "<< t2-t1<<" us"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t5 "<< t5-t4<<" us"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t6 "<< t6-t5<<" us"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
|
||||
#else
|
||||
int nrhs;
|
||||
GridBase *grid;
|
||||
@@ -358,17 +358,17 @@ public:
|
||||
flops = flops/(t4-t3)/1.e3;
|
||||
bytes = bytes/(t4-t3)/1.e3;
|
||||
xybytes = 4*xybytes/(t2-t1)/1.e3;
|
||||
std::cout << "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
|
||||
std::cout << "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
|
||||
std::cout << "InnerProductMatrix cp t2 "<< t2-t1<<" us "<<xybytes<<" GB/s"<<std::endl;
|
||||
std::cout << "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
|
||||
std::cout << "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
|
||||
std::cout << "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
|
||||
std::cout << "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
|
||||
std::cout << "InnerProductMatrix cp t5 "<< t5-t4<<" us"<<std::endl;
|
||||
std::cout << "InnerProductMatrix lsum t6l "<< t6l-t5<<" us"<<std::endl;
|
||||
std::cout << "InnerProductMatrix gsum t6 "<< t6-t6l<<" us"<<std::endl;
|
||||
std::cout << "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t2 "<< t2-t1<<" us "<<xybytes<<" GB/s"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t5 "<< t5-t4<<" us"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix lsum t6l "<< t6l-t5<<" us"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t6 "<< t6-t6l<<" us"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
@@ -63,7 +63,12 @@ class TwoLevelCGmrhs
|
||||
GridStopWatch SmoothTimer;
|
||||
GridStopWatch InsertTimer;
|
||||
|
||||
|
||||
/*
|
||||
Field rrr;
|
||||
Field sss;
|
||||
Field qqq;
|
||||
Field zzz;
|
||||
*/
|
||||
// more most opertor functions
|
||||
TwoLevelCGmrhs(RealD tol,
|
||||
Integer maxit,
|
||||
@@ -74,6 +79,12 @@ class TwoLevelCGmrhs
|
||||
MaxIterations(maxit),
|
||||
_FineLinop(FineLinop),
|
||||
_Smoother(Smoother)
|
||||
/*
|
||||
rrr(fine),
|
||||
sss(fine),
|
||||
qqq(fine),
|
||||
zzz(fine)
|
||||
*/
|
||||
{
|
||||
grid = fine;
|
||||
};
|
||||
@@ -81,8 +92,8 @@ class TwoLevelCGmrhs
|
||||
// Vector case
|
||||
virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
|
||||
{
|
||||
SolveSingleSystem(src,x);
|
||||
// SolvePrecBlockCG(src,x);
|
||||
// SolveSingleSystem(src,x);
|
||||
SolvePrecBlockCG(src,x);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@@ -657,6 +668,8 @@ public:
|
||||
CoarseField PleftProjMrhs(this->coarsegridmrhs);
|
||||
CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
|
||||
|
||||
// this->rrr=in[0];
|
||||
|
||||
#undef SMOOTHER_BLOCK_SOLVE
|
||||
#if SMOOTHER_BLOCK_SOLVE
|
||||
this->SmoothTimer.Start();
|
||||
@@ -669,6 +682,7 @@ public:
|
||||
this->SmoothTimer.Stop();
|
||||
}
|
||||
#endif
|
||||
// this->sss=Min[0];
|
||||
|
||||
for(int rhs=0;rhs<nrhs;rhs++) {
|
||||
|
||||
@@ -705,9 +719,11 @@ public:
|
||||
this->_Projector.blockPromote(tmp,PleftMss_proj);// tmp= Q[in - A Min]
|
||||
this->PromoteTimer.Stop();
|
||||
this->FineTimer.Start();
|
||||
// this->qqq=tmp[0];
|
||||
for(int rhs=0;rhs<nrhs;rhs++) {
|
||||
axpy(out[rhs],1.0,Min[rhs],tmp[rhs]); // Min+tmp
|
||||
}
|
||||
// this->zzz=out[0];
|
||||
this->FineTimer.Stop();
|
||||
}
|
||||
};
|
||||
|
||||
@@ -116,14 +116,14 @@ NAMESPACE_BEGIN(Grid);
|
||||
//Compute double precision rsd and also new RHS vector.
|
||||
Linop_d.HermOp(sol_d, tmp_d);
|
||||
RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
|
||||
|
||||
std::cout<<GridLogMessage<<" rsd norm "<<norm<<std::endl;
|
||||
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
|
||||
|
||||
if(norm < OuterLoopNormMult * stop){
|
||||
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
|
||||
break;
|
||||
}
|
||||
while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
|
||||
while(norm * inner_tol * inner_tol < stop*1.01) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
|
||||
|
||||
PrecChangeTimer.Start();
|
||||
precisionChange(src_f, src_d, pc_wk_dp_to_sp);
|
||||
|
||||
@@ -245,9 +245,10 @@ until convergence
|
||||
_HermOp(src_n,tmp);
|
||||
// std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
|
||||
// std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
|
||||
RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
|
||||
// RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
|
||||
RealD vnum = real(innerProduct(tmp,tmp)); // HermOp^2.
|
||||
RealD vden = norm2(src_n);
|
||||
RealD na = vnum/vden;
|
||||
RealD na = std::sqrt(vnum/vden);
|
||||
if (fabs(evalMaxApprox/na - 1.0) < 0.0001)
|
||||
i=_MAX_ITER_IRL_MEVAPP_;
|
||||
evalMaxApprox = na;
|
||||
@@ -255,6 +256,7 @@ until convergence
|
||||
src_n = tmp;
|
||||
}
|
||||
}
|
||||
std::cout << GridLogIRL << " Final evalMaxApprox " << evalMaxApprox << std::endl;
|
||||
|
||||
std::vector<RealD> lme(Nm);
|
||||
std::vector<RealD> lme2(Nm);
|
||||
|
||||
@@ -74,7 +74,7 @@ public:
|
||||
|
||||
void operator() (const Field &src, Field &psi){
|
||||
|
||||
psi=Zero();
|
||||
// psi=Zero();
|
||||
RealD cp, ssq,rsq;
|
||||
ssq=norm2(src);
|
||||
rsq=Tolerance*Tolerance*ssq;
|
||||
|
||||
931
Grid/algorithms/iterative/SimpleLanczos.h
Normal file
931
Grid/algorithms/iterative/SimpleLanczos.h
Normal file
@@ -0,0 +1,931 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Chulwoo Jung <chulwoo@bnl.gov>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#ifndef GRID_LANC_H
|
||||
#define GRID_LANC_H
|
||||
|
||||
#include <string.h> //memset
|
||||
|
||||
#ifdef USE_LAPACK
|
||||
#ifdef USE_MKL
|
||||
#include<mkl_lapack.h>
|
||||
#else
|
||||
void LAPACK_dstegr (char *jobz, char *range, int *n, double *d, double *e,
|
||||
double *vl, double *vu, int *il, int *iu, double *abstol,
|
||||
int *m, double *w, double *z, int *ldz, int *isuppz,
|
||||
double *work, int *lwork, int *iwork, int *liwork,
|
||||
int *info);
|
||||
//#include <lapacke/lapacke.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//#include <Grid/algorithms/densematrix/DenseMatrix.h>
|
||||
|
||||
// eliminate temorary vector in calc()
|
||||
#define MEM_SAVE
|
||||
|
||||
namespace Grid
|
||||
{
|
||||
|
||||
struct Bisection
|
||||
{
|
||||
|
||||
#if 0
|
||||
static void get_eig2 (int row_num, std::vector < RealD > &ALPHA,
|
||||
std::vector < RealD > &BETA,
|
||||
std::vector < RealD > &eig)
|
||||
{
|
||||
int i, j;
|
||||
std::vector < RealD > evec1 (row_num + 3);
|
||||
std::vector < RealD > evec2 (row_num + 3);
|
||||
RealD eps2;
|
||||
ALPHA[1] = 0.;
|
||||
BETHA[1] = 0.;
|
||||
for (i = 0; i < row_num - 1; i++)
|
||||
{
|
||||
ALPHA[i + 1] = A[i * (row_num + 1)].real ();
|
||||
BETHA[i + 2] = A[i * (row_num + 1) + 1].real ();
|
||||
}
|
||||
ALPHA[row_num] = A[(row_num - 1) * (row_num + 1)].real ();
|
||||
bisec (ALPHA, BETHA, row_num, 1, row_num, 1e-10, 1e-10, evec1, eps2);
|
||||
bisec (ALPHA, BETHA, row_num, 1, row_num, 1e-16, 1e-16, evec2, eps2);
|
||||
|
||||
// Do we really need to sort here?
|
||||
int begin = 1;
|
||||
int end = row_num;
|
||||
int swapped = 1;
|
||||
while (swapped)
|
||||
{
|
||||
swapped = 0;
|
||||
for (i = begin; i < end; i++)
|
||||
{
|
||||
if (mag (evec2[i]) > mag (evec2[i + 1]))
|
||||
{
|
||||
swap (evec2 + i, evec2 + i + 1);
|
||||
swapped = 1;
|
||||
}
|
||||
}
|
||||
end--;
|
||||
for (i = end - 1; i >= begin; i--)
|
||||
{
|
||||
if (mag (evec2[i]) > mag (evec2[i + 1]))
|
||||
{
|
||||
swap (evec2 + i, evec2 + i + 1);
|
||||
swapped = 1;
|
||||
}
|
||||
}
|
||||
begin++;
|
||||
}
|
||||
|
||||
for (i = 0; i < row_num; i++)
|
||||
{
|
||||
for (j = 0; j < row_num; j++)
|
||||
{
|
||||
if (i == j)
|
||||
H[i * row_num + j] = evec2[i + 1];
|
||||
else
|
||||
H[i * row_num + j] = 0.;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
static void bisec (std::vector < RealD > &c,
|
||||
std::vector < RealD > &b,
|
||||
int n,
|
||||
int m1,
|
||||
int m2,
|
||||
RealD eps1,
|
||||
RealD relfeh, std::vector < RealD > &x, RealD & eps2)
|
||||
{
|
||||
std::vector < RealD > wu (n + 2);
|
||||
|
||||
RealD h, q, x1, xu, x0, xmin, xmax;
|
||||
int i, a, k;
|
||||
|
||||
b[1] = 0.0;
|
||||
xmin = c[n] - fabs (b[n]);
|
||||
xmax = c[n] + fabs (b[n]);
|
||||
for (i = 1; i < n; i++)
|
||||
{
|
||||
h = fabs (b[i]) + fabs (b[i + 1]);
|
||||
if (c[i] + h > xmax)
|
||||
xmax = c[i] + h;
|
||||
if (c[i] - h < xmin)
|
||||
xmin = c[i] - h;
|
||||
}
|
||||
xmax *= 2.;
|
||||
|
||||
eps2 = relfeh * ((xmin + xmax) > 0.0 ? xmax : -xmin);
|
||||
if (eps1 <= 0.0)
|
||||
eps1 = eps2;
|
||||
eps2 = 0.5 * eps1 + 7.0 * (eps2);
|
||||
x0 = xmax;
|
||||
for (i = m1; i <= m2; i++)
|
||||
{
|
||||
x[i] = xmax;
|
||||
wu[i] = xmin;
|
||||
}
|
||||
|
||||
for (k = m2; k >= m1; k--)
|
||||
{
|
||||
xu = xmin;
|
||||
i = k;
|
||||
do
|
||||
{
|
||||
if (xu < wu[i])
|
||||
{
|
||||
xu = wu[i];
|
||||
i = m1 - 1;
|
||||
}
|
||||
i--;
|
||||
}
|
||||
while (i >= m1);
|
||||
if (x0 > x[k])
|
||||
x0 = x[k];
|
||||
while ((x0 - xu) > 2 * relfeh * (fabs (xu) + fabs (x0)) + eps1)
|
||||
{
|
||||
x1 = (xu + x0) / 2;
|
||||
|
||||
a = 0;
|
||||
q = 1.0;
|
||||
for (i = 1; i <= n; i++)
|
||||
{
|
||||
q =
|
||||
c[i] - x1 -
|
||||
((q != 0.0) ? b[i] * b[i] / q : fabs (b[i]) / relfeh);
|
||||
if (q < 0)
|
||||
a++;
|
||||
}
|
||||
// printf("x1=%0.14e a=%d\n",x1,a);
|
||||
if (a < k)
|
||||
{
|
||||
if (a < m1)
|
||||
{
|
||||
xu = x1;
|
||||
wu[m1] = x1;
|
||||
}
|
||||
else
|
||||
{
|
||||
xu = x1;
|
||||
wu[a + 1] = x1;
|
||||
if (x[a] > x1)
|
||||
x[a] = x1;
|
||||
}
|
||||
}
|
||||
else
|
||||
x0 = x1;
|
||||
}
|
||||
printf ("x0=%0.14e xu=%0.14e k=%d\n", x0, xu, k);
|
||||
x[k] = (x0 + xu) / 2;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////
|
||||
// Implicitly restarted lanczos
|
||||
/////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
template < class Field > class SimpleLanczos
|
||||
{
|
||||
|
||||
const RealD small = 1.0e-16;
|
||||
public:
|
||||
int lock;
|
||||
int get;
|
||||
int Niter;
|
||||
int converged;
|
||||
|
||||
int Nstop; // Number of evecs checked for convergence
|
||||
int Nk; // Number of converged sought
|
||||
int Np; // Np -- Number of spare vecs in kryloc space
|
||||
int Nm; // Nm -- total number of vectors
|
||||
|
||||
|
||||
RealD OrthoTime;
|
||||
|
||||
RealD eresid;
|
||||
|
||||
// SortEigen < Field > _sort;
|
||||
|
||||
LinearFunction < Field > &_Linop;
|
||||
|
||||
// OperatorFunction < Field > &_poly;
|
||||
|
||||
/////////////////////////
|
||||
// Constructor
|
||||
/////////////////////////
|
||||
void init (void)
|
||||
{
|
||||
};
|
||||
// void Abort (int ff, std::vector < RealD > &evals, DenseVector < Denstd::vector < RealD > >&evecs);
|
||||
|
||||
SimpleLanczos (LinearFunction < Field > &Linop, // op
|
||||
// OperatorFunction < Field > &poly, // polynmial
|
||||
int _Nstop, // sought vecs
|
||||
int _Nk, // sought vecs
|
||||
int _Nm, // spare vecs
|
||||
RealD _eresid, // resid in lmdue deficit
|
||||
int _Niter): // Max iterations
|
||||
|
||||
_Linop (Linop),
|
||||
// _poly (poly),
|
||||
Nstop (_Nstop), Nk (_Nk), Nm (_Nm), eresid (_eresid), Niter (_Niter)
|
||||
{
|
||||
Np = Nm - Nk;
|
||||
assert (Np > 0);
|
||||
};
|
||||
|
||||
/////////////////////////
|
||||
// Sanity checked this routine (step) against Saad.
|
||||
/////////////////////////
|
||||
void RitzMatrix (std::vector < Field > &evec, int k)
|
||||
{
|
||||
|
||||
if (1)
|
||||
return;
|
||||
|
||||
GridBase *grid = evec[0].Grid();
|
||||
Field w (grid);
|
||||
std::cout << GridLogMessage << "RitzMatrix " << std::endl;
|
||||
for (int i = 0; i < k; i++)
|
||||
{
|
||||
_Linop(evec[i], w);
|
||||
// _poly(_Linop,evec[i],w);
|
||||
std::cout << GridLogMessage << "[" << i << "] ";
|
||||
for (int j = 0; j < k; j++)
|
||||
{
|
||||
ComplexD in = innerProduct (evec[j], w);
|
||||
if (fabs ((double) i - j) > 1)
|
||||
{
|
||||
if (abs (in) > 1.0e-9)
|
||||
{
|
||||
std::cout << GridLogMessage << "oops" << std::endl;
|
||||
abort ();
|
||||
}
|
||||
else
|
||||
std::cout << GridLogMessage << " 0 ";
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << GridLogMessage << " " << in << " ";
|
||||
}
|
||||
}
|
||||
std::cout << GridLogMessage << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
void step (std::vector < RealD > &lmd,
|
||||
std::vector < RealD > &lme,
|
||||
Field & last, Field & current, Field & next, uint64_t k)
|
||||
{
|
||||
if (lmd.size () <= k)
|
||||
lmd.resize (k + Nm);
|
||||
if (lme.size () <= k)
|
||||
lme.resize (k + Nm);
|
||||
|
||||
|
||||
// _poly(_Linop,current,next ); // 3. wk:=Avk−βkv_{k−1}
|
||||
_Linop(current, next); // 3. wk:=Avk−βkv_{k−1}
|
||||
if (k > 0)
|
||||
{
|
||||
next -= lme[k - 1] * last;
|
||||
}
|
||||
// std::cout<<GridLogMessage << "<last|next>" << innerProduct(last,next) <<std::endl;
|
||||
|
||||
ComplexD zalph = innerProduct (current, next); // 4. αk:=(wk,vk)
|
||||
RealD alph = real (zalph);
|
||||
|
||||
next = next - alph * current; // 5. wk:=wk−αkvk
|
||||
// std::cout<<GridLogMessage << "<current|next>" << innerProduct(current,next) <<std::endl;
|
||||
|
||||
RealD beta = normalise (next); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
|
||||
// 7. vk+1 := wk/βk+1
|
||||
// norm=beta;
|
||||
|
||||
int interval = Nm / 100 + 1;
|
||||
if ((k % interval) == 0)
|
||||
std::
|
||||
cout << GridLogMessage << k << " : alpha = " << zalph << " beta " <<
|
||||
beta << std::endl;
|
||||
const RealD tiny = 1.0e-20;
|
||||
if (beta < tiny)
|
||||
{
|
||||
std::cout << GridLogMessage << " beta is tiny " << beta << std::
|
||||
endl;
|
||||
}
|
||||
lmd[k] = alph;
|
||||
lme[k] = beta;
|
||||
|
||||
}
|
||||
|
||||
void qr_decomp (std::vector < RealD > &lmd,
|
||||
std::vector < RealD > &lme,
|
||||
int Nk,
|
||||
int Nm,
|
||||
std::vector < RealD > &Qt, RealD Dsh, int kmin, int kmax)
|
||||
{
|
||||
int k = kmin - 1;
|
||||
RealD x;
|
||||
|
||||
RealD Fden = 1.0 / hypot (lmd[k] - Dsh, lme[k]);
|
||||
RealD c = (lmd[k] - Dsh) * Fden;
|
||||
RealD s = -lme[k] * Fden;
|
||||
|
||||
RealD tmpa1 = lmd[k];
|
||||
RealD tmpa2 = lmd[k + 1];
|
||||
RealD tmpb = lme[k];
|
||||
|
||||
lmd[k] = c * c * tmpa1 + s * s * tmpa2 - 2.0 * c * s * tmpb;
|
||||
lmd[k + 1] = s * s * tmpa1 + c * c * tmpa2 + 2.0 * c * s * tmpb;
|
||||
lme[k] = c * s * (tmpa1 - tmpa2) + (c * c - s * s) * tmpb;
|
||||
x = -s * lme[k + 1];
|
||||
lme[k + 1] = c * lme[k + 1];
|
||||
|
||||
for (int i = 0; i < Nk; ++i)
|
||||
{
|
||||
RealD Qtmp1 = Qt[i + Nm * k];
|
||||
RealD Qtmp2 = Qt[i + Nm * (k + 1)];
|
||||
Qt[i + Nm * k] = c * Qtmp1 - s * Qtmp2;
|
||||
Qt[i + Nm * (k + 1)] = s * Qtmp1 + c * Qtmp2;
|
||||
}
|
||||
|
||||
// Givens transformations
|
||||
for (int k = kmin; k < kmax - 1; ++k)
|
||||
{
|
||||
|
||||
RealD Fden = 1.0 / hypot (x, lme[k - 1]);
|
||||
RealD c = lme[k - 1] * Fden;
|
||||
RealD s = -x * Fden;
|
||||
|
||||
RealD tmpa1 = lmd[k];
|
||||
RealD tmpa2 = lmd[k + 1];
|
||||
RealD tmpb = lme[k];
|
||||
|
||||
lmd[k] = c * c * tmpa1 + s * s * tmpa2 - 2.0 * c * s * tmpb;
|
||||
lmd[k + 1] = s * s * tmpa1 + c * c * tmpa2 + 2.0 * c * s * tmpb;
|
||||
lme[k] = c * s * (tmpa1 - tmpa2) + (c * c - s * s) * tmpb;
|
||||
lme[k - 1] = c * lme[k - 1] - s * x;
|
||||
|
||||
if (k != kmax - 2)
|
||||
{
|
||||
x = -s * lme[k + 1];
|
||||
lme[k + 1] = c * lme[k + 1];
|
||||
}
|
||||
|
||||
for (int i = 0; i < Nk; ++i)
|
||||
{
|
||||
RealD Qtmp1 = Qt[i + Nm * k];
|
||||
RealD Qtmp2 = Qt[i + Nm * (k + 1)];
|
||||
Qt[i + Nm * k] = c * Qtmp1 - s * Qtmp2;
|
||||
Qt[i + Nm * (k + 1)] = s * Qtmp1 + c * Qtmp2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if 0
|
||||
#ifdef USE_LAPACK
|
||||
#ifdef USE_MKL
|
||||
#define LAPACK_INT MKL_INT
|
||||
#else
|
||||
#define LAPACK_INT long long
|
||||
#endif
|
||||
void diagonalize_lapack (std::vector < RealD > &lmd, std::vector < RealD > &lme, int N1, // all
|
||||
int N2, // get
|
||||
GridBase * grid)
|
||||
{
|
||||
const int size = Nm;
|
||||
LAPACK_INT NN = N1;
|
||||
double evals_tmp[NN];
|
||||
double DD[NN];
|
||||
double EE[NN];
|
||||
for (int i = 0; i < NN; i++)
|
||||
for (int j = i - 1; j <= i + 1; j++)
|
||||
if (j < NN && j >= 0)
|
||||
{
|
||||
if (i == j)
|
||||
DD[i] = lmd[i];
|
||||
if (i == j)
|
||||
evals_tmp[i] = lmd[i];
|
||||
if (j == (i - 1))
|
||||
EE[j] = lme[j];
|
||||
}
|
||||
LAPACK_INT evals_found;
|
||||
LAPACK_INT lwork =
|
||||
((18 * NN) >
|
||||
(1 + 4 * NN + NN * NN) ? (18 * NN) : (1 + 4 * NN + NN * NN));
|
||||
LAPACK_INT liwork = 3 + NN * 10;
|
||||
LAPACK_INT iwork[liwork];
|
||||
double work[lwork];
|
||||
LAPACK_INT isuppz[2 * NN];
|
||||
char jobz = 'N'; // calculate evals only
|
||||
char range = 'I'; // calculate il-th to iu-th evals
|
||||
// char range = 'A'; // calculate all evals
|
||||
char uplo = 'U'; // refer to upper half of original matrix
|
||||
char compz = 'I'; // Compute eigenvectors of tridiagonal matrix
|
||||
int ifail[NN];
|
||||
LAPACK_INT info;
|
||||
// int total = QMP_get_number_of_nodes();
|
||||
// int node = QMP_get_node_number();
|
||||
// GridBase *grid = evec[0]._grid;
|
||||
int total = grid->_Nprocessors;
|
||||
int node = grid->_processor;
|
||||
int interval = (NN / total) + 1;
|
||||
double vl = 0.0, vu = 0.0;
|
||||
LAPACK_INT il = interval * node + 1, iu = interval * (node + 1);
|
||||
if (iu > NN)
|
||||
iu = NN;
|
||||
double tol = 0.0;
|
||||
if (1)
|
||||
{
|
||||
memset (evals_tmp, 0, sizeof (double) * NN);
|
||||
if (il <= NN)
|
||||
{
|
||||
printf ("total=%d node=%d il=%d iu=%d\n", total, node, il, iu);
|
||||
#ifdef USE_MKL
|
||||
dstegr (&jobz, &range, &NN,
|
||||
#else
|
||||
LAPACK_dstegr (&jobz, &range, &NN,
|
||||
#endif
|
||||
(double *) DD, (double *) EE, &vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A'
|
||||
&tol, // tolerance
|
||||
&evals_found, evals_tmp, (double *) NULL, &NN,
|
||||
isuppz, work, &lwork, iwork, &liwork, &info);
|
||||
for (int i = iu - 1; i >= il - 1; i--)
|
||||
{
|
||||
printf ("node=%d evals_found=%d evals_tmp[%d] = %g\n", node,
|
||||
evals_found, i - (il - 1), evals_tmp[i - (il - 1)]);
|
||||
evals_tmp[i] = evals_tmp[i - (il - 1)];
|
||||
if (il > 1)
|
||||
evals_tmp[i - (il - 1)] = 0.;
|
||||
}
|
||||
}
|
||||
{
|
||||
grid->GlobalSumVector (evals_tmp, NN);
|
||||
}
|
||||
}
|
||||
// cheating a bit. It is better to sort instead of just reversing it, but the document of the routine says evals are sorted in increasing order. qr gives evals in decreasing order.
|
||||
}
|
||||
#undef LAPACK_INT
|
||||
#endif
|
||||
|
||||
|
||||
void diagonalize (std::vector < RealD > &lmd,
|
||||
std::vector < RealD > &lme,
|
||||
int N2, int N1, GridBase * grid)
|
||||
{
|
||||
|
||||
#ifdef USE_LAPACK
|
||||
const int check_lapack = 0; // just use lapack if 0, check against lapack if 1
|
||||
|
||||
if (!check_lapack)
|
||||
return diagonalize_lapack (lmd, lme, N2, N1, grid);
|
||||
|
||||
// diagonalize_lapack(lmd2,lme2,Nm2,Nm,Qt,grid);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
static RealD normalise (Field & v)
|
||||
{
|
||||
RealD nn = norm2 (v);
|
||||
nn = sqrt (nn);
|
||||
v = v * (1.0 / nn);
|
||||
return nn;
|
||||
}
|
||||
|
||||
void orthogonalize (Field & w, std::vector < Field > &evec, int k)
|
||||
{
|
||||
double t0 = -usecond () / 1e6;
|
||||
typedef typename Field::scalar_type MyComplex;
|
||||
MyComplex ip;
|
||||
|
||||
if (0)
|
||||
{
|
||||
for (int j = 0; j < k; ++j)
|
||||
{
|
||||
normalise (evec[j]);
|
||||
for (int i = 0; i < j; i++)
|
||||
{
|
||||
ip = innerProduct (evec[i], evec[j]); // are the evecs normalised? ; this assumes so.
|
||||
evec[j] = evec[j] - ip * evec[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int j = 0; j < k; ++j)
|
||||
{
|
||||
ip = innerProduct (evec[j], w); // are the evecs normalised? ; this assumes so.
|
||||
w = w - ip * evec[j];
|
||||
}
|
||||
normalise (w);
|
||||
t0 += usecond () / 1e6;
|
||||
OrthoTime += t0;
|
||||
}
|
||||
|
||||
void setUnit_Qt (int Nm, std::vector < RealD > &Qt)
|
||||
{
|
||||
for (int i = 0; i < Qt.size (); ++i)
|
||||
Qt[i] = 0.0;
|
||||
for (int k = 0; k < Nm; ++k)
|
||||
Qt[k + k * Nm] = 1.0;
|
||||
}
|
||||
|
||||
|
||||
void calc (std::vector < RealD > &eval, const Field & src, int &Nconv)
|
||||
{
|
||||
|
||||
GridBase *grid = src.Grid();
|
||||
// assert(grid == src._grid);
|
||||
|
||||
std::
|
||||
cout << GridLogMessage << " -- Nk = " << Nk << " Np = " << Np << std::
|
||||
endl;
|
||||
std::cout << GridLogMessage << " -- Nm = " << Nm << std::endl;
|
||||
std::cout << GridLogMessage << " -- size of eval = " << eval.
|
||||
size () << std::endl;
|
||||
|
||||
// assert(c.size() && Nm == eval.size());
|
||||
|
||||
std::vector < RealD > lme (Nm);
|
||||
std::vector < RealD > lmd (Nm);
|
||||
|
||||
|
||||
Field current (grid);
|
||||
Field last (grid);
|
||||
Field next (grid);
|
||||
|
||||
Nconv = 0;
|
||||
|
||||
RealD beta_k;
|
||||
|
||||
// Set initial vector
|
||||
// (uniform vector) Why not src??
|
||||
// evec[0] = 1.0;
|
||||
current = src;
|
||||
std::cout << GridLogMessage << "norm2(src)= " << norm2 (src) << std::
|
||||
endl;
|
||||
normalise (current);
|
||||
std::
|
||||
cout << GridLogMessage << "norm2(evec[0])= " << norm2 (current) <<
|
||||
std::endl;
|
||||
|
||||
// Initial Nk steps
|
||||
OrthoTime = 0.;
|
||||
double t0 = usecond () / 1e6;
|
||||
RealD norm; // sqrt norm of last vector
|
||||
|
||||
uint64_t iter = 0;
|
||||
|
||||
bool initted = false;
|
||||
std::vector < RealD > low (Nstop * 10);
|
||||
std::vector < RealD > high (Nstop * 10);
|
||||
RealD cont = 0.;
|
||||
while (1) {
|
||||
cont = 0.;
|
||||
std::vector < RealD > lme2 (Nm);
|
||||
std::vector < RealD > lmd2 (Nm);
|
||||
for (uint64_t k = 0; k < Nm; ++k, iter++) {
|
||||
step (lmd, lme, last, current, next, iter);
|
||||
last = current;
|
||||
current = next;
|
||||
}
|
||||
double t1 = usecond () / 1e6;
|
||||
std::cout << GridLogMessage << "IRL::Initial steps: " << t1 -
|
||||
t0 << "seconds" << std::endl;
|
||||
t0 = t1;
|
||||
std::
|
||||
cout << GridLogMessage << "IRL::Initial steps:OrthoTime " <<
|
||||
OrthoTime << "seconds" << std::endl;
|
||||
|
||||
// getting eigenvalues
|
||||
lmd2.resize (iter + 2);
|
||||
lme2.resize (iter + 2);
|
||||
for (uint64_t k = 0; k < iter; ++k) {
|
||||
lmd2[k + 1] = lmd[k];
|
||||
lme2[k + 2] = lme[k];
|
||||
}
|
||||
t1 = usecond () / 1e6;
|
||||
std::cout << GridLogMessage << "IRL:: copy: " << t1 -
|
||||
t0 << "seconds" << std::endl;
|
||||
t0 = t1;
|
||||
{
|
||||
int total = grid->_Nprocessors;
|
||||
int node = grid->_processor;
|
||||
int interval = (Nstop / total) + 1;
|
||||
int iu = (iter + 1) - (interval * node + 1);
|
||||
int il = (iter + 1) - (interval * (node + 1));
|
||||
std::vector < RealD > eval2 (iter + 3);
|
||||
RealD eps2;
|
||||
Bisection::bisec (lmd2, lme2, iter, il, iu, 1e-16, 1e-10, eval2,
|
||||
eps2);
|
||||
// diagonalize(eval2,lme2,iter,Nk,grid);
|
||||
RealD diff = 0.;
|
||||
for (int i = il; i <= iu; i++) {
|
||||
if (initted)
|
||||
diff =
|
||||
fabs (eval2[i] - high[iu-i]) / (fabs (eval2[i]) +
|
||||
fabs (high[iu-i]));
|
||||
if (initted && (diff > eresid))
|
||||
cont = 1.;
|
||||
if (initted)
|
||||
printf ("eval[%d]=%0.14e %0.14e, %0.14e\n", i, eval2[i],
|
||||
high[iu-i], diff);
|
||||
high[iu-i] = eval2[i];
|
||||
}
|
||||
il = (interval * node + 1);
|
||||
iu = (interval * (node + 1));
|
||||
Bisection::bisec (lmd2, lme2, iter, il, iu, 1e-16, 1e-10, eval2,
|
||||
eps2);
|
||||
for (int i = il; i <= iu; i++) {
|
||||
if (initted)
|
||||
diff =
|
||||
fabs (eval2[i] - low[i]) / (fabs (eval2[i]) +
|
||||
fabs (low[i]));
|
||||
if (initted && (diff > eresid))
|
||||
cont = 1.;
|
||||
if (initted)
|
||||
printf ("eval[%d]=%0.14e %0.14e, %0.14e\n", i, eval2[i],
|
||||
low[i], diff);
|
||||
low[i] = eval2[i];
|
||||
}
|
||||
t1 = usecond () / 1e6;
|
||||
std::cout << GridLogMessage << "IRL:: diagonalize: " << t1 -
|
||||
t0 << "seconds" << std::endl;
|
||||
t0 = t1;
|
||||
}
|
||||
|
||||
for (uint64_t k = 0; k < Nk; ++k) {
|
||||
// eval[k] = eval2[k];
|
||||
}
|
||||
if (initted)
|
||||
{
|
||||
grid->GlobalSumVector (&cont, 1);
|
||||
if (cont < 1.) return;
|
||||
}
|
||||
initted = true;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#if 0
|
||||
|
||||
/**
|
||||
There is some matrix Q such that for any vector y
|
||||
Q.e_1 = y and Q is unitary.
|
||||
**/
|
||||
template < class T >
|
||||
static T orthQ (DenseMatrix < T > &Q, std::vector < T > y)
|
||||
{
|
||||
int N = y.size (); //Matrix Size
|
||||
Fill (Q, 0.0);
|
||||
T tau;
|
||||
for (int i = 0; i < N; i++)
|
||||
{
|
||||
Q[i][0] = y[i];
|
||||
}
|
||||
T sig = conj (y[0]) * y[0];
|
||||
T tau0 = fabs (sqrt (sig));
|
||||
|
||||
for (int j = 1; j < N; j++)
|
||||
{
|
||||
sig += conj (y[j]) * y[j];
|
||||
tau = abs (sqrt (sig));
|
||||
|
||||
if (abs (tau0) > 0.0)
|
||||
{
|
||||
|
||||
T gam = conj ((y[j] / tau) / tau0);
|
||||
for (int k = 0; k <= j - 1; k++)
|
||||
{
|
||||
Q[k][j] = -gam * y[k];
|
||||
}
|
||||
Q[j][j] = tau0 / tau;
|
||||
}
|
||||
else
|
||||
{
|
||||
Q[j - 1][j] = 1.0;
|
||||
}
|
||||
tau0 = tau;
|
||||
}
|
||||
return tau;
|
||||
}
|
||||
|
||||
/**
|
||||
There is some matrix Q such that for any vector y
|
||||
Q.e_k = y and Q is unitary.
|
||||
**/
|
||||
template < class T >
|
||||
static T orthU (DenseMatrix < T > &Q, std::vector < T > y)
|
||||
{
|
||||
T tau = orthQ (Q, y);
|
||||
SL (Q);
|
||||
return tau;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
Wind up with a matrix with the first con rows untouched
|
||||
|
||||
say con = 2
|
||||
Q is such that Qdag H Q has {x, x, val, 0, 0, 0, 0, ...} as 1st colum
|
||||
and the matrix is upper hessenberg
|
||||
and with f and Q appropriately modidied with Q is the arnoldi factorization
|
||||
|
||||
**/
|
||||
|
||||
template < class T > static void Lock (DenseMatrix < T > &H, ///Hess mtx
|
||||
DenseMatrix < T > &Q, ///Lock Transform
|
||||
T val, ///value to be locked
|
||||
int con, ///number already locked
|
||||
RealD small, int dfg, bool herm)
|
||||
{
|
||||
//ForceTridiagonal(H);
|
||||
|
||||
int M = H.dim;
|
||||
DenseVector < T > vec;
|
||||
Resize (vec, M - con);
|
||||
|
||||
DenseMatrix < T > AH;
|
||||
Resize (AH, M - con, M - con);
|
||||
AH = GetSubMtx (H, con, M, con, M);
|
||||
|
||||
DenseMatrix < T > QQ;
|
||||
Resize (QQ, M - con, M - con);
|
||||
|
||||
Unity (Q);
|
||||
Unity (QQ);
|
||||
|
||||
DenseVector < T > evals;
|
||||
Resize (evals, M - con);
|
||||
DenseMatrix < T > evecs;
|
||||
Resize (evecs, M - con, M - con);
|
||||
|
||||
Wilkinson < T > (AH, evals, evecs, small);
|
||||
|
||||
int k = 0;
|
||||
RealD cold = abs (val - evals[k]);
|
||||
for (int i = 1; i < M - con; i++)
|
||||
{
|
||||
RealD cnew = abs (val - evals[i]);
|
||||
if (cnew < cold)
|
||||
{
|
||||
k = i;
|
||||
cold = cnew;
|
||||
}
|
||||
}
|
||||
vec = evecs[k];
|
||||
|
||||
ComplexD tau;
|
||||
orthQ (QQ, vec);
|
||||
//orthQM(QQ,AH,vec);
|
||||
|
||||
AH = Hermitian (QQ) * AH;
|
||||
AH = AH * QQ;
|
||||
|
||||
for (int i = con; i < M; i++)
|
||||
{
|
||||
for (int j = con; j < M; j++)
|
||||
{
|
||||
Q[i][j] = QQ[i - con][j - con];
|
||||
H[i][j] = AH[i - con][j - con];
|
||||
}
|
||||
}
|
||||
|
||||
for (int j = M - 1; j > con + 2; j--)
|
||||
{
|
||||
|
||||
DenseMatrix < T > U;
|
||||
Resize (U, j - 1 - con, j - 1 - con);
|
||||
DenseVector < T > z;
|
||||
Resize (z, j - 1 - con);
|
||||
T nm = norm (z);
|
||||
for (int k = con + 0; k < j - 1; k++)
|
||||
{
|
||||
z[k - con] = conj (H (j, k + 1));
|
||||
}
|
||||
normalise (z);
|
||||
|
||||
RealD tmp = 0;
|
||||
for (int i = 0; i < z.size () - 1; i++)
|
||||
{
|
||||
tmp = tmp + abs (z[i]);
|
||||
}
|
||||
|
||||
if (tmp < small / ((RealD) z.size () - 1.0))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
tau = orthU (U, z);
|
||||
|
||||
DenseMatrix < T > Hb;
|
||||
Resize (Hb, j - 1 - con, M);
|
||||
|
||||
for (int a = 0; a < M; a++)
|
||||
{
|
||||
for (int b = 0; b < j - 1 - con; b++)
|
||||
{
|
||||
T sum = 0;
|
||||
for (int c = 0; c < j - 1 - con; c++)
|
||||
{
|
||||
sum += H[a][con + 1 + c] * U[c][b];
|
||||
} //sum += H(a,con+1+c)*U(c,b);}
|
||||
Hb[b][a] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
for (int k = con + 1; k < j; k++)
|
||||
{
|
||||
for (int l = 0; l < M; l++)
|
||||
{
|
||||
H[l][k] = Hb[k - 1 - con][l];
|
||||
}
|
||||
} //H(Hb[k-1-con][l] , l,k);}}
|
||||
|
||||
DenseMatrix < T > Qb;
|
||||
Resize (Qb, M, M);
|
||||
|
||||
for (int a = 0; a < M; a++)
|
||||
{
|
||||
for (int b = 0; b < j - 1 - con; b++)
|
||||
{
|
||||
T sum = 0;
|
||||
for (int c = 0; c < j - 1 - con; c++)
|
||||
{
|
||||
sum += Q[a][con + 1 + c] * U[c][b];
|
||||
} //sum += Q(a,con+1+c)*U(c,b);}
|
||||
Qb[b][a] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
for (int k = con + 1; k < j; k++)
|
||||
{
|
||||
for (int l = 0; l < M; l++)
|
||||
{
|
||||
Q[l][k] = Qb[k - 1 - con][l];
|
||||
}
|
||||
} //Q(Qb[k-1-con][l] , l,k);}}
|
||||
|
||||
DenseMatrix < T > Hc;
|
||||
Resize (Hc, M, M);
|
||||
|
||||
for (int a = 0; a < j - 1 - con; a++)
|
||||
{
|
||||
for (int b = 0; b < M; b++)
|
||||
{
|
||||
T sum = 0;
|
||||
for (int c = 0; c < j - 1 - con; c++)
|
||||
{
|
||||
sum += conj (U[c][a]) * H[con + 1 + c][b];
|
||||
} //sum += conj( U(c,a) )*H(con+1+c,b);}
|
||||
Hc[b][a] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
for (int k = 0; k < M; k++)
|
||||
{
|
||||
for (int l = con + 1; l < j; l++)
|
||||
{
|
||||
H[l][k] = Hc[k][l - 1 - con];
|
||||
}
|
||||
} //H(Hc[k][l-1-con] , l,k);}}
|
||||
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
#endif
|
||||
@@ -30,6 +30,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
/* END LEGAL */
|
||||
#pragma once
|
||||
|
||||
#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
inline RealD AggregatePowerLaw(RealD x)
|
||||
@@ -95,7 +97,7 @@ public:
|
||||
|
||||
RealD scale;
|
||||
|
||||
ConjugateGradient<FineField> CG(1.0e-2,100,false);
|
||||
ConjugateGradient<FineField> CG(1.0e-3,400,false);
|
||||
FineField noise(FineGrid);
|
||||
FineField Mn(FineGrid);
|
||||
|
||||
@@ -108,7 +110,7 @@ public:
|
||||
|
||||
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
|
||||
|
||||
for(int i=0;i<1;i++){
|
||||
for(int i=0;i<4;i++){
|
||||
|
||||
CG(hermop,noise,subspace[b]);
|
||||
|
||||
@@ -124,6 +126,53 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
virtual void CreateSubspaceGCR(GridParallelRNG &RNG,LinearOperatorBase<FineField> &DiracOp,int nn=nbasis)
|
||||
{
|
||||
RealD scale;
|
||||
|
||||
TrivialPrecon<FineField> simple_fine;
|
||||
PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,12,12);
|
||||
FineField noise(FineGrid);
|
||||
FineField src(FineGrid);
|
||||
FineField guess(FineGrid);
|
||||
FineField Mn(FineGrid);
|
||||
|
||||
for(int b=0;b<nn;b++){
|
||||
|
||||
subspace[b] = Zero();
|
||||
gaussian(RNG,noise);
|
||||
scale = std::pow(norm2(noise),-0.5);
|
||||
noise=noise*scale;
|
||||
|
||||
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;
|
||||
|
||||
for(int i=0;i<2;i++){
|
||||
// void operator() (const Field &src, Field &psi){
|
||||
#if 1
|
||||
std::cout << GridLogMessage << " inverting on noise "<<std::endl;
|
||||
src = noise;
|
||||
guess=Zero();
|
||||
GCR(src,guess);
|
||||
subspace[b] = guess;
|
||||
#else
|
||||
std::cout << GridLogMessage << " inverting on zero "<<std::endl;
|
||||
src=Zero();
|
||||
guess = noise;
|
||||
GCR(src,guess);
|
||||
subspace[b] = guess;
|
||||
#endif
|
||||
noise = subspace[b];
|
||||
scale = std::pow(norm2(noise),-0.5);
|
||||
noise=noise*scale;
|
||||
|
||||
}
|
||||
|
||||
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|Op|f> "<<innerProduct(noise,Mn)<<std::endl;
|
||||
subspace[b] = noise;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
|
||||
// and this is the best I found
|
||||
@@ -160,14 +209,21 @@ public:
|
||||
|
||||
int b =0;
|
||||
{
|
||||
ComplexD ip;
|
||||
// Filter
|
||||
Chebyshev<FineField> Cheb(lo,hi,orderfilter);
|
||||
Cheb(hermop,noise,Mn);
|
||||
// normalise
|
||||
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
|
||||
subspace[b] = Mn;
|
||||
hermop.Op(Mn,tmp);
|
||||
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
|
||||
|
||||
hermop.Op(Mn,tmp);
|
||||
ip= innerProduct(Mn,tmp);
|
||||
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
|
||||
|
||||
hermop.AdjOp(Mn,tmp);
|
||||
ip = innerProduct(Mn,tmp);
|
||||
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
|
||||
b++;
|
||||
}
|
||||
|
||||
@@ -213,8 +269,18 @@ public:
|
||||
Mn=*Tnp;
|
||||
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
|
||||
subspace[b] = Mn;
|
||||
hermop.Op(Mn,tmp);
|
||||
std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
|
||||
|
||||
|
||||
ComplexD ip;
|
||||
|
||||
hermop.Op(Mn,tmp);
|
||||
ip= innerProduct(Mn,tmp);
|
||||
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
|
||||
|
||||
hermop.AdjOp(Mn,tmp);
|
||||
ip = innerProduct(Mn,tmp);
|
||||
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
|
||||
|
||||
b++;
|
||||
}
|
||||
|
||||
@@ -228,6 +294,70 @@ public:
|
||||
}
|
||||
assert(b==nn);
|
||||
}
|
||||
|
||||
|
||||
virtual void CreateSubspacePolyCheby(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
|
||||
int nn,
|
||||
double hi,
|
||||
double lo1,
|
||||
int orderfilter,
|
||||
double lo2,
|
||||
int orderstep)
|
||||
{
|
||||
RealD scale;
|
||||
|
||||
FineField noise(FineGrid);
|
||||
FineField Mn(FineGrid);
|
||||
FineField tmp(FineGrid);
|
||||
|
||||
// New normalised noise
|
||||
gaussian(RNG,noise);
|
||||
scale = std::pow(norm2(noise),-0.5);
|
||||
noise=noise*scale;
|
||||
|
||||
std::cout << GridLogMessage<<" CreateSubspacePolyCheby "<<std::endl;
|
||||
// Initial matrix element
|
||||
hermop.Op(noise,Mn);
|
||||
std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
|
||||
|
||||
int b =0;
|
||||
{
|
||||
// Filter
|
||||
std::cout << GridLogMessage << "Cheby "<<lo1<<","<<hi<<" "<<orderstep<<std::endl;
|
||||
Chebyshev<FineField> Cheb(lo1,hi,orderfilter);
|
||||
Cheb(hermop,noise,Mn);
|
||||
// normalise
|
||||
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
|
||||
subspace[b] = Mn;
|
||||
hermop.Op(Mn,tmp);
|
||||
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
|
||||
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|n> "<<norm2(Mn)<<std::endl;
|
||||
}
|
||||
|
||||
// Generate a full sequence of Chebyshevs
|
||||
for(int n=1;n<nn;n++){
|
||||
std::cout << GridLogMessage << "Cheby "<<lo2<<","<<hi<<" "<<orderstep<<std::endl;
|
||||
Chebyshev<FineField> Cheb(lo2,hi,orderstep);
|
||||
Cheb(hermop,subspace[n-1],Mn);
|
||||
|
||||
for(int m=0;m<n;m++){
|
||||
ComplexD c = innerProduct(subspace[m],Mn);
|
||||
Mn = Mn - c*subspace[m];
|
||||
}
|
||||
|
||||
// normalise
|
||||
scale = std::pow(norm2(Mn),-0.5);
|
||||
Mn=Mn*scale;
|
||||
|
||||
subspace[n]=Mn;
|
||||
|
||||
hermop.Op(Mn,tmp);
|
||||
std::cout<<GridLogMessage << "filt ["<<n<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
|
||||
std::cout<<GridLogMessage << "filt ["<<n<<"] <n|n> "<<norm2(Mn)<<std::endl;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
|
||||
int nn,
|
||||
double hi,
|
||||
|
||||
@@ -441,8 +441,20 @@ public:
|
||||
std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl;
|
||||
}
|
||||
#else
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Galerkin projection of matrix
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
|
||||
Aggregation<Fobj,CComplex,nbasis> & Subspace)
|
||||
{
|
||||
CoarsenOperator(linop,Subspace,Subspace);
|
||||
}
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Petrov - Galerkin projection of matrix
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
|
||||
Aggregation<Fobj,CComplex,nbasis> & U,
|
||||
Aggregation<Fobj,CComplex,nbasis> & V)
|
||||
{
|
||||
std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
|
||||
GridBase *grid = FineGrid();
|
||||
@@ -458,11 +470,9 @@ public:
|
||||
// Orthogonalise the subblocks over the basis
|
||||
/////////////////////////////////////////////////////////////
|
||||
CoarseScalar InnerProd(CoarseGrid());
|
||||
blockOrthogonalise(InnerProd,Subspace.subspace);
|
||||
blockOrthogonalise(InnerProd,V.subspace);
|
||||
blockOrthogonalise(InnerProd,U.subspace);
|
||||
|
||||
// for(int s=0;s<Subspace.subspace.size();s++){
|
||||
// std::cout << " subspace norm "<<norm2(Subspace.subspace[s])<<std::endl;
|
||||
// }
|
||||
const int npoint = geom.npoint;
|
||||
|
||||
Coordinate clatt = CoarseGrid()->GlobalDimensions();
|
||||
@@ -542,7 +552,7 @@ public:
|
||||
std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
|
||||
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
|
||||
tphaseBZ-=usecond();
|
||||
phaV = phaF[p]*Subspace.subspace[i];
|
||||
phaV = phaF[p]*V.subspace[i];
|
||||
tphaseBZ+=usecond();
|
||||
|
||||
/////////////////////////////////////////////////////////////////////
|
||||
@@ -555,7 +565,7 @@ public:
|
||||
// std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl;
|
||||
|
||||
tproj-=usecond();
|
||||
blockProject(coarseInner,MphaV,Subspace.subspace);
|
||||
blockProject(coarseInner,MphaV,U.subspace);
|
||||
coarseInner = conjugate(pha[p]) * coarseInner;
|
||||
|
||||
ComputeProj[p] = coarseInner;
|
||||
|
||||
@@ -69,7 +69,7 @@ public:
|
||||
}
|
||||
|
||||
// FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
|
||||
void construct(pointer __p, const _Tp& __val) { assert(0);};
|
||||
void construct(pointer __p, const _Tp& __val) { };
|
||||
void construct(pointer __p) { };
|
||||
void destroy(pointer __p) { };
|
||||
};
|
||||
@@ -175,10 +175,11 @@ template<typename _Tp> inline bool operator!=(const devAllocator<_Tp>&, const d
|
||||
// Template typedefs
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<class T> using hostVector = std::vector<T,alignedAllocator<T> >; // Needs autoview
|
||||
template<class T> using Vector = std::vector<T,uvmAllocator<T> >; //
|
||||
template<class T> using Vector = std::vector<T,uvmAllocator<T> >; // Really want to deprecate
|
||||
template<class T> using uvmVector = std::vector<T,uvmAllocator<T> >; // auto migrating page
|
||||
template<class T> using deviceVector = std::vector<T,devAllocator<T> >; // device vector
|
||||
|
||||
/*
|
||||
template<class T> class vecView
|
||||
{
|
||||
protected:
|
||||
@@ -214,6 +215,7 @@ template<class T> vecView<T> VectorView(Vector<T> &vec,ViewMode _mode)
|
||||
#define autoVecView(v_v,v,mode) \
|
||||
auto v_v = VectorView(v,mode); \
|
||||
ViewCloser<decltype(v_v)> _autoView##v_v(v_v);
|
||||
*/
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
|
||||
@@ -1,16 +1,15 @@
|
||||
#include <Grid/GridCore.h>
|
||||
#ifndef GRID_UVM
|
||||
|
||||
#warning "Using explicit device memory copies"
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
#define MAXLINE 512
|
||||
static char print_buffer [ MAXLINE ];
|
||||
|
||||
#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer;
|
||||
#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer;
|
||||
#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer << std::endl;
|
||||
#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer << std::endl;
|
||||
//#define dprintf(...)
|
||||
|
||||
//#define mprintf(...)
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
// For caching copies of data on device
|
||||
@@ -111,7 +110,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
|
||||
///////////////////////////////////////////////////////////
|
||||
assert(AccCache.state!=Empty);
|
||||
|
||||
dprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
|
||||
dprintf("MemoryManager: Discard(%lx) %lx",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
|
||||
assert(AccCache.accLock==0);
|
||||
assert(AccCache.cpuLock==0);
|
||||
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
||||
@@ -121,7 +120,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
|
||||
DeviceBytes -=AccCache.bytes;
|
||||
LRUremove(AccCache);
|
||||
AccCache.AccPtr=(uint64_t) NULL;
|
||||
dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
|
||||
dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
|
||||
}
|
||||
uint64_t CpuPtr = AccCache.CpuPtr;
|
||||
EntryErase(CpuPtr);
|
||||
@@ -141,7 +140,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
assert(AccCache.state!=Empty);
|
||||
|
||||
mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld\n",
|
||||
mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld",
|
||||
(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
|
||||
(uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock);
|
||||
if (AccCache.accLock!=0) return;
|
||||
@@ -155,7 +154,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
|
||||
AccCache.AccPtr=(uint64_t)NULL;
|
||||
AccCache.state=CpuDirty; // CPU primary now
|
||||
DeviceBytes -=AccCache.bytes;
|
||||
dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
|
||||
dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld ",(uint64_t)AccCache.AccPtr,DeviceBytes);
|
||||
}
|
||||
// uint64_t CpuPtr = AccCache.CpuPtr;
|
||||
DeviceEvictions++;
|
||||
@@ -169,7 +168,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
|
||||
assert(AccCache.AccPtr!=(uint64_t)NULL);
|
||||
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
||||
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
|
||||
mprintf("MemoryManager: acceleratorCopyFromDevice Flush AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
||||
mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
||||
DeviceToHostBytes+=AccCache.bytes;
|
||||
DeviceToHostXfer++;
|
||||
AccCache.state=Consistent;
|
||||
@@ -184,7 +183,9 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
|
||||
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
|
||||
DeviceBytes+=AccCache.bytes;
|
||||
}
|
||||
mprintf("MemoryManager: acceleratorCopyToDevice Clone AccPtr %lx <- CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
||||
mprintf("MemoryManager: acceleratorCopyToDevice Clone size %ld AccPtr %lx <- CpuPtr %lx",
|
||||
(uint64_t)AccCache.bytes,
|
||||
(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
||||
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
|
||||
HostToDeviceBytes+=AccCache.bytes;
|
||||
HostToDeviceXfer++;
|
||||
@@ -210,7 +211,7 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
|
||||
void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
|
||||
{
|
||||
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
|
||||
dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr);
|
||||
dprintf("AcceleratorViewClose %lx",(uint64_t)Ptr);
|
||||
AcceleratorViewClose((uint64_t)Ptr);
|
||||
} else if( (mode==CpuRead)||(mode==CpuWrite)){
|
||||
CpuViewClose((uint64_t)Ptr);
|
||||
@@ -222,7 +223,7 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
|
||||
{
|
||||
uint64_t CpuPtr = (uint64_t)_CpuPtr;
|
||||
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
|
||||
dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr);
|
||||
dprintf("AcceleratorViewOpen %lx",(uint64_t)CpuPtr);
|
||||
return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
|
||||
} else if( (mode==CpuRead)||(mode==CpuWrite)){
|
||||
return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
|
||||
@@ -233,6 +234,9 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
|
||||
}
|
||||
void MemoryManager::EvictVictims(uint64_t bytes)
|
||||
{
|
||||
if(bytes>=DeviceMaxBytes) {
|
||||
printf("EvictVictims bytes %ld DeviceMaxBytes %ld\n",bytes,DeviceMaxBytes);
|
||||
}
|
||||
assert(bytes<DeviceMaxBytes);
|
||||
while(bytes+DeviceLRUBytes > DeviceMaxBytes){
|
||||
if ( DeviceLRUBytes > 0){
|
||||
@@ -265,7 +269,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
|
||||
assert(AccCache.cpuLock==0); // Programming error
|
||||
|
||||
if(AccCache.state!=Empty) {
|
||||
dprintf("ViewOpen found entry %lx %lx : %ld %ld accLock %ld\n",
|
||||
dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld",
|
||||
(uint64_t)AccCache.CpuPtr,
|
||||
(uint64_t)CpuPtr,
|
||||
(uint64_t)AccCache.bytes,
|
||||
@@ -305,7 +309,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
|
||||
AccCache.state = Consistent; // Empty + AccRead => Consistent
|
||||
}
|
||||
AccCache.accLock= 1;
|
||||
dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock);
|
||||
dprintf("Copied Empty entry into device accLock= %d",AccCache.accLock);
|
||||
} else if(AccCache.state==CpuDirty ){
|
||||
if(mode==AcceleratorWriteDiscard) {
|
||||
CpuDiscard(AccCache);
|
||||
@@ -318,21 +322,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
|
||||
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
|
||||
}
|
||||
AccCache.accLock++;
|
||||
dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock);
|
||||
dprintf("CpuDirty entry into device ++accLock= %d",AccCache.accLock);
|
||||
} else if(AccCache.state==Consistent) {
|
||||
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
|
||||
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
|
||||
else
|
||||
AccCache.state = Consistent; // Consistent + AccRead => Consistent
|
||||
AccCache.accLock++;
|
||||
dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock);
|
||||
dprintf("Consistent entry into device ++accLock= %d",AccCache.accLock);
|
||||
} else if(AccCache.state==AccDirty) {
|
||||
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
|
||||
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
|
||||
else
|
||||
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
|
||||
AccCache.accLock++;
|
||||
dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock);
|
||||
dprintf("AccDirty entry ++accLock= %d",AccCache.accLock);
|
||||
} else {
|
||||
assert(0);
|
||||
}
|
||||
@@ -341,7 +345,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
|
||||
// If view is opened on device must remove from LRU
|
||||
if(AccCache.LRU_valid==1){
|
||||
// must possibly remove from LRU as now locked on GPU
|
||||
dprintf("AccCache entry removed from LRU \n");
|
||||
dprintf("AccCache entry removed from LRU ");
|
||||
LRUremove(AccCache);
|
||||
}
|
||||
|
||||
@@ -364,10 +368,10 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
|
||||
AccCache.accLock--;
|
||||
// Move to LRU queue if not locked and close on device
|
||||
if(AccCache.accLock==0) {
|
||||
dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
|
||||
dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
|
||||
LRUinsert(AccCache);
|
||||
} else {
|
||||
dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
|
||||
dprintf("AccleratorViewClose %lx AccLock decremented to %ld",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
|
||||
}
|
||||
}
|
||||
void MemoryManager::CpuViewClose(uint64_t CpuPtr)
|
||||
|
||||
@@ -57,18 +57,29 @@ int CartesianCommunicator::ProcessorCount(void) { return
|
||||
// very VERY rarely (Log, serial RNG) we need world without a grid
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifdef USE_GRID_REDUCTION
|
||||
void CartesianCommunicator::GlobalSum(ComplexF &c)
|
||||
{
|
||||
GlobalSumP2P(c);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(ComplexD &c)
|
||||
{
|
||||
GlobalSumP2P(c);
|
||||
}
|
||||
#else
|
||||
void CartesianCommunicator::GlobalSum(ComplexF &c)
|
||||
{
|
||||
GlobalSumVector((float *)&c,2);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
|
||||
{
|
||||
GlobalSumVector((float *)c,2*N);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(ComplexD &c)
|
||||
{
|
||||
GlobalSumVector((double *)&c,2);
|
||||
}
|
||||
#endif
|
||||
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
|
||||
{
|
||||
GlobalSumVector((float *)c,2*N);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
|
||||
{
|
||||
GlobalSumVector((double *)c,2*N);
|
||||
|
||||
@@ -33,6 +33,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
///////////////////////////////////
|
||||
#include <Grid/communicator/SharedMemory.h>
|
||||
|
||||
#define NVLINK_GET
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
extern bool Stencil_force_mpi ;
|
||||
@@ -127,7 +129,36 @@ public:
|
||||
void GlobalSumVector(ComplexD *c,int N);
|
||||
void GlobalXOR(uint32_t &);
|
||||
void GlobalXOR(uint64_t &);
|
||||
|
||||
|
||||
template<class obj> void GlobalSumP2P(obj &o)
|
||||
{
|
||||
std::vector<obj> column;
|
||||
obj accum = o;
|
||||
int source,dest;
|
||||
for(int d=0;d<_ndimension;d++){
|
||||
column.resize(_processors[d]);
|
||||
column[0] = accum;
|
||||
std::vector<MpiCommsRequest_t> list;
|
||||
for(int p=1;p<_processors[d];p++){
|
||||
ShiftedRanks(d,p,source,dest);
|
||||
SendToRecvFromBegin(list,
|
||||
&column[0],
|
||||
dest,
|
||||
&column[p],
|
||||
source,
|
||||
sizeof(obj),d*100+p);
|
||||
|
||||
}
|
||||
if (!list.empty()) // avoid triggering assert in comms == none
|
||||
CommsComplete(list);
|
||||
for(int p=1;p<_processors[d];p++){
|
||||
accum = accum + column[p];
|
||||
}
|
||||
}
|
||||
Broadcast(0,accum);
|
||||
o=accum;
|
||||
}
|
||||
|
||||
template<class obj> void GlobalSum(obj &o){
|
||||
typedef typename obj::scalar_type scalar_type;
|
||||
int words = sizeof(obj)/sizeof(scalar_type);
|
||||
@@ -138,8 +169,8 @@ public:
|
||||
////////////////////////////////////////////////////////////
|
||||
// Face exchange, buffer swap in translational invariant way
|
||||
////////////////////////////////////////////////////////////
|
||||
void CommsComplete(std::vector<CommsRequest_t> &list);
|
||||
void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void CommsComplete(std::vector<MpiCommsRequest_t> &list);
|
||||
void SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int dest,
|
||||
void *recv,
|
||||
@@ -158,6 +189,17 @@ public:
|
||||
int recv_from_rank,int do_recv,
|
||||
int bytes,int dir);
|
||||
|
||||
double StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int xmit_to_rank,int do_xmit,
|
||||
void *recv,
|
||||
int recv_from_rank,int do_recv,
|
||||
int xbytes,int rbytes,int dir);
|
||||
|
||||
// Could do a PollHtoD and have a CommsMerge dependence
|
||||
void StencilSendToRecvFromPollDtoH (std::vector<CommsRequest_t> &list);
|
||||
void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
|
||||
|
||||
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int xmit_to_rank,int do_xmit,
|
||||
|
||||
@@ -30,6 +30,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
|
||||
Grid_MPI_Comm CartesianCommunicator::communicator_world;
|
||||
|
||||
////////////////////////////////////////////
|
||||
@@ -257,6 +258,25 @@ CartesianCommunicator::~CartesianCommunicator()
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef USE_GRID_REDUCTION
|
||||
void CartesianCommunicator::GlobalSum(float &f){
|
||||
CartesianCommunicator::GlobalSumP2P(f);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(double &d)
|
||||
{
|
||||
CartesianCommunicator::GlobalSumP2P(d);
|
||||
}
|
||||
#else
|
||||
void CartesianCommunicator::GlobalSum(float &f){
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(double &d)
|
||||
{
|
||||
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
#endif
|
||||
void CartesianCommunicator::GlobalSum(uint32_t &u){
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
@@ -287,27 +307,18 @@ void CartesianCommunicator::GlobalMax(double &d)
|
||||
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(float &f){
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSumVector(float *f,int N)
|
||||
{
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(double &d)
|
||||
{
|
||||
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSumVector(double *d,int N)
|
||||
{
|
||||
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
|
||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int dest,
|
||||
void *recv,
|
||||
@@ -332,7 +343,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
|
||||
assert(ierr==0);
|
||||
list.push_back(xrq);
|
||||
}
|
||||
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list)
|
||||
void CartesianCommunicator::CommsComplete(std::vector<MpiCommsRequest_t> &list)
|
||||
{
|
||||
int nreq=list.size();
|
||||
|
||||
@@ -351,9 +362,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||
int from,
|
||||
int bytes)
|
||||
{
|
||||
std::vector<CommsRequest_t> reqs(0);
|
||||
unsigned long xcrc = crc32(0L, Z_NULL, 0);
|
||||
unsigned long rcrc = crc32(0L, Z_NULL, 0);
|
||||
std::vector<MpiCommsRequest_t> reqs(0);
|
||||
|
||||
int myrank = _processor;
|
||||
int ierr;
|
||||
@@ -369,9 +378,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||
communicator,MPI_STATUS_IGNORE);
|
||||
assert(ierr==0);
|
||||
|
||||
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
|
||||
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
|
||||
// printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
|
||||
}
|
||||
// Basic Halo comms primitive
|
||||
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
||||
@@ -381,12 +387,287 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
||||
int bytes,int dir)
|
||||
{
|
||||
std::vector<CommsRequest_t> list;
|
||||
double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
|
||||
double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
|
||||
offbytes += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
|
||||
StencilSendToRecvFromComplete(list,dir);
|
||||
return offbytes;
|
||||
}
|
||||
|
||||
#undef NVLINK_GET // Define to use get instead of put DMA
|
||||
|
||||
#ifdef ACCELERATOR_AWARE_MPI
|
||||
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
|
||||
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
|
||||
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int dest,int dox,
|
||||
void *recv,
|
||||
int from,int dor,
|
||||
int xbytes,int rbytes,int dir)
|
||||
{
|
||||
return 0.0; // Do nothing -- no preparation required
|
||||
}
|
||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int dest,int dox,
|
||||
void *recv,
|
||||
int from,int dor,
|
||||
int xbytes,int rbytes,int dir)
|
||||
{
|
||||
int ncomm =communicator_halo.size();
|
||||
int commdir=dir%ncomm;
|
||||
|
||||
MPI_Request xrq;
|
||||
MPI_Request rrq;
|
||||
|
||||
int ierr;
|
||||
int gdest = ShmRanks[dest];
|
||||
int gfrom = ShmRanks[from];
|
||||
int gme = ShmRanks[_processor];
|
||||
|
||||
assert(dest != _processor);
|
||||
assert(from != _processor);
|
||||
assert(gme == ShmRank);
|
||||
double off_node_bytes=0.0;
|
||||
int tag;
|
||||
|
||||
if ( dor ) {
|
||||
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||
tag= dir+from*32;
|
||||
ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
|
||||
assert(ierr==0);
|
||||
list.push_back(rrq);
|
||||
off_node_bytes+=rbytes;
|
||||
}
|
||||
#ifdef NVLINK_GET
|
||||
else {
|
||||
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
|
||||
assert(shm!=NULL);
|
||||
acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
// This is a NVLINK PUT
|
||||
if (dox) {
|
||||
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||
tag= dir+_processor*32;
|
||||
ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
||||
assert(ierr==0);
|
||||
list.push_back(xrq);
|
||||
off_node_bytes+=xbytes;
|
||||
} else {
|
||||
#ifndef NVLINK_GET
|
||||
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
|
||||
assert(shm!=NULL);
|
||||
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
return off_node_bytes;
|
||||
}
|
||||
|
||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
|
||||
{
|
||||
int nreq=list.size();
|
||||
/*finishes Get/Put*/
|
||||
acceleratorCopySynchronise();
|
||||
|
||||
if (nreq==0) return;
|
||||
std::vector<MPI_Status> status(nreq);
|
||||
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
|
||||
assert(ierr==0);
|
||||
list.resize(0);
|
||||
this->StencilBarrier();
|
||||
}
|
||||
|
||||
#else /* NOT ... ACCELERATOR_AWARE_MPI */
|
||||
///////////////////////////////////////////
|
||||
// Pipeline mode through host memory
|
||||
///////////////////////////////////////////
|
||||
/*
|
||||
* In prepare (phase 1):
|
||||
* PHASE 1: (prepare)
|
||||
* - post MPI receive buffers asynch
|
||||
* - post device - host send buffer transfer asynch
|
||||
* PHASE 2: (Begin)
|
||||
* - complete all copies
|
||||
* - post MPI send asynch
|
||||
* - post device - device transfers
|
||||
* PHASE 3: (Complete)
|
||||
* - MPI_waitall
|
||||
* - host-device transfers
|
||||
*
|
||||
*********************************
|
||||
* NB could split this further:
|
||||
*--------------------------------
|
||||
* PHASE 1: (Prepare)
|
||||
* - post MPI receive buffers asynch
|
||||
* - post device - host send buffer transfer asynch
|
||||
* PHASE 2: (BeginInterNode)
|
||||
* - complete all copies
|
||||
* - post MPI send asynch
|
||||
* PHASE 3: (BeginIntraNode)
|
||||
* - post device - device transfers
|
||||
* PHASE 4: (Complete)
|
||||
* - MPI_waitall
|
||||
* - host-device transfers asynch
|
||||
* - (complete all copies)
|
||||
*/
|
||||
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int dest,int dox,
|
||||
void *recv,
|
||||
int from,int dor,
|
||||
int xbytes,int rbytes,int dir)
|
||||
{
|
||||
/*
|
||||
* Bring sequence from Stencil.h down to lower level.
|
||||
* Assume using XeLink is ok
|
||||
*/
|
||||
int ncomm =communicator_halo.size();
|
||||
int commdir=dir%ncomm;
|
||||
|
||||
MPI_Request xrq;
|
||||
MPI_Request rrq;
|
||||
|
||||
int ierr;
|
||||
int gdest = ShmRanks[dest];
|
||||
int gfrom = ShmRanks[from];
|
||||
int gme = ShmRanks[_processor];
|
||||
|
||||
assert(dest != _processor);
|
||||
assert(from != _processor);
|
||||
assert(gme == ShmRank);
|
||||
double off_node_bytes=0.0;
|
||||
int tag;
|
||||
|
||||
void * host_recv = NULL;
|
||||
void * host_xmit = NULL;
|
||||
|
||||
/*
|
||||
* PHASE 1: (Prepare)
|
||||
* - post MPI receive buffers asynch
|
||||
* - post device - host send buffer transfer asynch
|
||||
*/
|
||||
|
||||
if ( dor ) {
|
||||
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||
tag= dir+from*32;
|
||||
host_recv = this->HostBufferMalloc(rbytes);
|
||||
ierr=MPI_Irecv(host_recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
|
||||
assert(ierr==0);
|
||||
CommsRequest_t srq;
|
||||
srq.PacketType = InterNodeRecv;
|
||||
srq.bytes = rbytes;
|
||||
srq.req = rrq;
|
||||
srq.host_buf = host_recv;
|
||||
srq.device_buf = recv;
|
||||
list.push_back(srq);
|
||||
off_node_bytes+=rbytes;
|
||||
}
|
||||
}
|
||||
|
||||
if (dox) {
|
||||
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||
|
||||
tag= dir+_processor*32;
|
||||
|
||||
host_xmit = this->HostBufferMalloc(xbytes);
|
||||
CommsRequest_t srq;
|
||||
|
||||
srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
|
||||
|
||||
// ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
||||
// assert(ierr==0);
|
||||
// off_node_bytes+=xbytes;
|
||||
|
||||
srq.PacketType = InterNodeXmit;
|
||||
srq.bytes = xbytes;
|
||||
// srq.req = xrq;
|
||||
srq.host_buf = host_xmit;
|
||||
srq.device_buf = xmit;
|
||||
srq.tag = tag;
|
||||
srq.dest = dest;
|
||||
srq.commdir = commdir;
|
||||
list.push_back(srq);
|
||||
}
|
||||
}
|
||||
|
||||
return off_node_bytes;
|
||||
}
|
||||
/*
|
||||
* In the interest of better pipelining, poll for completion on each DtoH and
|
||||
* start MPI_ISend in the meantime
|
||||
*/
|
||||
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list)
|
||||
{
|
||||
int pending = 0;
|
||||
do {
|
||||
|
||||
pending = 0;
|
||||
|
||||
for(int idx = 0; idx<list.size();idx++){
|
||||
|
||||
if ( list[idx].PacketType==InterNodeRecv ) {
|
||||
|
||||
int flag = 0;
|
||||
MPI_Status status;
|
||||
int ierr = MPI_Test(&list[idx].req,&flag,&status);
|
||||
assert(ierr==0);
|
||||
|
||||
if ( flag ) {
|
||||
// std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl;
|
||||
acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes);
|
||||
list[idx].PacketType=InterNodeReceiveHtoD;
|
||||
} else {
|
||||
pending ++;
|
||||
}
|
||||
}
|
||||
}
|
||||
// std::cout << " PollIrecv "<<pending<<" pending requests"<<std::endl;
|
||||
} while ( pending );
|
||||
|
||||
}
|
||||
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list)
|
||||
{
|
||||
int pending = 0;
|
||||
do {
|
||||
|
||||
pending = 0;
|
||||
|
||||
for(int idx = 0; idx<list.size();idx++){
|
||||
|
||||
if ( list[idx].PacketType==InterNodeXmit ) {
|
||||
|
||||
if ( acceleratorEventIsComplete(list[idx].ev) ) {
|
||||
|
||||
void *host_xmit = list[idx].host_buf;
|
||||
uint32_t xbytes = list[idx].bytes;
|
||||
int dest = list[idx].dest;
|
||||
int tag = list[idx].tag;
|
||||
int commdir = list[idx].commdir;
|
||||
///////////////////
|
||||
// Send packet
|
||||
///////////////////
|
||||
|
||||
// std::cout << " DtoH is complete for index "<<idx<<" calling MPI_Isend "<<std::endl;
|
||||
|
||||
MPI_Request xrq;
|
||||
int ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
||||
assert(ierr==0);
|
||||
|
||||
list[idx].req = xrq; // Update the MPI request in the list
|
||||
|
||||
list[idx].PacketType=InterNodeXmitISend;
|
||||
|
||||
} else {
|
||||
// not done, so return to polling loop
|
||||
pending++;
|
||||
}
|
||||
}
|
||||
}
|
||||
} while (pending);
|
||||
}
|
||||
|
||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int dest,int dox,
|
||||
@@ -411,54 +692,106 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
||||
double off_node_bytes=0.0;
|
||||
int tag;
|
||||
|
||||
if ( dor ) {
|
||||
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||
tag= dir+from*32;
|
||||
ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
|
||||
assert(ierr==0);
|
||||
list.push_back(rrq);
|
||||
off_node_bytes+=rbytes;
|
||||
}
|
||||
void * host_xmit = NULL;
|
||||
|
||||
////////////////////////////////
|
||||
// Receives already posted
|
||||
// Copies already started
|
||||
////////////////////////////////
|
||||
/*
|
||||
* PHASE 2: (Begin)
|
||||
* - complete all copies
|
||||
* - post MPI send asynch
|
||||
*/
|
||||
#ifdef NVLINK_GET
|
||||
if ( dor ) {
|
||||
|
||||
if ( ! ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) ) {
|
||||
// Intranode
|
||||
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
|
||||
assert(shm!=NULL);
|
||||
acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
CommsRequest_t srq;
|
||||
|
||||
srq.ev = acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
|
||||
|
||||
srq.PacketType = IntraNodeRecv;
|
||||
srq.bytes = xbytes;
|
||||
// srq.req = xrq;
|
||||
srq.host_buf = NULL;
|
||||
srq.device_buf = xmit;
|
||||
srq.tag = -1;
|
||||
srq.dest = dest;
|
||||
srq.commdir = dir;
|
||||
list.push_back(srq);
|
||||
}
|
||||
}
|
||||
#else
|
||||
if (dox) {
|
||||
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
|
||||
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||
tag= dir+_processor*32;
|
||||
ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
||||
assert(ierr==0);
|
||||
list.push_back(xrq);
|
||||
off_node_bytes+=xbytes;
|
||||
} else {
|
||||
#ifndef NVLINK_GET
|
||||
|
||||
if ( !( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) ) {
|
||||
// Intranode
|
||||
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
|
||||
assert(shm!=NULL);
|
||||
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
|
||||
#endif
|
||||
|
||||
CommsRequest_t srq;
|
||||
|
||||
srq.ev = acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
|
||||
|
||||
srq.PacketType = IntraNodeXmit;
|
||||
srq.bytes = xbytes;
|
||||
// srq.req = xrq;
|
||||
srq.host_buf = NULL;
|
||||
srq.device_buf = xmit;
|
||||
srq.tag = -1;
|
||||
srq.dest = dest;
|
||||
srq.commdir = dir;
|
||||
list.push_back(srq);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
return off_node_bytes;
|
||||
}
|
||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
|
||||
{
|
||||
int nreq=list.size();
|
||||
acceleratorCopySynchronise(); // Complete all pending copy transfers D2D
|
||||
|
||||
acceleratorCopySynchronise();
|
||||
std::vector<MPI_Status> status;
|
||||
std::vector<MPI_Request> MpiRequests;
|
||||
|
||||
for(int r=0;r<list.size();r++){
|
||||
// Must check each Send buf is clear to reuse
|
||||
if ( list[r].PacketType == InterNodeXmitISend ) MpiRequests.push_back(list[r].req);
|
||||
// if ( list[r].PacketType == InterNodeRecv ) MpiRequests.push_back(list[r].req); // Already "Test" passed
|
||||
}
|
||||
|
||||
if (nreq==0) return;
|
||||
int nreq=MpiRequests.size();
|
||||
|
||||
std::vector<MPI_Status> status(nreq);
|
||||
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
|
||||
assert(ierr==0);
|
||||
list.resize(0);
|
||||
if (nreq>0) {
|
||||
status.resize(MpiRequests.size());
|
||||
int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
|
||||
assert(ierr==0);
|
||||
}
|
||||
|
||||
// for(int r=0;r<nreq;r++){
|
||||
// if ( list[r].PacketType==InterNodeRecv ) {
|
||||
// acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
|
||||
// }
|
||||
// }
|
||||
|
||||
|
||||
list.resize(0); // Delete the list
|
||||
this->HostBufferFreeAll(); // Clean up the buffer allocs
|
||||
#ifndef NVLINK_GET
|
||||
this->StencilBarrier(); // if PUT must check our nbrs have filled our receive buffers.
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
////////////////////////////////////////////
|
||||
// END PIPELINE MODE / NO CUDA AWARE MPI
|
||||
////////////////////////////////////////////
|
||||
|
||||
void CartesianCommunicator::StencilBarrier(void)
|
||||
{
|
||||
MPI_Barrier (ShmComm);
|
||||
|
||||
@@ -91,7 +91,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(0);}
|
||||
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(list.size()==0);}
|
||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int dest,
|
||||
@@ -132,6 +132,17 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
||||
{
|
||||
return 2.0*bytes;
|
||||
}
|
||||
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
|
||||
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
|
||||
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int xmit_to_rank,int dox,
|
||||
void *recv,
|
||||
int recv_from_rank,int dor,
|
||||
int xbytes,int rbytes, int dir)
|
||||
{
|
||||
return 0.0;
|
||||
}
|
||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int xmit_to_rank,int dox,
|
||||
|
||||
@@ -46,8 +46,40 @@ NAMESPACE_BEGIN(Grid);
|
||||
|
||||
#if defined (GRID_COMMS_MPI3)
|
||||
typedef MPI_Comm Grid_MPI_Comm;
|
||||
typedef MPI_Request MpiCommsRequest_t;
|
||||
#ifdef ACCELERATOR_AWARE_MPI
|
||||
typedef MPI_Request CommsRequest_t;
|
||||
#else
|
||||
/*
|
||||
* Enable state transitions as each packet flows.
|
||||
*/
|
||||
enum PacketType_t {
|
||||
FaceGather,
|
||||
InterNodeXmit,
|
||||
InterNodeRecv,
|
||||
IntraNodeXmit,
|
||||
IntraNodeRecv,
|
||||
InterNodeXmitISend,
|
||||
InterNodeReceiveHtoD
|
||||
};
|
||||
/*
|
||||
*Package arguments needed for various actions along packet flow
|
||||
*/
|
||||
typedef struct {
|
||||
PacketType_t PacketType;
|
||||
void *host_buf;
|
||||
void *device_buf;
|
||||
int dest;
|
||||
int tag;
|
||||
int commdir;
|
||||
unsigned long bytes;
|
||||
acceleratorEvent_t ev;
|
||||
MpiCommsRequest_t req;
|
||||
} CommsRequest_t;
|
||||
#endif
|
||||
|
||||
#else
|
||||
typedef int MpiCommsRequest_t;
|
||||
typedef int CommsRequest_t;
|
||||
typedef int Grid_MPI_Comm;
|
||||
#endif
|
||||
@@ -105,7 +137,7 @@ public:
|
||||
///////////////////////////////////////////////////
|
||||
static void SharedMemoryAllocate(uint64_t bytes, int flags);
|
||||
static void SharedMemoryFree(void);
|
||||
static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
|
||||
// static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
|
||||
static void SharedMemoryZero(void *dest,size_t bytes);
|
||||
|
||||
};
|
||||
|
||||
@@ -42,6 +42,11 @@ Author: Christoph Lehner <christoph@lhnr.de>
|
||||
#ifdef ACCELERATOR_AWARE_MPI
|
||||
#define GRID_SYCL_LEVEL_ZERO_IPC
|
||||
#define SHM_SOCKETS
|
||||
#else
|
||||
#ifdef HAVE_NUMAIF_H
|
||||
#warning " Using NUMAIF "
|
||||
#include <numaif.h>
|
||||
#endif
|
||||
#endif
|
||||
#include <syscall.h>
|
||||
#endif
|
||||
@@ -537,7 +542,38 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
// Each MPI rank should allocate our own buffer
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#ifndef ACCELERATOR_AWARE_MPI
|
||||
HostCommBuf= malloc(bytes);
|
||||
// printf("Host buffer allocate for GPU non-aware MPI\n");
|
||||
#if 0
|
||||
HostCommBuf= acceleratorAllocHost(bytes);
|
||||
#else
|
||||
HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
|
||||
#if 0
|
||||
#warning "Moving host buffers to specific NUMA domain"
|
||||
int numa;
|
||||
char *numa_name=(char *)getenv("MPI_BUF_NUMA");
|
||||
if(numa_name) {
|
||||
unsigned long page_size = sysconf(_SC_PAGESIZE);
|
||||
numa = atoi(numa_name);
|
||||
unsigned long page_count = bytes/page_size;
|
||||
std::vector<void *> pages(page_count);
|
||||
std::vector<int> nodes(page_count,numa);
|
||||
std::vector<int> status(page_count,-1);
|
||||
for(unsigned long p=0;p<page_count;p++){
|
||||
pages[p] =(void *) ((uint64_t) HostCommBuf + p*page_size);
|
||||
}
|
||||
int ret = move_pages(0,
|
||||
page_count,
|
||||
&pages[0],
|
||||
&nodes[0],
|
||||
&status[0],
|
||||
MPOL_MF_MOVE);
|
||||
printf("Host buffer move to numa domain %d : move_pages returned %d\n",numa,ret);
|
||||
if (ret) perror(" move_pages failed for reason:");
|
||||
}
|
||||
#endif
|
||||
acceleratorPin(HostCommBuf,bytes);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
ShmCommBuf = acceleratorAllocDevice(bytes);
|
||||
if (ShmCommBuf == (void *)NULL ) {
|
||||
@@ -569,8 +605,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
|
||||
typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;
|
||||
|
||||
auto zeDevice = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
|
||||
auto zeContext = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
|
||||
auto zeDevice = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
|
||||
auto zeContext = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
|
||||
|
||||
ze_ipc_mem_handle_t ihandle;
|
||||
clone_mem_t handle;
|
||||
@@ -880,14 +916,14 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
|
||||
bzero(dest,bytes);
|
||||
#endif
|
||||
}
|
||||
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
|
||||
{
|
||||
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
|
||||
acceleratorCopyToDevice(src,dest,bytes);
|
||||
#else
|
||||
bcopy(src,dest,bytes);
|
||||
#endif
|
||||
}
|
||||
//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
|
||||
//{
|
||||
//#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
|
||||
// acceleratorCopyToDevice(src,dest,bytes);
|
||||
//#else
|
||||
// bcopy(src,dest,bytes);
|
||||
//#endif
|
||||
//}
|
||||
////////////////////////////////////////////////////////
|
||||
// Global shared functionality finished
|
||||
// Now move to per communicator functionality
|
||||
@@ -923,6 +959,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
|
||||
MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
|
||||
|
||||
ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
|
||||
// std::cerr << " SetCommunicator rank "<<r<<" comm "<<ShmCommBufs[r] <<std::endl;
|
||||
}
|
||||
ShmBufferFreeAll();
|
||||
|
||||
@@ -953,7 +990,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
|
||||
}
|
||||
#endif
|
||||
|
||||
//SharedMemoryTest();
|
||||
SharedMemoryTest();
|
||||
}
|
||||
//////////////////////////////////////////////////////////////////
|
||||
// On node barrier
|
||||
@@ -975,19 +1012,18 @@ void SharedMemory::SharedMemoryTest(void)
|
||||
check[0]=GlobalSharedMemory::WorldNode;
|
||||
check[1]=r;
|
||||
check[2]=magic;
|
||||
GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t));
|
||||
acceleratorCopyToDevice(check,ShmCommBufs[r],3*sizeof(uint64_t));
|
||||
}
|
||||
}
|
||||
ShmBarrier();
|
||||
for(uint64_t r=0;r<ShmSize;r++){
|
||||
ShmBarrier();
|
||||
GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t));
|
||||
ShmBarrier();
|
||||
acceleratorCopyFromDevice(ShmCommBufs[r],check,3*sizeof(uint64_t));
|
||||
assert(check[0]==GlobalSharedMemory::WorldNode);
|
||||
assert(check[1]==r);
|
||||
assert(check[2]==magic);
|
||||
ShmBarrier();
|
||||
}
|
||||
ShmBarrier();
|
||||
std::cout << GridLogDebug << " SharedMemoryTest has passed "<<std::endl;
|
||||
}
|
||||
|
||||
void *SharedMemory::ShmBuffer(int rank)
|
||||
|
||||
@@ -31,7 +31,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
const int Cshift_verbose=0;
|
||||
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
|
||||
{
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
@@ -65,7 +65,7 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
|
||||
Cshift_comms(ret,rhs,dimension,shift);
|
||||
}
|
||||
t1=usecond();
|
||||
// std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
|
||||
if(Cshift_verbose) std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -125,7 +125,11 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
||||
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
|
||||
static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
|
||||
static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
|
||||
|
||||
#ifndef ACCELERATOR_AWARE_MPI
|
||||
static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
|
||||
static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
|
||||
#endif
|
||||
|
||||
int cb= (cbmask==0x2)? Odd : Even;
|
||||
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
||||
RealD tcopy=0.0;
|
||||
@@ -156,16 +160,29 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
||||
// int rank = grid->_processor;
|
||||
int recv_from_rank;
|
||||
int xmit_to_rank;
|
||||
|
||||
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
||||
|
||||
tcomms-=usecond();
|
||||
grid->Barrier();
|
||||
|
||||
#ifdef ACCELERATOR_AWARE_MPI
|
||||
grid->SendToRecvFrom((void *)&send_buf[0],
|
||||
xmit_to_rank,
|
||||
(void *)&recv_buf[0],
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
#else
|
||||
// bouncy bouncy
|
||||
acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
|
||||
grid->SendToRecvFrom((void *)&hsend_buf[0],
|
||||
xmit_to_rank,
|
||||
(void *)&hrecv_buf[0],
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
|
||||
#endif
|
||||
|
||||
xbytes+=bytes;
|
||||
grid->Barrier();
|
||||
tcomms+=usecond();
|
||||
@@ -175,11 +192,13 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
||||
tscatter+=usecond();
|
||||
}
|
||||
}
|
||||
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
||||
if (Cshift_verbose){
|
||||
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||
@@ -224,12 +243,16 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
||||
static std::vector<deviceVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
|
||||
scalar_object * recv_buf_extract_mpi;
|
||||
scalar_object * send_buf_extract_mpi;
|
||||
|
||||
|
||||
for(int s=0;s<Nsimd;s++){
|
||||
send_buf_extract[s].resize(buffer_size);
|
||||
recv_buf_extract[s].resize(buffer_size);
|
||||
}
|
||||
|
||||
#ifndef ACCELERATOR_AWARE_MPI
|
||||
hostVector<scalar_object> hsend_buf; hsend_buf.resize(buffer_size);
|
||||
hostVector<scalar_object> hrecv_buf; hrecv_buf.resize(buffer_size);
|
||||
#endif
|
||||
|
||||
int bytes = buffer_size*sizeof(scalar_object);
|
||||
|
||||
ExtractPointerArray<scalar_object> pointers(Nsimd); //
|
||||
@@ -281,11 +304,22 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
||||
|
||||
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
|
||||
recv_buf_extract_mpi = &recv_buf_extract[i][0];
|
||||
#ifdef ACCELERATOR_AWARE_MPI
|
||||
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
|
||||
xmit_to_rank,
|
||||
(void *)recv_buf_extract_mpi,
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
#else
|
||||
// bouncy bouncy
|
||||
acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes);
|
||||
grid->SendToRecvFrom((void *)&hsend_buf[0],
|
||||
xmit_to_rank,
|
||||
(void *)&hrecv_buf[0],
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
|
||||
#endif
|
||||
|
||||
xbytes+=bytes;
|
||||
grid->Barrier();
|
||||
@@ -301,12 +335,15 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
||||
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
|
||||
tscatter+=usecond();
|
||||
}
|
||||
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
||||
if(Cshift_verbose){
|
||||
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -257,17 +257,30 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
|
||||
});
|
||||
}
|
||||
|
||||
#define FAST_AXPY_NORM
|
||||
template<class sobj,class vobj> inline
|
||||
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
|
||||
{
|
||||
GRID_TRACE("axpy_norm");
|
||||
return axpy_norm_fast(ret,a,x,y);
|
||||
#ifdef FAST_AXPY_NORM
|
||||
return axpy_norm_fast(ret,a,x,y);
|
||||
#else
|
||||
ret = a*x+y;
|
||||
RealD nn=norm2(ret);
|
||||
return nn;
|
||||
#endif
|
||||
}
|
||||
template<class sobj,class vobj> inline
|
||||
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
|
||||
{
|
||||
GRID_TRACE("axpby_norm");
|
||||
return axpby_norm_fast(ret,a,b,x,y);
|
||||
#ifdef FAST_AXPY_NORM
|
||||
return axpby_norm_fast(ret,a,b,x,y);
|
||||
#else
|
||||
ret = a*x+b*y;
|
||||
RealD nn=norm2(ret);
|
||||
return nn;
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Trace product
|
||||
|
||||
@@ -290,8 +290,10 @@ template<class vobj>
|
||||
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
|
||||
GridBase *grid = left.Grid();
|
||||
|
||||
bool ok;
|
||||
#ifdef GRID_SYCL
|
||||
uint64_t csum=0;
|
||||
uint64_t csum2=0;
|
||||
if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
|
||||
{
|
||||
// Hack
|
||||
@@ -300,13 +302,33 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
|
||||
Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
|
||||
uint64_t *base= (uint64_t *)&l_v[0];
|
||||
csum=svm_xor(base,words);
|
||||
ok = FlightRecorder::CsumLog(csum);
|
||||
if ( !ok ) {
|
||||
csum2=svm_xor(base,words);
|
||||
std::cerr<< " Bad CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
|
||||
} else {
|
||||
// csum2=svm_xor(base,words);
|
||||
// std::cerr<< " ok CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
|
||||
}
|
||||
assert(ok);
|
||||
}
|
||||
FlightRecorder::CsumLog(csum);
|
||||
#endif
|
||||
FlightRecorder::StepLog("rank inner product");
|
||||
ComplexD nrm = rankInnerProduct(left,right);
|
||||
// ComplexD nrmck=nrm;
|
||||
RealD local = real(nrm);
|
||||
FlightRecorder::NormLog(real(nrm));
|
||||
ok = FlightRecorder::NormLog(real(nrm));
|
||||
if ( !ok ) {
|
||||
ComplexD nrm2 = rankInnerProduct(left,right);
|
||||
RealD local2 = real(nrm2);
|
||||
std::cerr<< " Bad NORM " << local << " recomputed as "<<local2<<std::endl;
|
||||
assert(ok);
|
||||
}
|
||||
FlightRecorder::StepLog("Start global sum");
|
||||
// grid->GlobalSumP2P(nrm);
|
||||
grid->GlobalSum(nrm);
|
||||
FlightRecorder::StepLog("Finished global sum");
|
||||
// std::cout << " norm "<< nrm << " p2p norm "<<nrmck<<std::endl;
|
||||
FlightRecorder::ReductionLog(local,real(nrm));
|
||||
return nrm;
|
||||
}
|
||||
@@ -353,8 +375,44 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
|
||||
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
|
||||
coalescedWrite(z_v[ss],tmp);
|
||||
});
|
||||
bool ok;
|
||||
#ifdef GRID_SYCL
|
||||
uint64_t csum=0;
|
||||
uint64_t csum2=0;
|
||||
if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
|
||||
{
|
||||
// z_v
|
||||
{
|
||||
Integer words = sites*sizeof(vobj)/sizeof(uint64_t);
|
||||
uint64_t *base= (uint64_t *)&z_v[0];
|
||||
csum=svm_xor(base,words);
|
||||
ok = FlightRecorder::CsumLog(csum);
|
||||
if ( !ok ) {
|
||||
csum2=svm_xor(base,words);
|
||||
std::cerr<< " Bad z_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
|
||||
}
|
||||
assert(ok);
|
||||
}
|
||||
// inner_v
|
||||
{
|
||||
Integer words = sites*sizeof(inner_t)/sizeof(uint64_t);
|
||||
uint64_t *base= (uint64_t *)&inner_tmp_v[0];
|
||||
csum=svm_xor(base,words);
|
||||
ok = FlightRecorder::CsumLog(csum);
|
||||
if ( !ok ) {
|
||||
csum2=svm_xor(base,words);
|
||||
std::cerr<< " Bad inner_tmp_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
|
||||
}
|
||||
assert(ok);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
|
||||
ok = FlightRecorder::NormLog(real(nrm));
|
||||
assert(ok);
|
||||
RealD local = real(nrm);
|
||||
grid->GlobalSum(nrm);
|
||||
FlightRecorder::ReductionLog(local,real(nrm));
|
||||
return nrm;
|
||||
}
|
||||
|
||||
@@ -498,6 +556,8 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
|
||||
scalar_type * ptr = (scalar_type *) &result[0];
|
||||
int words = fd*sizeof(sobj)/sizeof(scalar_type);
|
||||
grid->GlobalSumVector(ptr, words);
|
||||
// std::cout << GridLogMessage << " sliceSum local"<<t_sum<<" us, host+mpi "<<t_rest<<std::endl;
|
||||
|
||||
}
|
||||
template<class vobj> inline
|
||||
std::vector<typename vobj::scalar_object>
|
||||
|
||||
@@ -16,11 +16,11 @@ inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer os
|
||||
Integer nsimd= vobj::Nsimd();
|
||||
{
|
||||
sycl::buffer<sobj, 1> abuff(&ret, {1});
|
||||
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
||||
auto Reduction = cl::sycl::reduction(abuff,cgh,identity,std::plus<>());
|
||||
cgh.parallel_for(cl::sycl::range<1>{osites},
|
||||
theGridAccelerator->submit([&](sycl::handler &cgh) {
|
||||
auto Reduction = sycl::reduction(abuff,cgh,identity,std::plus<>());
|
||||
cgh.parallel_for(sycl::range<1>{osites},
|
||||
Reduction,
|
||||
[=] (cl::sycl::id<1> item, auto &sum) {
|
||||
[=] (sycl::id<1> item, auto &sum) {
|
||||
auto osite = item[0];
|
||||
sum +=Reduce(lat[osite]);
|
||||
});
|
||||
@@ -75,11 +75,11 @@ template<class Word> Word svm_xor(Word *vec,uint64_t L)
|
||||
Word ret = 0;
|
||||
{
|
||||
sycl::buffer<Word, 1> abuff(&ret, {1});
|
||||
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
||||
auto Reduction = cl::sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
|
||||
cgh.parallel_for(cl::sycl::range<1>{L},
|
||||
theGridAccelerator->submit([&](sycl::handler &cgh) {
|
||||
auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
|
||||
cgh.parallel_for(sycl::range<1>{L},
|
||||
Reduction,
|
||||
[=] (cl::sycl::id<1> index, auto &sum) {
|
||||
[=] (sycl::id<1> index, auto &sum) {
|
||||
sum ^=vec[index];
|
||||
});
|
||||
});
|
||||
|
||||
@@ -55,7 +55,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
|
||||
d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
|
||||
|
||||
//copy offsets to device
|
||||
acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
|
||||
acceleratorCopyToDeviceAsynch(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
|
||||
|
||||
|
||||
gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
|
||||
@@ -88,7 +88,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
acceleratorCopyFromDeviceAsync(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
|
||||
acceleratorCopyFromDeviceAsynch(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
|
||||
|
||||
//sync after copy
|
||||
accelerator_barrier();
|
||||
@@ -141,11 +141,11 @@ inline void sliceSumReduction_sycl_small(const vobj *Data,
|
||||
});
|
||||
|
||||
for (int r = 0; r < rd; r++) {
|
||||
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
||||
auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>());
|
||||
cgh.parallel_for(cl::sycl::range<1>{subvol_size},
|
||||
theGridAccelerator->submit([&](sycl::handler &cgh) {
|
||||
auto Reduction = sycl::reduction(&mysum[r],std::plus<>());
|
||||
cgh.parallel_for(sycl::range<1>{subvol_size},
|
||||
Reduction,
|
||||
[=](cl::sycl::id<1> item, auto &sum) {
|
||||
[=](sycl::id<1> item, auto &sum) {
|
||||
auto s = item[0];
|
||||
sum += rb_p[r*subvol_size+s];
|
||||
});
|
||||
|
||||
@@ -466,9 +466,15 @@ public:
|
||||
static deviceVector<vobj> recv_buf;
|
||||
send_buf.resize(buffer_size*2*depth);
|
||||
recv_buf.resize(buffer_size*2*depth);
|
||||
#ifndef ACCELERATOR_AWARE_MPI
|
||||
static hostVector<vobj> hsend_buf;
|
||||
static hostVector<vobj> hrecv_buf;
|
||||
hsend_buf.resize(buffer_size*2*depth);
|
||||
hrecv_buf.resize(buffer_size*2*depth);
|
||||
#endif
|
||||
|
||||
std::vector<CommsRequest_t> fwd_req;
|
||||
std::vector<CommsRequest_t> bwd_req;
|
||||
std::vector<MpiCommsRequest_t> fwd_req;
|
||||
std::vector<MpiCommsRequest_t> bwd_req;
|
||||
|
||||
int words = buffer_size;
|
||||
int bytes = words * sizeof(vobj);
|
||||
@@ -495,9 +501,16 @@ public:
|
||||
t_gather+=usecond()-t;
|
||||
|
||||
t=usecond();
|
||||
#ifdef ACCELERATOR_AWARE_MPI
|
||||
grid->SendToRecvFromBegin(fwd_req,
|
||||
(void *)&send_buf[d*buffer_size], xmit_to_rank,
|
||||
(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
|
||||
#else
|
||||
acceleratorCopyFromDevice(&send_buf[d*buffer_size],&hsend_buf[d*buffer_size],bytes);
|
||||
grid->SendToRecvFromBegin(fwd_req,
|
||||
(void *)&hsend_buf[d*buffer_size], xmit_to_rank,
|
||||
(void *)&hrecv_buf[d*buffer_size], recv_from_rank, bytes, tag);
|
||||
#endif
|
||||
t_comms+=usecond()-t;
|
||||
}
|
||||
for ( int d=0;d < depth ; d ++ ) {
|
||||
@@ -508,9 +521,16 @@ public:
|
||||
t_gather+= usecond() - t;
|
||||
|
||||
t=usecond();
|
||||
#ifdef ACCELERATOR_AWARE_MPI
|
||||
grid->SendToRecvFromBegin(bwd_req,
|
||||
(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
|
||||
(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
|
||||
#else
|
||||
acceleratorCopyFromDevice(&send_buf[(d+depth)*buffer_size],&hsend_buf[(d+depth)*buffer_size],bytes);
|
||||
grid->SendToRecvFromBegin(bwd_req,
|
||||
(void *)&hsend_buf[(d+depth)*buffer_size], recv_from_rank,
|
||||
(void *)&hrecv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
|
||||
#endif
|
||||
t_comms+=usecond()-t;
|
||||
}
|
||||
|
||||
@@ -533,8 +553,13 @@ public:
|
||||
|
||||
t=usecond();
|
||||
grid->CommsComplete(fwd_req);
|
||||
#ifndef ACCELERATOR_AWARE_MPI
|
||||
for ( int d=0;d < depth ; d ++ ) {
|
||||
acceleratorCopyToDevice(&hrecv_buf[d*buffer_size],&recv_buf[d*buffer_size],bytes);
|
||||
}
|
||||
#endif
|
||||
t_comms+= usecond() - t;
|
||||
|
||||
|
||||
t=usecond();
|
||||
for ( int d=0;d < depth ; d ++ ) {
|
||||
ScatterSlice(recv_buf,to,nld-depth+d,dimension,plane*buffer_size); plane++;
|
||||
@@ -543,6 +568,11 @@ public:
|
||||
|
||||
t=usecond();
|
||||
grid->CommsComplete(bwd_req);
|
||||
#ifndef ACCELERATOR_AWARE_MPI
|
||||
for ( int d=0;d < depth ; d ++ ) {
|
||||
acceleratorCopyToDevice(&hrecv_buf[(d+depth)*buffer_size],&recv_buf[(d+depth)*buffer_size],bytes);
|
||||
}
|
||||
#endif
|
||||
t_comms+= usecond() - t;
|
||||
|
||||
t=usecond();
|
||||
|
||||
@@ -98,7 +98,7 @@ public:
|
||||
virtual RealD S(const GaugeField& U) = 0; // evaluate the action
|
||||
virtual RealD Sinitial(const GaugeField& U) { return this->S(U); } ; // if the refresh computes the action, can cache it. Alternately refreshAndAction() ?
|
||||
virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0; // evaluate the action derivative
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////
|
||||
// virtual smeared interface through configuration container
|
||||
/////////////////////////////////////////////////////////////
|
||||
@@ -132,6 +132,10 @@ public:
|
||||
template <class GaugeField >
|
||||
class EmptyAction : public Action <GaugeField>
|
||||
{
|
||||
using Action<GaugeField>::refresh;
|
||||
using Action<GaugeField>::Sinitial;
|
||||
using Action<GaugeField>::deriv;
|
||||
|
||||
virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { assert(0);}; // refresh pseudofermions
|
||||
virtual RealD S(const GaugeField& U) { return 0.0;}; // evaluate the action
|
||||
virtual void deriv(const GaugeField& U, GaugeField& dSdU) { assert(0); }; // evaluate the action derivative
|
||||
|
||||
@@ -55,6 +55,11 @@ public:
|
||||
RealD alpha; // Mobius scale
|
||||
RealD k; // EOFA normalization constant
|
||||
|
||||
// Device resident
|
||||
deviceVector<Coeff_t> d_shift_coefficients;
|
||||
deviceVector<Coeff_t> d_MooeeInv_shift_lc;
|
||||
deviceVector<Coeff_t> d_MooeeInv_shift_norm;
|
||||
|
||||
virtual void Instantiatable(void) = 0;
|
||||
|
||||
// EOFA-specific operations
|
||||
@@ -92,6 +97,11 @@ public:
|
||||
this->k = this->alpha * (_mq3-_mq2) * std::pow(this->alpha+1.0,2*Ls) /
|
||||
( std::pow(this->alpha+1.0,Ls) + _mq2*std::pow(this->alpha-1.0,Ls) ) /
|
||||
( std::pow(this->alpha+1.0,Ls) + _mq3*std::pow(this->alpha-1.0,Ls) );
|
||||
|
||||
d_shift_coefficients.resize(Ls);
|
||||
d_MooeeInv_shift_lc.resize(Ls);
|
||||
d_MooeeInv_shift_norm.resize(Ls);
|
||||
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
@@ -124,6 +124,11 @@ public:
|
||||
RealD _b;
|
||||
RealD _c;
|
||||
|
||||
// possible boost
|
||||
std::vector<ComplexD> qmu;
|
||||
void set_qmu(std::vector<ComplexD> _qmu) { qmu=_qmu; assert(qmu.size()==Nd);};
|
||||
void addQmu(const FermionField &in, FermionField &out, int dag);
|
||||
|
||||
// Cayley form Moebius (tanh and zolotarev)
|
||||
std::vector<Coeff_t> omega;
|
||||
std::vector<Coeff_t> bs; // S dependent coeffs
|
||||
@@ -143,6 +148,17 @@ public:
|
||||
std::vector<Coeff_t> ueem;
|
||||
std::vector<Coeff_t> dee;
|
||||
|
||||
// Device memory
|
||||
deviceVector<Coeff_t> d_diag;
|
||||
deviceVector<Coeff_t> d_upper;
|
||||
deviceVector<Coeff_t> d_lower;
|
||||
|
||||
deviceVector<Coeff_t> d_lee;
|
||||
deviceVector<Coeff_t> d_dee;
|
||||
deviceVector<Coeff_t> d_uee;
|
||||
deviceVector<Coeff_t> d_leem;
|
||||
deviceVector<Coeff_t> d_ueem;
|
||||
|
||||
// Matrices of 5d ee inverse params
|
||||
// std::vector<iSinglet<Simd> > MatpInv;
|
||||
// std::vector<iSinglet<Simd> > MatmInv;
|
||||
|
||||
196
Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h
Normal file
196
Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h
Normal file
@@ -0,0 +1,196 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion5D.h
|
||||
|
||||
Copyright (C) 2020 - 2025
|
||||
|
||||
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
|
||||
Author: Nils Meyer <nils.meyer@ur.de>
|
||||
Author: Christoph Lehner <christoph@lhnr.de>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <Grid/qcd/action/fermion/WilsonFermion5D.h>
|
||||
#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
|
||||
#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
|
||||
#include <Grid/qcd/action/fermion/CloverHelpers.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
// see Grid/qcd/action/fermion/CompactWilsonCloverFermion.h for description
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
class CompactWilsonCloverFermion5D : public WilsonFermion5D<Impl>,
|
||||
public WilsonCloverHelpers<Impl>,
|
||||
public CompactWilsonCloverHelpers<Impl> {
|
||||
/////////////////////////////////////////////
|
||||
// Sizes
|
||||
/////////////////////////////////////////////
|
||||
|
||||
public:
|
||||
|
||||
INHERIT_COMPACT_CLOVER_SIZES(Impl);
|
||||
|
||||
/////////////////////////////////////////////
|
||||
// Type definitions
|
||||
/////////////////////////////////////////////
|
||||
|
||||
public:
|
||||
|
||||
INHERIT_IMPL_TYPES(Impl);
|
||||
INHERIT_CLOVER_TYPES(Impl);
|
||||
INHERIT_COMPACT_CLOVER_TYPES(Impl);
|
||||
|
||||
typedef WilsonFermion5D<Impl> WilsonBase;
|
||||
typedef WilsonCloverHelpers<Impl> Helpers;
|
||||
typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
|
||||
|
||||
/////////////////////////////////////////////
|
||||
// Constructors
|
||||
/////////////////////////////////////////////
|
||||
|
||||
public:
|
||||
|
||||
CompactWilsonCloverFermion5D(GaugeField& _Umu,
|
||||
GridCartesian &FiveDimGrid,
|
||||
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||
GridCartesian &FourDimGrid,
|
||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||
const RealD _mass,
|
||||
const RealD _csw_r = 0.0,
|
||||
const RealD _csw_t = 0.0,
|
||||
const RealD _cF = 1.0,
|
||||
const ImplParams& impl_p = ImplParams());
|
||||
|
||||
/////////////////////////////////////////////
|
||||
// Member functions (implementing interface)
|
||||
/////////////////////////////////////////////
|
||||
|
||||
public:
|
||||
|
||||
virtual void Instantiatable() {};
|
||||
int ConstEE() override { return 0; };
|
||||
int isTrivialEE() override { return 0; };
|
||||
|
||||
void Dhop(const FermionField& in, FermionField& out, int dag) override;
|
||||
|
||||
void DhopOE(const FermionField& in, FermionField& out, int dag) override;
|
||||
|
||||
void DhopEO(const FermionField& in, FermionField& out, int dag) override;
|
||||
|
||||
void DhopDir(const FermionField& in, FermionField& out, int dir, int disp) override;
|
||||
|
||||
void DhopDirAll(const FermionField& in, std::vector<FermionField>& out) /* override */;
|
||||
|
||||
void M(const FermionField& in, FermionField& out) override;
|
||||
|
||||
void Mdag(const FermionField& in, FermionField& out) override;
|
||||
|
||||
void Meooe(const FermionField& in, FermionField& out) override;
|
||||
|
||||
void MeooeDag(const FermionField& in, FermionField& out) override;
|
||||
|
||||
void Mooee(const FermionField& in, FermionField& out) override;
|
||||
|
||||
void MooeeDag(const FermionField& in, FermionField& out) override;
|
||||
|
||||
void MooeeInv(const FermionField& in, FermionField& out) override;
|
||||
|
||||
void MooeeInvDag(const FermionField& in, FermionField& out) override;
|
||||
|
||||
void Mdir(const FermionField& in, FermionField& out, int dir, int disp) override;
|
||||
|
||||
void MdirAll(const FermionField& in, std::vector<FermionField>& out) override;
|
||||
|
||||
void MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) override;
|
||||
|
||||
void MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
|
||||
|
||||
void MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
|
||||
|
||||
/////////////////////////////////////////////
|
||||
// Member functions (internals)
|
||||
/////////////////////////////////////////////
|
||||
|
||||
void MooeeInternal(const FermionField& in,
|
||||
FermionField& out,
|
||||
const CloverDiagonalField& diagonal,
|
||||
const CloverTriangleField& triangle);
|
||||
|
||||
/////////////////////////////////////////////
|
||||
// Helpers
|
||||
/////////////////////////////////////////////
|
||||
|
||||
void ImportGauge(const GaugeField& _Umu) override;
|
||||
|
||||
/////////////////////////////////////////////
|
||||
// Helpers
|
||||
/////////////////////////////////////////////
|
||||
|
||||
private:
|
||||
|
||||
template<class Field>
|
||||
const MaskField* getCorrectMaskField(const Field &in) const {
|
||||
if(in.Grid()->_isCheckerBoarded) {
|
||||
if(in.Checkerboard() == Odd) {
|
||||
return &this->BoundaryMaskOdd;
|
||||
} else {
|
||||
return &this->BoundaryMaskEven;
|
||||
}
|
||||
} else {
|
||||
return &this->BoundaryMask;
|
||||
}
|
||||
}
|
||||
|
||||
template<class Field>
|
||||
void ApplyBoundaryMask(Field& f) {
|
||||
const MaskField* m = getCorrectMaskField(f); assert(m != nullptr);
|
||||
assert(m != nullptr);
|
||||
CompactHelpers::ApplyBoundaryMask(f, *m);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////
|
||||
// Member Data
|
||||
/////////////////////////////////////////////
|
||||
|
||||
public:
|
||||
|
||||
RealD csw_r;
|
||||
RealD csw_t;
|
||||
RealD cF;
|
||||
int n_rhs;
|
||||
|
||||
bool fixedBoundaries;
|
||||
|
||||
CloverDiagonalField Diagonal, DiagonalEven, DiagonalOdd;
|
||||
CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
|
||||
|
||||
CloverTriangleField Triangle, TriangleEven, TriangleOdd;
|
||||
CloverTriangleField TriangleInv, TriangleInvEven, TriangleInvOdd;
|
||||
|
||||
FermionField Tmp;
|
||||
|
||||
MaskField BoundaryMask, BoundaryMaskEven, BoundaryMaskOdd;
|
||||
};
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
@@ -60,6 +60,50 @@ public:
|
||||
// virtual void Instantiatable(void)=0;
|
||||
virtual void Instantiatable(void) =0;
|
||||
|
||||
void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist)
|
||||
{
|
||||
std::cout << "Free Propagator for PartialFraction"<<std::endl;
|
||||
FermionField in_k(in.Grid());
|
||||
FermionField prop_k(in.Grid());
|
||||
|
||||
FFT theFFT((GridCartesian *) in.Grid());
|
||||
|
||||
//phase for boundary condition
|
||||
ComplexField coor(in.Grid());
|
||||
ComplexField ph(in.Grid()); ph = Zero();
|
||||
FermionField in_buf(in.Grid()); in_buf = Zero();
|
||||
typedef typename Simd::scalar_type Scalar;
|
||||
Scalar ci(0.0,1.0);
|
||||
assert(twist.size() == Nd);//check that twist is Nd
|
||||
assert(boundary.size() == Nd);//check that boundary conditions is Nd
|
||||
int shift = 0;
|
||||
for(unsigned int nu = 0; nu < Nd; nu++)
|
||||
{
|
||||
// Shift coordinate lattice index by 1 to account for 5th dimension.
|
||||
LatticeCoordinate(coor, nu + shift);
|
||||
double boundary_phase = ::acos(real(boundary[nu]));
|
||||
ph = ph + boundary_phase*coor*((1./(in.Grid()->_fdimensions[nu+shift])));
|
||||
//momenta for propagator shifted by twist+boundary
|
||||
twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
|
||||
}
|
||||
in_buf = exp(ci*ph*(-1.0))*in;
|
||||
|
||||
theFFT.FFT_all_dim(in_k,in,FFT::forward);
|
||||
this->MomentumSpacePropagatorHw(prop_k,in_k,mass,twist);
|
||||
theFFT.FFT_all_dim(out,prop_k,FFT::backward);
|
||||
|
||||
//phase for boundary condition
|
||||
out = out * exp(ci*ph);
|
||||
};
|
||||
|
||||
virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
|
||||
std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
|
||||
std::vector<Complex> boundary;
|
||||
for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
|
||||
FreePropagator(in,out,mass,boundary,twist);
|
||||
};
|
||||
|
||||
|
||||
// Efficient support for multigrid coarsening
|
||||
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp);
|
||||
virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out);
|
||||
|
||||
@@ -55,6 +55,7 @@ NAMESPACE_CHECK(Wilson);
|
||||
NAMESPACE_CHECK(WilsonTM);
|
||||
#include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
|
||||
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h> // 4d compact wilson clover fermions
|
||||
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h> // 5d compact wilson clover fermions
|
||||
NAMESPACE_CHECK(WilsonClover);
|
||||
#include <Grid/qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types
|
||||
NAMESPACE_CHECK(Wilson5D);
|
||||
@@ -164,12 +165,17 @@ typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiS
|
||||
|
||||
// Compact Clover fermions
|
||||
template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>;
|
||||
template <typename WImpl> using CompactWilsonClover5D = CompactWilsonCloverFermion5D<WImpl, CompactCloverHelpers<WImpl>>;
|
||||
template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>;
|
||||
|
||||
typedef CompactWilsonClover<WilsonImplD2> CompactWilsonCloverFermionD2;
|
||||
typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF;
|
||||
typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD;
|
||||
|
||||
typedef CompactWilsonClover5D<WilsonImplD2> CompactWilsonCloverFermion5DD2;
|
||||
typedef CompactWilsonClover5D<WilsonImplF> CompactWilsonCloverFermion5DF;
|
||||
typedef CompactWilsonClover5D<WilsonImplD> CompactWilsonCloverFermion5DD;
|
||||
|
||||
typedef CompactWilsonExpClover<WilsonImplD2> CompactWilsonExpCloverFermionD2;
|
||||
typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF;
|
||||
typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD;
|
||||
|
||||
@@ -42,7 +42,7 @@ public:
|
||||
|
||||
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
|
||||
this->MomentumSpacePropagatorHw(out,in,_m,twist);
|
||||
};
|
||||
};
|
||||
|
||||
// Constructors
|
||||
OverlapWilsonCayleyTanhFermion(GaugeField &_Umu,
|
||||
|
||||
@@ -41,6 +41,10 @@ public:
|
||||
public:
|
||||
|
||||
// Constructors
|
||||
virtual void Instantiatable(void){};
|
||||
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
|
||||
this->MomentumSpacePropagatorHw(out,in,_m,twist);
|
||||
};
|
||||
|
||||
OverlapWilsonCayleyZolotarevFermion(GaugeField &_Umu,
|
||||
GridCartesian &FiveDimGrid,
|
||||
|
||||
@@ -41,6 +41,9 @@ public:
|
||||
public:
|
||||
|
||||
virtual void Instantiatable(void){};
|
||||
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
|
||||
this->MomentumSpacePropagatorHw(out,in,_m,twist);
|
||||
};
|
||||
// Constructors
|
||||
OverlapWilsonContFracTanhFermion(GaugeField &_Umu,
|
||||
GridCartesian &FiveDimGrid,
|
||||
|
||||
@@ -40,6 +40,9 @@ public:
|
||||
INHERIT_IMPL_TYPES(Impl);
|
||||
|
||||
virtual void Instantiatable(void){};
|
||||
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
|
||||
this->MomentumSpacePropagatorHw(out,in,_m,twist);
|
||||
};
|
||||
// Constructors
|
||||
OverlapWilsonContFracZolotarevFermion(GaugeField &_Umu,
|
||||
GridCartesian &FiveDimGrid,
|
||||
|
||||
@@ -41,6 +41,9 @@ public:
|
||||
public:
|
||||
|
||||
virtual void Instantiatable(void){};
|
||||
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
|
||||
this->MomentumSpacePropagatorHw(out,in,_m,twist);
|
||||
};
|
||||
// Constructors
|
||||
OverlapWilsonPartialFractionTanhFermion(GaugeField &_Umu,
|
||||
GridCartesian &FiveDimGrid,
|
||||
|
||||
@@ -40,6 +40,11 @@ public:
|
||||
INHERIT_IMPL_TYPES(Impl);
|
||||
|
||||
virtual void Instantiatable(void){};
|
||||
|
||||
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
|
||||
this->MomentumSpacePropagatorHw(out,in,_m,twist);
|
||||
};
|
||||
|
||||
// Constructors
|
||||
OverlapWilsonPartialFractionZolotarevFermion(GaugeField &_Umu,
|
||||
GridCartesian &FiveDimGrid,
|
||||
|
||||
@@ -39,7 +39,7 @@ class PartialFractionFermion5D : public WilsonFermion5D<Impl>
|
||||
public:
|
||||
INHERIT_IMPL_TYPES(Impl);
|
||||
|
||||
const int part_frac_chroma_convention=1;
|
||||
const int part_frac_chroma_convention=0;
|
||||
|
||||
void Meooe_internal(const FermionField &in, FermionField &out,int dag);
|
||||
void Mooee_internal(const FermionField &in, FermionField &out,int dag);
|
||||
@@ -83,11 +83,70 @@ public:
|
||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||
RealD _mass,RealD M5,const ImplParams &p= ImplParams());
|
||||
|
||||
PartialFractionFermion5D(GaugeField &_Umu,
|
||||
GridCartesian &FiveDimGrid,
|
||||
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||
GridCartesian &FourDimGrid,
|
||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||
RealD _mass,RealD M5,std::vector<RealD> &_qmu,const ImplParams &p= ImplParams());
|
||||
|
||||
void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist)
|
||||
{
|
||||
std::cout << "Free Propagator for PartialFraction"<<std::endl;
|
||||
FermionField in_k(in.Grid());
|
||||
FermionField prop_k(in.Grid());
|
||||
|
||||
FFT theFFT((GridCartesian *) in.Grid());
|
||||
|
||||
//phase for boundary condition
|
||||
ComplexField coor(in.Grid());
|
||||
ComplexField ph(in.Grid()); ph = Zero();
|
||||
FermionField in_buf(in.Grid()); in_buf = Zero();
|
||||
typedef typename Simd::scalar_type Scalar;
|
||||
Scalar ci(0.0,1.0);
|
||||
assert(twist.size() == Nd);//check that twist is Nd
|
||||
assert(boundary.size() == Nd);//check that boundary conditions is Nd
|
||||
int shift = 0;
|
||||
for(unsigned int nu = 0; nu < Nd; nu++)
|
||||
{
|
||||
// Shift coordinate lattice index by 1 to account for 5th dimension.
|
||||
LatticeCoordinate(coor, nu + shift);
|
||||
double boundary_phase = ::acos(real(boundary[nu]));
|
||||
ph = ph + boundary_phase*coor*((1./(in.Grid()->_fdimensions[nu+shift])));
|
||||
//momenta for propagator shifted by twist+boundary
|
||||
twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
|
||||
}
|
||||
in_buf = exp(ci*ph*(-1.0))*in;
|
||||
|
||||
theFFT.FFT_all_dim(in_k,in,FFT::forward);
|
||||
if ( this->qmu.size() ){
|
||||
this->MomentumSpacePropagatorHwQ(prop_k,in_k,mass,twist,this->qmu);
|
||||
} else {
|
||||
this->MomentumSpacePropagatorHw(prop_k,in_k,mass,twist);
|
||||
}
|
||||
theFFT.FFT_all_dim(out,prop_k,FFT::backward);
|
||||
|
||||
//phase for boundary condition
|
||||
out = out * exp(ci*ph);
|
||||
};
|
||||
|
||||
virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
|
||||
std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
|
||||
std::vector<Complex> boundary;
|
||||
for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
|
||||
FreePropagator(in,out,mass,boundary,twist);
|
||||
};
|
||||
|
||||
void set_qmu(std::vector<RealD> _qmu) { qmu=_qmu; assert(qmu.size()==Nd);};
|
||||
void addQmu(const FermionField &in, FermionField &out, int dag);
|
||||
|
||||
protected:
|
||||
|
||||
virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale);
|
||||
virtual void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata);
|
||||
|
||||
std::vector<RealD> qmu;
|
||||
|
||||
// Part frac
|
||||
RealD mass;
|
||||
RealD dw_diag;
|
||||
|
||||
@@ -414,29 +414,6 @@ public:
|
||||
// surface_list.resize(0);
|
||||
this->same_node.resize(npoints);
|
||||
};
|
||||
|
||||
/*
|
||||
void BuildSurfaceList(int Ls,int vol4){
|
||||
|
||||
// find same node for SHM
|
||||
// Here we know the distance is 1 for WilsonStencil
|
||||
for(int point=0;point<this->_npoints;point++){
|
||||
this->same_node[point] = this->SameNode(point);
|
||||
}
|
||||
|
||||
for(int site = 0 ;site< vol4;site++){
|
||||
int local = 1;
|
||||
for(int point=0;point<this->_npoints;point++){
|
||||
if( (!this->GetNodeLocal(site*Ls,point)) && (!this->same_node[point]) ){
|
||||
local = 0;
|
||||
}
|
||||
}
|
||||
if(local == 0) {
|
||||
surface_list.push_back(site);
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
template < class compressor>
|
||||
void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)
|
||||
@@ -507,6 +484,11 @@ public:
|
||||
this->face_table_computed=1;
|
||||
assert(this->u_comm_offset==this->_unified_buffer_size);
|
||||
accelerator_barrier();
|
||||
#ifdef NVLINK_GET
|
||||
this->_grid->StencilBarrier(); // He can now get mu local gather, I can get his
|
||||
// Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
|
||||
// Or issue barrier AFTER the DMA is running
|
||||
#endif
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
@@ -91,13 +91,13 @@ public:
|
||||
virtual void Mdag (const FermionField &in, FermionField &out){assert(0);};
|
||||
|
||||
// half checkerboard operations; leave unimplemented as abstract for now
|
||||
virtual void Meooe (const FermionField &in, FermionField &out){assert(0);};
|
||||
virtual void Mooee (const FermionField &in, FermionField &out){assert(0);};
|
||||
virtual void MooeeInv (const FermionField &in, FermionField &out){assert(0);};
|
||||
virtual void Meooe (const FermionField &in, FermionField &out);
|
||||
virtual void Mooee (const FermionField &in, FermionField &out);
|
||||
virtual void MooeeInv (const FermionField &in, FermionField &out);
|
||||
|
||||
virtual void MeooeDag (const FermionField &in, FermionField &out){assert(0);};
|
||||
virtual void MooeeDag (const FermionField &in, FermionField &out){assert(0);};
|
||||
virtual void MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
|
||||
virtual void MeooeDag (const FermionField &in, FermionField &out);
|
||||
virtual void MooeeDag (const FermionField &in, FermionField &out);
|
||||
virtual void MooeeInvDag (const FermionField &in, FermionField &out);
|
||||
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
|
||||
virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
|
||||
|
||||
@@ -109,6 +109,8 @@ public:
|
||||
void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
|
||||
void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
|
||||
void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
|
||||
void MomentumSpacePropagatorHwQ(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist,
|
||||
std::vector<double> qmu) ;
|
||||
|
||||
// Implement hopping term non-hermitian hopping term; half cb or both
|
||||
// Implement s-diagonal DW
|
||||
@@ -117,6 +119,9 @@ public:
|
||||
void DhopOE(const FermionField &in, FermionField &out,int dag);
|
||||
void DhopEO(const FermionField &in, FermionField &out,int dag);
|
||||
|
||||
void DhopComms (const FermionField &in, FermionField &out);
|
||||
void DhopCalc (const FermionField &in, FermionField &out,uint64_t *ids);
|
||||
|
||||
// add a DhopComm
|
||||
// -- suboptimal interface will presently trigger multiple comms.
|
||||
void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
|
||||
|
||||
@@ -57,6 +57,10 @@ public:
|
||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||
int interior=1,int exterior=1) ;
|
||||
|
||||
static void DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||
uint64_t *ids);
|
||||
|
||||
static void DhopDagKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||
int interior=1,int exterior=1) ;
|
||||
|
||||
@@ -48,7 +48,8 @@ CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
|
||||
FourDimGrid,
|
||||
FourDimRedBlackGrid,_M5,p),
|
||||
mass_plus(_mass), mass_minus(_mass)
|
||||
{
|
||||
{
|
||||
// qmu defaults to zero size;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
@@ -270,6 +271,34 @@ void CayleyFermion5D<Impl>::MeooeDag5D (const FermionField &psi, FermionField
|
||||
M5Ddag(psi,psi,Din,lower,diag,upper);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::addQmu(const FermionField &psi,FermionField &chi, int dag)
|
||||
{
|
||||
if ( qmu.size() ) {
|
||||
|
||||
Gamma::Algebra Gmu [] = {
|
||||
Gamma::Algebra::GammaX,
|
||||
Gamma::Algebra::GammaY,
|
||||
Gamma::Algebra::GammaZ,
|
||||
Gamma::Algebra::GammaT
|
||||
};
|
||||
std::vector<ComplexD> coeff(Nd);
|
||||
ComplexD ci(0,1);
|
||||
|
||||
assert(qmu.size()==Nd);
|
||||
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
coeff[mu] = ci*qmu[mu];
|
||||
if ( dag ) coeff[mu] = conjugate(coeff[mu]);
|
||||
}
|
||||
|
||||
chi = chi + Gamma(Gmu[0])*psi*coeff[0];
|
||||
for(int mu=1;mu<Nd;mu++){
|
||||
chi = chi + Gamma(Gmu[mu])*psi*coeff[mu];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::M (const FermionField &psi, FermionField &chi)
|
||||
{
|
||||
@@ -277,8 +306,12 @@ void CayleyFermion5D<Impl>::M (const FermionField &psi, FermionField &chi)
|
||||
|
||||
// Assemble Din
|
||||
Meooe5D(psi,Din);
|
||||
|
||||
|
||||
this->DW(Din,chi,DaggerNo);
|
||||
|
||||
// add i q_mu gamma_mu here
|
||||
addQmu(Din,chi,DaggerNo);
|
||||
|
||||
// ((b D_W + D_w hop terms +1) on s-diag
|
||||
axpby(chi,1.0,1.0,chi,psi);
|
||||
|
||||
@@ -295,6 +328,9 @@ void CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
|
||||
FermionField Din(psi.Grid());
|
||||
// Apply Dw
|
||||
this->DW(psi,Din,DaggerYes);
|
||||
|
||||
// add -i conj(q_mu) gamma_mu here ... if qmu is real, gammm_5 hermitian, otherwise not.
|
||||
addQmu(psi,Din,DaggerYes);
|
||||
|
||||
MeooeDag5D(Din,chi);
|
||||
|
||||
@@ -488,7 +524,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
|
||||
leem.resize(Ls);
|
||||
uee.resize(Ls);
|
||||
ueem.resize(Ls);
|
||||
|
||||
|
||||
for(int i=0;i<Ls;i++){
|
||||
|
||||
dee[i] = bee[i];
|
||||
@@ -529,6 +565,18 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
|
||||
dee[Ls-1] += delta_d;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////
|
||||
// Device buffers
|
||||
//////////////////////////////////////////
|
||||
d_diag.resize(Ls);
|
||||
d_upper.resize(Ls);
|
||||
d_lower.resize(Ls);
|
||||
|
||||
d_dee.resize(Ls);
|
||||
d_lee.resize(Ls);
|
||||
d_uee.resize(Ls);
|
||||
d_leem.resize(Ls);
|
||||
d_ueem.resize(Ls);
|
||||
// int inv=1;
|
||||
// this->MooeeInternalCompute(0,inv,MatpInv,MatmInv);
|
||||
// this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag);
|
||||
|
||||
@@ -57,9 +57,9 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
|
||||
|
||||
int Ls =this->Ls;
|
||||
|
||||
static deviceVector<Coeff_t> d_diag(Ls) ; acceleratorCopyToDevice(&diag[0] ,&d_diag[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_upper(Ls); acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_lower(Ls); acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&diag[0] ,&this->d_diag[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&upper[0],&this->d_upper[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&lower[0],&this->d_lower[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
auto pdiag = &d_diag[0];
|
||||
auto pupper = &d_upper[0];
|
||||
@@ -99,9 +99,9 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
|
||||
|
||||
int Ls=this->Ls;
|
||||
|
||||
static deviceVector<Coeff_t> d_diag(Ls) ; acceleratorCopyToDevice(&diag[0] ,&d_diag[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_upper(Ls); acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_lower(Ls); acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&diag[0] ,&this->d_diag[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&upper[0],&this->d_upper[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&lower[0],&this->d_lower[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
auto pdiag = &d_diag[0];
|
||||
auto pupper = &d_upper[0];
|
||||
@@ -134,11 +134,11 @@ CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi_i, FermionField &chi
|
||||
|
||||
int Ls=this->Ls;
|
||||
|
||||
static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
auto plee = & d_lee [0];
|
||||
auto pdee = & d_dee [0];
|
||||
@@ -196,11 +196,11 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
|
||||
autoView(psi , psi_i,AcceleratorRead);
|
||||
autoView(chi , chi_i,AcceleratorWrite);
|
||||
|
||||
static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
auto plee = & d_lee [0];
|
||||
auto pdee = & d_dee [0];
|
||||
|
||||
@@ -0,0 +1,376 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion5DImplementation.h
|
||||
|
||||
Copyright (C) 2017 - 2025
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Guido Cossu <guido.cossu@ed.ac.uk>
|
||||
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
|
||||
Author: Christoph Lehner <christoph@lhnr.de>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#include <Grid/Grid.h>
|
||||
#include <Grid/qcd/spin/Dirac.h>
|
||||
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h>
|
||||
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
template<class Impl, class CloverHelpers>
|
||||
CompactWilsonCloverFermion5D<Impl, CloverHelpers>::CompactWilsonCloverFermion5D(GaugeField& _Umu,
|
||||
GridCartesian &FiveDimGrid,
|
||||
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||
GridCartesian &FourDimGrid,
|
||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||
const RealD _mass,
|
||||
const RealD _csw_r,
|
||||
const RealD _csw_t,
|
||||
const RealD _cF,
|
||||
const ImplParams& impl_p)
|
||||
: WilsonBase(_Umu, FiveDimGrid, FiveDimRedBlackGrid, FourDimGrid, FourDimRedBlackGrid, _mass, impl_p)
|
||||
, csw_r(_csw_r)
|
||||
, csw_t(_csw_t)
|
||||
, cF(_cF)
|
||||
, fixedBoundaries(impl_p.boundary_phases[Nd-1] == 0.0)
|
||||
, Diagonal(&FourDimGrid), Triangle(&FourDimGrid)
|
||||
, DiagonalEven(&FourDimRedBlackGrid), TriangleEven(&FourDimRedBlackGrid)
|
||||
, DiagonalOdd(&FourDimRedBlackGrid), TriangleOdd(&FourDimRedBlackGrid)
|
||||
, DiagonalInv(&FourDimGrid), TriangleInv(&FourDimGrid)
|
||||
, DiagonalInvEven(&FourDimRedBlackGrid), TriangleInvEven(&FourDimRedBlackGrid)
|
||||
, DiagonalInvOdd(&FourDimRedBlackGrid), TriangleInvOdd(&FourDimRedBlackGrid)
|
||||
, Tmp(&FiveDimGrid)
|
||||
, BoundaryMask(&FiveDimGrid)
|
||||
, BoundaryMaskEven(&FiveDimRedBlackGrid), BoundaryMaskOdd(&FiveDimRedBlackGrid)
|
||||
{
|
||||
assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3);
|
||||
|
||||
csw_r *= 0.5;
|
||||
csw_t *= 0.5;
|
||||
//if (clover_anisotropy.isAnisotropic)
|
||||
// csw_r /= clover_anisotropy.xi_0;
|
||||
|
||||
ImportGauge(_Umu);
|
||||
if (fixedBoundaries) {
|
||||
this->BoundaryMaskEven.Checkerboard() = Even;
|
||||
this->BoundaryMaskOdd.Checkerboard() = Odd;
|
||||
CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
|
||||
}
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Dhop(const FermionField& in, FermionField& out, int dag) {
|
||||
WilsonBase::Dhop(in, out, dag);
|
||||
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopOE(const FermionField& in, FermionField& out, int dag) {
|
||||
WilsonBase::DhopOE(in, out, dag);
|
||||
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopEO(const FermionField& in, FermionField& out, int dag) {
|
||||
WilsonBase::DhopEO(in, out, dag);
|
||||
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
|
||||
WilsonBase::DhopDir(in, out, dir, disp);
|
||||
if(this->fixedBoundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
|
||||
WilsonBase::DhopDirAll(in, out);
|
||||
if(this->fixedBoundaries) {
|
||||
for(auto& o : out) ApplyBoundaryMask(o);
|
||||
}
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::M(const FermionField& in, FermionField& out) {
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
|
||||
Mooee(in, Tmp);
|
||||
axpy(out, 1.0, out, Tmp);
|
||||
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Mdag(const FermionField& in, FermionField& out) {
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
WilsonBase::Dhop(in, out, DaggerYes); // call base to save applying bc
|
||||
MooeeDag(in, Tmp);
|
||||
axpy(out, 1.0, out, Tmp);
|
||||
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Meooe(const FermionField& in, FermionField& out) {
|
||||
WilsonBase::Meooe(in, out);
|
||||
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MeooeDag(const FermionField& in, FermionField& out) {
|
||||
WilsonBase::MeooeDag(in, out);
|
||||
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Mooee(const FermionField& in, FermionField& out) {
|
||||
if(in.Grid()->_isCheckerBoarded) {
|
||||
if(in.Checkerboard() == Odd) {
|
||||
MooeeInternal(in, out, DiagonalOdd, TriangleOdd);
|
||||
} else {
|
||||
MooeeInternal(in, out, DiagonalEven, TriangleEven);
|
||||
}
|
||||
} else {
|
||||
MooeeInternal(in, out, Diagonal, Triangle);
|
||||
}
|
||||
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeDag(const FermionField& in, FermionField& out) {
|
||||
Mooee(in, out); // blocks are hermitian
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInv(const FermionField& in, FermionField& out) {
|
||||
if(in.Grid()->_isCheckerBoarded) {
|
||||
if(in.Checkerboard() == Odd) {
|
||||
MooeeInternal(in, out, DiagonalInvOdd, TriangleInvOdd);
|
||||
} else {
|
||||
MooeeInternal(in, out, DiagonalInvEven, TriangleInvEven);
|
||||
}
|
||||
} else {
|
||||
MooeeInternal(in, out, DiagonalInv, TriangleInv);
|
||||
}
|
||||
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInvDag(const FermionField& in, FermionField& out) {
|
||||
MooeeInv(in, out); // blocks are hermitian
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Mdir(const FermionField& in, FermionField& out, int dir, int disp) {
|
||||
DhopDir(in, out, dir, disp);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MdirAll(const FermionField& in, std::vector<FermionField>& out) {
|
||||
DhopDirAll(in, out);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
|
||||
assert(!fixedBoundaries); // TODO check for changes required for open bc
|
||||
|
||||
// NOTE: code copied from original clover term
|
||||
conformable(X.Grid(), Y.Grid());
|
||||
conformable(X.Grid(), force.Grid());
|
||||
GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
|
||||
GaugeField clover_force(force.Grid());
|
||||
PropagatorField Lambda(force.Grid());
|
||||
|
||||
// Guido: Here we are hitting some performance issues:
|
||||
// need to extract the components of the DoubledGaugeField
|
||||
// for each call
|
||||
// Possible solution
|
||||
// Create a vector object to store them? (cons: wasting space)
|
||||
std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
|
||||
|
||||
Impl::extractLinkField(U, this->Umu);
|
||||
|
||||
force = Zero();
|
||||
// Derivative of the Wilson hopping term
|
||||
this->DhopDeriv(force, X, Y, dag);
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
// Clover term derivative
|
||||
///////////////////////////////////////////////////////////
|
||||
Impl::outerProductImpl(Lambda, X, Y);
|
||||
//std::cout << "Lambda:" << Lambda << std::endl;
|
||||
|
||||
Gamma::Algebra sigma[] = {
|
||||
Gamma::Algebra::SigmaXY,
|
||||
Gamma::Algebra::SigmaXZ,
|
||||
Gamma::Algebra::SigmaXT,
|
||||
Gamma::Algebra::MinusSigmaXY,
|
||||
Gamma::Algebra::SigmaYZ,
|
||||
Gamma::Algebra::SigmaYT,
|
||||
Gamma::Algebra::MinusSigmaXZ,
|
||||
Gamma::Algebra::MinusSigmaYZ,
|
||||
Gamma::Algebra::SigmaZT,
|
||||
Gamma::Algebra::MinusSigmaXT,
|
||||
Gamma::Algebra::MinusSigmaYT,
|
||||
Gamma::Algebra::MinusSigmaZT};
|
||||
|
||||
/*
|
||||
sigma_{\mu \nu}=
|
||||
| 0 sigma[0] sigma[1] sigma[2] |
|
||||
| sigma[3] 0 sigma[4] sigma[5] |
|
||||
| sigma[6] sigma[7] 0 sigma[8] |
|
||||
| sigma[9] sigma[10] sigma[11] 0 |
|
||||
*/
|
||||
|
||||
int count = 0;
|
||||
clover_force = Zero();
|
||||
for (int mu = 0; mu < 4; mu++)
|
||||
{
|
||||
force_mu = Zero();
|
||||
for (int nu = 0; nu < 4; nu++)
|
||||
{
|
||||
if (mu == nu)
|
||||
continue;
|
||||
|
||||
RealD factor;
|
||||
if (nu == 4 || mu == 4)
|
||||
{
|
||||
factor = 2.0 * csw_t;
|
||||
}
|
||||
else
|
||||
{
|
||||
factor = 2.0 * csw_r;
|
||||
}
|
||||
PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
|
||||
Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok
|
||||
force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu); // checked
|
||||
count++;
|
||||
}
|
||||
|
||||
pokeLorentz(clover_force, U[mu] * force_mu, mu);
|
||||
}
|
||||
//clover_force *= csw;
|
||||
force += clover_force;
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
|
||||
assert(0);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
|
||||
assert(0);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInternal(const FermionField& in,
|
||||
FermionField& out,
|
||||
const CloverDiagonalField& diagonal,
|
||||
const CloverTriangleField& triangle) {
|
||||
assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
conformable(in, out);
|
||||
CompactHelpers::MooeeKernel(diagonal.oSites(), this->Ls, in, out, diagonal, triangle);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::ImportGauge(const GaugeField& _Umu) {
|
||||
// NOTE: parts copied from original implementation
|
||||
|
||||
// Import gauge into base class
|
||||
double t0 = usecond();
|
||||
WilsonBase::ImportGauge(_Umu); // NOTE: called here and in wilson constructor -> performed twice, but can't avoid that
|
||||
|
||||
// Initialize temporary variables
|
||||
double t1 = usecond();
|
||||
conformable(_Umu.Grid(), this->GaugeGrid());
|
||||
GridBase* grid = _Umu.Grid();
|
||||
typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
|
||||
CloverField TmpOriginal(grid);
|
||||
CloverField TmpInverse(grid);
|
||||
|
||||
// Compute the field strength terms mu>nu
|
||||
double t2 = usecond();
|
||||
WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
|
||||
WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
|
||||
WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
|
||||
WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
|
||||
WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
|
||||
WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
|
||||
|
||||
// Compute the Clover Operator acting on Colour and Spin
|
||||
// multiply here by the clover coefficients for the anisotropy
|
||||
double t3 = usecond();
|
||||
TmpOriginal = Helpers::fillCloverYZ(Bx) * csw_r;
|
||||
TmpOriginal += Helpers::fillCloverXZ(By) * csw_r;
|
||||
TmpOriginal += Helpers::fillCloverXY(Bz) * csw_r;
|
||||
TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
|
||||
TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
|
||||
TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
|
||||
|
||||
// Instantiate the clover term
|
||||
// - In case of the standard clover the mass term is added
|
||||
// - In case of the exponential clover the clover term is exponentiated
|
||||
double t4 = usecond();
|
||||
CloverHelpers::InstantiateClover(TmpOriginal, TmpInverse, csw_t, 4.0 + this->M5 /*this->diag_mass*/);
|
||||
|
||||
// Convert the data layout of the clover term
|
||||
double t5 = usecond();
|
||||
CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
|
||||
|
||||
// Modify the clover term at the temporal boundaries in case of open boundary conditions
|
||||
double t6 = usecond();
|
||||
if(fixedBoundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, 4.0 + this->M5 /*this->diag_mass*/);
|
||||
|
||||
// Invert the Clover term
|
||||
// In case of the exponential clover with (anti-)periodic boundary conditions exp(-Clover) saved
|
||||
// in TmpInverse can be used. In all other cases the clover term has to be explictly inverted.
|
||||
// TODO: For now this inversion is explictly done on the CPU
|
||||
double t7 = usecond();
|
||||
CloverHelpers::InvertClover(TmpInverse, Diagonal, Triangle, DiagonalInv, TriangleInv, fixedBoundaries);
|
||||
|
||||
// Fill the remaining clover fields
|
||||
double t8 = usecond();
|
||||
pickCheckerboard(Even, DiagonalEven, Diagonal);
|
||||
pickCheckerboard(Even, TriangleEven, Triangle);
|
||||
pickCheckerboard(Odd, DiagonalOdd, Diagonal);
|
||||
pickCheckerboard(Odd, TriangleOdd, Triangle);
|
||||
pickCheckerboard(Even, DiagonalInvEven, DiagonalInv);
|
||||
pickCheckerboard(Even, TriangleInvEven, TriangleInv);
|
||||
pickCheckerboard(Odd, DiagonalInvOdd, DiagonalInv);
|
||||
pickCheckerboard(Odd, TriangleInvOdd, TriangleInv);
|
||||
|
||||
// Report timings
|
||||
double t9 = usecond();
|
||||
|
||||
std::cout << GridLogDebug << "CompactWilsonCloverFermion5D::ImportGauge timings:" << std::endl;
|
||||
std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
|
||||
std::cout << GridLogDebug << "allocations = " << (t2 - t1) / 1e6 << std::endl;
|
||||
std::cout << GridLogDebug << "field strength = " << (t3 - t2) / 1e6 << std::endl;
|
||||
std::cout << GridLogDebug << "fill clover = " << (t4 - t3) / 1e6 << std::endl;
|
||||
std::cout << GridLogDebug << "instantiate clover = " << (t5 - t4) / 1e6 << std::endl;
|
||||
std::cout << GridLogDebug << "convert layout = " << (t6 - t5) / 1e6 << std::endl;
|
||||
std::cout << GridLogDebug << "modify boundaries = " << (t7 - t6) / 1e6 << std::endl;
|
||||
std::cout << GridLogDebug << "invert clover = " << (t8 - t7) / 1e6 << std::endl;
|
||||
std::cout << GridLogDebug << "pick cbs = " << (t9 - t8) / 1e6 << std::endl;
|
||||
std::cout << GridLogDebug << "total = " << (t9 - t0) / 1e6 << std::endl;
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
@@ -42,13 +42,13 @@ template<class Impl>
|
||||
void ContinuedFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata)
|
||||
{
|
||||
// How to check Ls matches??
|
||||
// std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
|
||||
// std::cout<<GridLogMessage << zdata->n << " - n"<<std::endl;
|
||||
// std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
|
||||
// std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
|
||||
// std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
|
||||
// std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
|
||||
std::cout<<GridLogMessage << zdata->n << " - n"<<std::endl;
|
||||
std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
|
||||
std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
|
||||
std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
|
||||
std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
|
||||
int Ls = this->Ls;
|
||||
std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
|
||||
assert(zdata->db==Ls);// Beta has Ls coeffs
|
||||
|
||||
R=(1+this->mass)/(1-this->mass);
|
||||
@@ -320,7 +320,7 @@ ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
|
||||
int Ls = this->Ls;
|
||||
conformable(solution5d.Grid(),this->FermionGrid());
|
||||
conformable(exported4d.Grid(),this->GaugeGrid());
|
||||
ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
|
||||
ExtractSlice(exported4d, solution5d, Ls-1, 0);
|
||||
}
|
||||
template<class Impl>
|
||||
void ContinuedFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
|
||||
@@ -330,7 +330,7 @@ ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
|
||||
conformable(input4d.Grid() ,this->GaugeGrid());
|
||||
FermionField tmp(this->FermionGrid());
|
||||
tmp=Zero();
|
||||
InsertSlice(input4d, tmp, Ls-1, Ls-1);
|
||||
InsertSlice(input4d, tmp, Ls-1, 0);
|
||||
tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
|
||||
this->Dminus(tmp,imported5d);
|
||||
}
|
||||
|
||||
@@ -51,13 +51,13 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
|
||||
autoView( chi , chi_i, AcceleratorWrite);
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
auto pdiag = &d_diag[0];
|
||||
auto pupper = &d_upper[0];
|
||||
auto plower = &d_lower[0];
|
||||
auto pdiag = &this->d_diag[0];
|
||||
auto pupper = &this->d_upper[0];
|
||||
auto plower = &this->d_lower[0];
|
||||
|
||||
acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||
|
||||
@@ -89,14 +89,14 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
|
||||
autoView( phi , phi_i, AcceleratorRead);
|
||||
autoView( chi , chi_i, AcceleratorWrite);
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
auto pdiag = &d_diag[0];
|
||||
auto pupper = &d_upper[0];
|
||||
auto plower = &d_lower[0];
|
||||
auto pdiag = &this->d_diag[0];
|
||||
auto pupper = &this->d_upper[0];
|
||||
auto plower = &this->d_lower[0];
|
||||
|
||||
acceleratorCopyToDevice(&diag[0] ,&pdiag[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||
|
||||
@@ -125,18 +125,18 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
|
||||
autoView( chi, chi_i, AcceleratorWrite);
|
||||
int Ls = this->Ls;
|
||||
|
||||
static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
auto plee = & d_lee [0];
|
||||
auto pdee = & d_dee [0];
|
||||
auto puee = & d_uee [0];
|
||||
auto pleem = & d_leem[0];
|
||||
auto pueem = & d_ueem[0];
|
||||
auto plee = & this->d_lee [0];
|
||||
auto pdee = & this->d_dee [0];
|
||||
auto puee = & this->d_uee [0];
|
||||
auto pleem = & this->d_leem[0];
|
||||
auto pueem = & this->d_ueem[0];
|
||||
|
||||
acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
uint64_t nloop=grid->oSites()/Ls;
|
||||
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
||||
uint64_t ss=sss*Ls;
|
||||
|
||||
@@ -50,14 +50,14 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
|
||||
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
auto pdiag = &d_diag[0];
|
||||
auto pupper = &d_upper[0];
|
||||
auto plower = &d_lower[0];
|
||||
auto pdiag = &this->d_diag[0];
|
||||
auto pupper = &this->d_upper[0];
|
||||
auto plower = &this->d_lower[0];
|
||||
|
||||
acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||
int nloop = grid->oSites()/Ls;
|
||||
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
||||
@@ -93,15 +93,15 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
|
||||
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_shift_coeffs(Ls);acceleratorCopyToDevice(&shift_coeffs[0],&d_shift_coeffs[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
auto pdiag = &d_diag[0];
|
||||
auto pupper = &d_upper[0];
|
||||
auto plower = &d_lower[0];
|
||||
auto pshift_coeffs = &d_shift_coeffs[0];
|
||||
auto pdiag = &this->d_diag[0];
|
||||
auto pupper = &this->d_upper[0];
|
||||
auto plower = &this->d_lower[0];
|
||||
auto pshift_coeffs = &this->d_shift_coefficients[0];
|
||||
|
||||
acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&shift_coeffs[0],&pshift_coeffs[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||
int nloop = grid->oSites()/Ls;
|
||||
@@ -138,14 +138,14 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
|
||||
autoView(chi , chi_i, AcceleratorWrite);
|
||||
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
auto pdiag = &d_diag[0];
|
||||
auto pupper = &d_upper[0];
|
||||
auto plower = &d_lower[0];
|
||||
auto pdiag = &this->d_diag[0];
|
||||
auto pupper = &this->d_upper[0];
|
||||
auto plower = &this->d_lower[0];
|
||||
|
||||
acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||
int nloop = grid->oSites()/Ls;
|
||||
@@ -180,16 +180,16 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
|
||||
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_shift_coeffs(Ls);acceleratorCopyToDevice(&shift_coeffs[0],&d_shift_coeffs[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
auto pdiag = &d_diag[0];
|
||||
auto pupper = &d_upper[0];
|
||||
auto plower = &d_lower[0];
|
||||
auto pshift_coeffs = &d_shift_coeffs[0];
|
||||
auto pdiag = &this->d_diag[0];
|
||||
auto pupper = &this->d_upper[0];
|
||||
auto plower = &this->d_lower[0];
|
||||
auto pshift_coeffs = &this->d_shift_coefficients[0];
|
||||
|
||||
acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&shift_coeffs[0],&pshift_coeffs[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||
auto pm = this->pm;
|
||||
|
||||
@@ -230,17 +230,17 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
|
||||
autoView(psi , psi_i, AcceleratorRead);
|
||||
autoView(chi , chi_i, AcceleratorWrite);
|
||||
|
||||
static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
|
||||
auto plee = & this->d_lee [0];
|
||||
auto pdee = & this->d_dee [0];
|
||||
auto puee = & this->d_uee [0];
|
||||
auto pleem = & this->d_leem[0];
|
||||
auto pueem = & this->d_ueem[0];
|
||||
|
||||
auto plee = & d_lee [0];
|
||||
auto pdee = & d_dee [0];
|
||||
auto puee = & d_uee [0];
|
||||
auto pleem = & d_leem[0];
|
||||
auto pueem = & d_ueem[0];
|
||||
acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; }
|
||||
|
||||
@@ -293,23 +293,22 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
|
||||
autoView(chi , chi_i, AcceleratorWrite);
|
||||
|
||||
// Move into object and constructor
|
||||
static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
auto pm = this->pm;
|
||||
auto plee = & d_lee [0];
|
||||
auto pdee = & d_dee [0];
|
||||
auto puee = & d_uee [0];
|
||||
auto pleem = & d_leem[0];
|
||||
auto pueem = & d_ueem[0];
|
||||
auto plee = & this->d_lee [0];
|
||||
auto pdee = & this->d_dee [0];
|
||||
auto puee = & this->d_uee [0];
|
||||
auto pleem = & this->d_leem[0];
|
||||
auto pueem = & this->d_ueem[0];
|
||||
auto pMooeeInv_shift_lc = &this->d_MooeeInv_shift_lc[0];
|
||||
auto pMooeeInv_shift_norm = &this->d_MooeeInv_shift_norm[0];
|
||||
|
||||
static deviceVector<Coeff_t> d_MooeeInv_shift_lc(Ls); acceleratorCopyToDevice(&MooeeInv_shift_lc[0],&d_MooeeInv_shift_lc[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_MooeeInv_shift_norm(Ls); acceleratorCopyToDevice(&MooeeInv_shift_norm[0],&d_MooeeInv_shift_norm[0],Ls*sizeof(Coeff_t));
|
||||
auto pMooeeInv_shift_lc = &d_MooeeInv_shift_lc[0];
|
||||
auto pMooeeInv_shift_norm = &d_MooeeInv_shift_norm[0];
|
||||
acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&MooeeInv_shift_lc[0],&pMooeeInv_shift_lc[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&MooeeInv_shift_norm[0],&pMooeeInv_shift_norm[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
int nloop = grid->oSites()/Ls;
|
||||
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
||||
@@ -367,17 +366,17 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
|
||||
autoView(psi , psi_i, AcceleratorRead);
|
||||
autoView(chi , chi_i, AcceleratorWrite);
|
||||
|
||||
static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
|
||||
auto plee = &this->d_lee [0];
|
||||
auto pdee = &this->d_dee [0];
|
||||
auto puee = &this->d_uee [0];
|
||||
auto pleem = &this->d_leem[0];
|
||||
auto pueem = &this->d_ueem[0];
|
||||
|
||||
auto plee = & d_lee [0];
|
||||
auto pdee = & d_dee [0];
|
||||
auto puee = & d_uee [0];
|
||||
auto pleem = & d_leem[0];
|
||||
auto pueem = & d_ueem[0];
|
||||
acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
int nloop = grid->oSites()/Ls;
|
||||
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
||||
@@ -426,25 +425,23 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
|
||||
autoView(chi , chi_i, AcceleratorWrite);
|
||||
int Ls = this->Ls;
|
||||
|
||||
static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
|
||||
static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
auto pm = this->pm;
|
||||
auto plee = & d_lee [0];
|
||||
auto pdee = & d_dee [0];
|
||||
auto puee = & d_uee [0];
|
||||
auto pleem = & d_leem[0];
|
||||
auto pueem = & d_ueem[0];
|
||||
auto plee = & this->d_lee [0];
|
||||
auto pdee = & this->d_dee [0];
|
||||
auto puee = & this->d_uee [0];
|
||||
auto pleem = & this->d_leem[0];
|
||||
auto pueem = & this->d_ueem[0];
|
||||
|
||||
static deviceVector<Coeff_t> d_MooeeInvDag_shift_lc(Ls);
|
||||
static deviceVector<Coeff_t> d_MooeeInvDag_shift_norm(Ls);
|
||||
acceleratorCopyToDevice(&MooeeInvDag_shift_lc[0],&d_MooeeInvDag_shift_lc[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&MooeeInvDag_shift_norm[0],&d_MooeeInvDag_shift_norm[0],Ls*sizeof(Coeff_t));
|
||||
auto pMooeeInvDag_shift_lc = &d_MooeeInvDag_shift_lc[0];
|
||||
auto pMooeeInvDag_shift_norm = &d_MooeeInvDag_shift_norm[0];
|
||||
auto pMooeeInvDag_shift_lc = &this->d_MooeeInv_shift_lc[0];
|
||||
auto pMooeeInvDag_shift_norm = &this->d_MooeeInv_shift_norm[0];
|
||||
|
||||
acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&MooeeInvDag_shift_lc[0],&pMooeeInvDag_shift_lc[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&MooeeInvDag_shift_norm[0],&pMooeeInvDag_shift_norm[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
// auto pMooeeInvDag_shift_lc = &MooeeInvDag_shift_lc[0];
|
||||
// auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0];
|
||||
|
||||
@@ -237,7 +237,32 @@ void PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, Fermi
|
||||
// ( 0 -sqrt(p_i)*amax | 2 R gamma_5 + p0/amax 2H
|
||||
//
|
||||
|
||||
this->DW(psi,D,DaggerNo);
|
||||
this->DW(psi,D,DaggerNo);
|
||||
|
||||
// DW - DW+iqslash
|
||||
// (g5 Dw)^dag = g5 Dw
|
||||
// (iqmu g5 gmu)^dag = (-i qmu gmu^dag g5^dag) = i qmu g5 gmu
|
||||
if ( qmu.size() ) {
|
||||
|
||||
std::cout<< "Mat" << "qmu ("<<qmu[0]<<","<<qmu[1]<<","<<qmu[2]<<","<<qmu[3]<<")"<<std::endl;
|
||||
assert(qmu.size()==Nd);
|
||||
|
||||
FermionField qslash_psi(psi.Grid());
|
||||
|
||||
Gamma::Algebra Gmu [] = {
|
||||
Gamma::Algebra::GammaX,
|
||||
Gamma::Algebra::GammaY,
|
||||
Gamma::Algebra::GammaZ,
|
||||
Gamma::Algebra::GammaT
|
||||
};
|
||||
qslash_psi = qmu[0]*(Gamma(Gmu[0])*psi);
|
||||
for(int mu=1;mu<Nd;mu++){
|
||||
qslash_psi = qslash_psi + qmu[mu]*(Gamma(Gmu[mu])*psi);
|
||||
}
|
||||
ComplexD ci(0.0,1.0);
|
||||
qslash_psi = ci*qslash_psi ; // i qslash
|
||||
D = D + qslash_psi;
|
||||
}
|
||||
|
||||
int nblock=(Ls-1)/2;
|
||||
for(int b=0;b<nblock;b++){
|
||||
@@ -255,15 +280,55 @@ void PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, Fermi
|
||||
}
|
||||
|
||||
{
|
||||
// The 'conventional' Cayley overlap operator is
|
||||
//
|
||||
// Dov = (1+m)/2 + (1-m)/2 g5 sgn Hw
|
||||
//
|
||||
//
|
||||
// With massless limit 1/2(1+g5 sgnHw)
|
||||
//
|
||||
// Luscher shows quite neatly that 1+g5 sgn Hw has tree level propagator i qslash +O(a^2)
|
||||
//
|
||||
// However, the conventional normalisation has both a leading order factor of 2 in Zq
|
||||
// at tree level AND a mass dependent (1-m) that are convenient to absorb.
|
||||
//
|
||||
// In WilsonFermion5DImplementation.h, the tree level propagator for Hw is
|
||||
//
|
||||
// num = -i sin kmu gmu
|
||||
//
|
||||
// denom ( sqrt(sk^2 + (2shk^2 - 1)^2
|
||||
// b_k = sk2 - M5;
|
||||
//
|
||||
// w_k = sqrt(sk + b_k*b_k);
|
||||
//
|
||||
// denom= ( w_k + b_k + mass*mass) ;
|
||||
//
|
||||
// denom= one/denom;
|
||||
// out = num*denom;
|
||||
//
|
||||
// Chroma, and Grid define partial fraction via 4d operator
|
||||
//
|
||||
// Dpf = 2/(1-m) x Dov = (1+m)/(1-m) + g5 sgn Hw
|
||||
//
|
||||
// Now since:
|
||||
//
|
||||
// (1+m)/(1-m) = (1-m)/(1-m) + 2m/(1-m) = 1 + 2m/(1-m)
|
||||
//
|
||||
// This corresponds to a modified mass parameter
|
||||
//
|
||||
// It has an annoying
|
||||
//
|
||||
//
|
||||
double R=(1+this->mass)/(1-this->mass);
|
||||
//R g5 psi[Ls] + p[0] H
|
||||
//R g5 psi[Ls] + p[0] Hw
|
||||
ag5xpbg5y_ssp(chi,R*scale,psi,p[nblock]*scale/amax,D,Ls-1,Ls-1);
|
||||
|
||||
|
||||
for(int b=0;b<nblock;b++){
|
||||
int s = 2*b+1;
|
||||
double pp = p[nblock-1-b];
|
||||
axpby_ssp(chi,1.0,chi,-sqrt(amax*pp)*scale*sign,psi,Ls-1,s);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@@ -411,17 +476,18 @@ void PartialFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,App
|
||||
int Ls = this->Ls;
|
||||
conformable(solution5d.Grid(),this->FermionGrid());
|
||||
conformable(exported4d.Grid(),this->GaugeGrid());
|
||||
ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
|
||||
ExtractSlice(exported4d, solution5d, Ls-1, 0);
|
||||
}
|
||||
template<class Impl>
|
||||
void PartialFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
|
||||
{
|
||||
//void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog)
|
||||
int Ls = this->Ls;
|
||||
conformable(imported5d.Grid(),this->FermionGrid());
|
||||
conformable(input4d.Grid() ,this->GaugeGrid());
|
||||
FermionField tmp(this->FermionGrid());
|
||||
tmp=Zero();
|
||||
InsertSlice(input4d, tmp, Ls-1, Ls-1);
|
||||
InsertSlice(input4d, tmp, Ls-1, 0);
|
||||
tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
|
||||
this->Dminus(tmp,imported5d);
|
||||
}
|
||||
@@ -442,7 +508,7 @@ PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
|
||||
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
qmu.resize(0);
|
||||
assert((Ls&0x1)==1); // Odd Ls required
|
||||
int nrational=Ls-1;
|
||||
|
||||
@@ -460,6 +526,22 @@ PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
|
||||
Approx::zolotarev_free(zdata);
|
||||
|
||||
}
|
||||
template<class Impl>
|
||||
PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
|
||||
GridCartesian &FiveDimGrid,
|
||||
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||
GridCartesian &FourDimGrid,
|
||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||
RealD _mass,RealD M5,
|
||||
std::vector<RealD> &_qmu,
|
||||
const ImplParams &p)
|
||||
: PartialFractionFermion5D<Impl>(_Umu,
|
||||
FiveDimGrid,FiveDimRedBlackGrid,
|
||||
FourDimGrid,FourDimRedBlackGrid,
|
||||
_mass,M5,p)
|
||||
{
|
||||
qmu=_qmu;
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
|
||||
@@ -14,6 +14,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Guido Cossu <guido.cossu@ed.ac.uk>
|
||||
Author: Andrew Lawson <andrew.lawson1991@gmail.com>
|
||||
Author: Vera Guelpers <V.M.Guelpers@soton.ac.uk>
|
||||
Author: Christoph Lehner <christoph@lhnr.de>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
@@ -332,22 +333,18 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
|
||||
|
||||
// std::cout << " WilsonFermion5D Communicate Begin " <<std::endl;
|
||||
std::vector<std::vector<CommsRequest_t> > requests;
|
||||
auto id=traceStart("Communicate overlapped");
|
||||
st.CommunicateBegin(requests);
|
||||
|
||||
#if 1
|
||||
/////////////////////////////
|
||||
// Overlap with comms
|
||||
/////////////////////////////
|
||||
{
|
||||
// std::cout << " WilsonFermion5D Comms merge " <<std::endl;
|
||||
GRID_TRACE("MergeSHM");
|
||||
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
|
||||
}
|
||||
|
||||
st.CommunicateBegin(requests);
|
||||
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
|
||||
#endif
|
||||
|
||||
/////////////////////////////
|
||||
// do the compute interior
|
||||
/////////////////////////////
|
||||
// std::cout << " WilsonFermion5D Interior " <<std::endl;
|
||||
int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know
|
||||
if (dag == DaggerYes) {
|
||||
GRID_TRACE("DhopDagInterior");
|
||||
@@ -356,13 +353,23 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
|
||||
GRID_TRACE("DhopInterior");
|
||||
Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
|
||||
}
|
||||
|
||||
|
||||
//ifdef GRID_ACCELERATED
|
||||
#if 0
|
||||
/////////////////////////////
|
||||
// Overlap with comms -- on GPU the interior kernel call is nonblocking
|
||||
/////////////////////////////
|
||||
st.CommunicateBegin(requests);
|
||||
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
|
||||
#endif
|
||||
|
||||
|
||||
/////////////////////////////
|
||||
// Complete comms
|
||||
/////////////////////////////
|
||||
// std::cout << " WilsonFermion5D Comms Complete " <<std::endl;
|
||||
st.CommunicateComplete(requests);
|
||||
traceStop(id);
|
||||
// traceStop(id);
|
||||
|
||||
/////////////////////////////
|
||||
// do the compute exterior
|
||||
@@ -438,6 +445,29 @@ void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int
|
||||
|
||||
DhopInternal(StencilOdd,UmuEven,in,out,dag);
|
||||
}
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::DhopComms(const FermionField &in, FermionField &out)
|
||||
{
|
||||
int dag =0 ;
|
||||
conformable(in.Grid(),FermionGrid()); // verifies full grid
|
||||
conformable(in.Grid(),out.Grid());
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
Compressor compressor(dag);
|
||||
Stencil.HaloExchangeOpt(in,compressor);
|
||||
}
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::DhopCalc(const FermionField &in, FermionField &out,uint64_t *ids)
|
||||
{
|
||||
conformable(in.Grid(),FermionGrid()); // verifies full grid
|
||||
conformable(in.Grid(),out.Grid());
|
||||
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
|
||||
int LLs = in.Grid()->_rdimensions[0];
|
||||
int Opt = WilsonKernelsStatic::Opt;
|
||||
Kernels::DhopKernel(Opt,Stencil,Umu,Stencil.CommBuf(),LLs,Umu.oSites(),in,out,ids);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
@@ -455,6 +485,54 @@ void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag
|
||||
Dhop(in,out,dag); // -0.5 is included
|
||||
axpy(out,4.0-M5,in,out);
|
||||
}
|
||||
template <class Impl>
|
||||
void WilsonFermion5D<Impl>::Meooe(const FermionField &in, FermionField &out)
|
||||
{
|
||||
if (in.Checkerboard() == Odd) {
|
||||
DhopEO(in, out, DaggerNo);
|
||||
} else {
|
||||
DhopOE(in, out, DaggerNo);
|
||||
}
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonFermion5D<Impl>::MeooeDag(const FermionField &in, FermionField &out)
|
||||
{
|
||||
if (in.Checkerboard() == Odd) {
|
||||
DhopEO(in, out, DaggerYes);
|
||||
} else {
|
||||
DhopOE(in, out, DaggerYes);
|
||||
}
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonFermion5D<Impl>::Mooee(const FermionField &in, FermionField &out)
|
||||
{
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
typename FermionField::scalar_type scal(4.0 + M5);
|
||||
out = scal * in;
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonFermion5D<Impl>::MooeeDag(const FermionField &in, FermionField &out)
|
||||
{
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
Mooee(in, out);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::MooeeInv(const FermionField &in, FermionField &out)
|
||||
{
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
out = (1.0/(4.0 + M5))*in;
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
|
||||
{
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
MooeeInv(in,out);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in, RealD mass,std::vector<double> twist)
|
||||
@@ -740,6 +818,15 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe
|
||||
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist)
|
||||
{
|
||||
std::vector<double> empty_q(Nd,0.0);
|
||||
MomentumSpacePropagatorHwQ(out,in,mass,twist,empty_q);
|
||||
}
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::MomentumSpacePropagatorHwQ(FermionField &out,const FermionField &in,
|
||||
RealD mass,
|
||||
std::vector<double> twist,
|
||||
std::vector<double> qmu)
|
||||
{
|
||||
Gamma::Algebra Gmu [] = {
|
||||
Gamma::Algebra::GammaX,
|
||||
@@ -755,6 +842,7 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
|
||||
typedef typename FermionField::scalar_type ScalComplex;
|
||||
|
||||
typedef Lattice<iSinglet<vector_type> > LatComplex;
|
||||
typedef iSpinMatrix<ScalComplex> SpinMat;
|
||||
|
||||
|
||||
Coordinate latt_size = _grid->_fdimensions;
|
||||
@@ -772,8 +860,10 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
|
||||
LatComplex kmu(_grid);
|
||||
ScalComplex ci(0.0,1.0);
|
||||
|
||||
std::cout<< "Feynman Rule" << "qmu ("<<qmu[0]<<","<<qmu[1]<<","<<qmu[2]<<","<<qmu[3]<<")"<<std::endl;
|
||||
|
||||
for(int mu=0;mu<Nd;mu++) {
|
||||
|
||||
|
||||
LatticeCoordinate(kmu,mu);
|
||||
|
||||
RealD TwoPiL = M_PI * 2.0/ latt_size[mu];
|
||||
@@ -782,9 +872,18 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
|
||||
kmu = kmu + TwoPiL * one * twist[mu];//momentum for twisted boundary conditions
|
||||
|
||||
sk2 = sk2 + 2.0*sin(kmu*0.5)*sin(kmu*0.5);
|
||||
sk = sk + sin(kmu)*sin(kmu);
|
||||
|
||||
num = num - sin(kmu)*ci*(Gamma(Gmu[mu])*in);
|
||||
sk = sk + (sin(kmu)+qmu[mu])*(sin(kmu)+qmu[mu]);
|
||||
|
||||
// Terms for boosted Fermion
|
||||
// 1/2 [ -i gamma.(sin p + q ) ]
|
||||
// [ --------------------- + 1 ]
|
||||
// [ wq + b ]
|
||||
//
|
||||
// wq = sqrt( (sinp+q)^2 + b^2 )
|
||||
//
|
||||
|
||||
num = num - (sin(kmu)+qmu[mu])*ci*(Gamma(Gmu[mu])*in);
|
||||
|
||||
}
|
||||
num = num + mass * in ;
|
||||
|
||||
@@ -63,7 +63,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
|
||||
} else { \
|
||||
chi = coalescedRead(buf[SE->_offset],lane); \
|
||||
} \
|
||||
acceleratorSynchronise(); \
|
||||
acceleratorSynchronise(); \
|
||||
Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \
|
||||
Recon(result, Uchi);
|
||||
|
||||
@@ -411,6 +411,46 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
|
||||
#undef LoopBody
|
||||
}
|
||||
|
||||
#ifdef GRID_SYCL
|
||||
extern "C" {
|
||||
ulong SYCL_EXTERNAL __attribute__((overloadable)) intel_get_cycle_counter( void );
|
||||
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_active_channel_mask( void );
|
||||
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_grf_register( uint reg );
|
||||
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_flag_register( uint flag );
|
||||
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_control_register( uint reg );
|
||||
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_hw_thread_id( void );
|
||||
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_slice_id( void );
|
||||
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_subslice_id( void );
|
||||
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_eu_id( void );
|
||||
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_eu_thread_id( void );
|
||||
void SYCL_EXTERNAL __attribute__((overloadable)) intel_eu_thread_pause( uint value );
|
||||
}
|
||||
#ifdef GRID_SIMT
|
||||
#define MAKE_ID(A) (intel_get_eu_id()<<16)|(intel_get_slice_id()<<8)|(intel_get_subslice_id())
|
||||
#else
|
||||
#define MAKE_ID(A) (0)
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#define MAKE_ID(A) (0)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#define KERNEL_CALL_ID(A) \
|
||||
const uint64_t NN = Nsite*Ls; \
|
||||
accelerator_forNB( ss, NN, Simd::Nsimd(), { \
|
||||
int sF = ss; \
|
||||
int sU = ss/Ls; \
|
||||
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v); \
|
||||
const int Nsimd = SiteHalfSpinor::Nsimd(); \
|
||||
const int lane=acceleratorSIMTlane(Nsimd); \
|
||||
int idx=sF*Nsimd+lane; \
|
||||
uint64_t id = MAKE_ID(); \
|
||||
ids[idx]=id; \
|
||||
}); \
|
||||
accelerator_barrier();
|
||||
|
||||
#define KERNEL_CALLNB(A) \
|
||||
const uint64_t NN = Nsite*Ls; \
|
||||
@@ -418,7 +458,7 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
|
||||
int sF = ss; \
|
||||
int sU = ss/Ls; \
|
||||
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v); \
|
||||
});
|
||||
});
|
||||
|
||||
#define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier();
|
||||
|
||||
@@ -451,6 +491,8 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
|
||||
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \
|
||||
});}
|
||||
|
||||
|
||||
|
||||
template <class Impl>
|
||||
void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||
@@ -475,7 +517,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;}
|
||||
#endif
|
||||
} else if( exterior ) {
|
||||
// dependent on result of merge
|
||||
// // dependent on result of merge
|
||||
acceleratorFenceComputeStream();
|
||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL_EXT(GenericDhopSiteExt); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteExt); return;}
|
||||
@@ -485,6 +527,18 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
||||
}
|
||||
assert(0 && " Kernel optimisation case not covered ");
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||
uint64_t *ids)
|
||||
{
|
||||
autoView(U_v , U,AcceleratorRead);
|
||||
autoView(in_v , in,AcceleratorRead);
|
||||
autoView(out_v,out,AcceleratorWrite);
|
||||
autoView(st_v , st,AcceleratorRead);
|
||||
KERNEL_CALL_ID(GenericDhopSite);
|
||||
}
|
||||
template <class Impl>
|
||||
void WilsonKernels<Impl>::DhopDagKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||
|
||||
@@ -0,0 +1,45 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/ qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation5D.cc.master
|
||||
|
||||
Copyright (C) 2017 - 2025
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Guido Cossu <guido.cossu@ed.ac.uk>
|
||||
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
|
||||
Author: Mattia Bruno <mattia.bruno@cern.ch>
|
||||
Author: Christoph Lehner <christoph@lhnr.de>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#include <Grid/Grid.h>
|
||||
#include <Grid/qcd/spin/Dirac.h>
|
||||
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h>
|
||||
#include <Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermion5DImplementation.h>
|
||||
#include <Grid/qcd/action/fermion/CloverHelpers.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
#include "impl.h"
|
||||
template class CompactWilsonCloverFermion5D<IMPLEMENTATION, CompactCloverHelpers<IMPLEMENTATION>>;
|
||||
template class CompactWilsonCloverFermion5D<IMPLEMENTATION, CompactExpCloverHelpers<IMPLEMENTATION>>;
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
@@ -0,0 +1 @@
|
||||
../CompactWilsonCloverFermion5DInstantiation.cc.master
|
||||
@@ -0,0 +1 @@
|
||||
../CompactWilsonCloverFermion5DInstantiation.cc.master
|
||||
@@ -62,7 +62,7 @@ do
|
||||
done
|
||||
done
|
||||
|
||||
CC_LIST="CompactWilsonCloverFermionInstantiation"
|
||||
CC_LIST="CompactWilsonCloverFermionInstantiation CompactWilsonCloverFermion5DInstantiation"
|
||||
|
||||
for impl in $COMPACT_WILSON_IMPL_LIST
|
||||
do
|
||||
|
||||
@@ -40,6 +40,11 @@ public:
|
||||
|
||||
INHERIT_GIMPL_TYPES(Gimpl);
|
||||
|
||||
using Action<GaugeField>::S;
|
||||
using Action<GaugeField>::Sinitial;
|
||||
using Action<GaugeField>::deriv;
|
||||
using Action<GaugeField>::refresh;
|
||||
|
||||
private:
|
||||
RealD c_plaq;
|
||||
RealD c_rect;
|
||||
@@ -71,27 +76,27 @@ public:
|
||||
return action;
|
||||
};
|
||||
|
||||
virtual void deriv(const GaugeField &Umu,GaugeField & dSdU) {
|
||||
virtual void deriv(const GaugeField &U, GaugeField &dSdU) {
|
||||
//extend Ta to include Lorentz indexes
|
||||
RealD factor_p = c_plaq/RealD(Nc)*0.5;
|
||||
RealD factor_r = c_rect/RealD(Nc)*0.5;
|
||||
|
||||
GridBase *grid = Umu.Grid();
|
||||
GridBase *grid = U.Grid();
|
||||
|
||||
std::vector<GaugeLinkField> U (Nd,grid);
|
||||
std::vector<GaugeLinkField> Umu (Nd,grid);
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
|
||||
Umu[mu] = PeekIndex<LorentzIndex>(U,mu);
|
||||
}
|
||||
std::vector<GaugeLinkField> RectStaple(Nd,grid), Staple(Nd,grid);
|
||||
WilsonLoops<Gimpl>::StapleAndRectStapleAll(Staple, RectStaple, U, workspace);
|
||||
WilsonLoops<Gimpl>::StapleAndRectStapleAll(Staple, RectStaple, Umu, workspace);
|
||||
|
||||
GaugeLinkField dSdU_mu(grid);
|
||||
GaugeLinkField staple(grid);
|
||||
|
||||
for (int mu=0; mu < Nd; mu++){
|
||||
dSdU_mu = Ta(U[mu]*Staple[mu])*factor_p;
|
||||
dSdU_mu = dSdU_mu + Ta(U[mu]*RectStaple[mu])*factor_r;
|
||||
|
||||
dSdU_mu = Ta(Umu[mu]*Staple[mu])*factor_p;
|
||||
dSdU_mu = dSdU_mu + Ta(Umu[mu]*RectStaple[mu])*factor_r;
|
||||
|
||||
PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
|
||||
}
|
||||
|
||||
|
||||
@@ -43,6 +43,11 @@ class WilsonGaugeAction : public Action<typename Gimpl::GaugeField> {
|
||||
public:
|
||||
INHERIT_GIMPL_TYPES(Gimpl);
|
||||
|
||||
using Action<GaugeField>::S;
|
||||
using Action<GaugeField>::Sinitial;
|
||||
using Action<GaugeField>::deriv;
|
||||
using Action<GaugeField>::refresh;
|
||||
|
||||
/////////////////////////// constructors
|
||||
explicit WilsonGaugeAction(RealD beta_):beta(beta_){};
|
||||
|
||||
@@ -68,20 +73,23 @@ public:
|
||||
// extend Ta to include Lorentz indexes
|
||||
|
||||
RealD factor = 0.5 * beta / RealD(Nc);
|
||||
GridBase *grid = U.Grid();
|
||||
|
||||
GaugeLinkField Umu(U.Grid());
|
||||
GaugeLinkField dSdU_mu(U.Grid());
|
||||
GaugeLinkField dSdU_mu(grid);
|
||||
std::vector<GaugeLinkField> Umu(Nd, grid);
|
||||
for (int mu = 0; mu < Nd; mu++) {
|
||||
Umu[mu] = PeekIndex<LorentzIndex>(U, mu);
|
||||
}
|
||||
|
||||
Umu = PeekIndex<LorentzIndex>(U, mu);
|
||||
|
||||
for (int mu = 0; mu < Nd; mu++) {
|
||||
// Staple in direction mu
|
||||
WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
|
||||
dSdU_mu = Ta(Umu * dSdU_mu) * factor;
|
||||
|
||||
WilsonLoops<Gimpl>::Staple(dSdU_mu, Umu, mu);
|
||||
dSdU_mu = Ta(Umu[mu] * dSdU_mu) * factor;
|
||||
|
||||
PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
RealD beta;
|
||||
};
|
||||
|
||||
@@ -111,8 +111,8 @@ public:
|
||||
};
|
||||
|
||||
void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {
|
||||
std::string config, rng;
|
||||
this->build_filenames(traj, Params, config, rng);
|
||||
std::string config, rng, smr;
|
||||
this->build_filenames(traj, Params, config, smr, rng);
|
||||
this->check_filename(rng);
|
||||
this->check_filename(config);
|
||||
|
||||
|
||||
@@ -75,7 +75,7 @@ public:
|
||||
GridParallelRNG &pRNG) {
|
||||
if ((traj % Params.saveInterval) == 0) {
|
||||
std::string config, rng, smr;
|
||||
this->build_filenames(traj, Params, config, rng);
|
||||
this->build_filenames(traj, Params, config, smr, rng);
|
||||
GridBase *grid = SmartConfig.get_U(false).Grid();
|
||||
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
|
||||
BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
|
||||
@@ -102,7 +102,7 @@ public:
|
||||
if ( Params.saveSmeared ) {
|
||||
IldgWriter _IldgWriter(grid->IsBoss());
|
||||
_IldgWriter.open(smr);
|
||||
_IldgWriter.writeConfiguration<GaugeStats>(SmartConfig.get_U(true), traj, config, config);
|
||||
_IldgWriter.writeConfiguration<GaugeStats>(SmartConfig.get_U(true), traj, smr, smr);
|
||||
_IldgWriter.close();
|
||||
|
||||
std::cout << GridLogMessage << "Written ILDG Configuration on " << smr
|
||||
@@ -118,8 +118,8 @@ public:
|
||||
|
||||
void CheckpointRestore(int traj, GaugeField &U, GridSerialRNG &sRNG,
|
||||
GridParallelRNG &pRNG) {
|
||||
std::string config, rng;
|
||||
this->build_filenames(traj, Params, config, rng);
|
||||
std::string config, rng, smr;
|
||||
this->build_filenames(traj, Params, config, smr, rng);
|
||||
this->check_filename(rng);
|
||||
this->check_filename(config);
|
||||
|
||||
|
||||
@@ -107,8 +107,8 @@ class ScidacHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
|
||||
|
||||
void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG,
|
||||
GridParallelRNG &pRNG) {
|
||||
std::string config, rng;
|
||||
this->build_filenames(traj, Params, config, rng);
|
||||
std::string config, rng, smr;
|
||||
this->build_filenames(traj, Params, config, smr, rng);
|
||||
this->check_filename(rng);
|
||||
this->check_filename(config);
|
||||
|
||||
|
||||
@@ -62,15 +62,15 @@ accelerator_inline int stencilIndex(int mu, int nu) {
|
||||
|
||||
|
||||
/*! @brief structure holding the link treatment */
|
||||
struct SmearingParameters{
|
||||
SmearingParameters(){}
|
||||
struct HISQSmearingParameters{
|
||||
HISQSmearingParameters(){}
|
||||
Real c_1; // 1 link
|
||||
Real c_naik; // Naik term
|
||||
Real c_3; // 3 link
|
||||
Real c_5; // 5 link
|
||||
Real c_7; // 7 link
|
||||
Real c_lp; // 5 link Lepage
|
||||
SmearingParameters(Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp)
|
||||
HISQSmearingParameters(Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp)
|
||||
: c_1(c1),
|
||||
c_naik(cnaik),
|
||||
c_3(c3),
|
||||
@@ -86,7 +86,7 @@ class Smear_HISQ : public Gimpl {
|
||||
|
||||
private:
|
||||
GridCartesian* const _grid;
|
||||
SmearingParameters _linkTreatment;
|
||||
HISQSmearingParameters _linkTreatment;
|
||||
|
||||
public:
|
||||
|
||||
@@ -117,7 +117,7 @@ public:
|
||||
// IN--u_thin
|
||||
void smear(GF& u_smr, GF& u_naik, GF& u_thin) const {
|
||||
|
||||
SmearingParameters lt = this->_linkTreatment;
|
||||
HISQSmearingParameters lt = this->_linkTreatment;
|
||||
auto grid = this->_grid;
|
||||
|
||||
// Create a padded cell of extra padding depth=1 and fill the padding.
|
||||
|
||||
@@ -207,11 +207,14 @@ std::vector<RealD> WilsonFlowBase<Gimpl>::flowMeasureEnergyDensityCloverleaf(con
|
||||
}
|
||||
|
||||
template <class Gimpl>
|
||||
void WilsonFlowBase<Gimpl>::setDefaultMeasurements(int topq_meas_interval){
|
||||
addMeasurement(1, [](int step, RealD t, const typename Gimpl::GaugeField &U){
|
||||
void WilsonFlowBase<Gimpl>::setDefaultMeasurements(int meas_interval){
|
||||
addMeasurement(meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
|
||||
std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : " << step << " " << t << " " << energyDensityPlaquette(t,U) << std::endl;
|
||||
});
|
||||
addMeasurement(topq_meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
|
||||
addMeasurement(meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
|
||||
std::cout << GridLogMessage << "[WilsonFlow] Energy density (cloverleaf) : " << step << " " << t << " " << energyDensityCloverleaf(t,U) << std::endl;
|
||||
});
|
||||
addMeasurement(meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
|
||||
std::cout << GridLogMessage << "[WilsonFlow] Top. charge : " << step << " " << WilsonLoops<Gimpl>::TopologicalCharge(U) << std::endl;
|
||||
});
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -118,7 +118,7 @@ static void generatorDiagonal(int diagIndex, iGroupMatrix<cplx> &ta) {
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Map a su2 subgroup number to the pair of rows that are non zero
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
static void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) {
|
||||
static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) {
|
||||
assert((su2_index >= 0) && (su2_index < (ncolour * (ncolour - 1)) / 2));
|
||||
|
||||
int spare = su2_index;
|
||||
|
||||
@@ -207,7 +207,7 @@ static void generatorZtype(int zIndex, iGroupMatrix<cplx> &ta) {
|
||||
// Map a su2 subgroup number to the pair of rows that are non zero
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
template <ONLY_IF_Sp>
|
||||
static void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::Sp) {
|
||||
static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::Sp) {
|
||||
const int nsp=ncolour/2;
|
||||
assert((su2_index >= 0) && (su2_index < (nsp * (nsp - 1)) / 2));
|
||||
|
||||
|
||||
@@ -292,19 +292,21 @@ public:
|
||||
//////////////////////////////////////////////////
|
||||
// the sum over all nu-oriented staples for nu != mu on each site
|
||||
//////////////////////////////////////////////////
|
||||
static void Staple(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
|
||||
static void Staple(GaugeMat &staple, const GaugeLorentz &U, int mu) {
|
||||
|
||||
GridBase *grid = Umu.Grid();
|
||||
|
||||
std::vector<GaugeMat> U(Nd, grid);
|
||||
std::vector<GaugeMat> Umu(Nd, U.Grid());
|
||||
for (int d = 0; d < Nd; d++) {
|
||||
U[d] = PeekIndex<LorentzIndex>(Umu, d);
|
||||
Umu[d] = PeekIndex<LorentzIndex>(U, d);
|
||||
}
|
||||
Staple(staple, U, mu);
|
||||
Staple(staple, Umu, mu);
|
||||
}
|
||||
|
||||
static void Staple(GaugeMat &staple, const std::vector<GaugeMat> &U, int mu) {
|
||||
staple = Zero();
|
||||
static void Staple(GaugeMat &staple, const std::vector<GaugeMat> &Umu, int mu) {
|
||||
|
||||
autoView(staple_v, staple, AcceleratorWrite);
|
||||
accelerator_for(i, staple.Grid()->oSites(), Simd::Nsimd(), {
|
||||
staple_v[i] = Zero();
|
||||
});
|
||||
|
||||
for (int nu = 0; nu < Nd; nu++) {
|
||||
|
||||
@@ -318,12 +320,12 @@ public:
|
||||
// |
|
||||
// __|
|
||||
//
|
||||
|
||||
|
||||
staple += Gimpl::ShiftStaple(
|
||||
Gimpl::CovShiftForward(
|
||||
U[nu], nu,
|
||||
Umu[nu], nu,
|
||||
Gimpl::CovShiftBackward(
|
||||
U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))),
|
||||
Umu[mu], mu, Gimpl::CovShiftIdentityBackward(Umu[nu], nu))),
|
||||
mu);
|
||||
|
||||
// __
|
||||
@@ -333,8 +335,8 @@ public:
|
||||
//
|
||||
|
||||
staple += Gimpl::ShiftStaple(
|
||||
Gimpl::CovShiftBackward(U[nu], nu,
|
||||
Gimpl::CovShiftBackward(U[mu], mu, U[nu])), mu);
|
||||
Gimpl::CovShiftBackward(Umu[nu], nu,
|
||||
Gimpl::CovShiftBackward(Umu[mu], mu, Umu[nu])), mu);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -121,17 +121,22 @@ class CartesianStencilAccelerator {
|
||||
StencilVector same_node;
|
||||
Coordinate _simd_layout;
|
||||
Parameters parameters;
|
||||
ViewMode mode;
|
||||
StencilEntry* _entries_p;
|
||||
StencilEntry* _entries_host_p;
|
||||
cobj* u_recv_buf_p;
|
||||
cobj* u_send_buf_p;
|
||||
|
||||
accelerator_inline cobj *CommBuf(void) const { return u_recv_buf_p; }
|
||||
|
||||
accelerator_inline int GetNodeLocal(int osite,int point) const {
|
||||
return this->_entries_p[point+this->_npoints*osite]._is_local;
|
||||
// Not a device function
|
||||
inline int GetNodeLocal(int osite,int point) const {
|
||||
StencilEntry SE=this->_entries_host_p[point+this->_npoints*osite];
|
||||
return SE._is_local;
|
||||
}
|
||||
accelerator_inline StencilEntry * GetEntry(int &ptype,int point,int osite) const {
|
||||
ptype = this->_permute_type[point]; return & this->_entries_p[point+this->_npoints*osite];
|
||||
ptype = this->_permute_type[point];
|
||||
return & this->_entries_p[point+this->_npoints*osite];
|
||||
}
|
||||
|
||||
accelerator_inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) const {
|
||||
@@ -164,28 +169,22 @@ class CartesianStencilView : public CartesianStencilAccelerator<vobj,cobj,Parame
|
||||
{
|
||||
public:
|
||||
int *closed;
|
||||
StencilEntry *cpu_ptr;
|
||||
ViewMode mode;
|
||||
// StencilEntry *cpu_ptr;
|
||||
public:
|
||||
// default copy constructor
|
||||
CartesianStencilView (const CartesianStencilView &refer_to_me) = default;
|
||||
|
||||
CartesianStencilView (const CartesianStencilAccelerator<vobj,cobj,Parameters> &refer_to_me,ViewMode _mode)
|
||||
: CartesianStencilAccelerator<vobj,cobj,Parameters>(refer_to_me),
|
||||
cpu_ptr(this->_entries_p),
|
||||
mode(_mode)
|
||||
: CartesianStencilAccelerator<vobj,cobj,Parameters>(refer_to_me)
|
||||
{
|
||||
this->_entries_p =(StencilEntry *)
|
||||
MemoryManager::ViewOpen(this->_entries_p,
|
||||
this->_npoints*this->_osites*sizeof(StencilEntry),
|
||||
mode,
|
||||
AdviseDefault);
|
||||
this->ViewOpen(_mode);
|
||||
}
|
||||
void ViewOpen(ViewMode _mode)
|
||||
{
|
||||
this->mode = _mode;
|
||||
}
|
||||
|
||||
void ViewClose(void)
|
||||
{
|
||||
MemoryManager::ViewClose(this->cpu_ptr,this->mode);
|
||||
}
|
||||
void ViewClose(void) { }
|
||||
|
||||
};
|
||||
|
||||
@@ -274,8 +273,8 @@ public:
|
||||
std::vector<deviceVector<std::pair<int,int> > > face_table ;
|
||||
deviceVector<int> surface_list;
|
||||
|
||||
std::vector<StencilEntry> _entries; // Resident in host memory
|
||||
deviceVector<StencilEntry> _entries_device; // Resident in device memory
|
||||
std::vector<StencilEntry> _entries; // Resident in host memory
|
||||
deviceVector<StencilEntry> _entries_device; // Resident in device memory
|
||||
std::vector<Packet> Packets;
|
||||
std::vector<Merge> Mergers;
|
||||
std::vector<Merge> MergersSHM;
|
||||
@@ -364,11 +363,32 @@ public:
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||
{
|
||||
// std::cout << "Communicate Begin "<<std::endl;
|
||||
// _grid->Barrier();
|
||||
FlightRecorder::StepLog("Communicate begin");
|
||||
// All GPU kernel tasks must complete
|
||||
accelerator_barrier(); // All kernels should ALREADY be complete
|
||||
_grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer
|
||||
// accelerator_barrier(); // All kernels should ALREADY be complete
|
||||
// _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer
|
||||
// But the HaloGather had a barrier too.
|
||||
for(int i=0;i<Packets.size();i++){
|
||||
// std::cout << "Communicate prepare "<<i<<std::endl;
|
||||
// _grid->Barrier();
|
||||
_grid->StencilSendToRecvFromPrepare(MpiReqs,
|
||||
Packets[i].send_buf,
|
||||
Packets[i].to_rank,Packets[i].do_send,
|
||||
Packets[i].recv_buf,
|
||||
Packets[i].from_rank,Packets[i].do_recv,
|
||||
Packets[i].xbytes,Packets[i].rbytes,i);
|
||||
}
|
||||
// std::cout << "Communicate PollDtoH "<<std::endl;
|
||||
// _grid->Barrier();
|
||||
_grid->StencilSendToRecvFromPollDtoH (MpiReqs); /* Starts MPI*/
|
||||
// std::cout << "Communicate CopySynch "<<std::endl;
|
||||
// _grid->Barrier();
|
||||
acceleratorCopySynchronise();
|
||||
// Starts intranode
|
||||
for(int i=0;i<Packets.size();i++){
|
||||
// std::cout << "Communicate Begin "<<i<<std::endl;
|
||||
_grid->StencilSendToRecvFromBegin(MpiReqs,
|
||||
Packets[i].send_buf,
|
||||
Packets[i].to_rank,Packets[i].do_send,
|
||||
@@ -386,18 +406,25 @@ public:
|
||||
|
||||
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||
{
|
||||
// std::cout << "Communicate Complete "<<std::endl;
|
||||
// _grid->Barrier();
|
||||
FlightRecorder::StepLog("Start communicate complete");
|
||||
// std::cout << "Communicate Complete PollIRecv "<<std::endl;
|
||||
// _grid->Barrier();
|
||||
_grid->StencilSendToRecvFromPollIRecv(MpiReqs);
|
||||
// std::cout << "Communicate Complete Complete "<<std::endl;
|
||||
// _grid->Barrier();
|
||||
_grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
|
||||
if ( this->partialDirichlet ) DslashLogPartial();
|
||||
else if ( this->fullDirichlet ) DslashLogDirichlet();
|
||||
else DslashLogFull();
|
||||
acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
|
||||
accelerator_barrier();
|
||||
_grid->StencilBarrier();
|
||||
// run any checksums
|
||||
// acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
|
||||
// accelerator_barrier();
|
||||
for(int i=0;i<Packets.size();i++){
|
||||
if ( Packets[i].do_recv )
|
||||
FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
|
||||
}
|
||||
FlightRecorder::StepLog("Finish communicate complete");
|
||||
}
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Blocking send and receive. Either sequential or parallel.
|
||||
@@ -419,6 +446,7 @@ public:
|
||||
Communicate();
|
||||
CommsMergeSHM(compress);
|
||||
CommsMerge(compress);
|
||||
accelerator_barrier();
|
||||
}
|
||||
|
||||
template<class compressor> int HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
|
||||
@@ -473,7 +501,10 @@ public:
|
||||
template<class compressor>
|
||||
void HaloGather(const Lattice<vobj> &source,compressor &compress)
|
||||
{
|
||||
accelerator_barrier();
|
||||
// accelerator_barrier();
|
||||
//////////////////////////////////
|
||||
// I will overwrite my send buffers
|
||||
//////////////////////////////////
|
||||
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
||||
|
||||
assert(source.Grid()==_grid);
|
||||
@@ -487,7 +518,11 @@ public:
|
||||
HaloGatherDir(source,compress,point,face_idx);
|
||||
}
|
||||
accelerator_barrier(); // All my local gathers are complete
|
||||
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
||||
#ifdef NVLINK_GET
|
||||
_grid->StencilBarrier(); // He can now get mu local gather, I can get his
|
||||
// Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
|
||||
// Or issue barrier AFTER the DMA is running
|
||||
#endif
|
||||
face_table_computed=1;
|
||||
assert(u_comm_offset==_unified_buffer_size);
|
||||
}
|
||||
@@ -526,6 +561,7 @@ public:
|
||||
coalescedWrite(to[j] ,coalescedRead(from [j]));
|
||||
});
|
||||
acceleratorFenceComputeStream();
|
||||
// Also fenced in WilsonKernels
|
||||
}
|
||||
}
|
||||
|
||||
@@ -623,10 +659,10 @@ public:
|
||||
////////////////////////////////////////
|
||||
void PrecomputeByteOffsets(void){
|
||||
for(int i=0;i<_entries.size();i++){
|
||||
if( _entries[i]._is_local ) {
|
||||
_entries[i]._byte_offset = _entries[i]._offset*sizeof(vobj);
|
||||
if( this->_entries[i]._is_local ) {
|
||||
this->_entries[i]._byte_offset = this->_entries[i]._offset*sizeof(vobj);
|
||||
} else {
|
||||
_entries[i]._byte_offset = _entries[i]._offset*sizeof(cobj);
|
||||
this->_entries[i]._byte_offset = this->_entries[i]._offset*sizeof(cobj);
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -654,7 +690,7 @@ public:
|
||||
}
|
||||
}
|
||||
}
|
||||
std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
|
||||
// std::cout << "BuildSurfaceList size is "<<surface_list_size<<std::endl;
|
||||
surface_list.resize(surface_list_size);
|
||||
std::vector<int> surface_list_host(surface_list_size);
|
||||
int32_t ss=0;
|
||||
@@ -674,6 +710,7 @@ public:
|
||||
}
|
||||
}
|
||||
acceleratorCopyToDevice(&surface_list_host[0],&surface_list[0],surface_list_size*sizeof(int));
|
||||
// std::cout << GridLogMessage<<"BuildSurfaceList size is "<<surface_list_size<<std::endl;
|
||||
}
|
||||
/// Introduce a block structure and switch off comms on boundaries
|
||||
void DirichletBlock(const Coordinate &dirichlet_block)
|
||||
@@ -761,7 +798,13 @@ public:
|
||||
this->_osites = _grid->oSites();
|
||||
|
||||
_entries.resize(this->_npoints* this->_osites);
|
||||
this->_entries_p = &_entries[0];
|
||||
_entries_device.resize(this->_npoints* this->_osites);
|
||||
this->_entries_host_p = &_entries[0];
|
||||
this->_entries_p = &_entries_device[0];
|
||||
|
||||
// std::cout << GridLogMessage << " Stencil object allocated for "<<std::dec<<this->_osites
|
||||
// <<" sites table "<<std::hex<<this->_entries_p<< " GridPtr "<<_grid<<std::dec<<std::endl;
|
||||
|
||||
for(int ii=0;ii<npoints;ii++){
|
||||
|
||||
int i = ii; // reverse direction to get SIMD comms done first
|
||||
@@ -838,6 +881,7 @@ public:
|
||||
u_simd_send_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
|
||||
}
|
||||
PrecomputeByteOffsets();
|
||||
acceleratorCopyToDevice(&this->_entries[0],&this->_entries_device[0],this->_entries.size()*sizeof(StencilEntry));
|
||||
}
|
||||
|
||||
void Local (int point, int dimension,int shiftpm,int cbmask)
|
||||
@@ -993,10 +1037,10 @@ public:
|
||||
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<_grid->_slice_block[dimension];b++){
|
||||
int idx=point+(lo+o+b)*this->_npoints;
|
||||
_entries[idx]._offset =ro+o+b;
|
||||
_entries[idx]._permute=permute;
|
||||
_entries[idx]._is_local=1;
|
||||
_entries[idx]._around_the_world=wrap;
|
||||
this->_entries[idx]._offset =ro+o+b;
|
||||
this->_entries[idx]._permute=permute;
|
||||
this->_entries[idx]._is_local=1;
|
||||
this->_entries[idx]._around_the_world=wrap;
|
||||
}
|
||||
o +=_grid->_slice_stride[dimension];
|
||||
}
|
||||
@@ -1014,10 +1058,10 @@ public:
|
||||
|
||||
if ( ocb&cbmask ) {
|
||||
int idx = point+(lo+o+b)*this->_npoints;
|
||||
_entries[idx]._offset =ro+o+b;
|
||||
_entries[idx]._is_local=1;
|
||||
_entries[idx]._permute=permute;
|
||||
_entries[idx]._around_the_world=wrap;
|
||||
this->_entries[idx]._offset =ro+o+b;
|
||||
this->_entries[idx]._is_local=1;
|
||||
this->_entries[idx]._permute=permute;
|
||||
this->_entries[idx]._around_the_world=wrap;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1041,10 +1085,10 @@ public:
|
||||
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
|
||||
for(int b=0;b<_grid->_slice_block[dimension];b++){
|
||||
int idx=point+(so+o+b)*this->_npoints;
|
||||
_entries[idx]._offset =offset+(bo++);
|
||||
_entries[idx]._is_local=0;
|
||||
_entries[idx]._permute=0;
|
||||
_entries[idx]._around_the_world=wrap;
|
||||
this->_entries[idx]._offset =offset+(bo++);
|
||||
this->_entries[idx]._is_local=0;
|
||||
this->_entries[idx]._permute=0;
|
||||
this->_entries[idx]._around_the_world=wrap;
|
||||
}
|
||||
o +=_grid->_slice_stride[dimension];
|
||||
}
|
||||
@@ -1061,10 +1105,10 @@ public:
|
||||
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
|
||||
if ( ocb & cbmask ) {
|
||||
int idx = point+(so+o+b)*this->_npoints;
|
||||
_entries[idx]._offset =offset+(bo++);
|
||||
_entries[idx]._is_local=0;
|
||||
_entries[idx]._permute =0;
|
||||
_entries[idx]._around_the_world=wrap;
|
||||
this->_entries[idx]._offset =offset+(bo++);
|
||||
this->_entries[idx]._is_local=0;
|
||||
this->_entries[idx]._permute =0;
|
||||
this->_entries[idx]._around_the_world=wrap;
|
||||
}
|
||||
}
|
||||
o +=_grid->_slice_stride[dimension];
|
||||
|
||||
@@ -202,13 +202,13 @@ void acceleratorInit(void)
|
||||
|
||||
#ifdef GRID_SYCL
|
||||
|
||||
cl::sycl::queue *theGridAccelerator;
|
||||
cl::sycl::queue *theCopyAccelerator;
|
||||
sycl::queue *theGridAccelerator;
|
||||
sycl::queue *theCopyAccelerator;
|
||||
void acceleratorInit(void)
|
||||
{
|
||||
int nDevices = 1;
|
||||
// cl::sycl::gpu_selector selector;
|
||||
// cl::sycl::device selectedDevice { selector };
|
||||
// sycl::gpu_selector selector;
|
||||
// sycl::device selectedDevice { selector };
|
||||
theGridAccelerator = new sycl::queue (sycl::gpu_selector_v);
|
||||
theCopyAccelerator = new sycl::queue (sycl::gpu_selector_v);
|
||||
// theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway.
|
||||
@@ -242,14 +242,14 @@ void acceleratorInit(void)
|
||||
gethostname(hostname, HOST_NAME_MAX+1);
|
||||
if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname);
|
||||
|
||||
auto devices = cl::sycl::device::get_devices();
|
||||
auto devices = sycl::device::get_devices();
|
||||
for(int d = 0;d<devices.size();d++){
|
||||
|
||||
#define GPU_PROP_STR(prop) \
|
||||
printf("AcceleratorSyclInit: " #prop ": %s \n",devices[d].get_info<cl::sycl::info::device::prop>().c_str());
|
||||
printf("AcceleratorSyclInit: " #prop ": %s \n",devices[d].get_info<sycl::info::device::prop>().c_str());
|
||||
|
||||
#define GPU_PROP_FMT(prop,FMT) \
|
||||
printf("AcceleratorSyclInit: " #prop ": " FMT" \n",devices[d].get_info<cl::sycl::info::device::prop>());
|
||||
printf("AcceleratorSyclInit: " #prop ": " FMT" \n",devices[d].get_info<sycl::info::device::prop>());
|
||||
|
||||
#define GPU_PROP(prop) GPU_PROP_FMT(prop,"%ld");
|
||||
if ( world_rank == 0) {
|
||||
|
||||
@@ -132,27 +132,17 @@ inline void cuda_mem(void)
|
||||
|
||||
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
|
||||
{ \
|
||||
int nt=acceleratorThreads(); \
|
||||
typedef uint64_t Iterator; \
|
||||
auto lambda = [=] accelerator \
|
||||
(Iterator iter1,Iterator iter2,Iterator lane) mutable { \
|
||||
__VA_ARGS__; \
|
||||
}; \
|
||||
dim3 cu_threads(nsimd,acceleratorThreads(),1); \
|
||||
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
|
||||
LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
|
||||
}
|
||||
#define prof_accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
|
||||
{ \
|
||||
int nt=acceleratorThreads(); \
|
||||
typedef uint64_t Iterator; \
|
||||
auto lambda = [=] accelerator \
|
||||
(Iterator iter1,Iterator iter2,Iterator lane) mutable { \
|
||||
__VA_ARGS__; \
|
||||
}; \
|
||||
dim3 cu_threads(nsimd,acceleratorThreads(),1); \
|
||||
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
|
||||
ProfileLambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
|
||||
if ( num1*num2 ) { \
|
||||
int nt=acceleratorThreads(); \
|
||||
typedef uint64_t Iterator; \
|
||||
auto lambda = [=] accelerator \
|
||||
(Iterator iter1,Iterator iter2,Iterator lane) mutable { \
|
||||
__VA_ARGS__; \
|
||||
}; \
|
||||
dim3 cu_threads(nsimd,acceleratorThreads(),1); \
|
||||
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
|
||||
LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define accelerator_for6dNB(iter1, num1, \
|
||||
@@ -175,19 +165,6 @@ inline void cuda_mem(void)
|
||||
}
|
||||
|
||||
|
||||
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
|
||||
{ \
|
||||
int nt=acceleratorThreads(); \
|
||||
typedef uint64_t Iterator; \
|
||||
auto lambda = [=] accelerator \
|
||||
(Iterator iter1,Iterator iter2,Iterator lane) mutable { \
|
||||
__VA_ARGS__; \
|
||||
}; \
|
||||
dim3 cu_threads(nsimd,acceleratorThreads(),1); \
|
||||
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
|
||||
LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
|
||||
}
|
||||
|
||||
template<typename lambda> __global__
|
||||
void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
|
||||
{
|
||||
@@ -199,17 +176,6 @@ void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
|
||||
Lambda(x,y,z);
|
||||
}
|
||||
}
|
||||
template<typename lambda> __global__
|
||||
void ProfileLambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
|
||||
{
|
||||
// Weird permute is to make lane coalesce for large blocks
|
||||
uint64_t x = threadIdx.y + blockDim.y*blockIdx.x;
|
||||
uint64_t y = threadIdx.z + blockDim.z*blockIdx.y;
|
||||
uint64_t z = threadIdx.x;
|
||||
if ( (x < num1) && (y<num2) && (z<num3) ) {
|
||||
Lambda(x,y,z);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename lambda> __global__
|
||||
void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
|
||||
@@ -243,6 +209,17 @@ void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
|
||||
} \
|
||||
}
|
||||
|
||||
inline void *acceleratorAllocHost(size_t bytes)
|
||||
{
|
||||
void *ptr=NULL;
|
||||
auto err = cudaMallocHost((void **)&ptr,bytes);
|
||||
if( err != cudaSuccess ) {
|
||||
ptr = (void *) NULL;
|
||||
printf(" cudaMallocHost failed for %d %s \n",bytes,cudaGetErrorString(err));
|
||||
assert(0);
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
inline void *acceleratorAllocShared(size_t bytes)
|
||||
{
|
||||
void *ptr=NULL;
|
||||
@@ -264,18 +241,34 @@ inline void *acceleratorAllocDevice(size_t bytes)
|
||||
}
|
||||
return ptr;
|
||||
};
|
||||
|
||||
typedef int acceleratorEvent_t;
|
||||
|
||||
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
|
||||
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
|
||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
|
||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
|
||||
inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
|
||||
inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToHost, stream);}
|
||||
inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
|
||||
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
|
||||
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
|
||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
|
||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
|
||||
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
|
||||
acceleratorCopyToDevice(to,from,bytes, cudaMemcpyHostToDevice);
|
||||
return 0;
|
||||
}
|
||||
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
|
||||
acceleratorCopyFromDevice(from,to,bytes);
|
||||
return 0;
|
||||
}
|
||||
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
|
||||
{
|
||||
cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
|
||||
return 0;
|
||||
}
|
||||
inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
|
||||
inline void acceleratorEventWait(acceleratorEvent_t ev)
|
||||
{
|
||||
//auto discard=cudaStreamSynchronize(ev);
|
||||
}
|
||||
inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}
|
||||
|
||||
|
||||
inline int acceleratorIsCommunicable(void *ptr)
|
||||
@@ -302,7 +295,7 @@ NAMESPACE_END(Grid);
|
||||
|
||||
// Force deterministic reductions
|
||||
#define SYCL_REDUCTION_DETERMINISTIC
|
||||
#include <sycl/CL/sycl.hpp>
|
||||
#include <sycl/sycl.hpp>
|
||||
#include <sycl/usm.hpp>
|
||||
#include <level_zero/ze_api.h>
|
||||
#include <sycl/ext/oneapi/backend/level_zero.hpp>
|
||||
@@ -314,8 +307,8 @@ inline void acceleratorMem(void)
|
||||
std::cout <<" SYCL acceleratorMem not implemented"<<std::endl;
|
||||
}
|
||||
|
||||
extern cl::sycl::queue *theGridAccelerator;
|
||||
extern cl::sycl::queue *theCopyAccelerator;
|
||||
extern sycl::queue *theGridAccelerator;
|
||||
extern sycl::queue *theCopyAccelerator;
|
||||
|
||||
#ifdef __SYCL_DEVICE_ONLY__
|
||||
#define GRID_SIMT
|
||||
@@ -326,24 +319,24 @@ extern cl::sycl::queue *theCopyAccelerator;
|
||||
|
||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
||||
#ifdef GRID_SIMT
|
||||
return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2];
|
||||
return __spirv::initLocalInvocationId<3, sycl::id<3>>()[2];
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
} // SYCL specific
|
||||
|
||||
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
|
||||
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { \
|
||||
theGridAccelerator->submit([&](sycl::handler &cgh) { \
|
||||
unsigned long nt=acceleratorThreads(); \
|
||||
if(nt < 8)nt=8; \
|
||||
unsigned long unum1 = num1; \
|
||||
unsigned long unum2 = num2; \
|
||||
unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt; \
|
||||
cl::sycl::range<3> local {nt,1,nsimd}; \
|
||||
cl::sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd}; \
|
||||
sycl::range<3> local {nt,1,nsimd}; \
|
||||
sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd}; \
|
||||
cgh.parallel_for( \
|
||||
cl::sycl::nd_range<3>(global,local), \
|
||||
[=] (cl::sycl::nd_item<3> item) /*mutable*/ \
|
||||
sycl::nd_range<3>(global,local), \
|
||||
[=] (sycl::nd_item<3> item) /*mutable*/ \
|
||||
[[intel::reqd_sub_group_size(16)]] \
|
||||
{ \
|
||||
auto iter1 = item.get_global_id(0); \
|
||||
@@ -356,26 +349,50 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
||||
#define accelerator_barrier(dummy) { theGridAccelerator->wait(); }
|
||||
|
||||
inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
|
||||
inline void *acceleratorAllocHost(size_t bytes) { return malloc_host(bytes,*theGridAccelerator);};
|
||||
inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
|
||||
inline void acceleratorFreeHost(void *ptr){free(ptr,*theGridAccelerator);};
|
||||
inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
|
||||
inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
|
||||
|
||||
inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait(); }
|
||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes);}
|
||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
|
||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
|
||||
|
||||
|
||||
///////
|
||||
// Asynch event interface
|
||||
///////
|
||||
typedef sycl::event acceleratorEvent_t;
|
||||
|
||||
inline void acceleratorEventWait(acceleratorEvent_t ev)
|
||||
{
|
||||
ev.wait();
|
||||
}
|
||||
|
||||
inline int acceleratorEventIsComplete(acceleratorEvent_t ev)
|
||||
{
|
||||
return (ev.get_info<sycl::info::event::command_execution_status>() == sycl::info::event_command_status::complete);
|
||||
}
|
||||
|
||||
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes);}
|
||||
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
|
||||
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
|
||||
|
||||
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
|
||||
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
|
||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}
|
||||
|
||||
inline int acceleratorIsCommunicable(void *ptr)
|
||||
{
|
||||
#if 0
|
||||
auto uvm = cl::sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context());
|
||||
if ( uvm = cl::sycl::usm::alloc::shared ) return 1;
|
||||
auto uvm = sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context());
|
||||
if ( uvm = sycl::usm::alloc::shared ) return 1;
|
||||
else return 0;
|
||||
#endif
|
||||
return 1;
|
||||
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
//////////////////////////////////////////////
|
||||
@@ -472,6 +489,16 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
|
||||
} \
|
||||
}
|
||||
|
||||
inline void *acceleratorAllocHost(size_t bytes)
|
||||
{
|
||||
void *ptr=NULL;
|
||||
auto err = hipHostMalloc((void **)&ptr,bytes);
|
||||
if( err != hipSuccess ) {
|
||||
ptr = (void *) NULL;
|
||||
fprintf(stderr," hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
|
||||
}
|
||||
return ptr;
|
||||
};
|
||||
inline void *acceleratorAllocShared(size_t bytes)
|
||||
{
|
||||
void *ptr=NULL;
|
||||
@@ -495,37 +522,53 @@ inline void *acceleratorAllocDevice(size_t bytes)
|
||||
return ptr;
|
||||
};
|
||||
|
||||
inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);};
|
||||
inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);};
|
||||
inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
|
||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
|
||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
|
||||
//inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
|
||||
//inline void acceleratorCopySynchronise(void) { }
|
||||
inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
|
||||
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
|
||||
|
||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
|
||||
|
||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
|
||||
typedef int acceleratorEvent_t;
|
||||
|
||||
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
|
||||
{
|
||||
auto discard=hipMemcpyDtoDAsync(to,from,bytes, copyStream);
|
||||
return 0;
|
||||
}
|
||||
inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
|
||||
auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyHostToDevice, stream);
|
||||
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
|
||||
acceleratorCopyToDevice(from,to,bytes);
|
||||
return 0;
|
||||
}
|
||||
inline void acceleratorCopyFromDeviceAsync(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
|
||||
auto r = hipMemcpyAsync(to,from,bytes, hipMemcpyDeviceToHost, stream);
|
||||
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
|
||||
acceleratorCopyFromDevice(from,to,bytes);
|
||||
return 0;
|
||||
}
|
||||
inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); };
|
||||
|
||||
inline void acceleratorEventWait(acceleratorEvent_t ev)
|
||||
{
|
||||
// auto discard=hipStreamSynchronize(ev);
|
||||
}
|
||||
inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev) ; return 1;}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
inline void acceleratorPin(void *ptr,unsigned long bytes)
|
||||
{
|
||||
#ifdef GRID_SYCL
|
||||
sycl::ext::oneapi::experimental::prepare_for_device_copy(ptr,bytes,theCopyAccelerator->get_context());
|
||||
#endif
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// Common on all GPU targets
|
||||
//////////////////////////////////////////////
|
||||
#if defined(GRID_SYCL) || defined(GRID_CUDA) || defined(GRID_HIP)
|
||||
// FIXME -- the non-blocking nature got broken March 30 2023 by PAB
|
||||
#define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );
|
||||
#define prof_accelerator_for( iter1, num1, nsimd, ... ) \
|
||||
prof_accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );\
|
||||
accelerator_barrier(dummy);
|
||||
|
||||
#define accelerator_for( iter, num, nsimd, ... ) \
|
||||
accelerator_forNB(iter, num, nsimd, { __VA_ARGS__ } ); \
|
||||
@@ -547,6 +590,8 @@ inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize
|
||||
|
||||
#undef GRID_SIMT
|
||||
|
||||
typedef int acceleratorEvent_t;
|
||||
|
||||
inline void acceleratorMem(void)
|
||||
{
|
||||
/*
|
||||
@@ -567,15 +612,22 @@ inline void acceleratorMem(void)
|
||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
|
||||
|
||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); }
|
||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ thread_bcopy(from,to,bytes);}
|
||||
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes);}
|
||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); }
|
||||
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { acceleratorCopyToDevice(from,to,bytes); return 0; }
|
||||
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes) { acceleratorCopyFromDevice(from,to,bytes); return 0; }
|
||||
inline void acceleratorEventWait(acceleratorEvent_t ev){}
|
||||
inline int acceleratorEventIsComplete(acceleratorEvent_t ev){ acceleratorEventWait(ev); return 1;}
|
||||
inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); return 0;}
|
||||
|
||||
inline void acceleratorCopySynchronise(void) {};
|
||||
|
||||
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
|
||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);}
|
||||
#ifdef HAVE_MM_MALLOC_H
|
||||
inline void *acceleratorAllocHost(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
|
||||
inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
|
||||
inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
|
||||
inline void acceleratorFreeHost(void *ptr){_mm_free(ptr);};
|
||||
inline void acceleratorFreeShared(void *ptr){_mm_free(ptr);};
|
||||
inline void acceleratorFreeDevice(void *ptr){_mm_free(ptr);};
|
||||
#else
|
||||
@@ -655,9 +707,9 @@ inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
|
||||
acceleratorCopySynchronise();
|
||||
}
|
||||
|
||||
template<class T> void acceleratorPut(T& dev,T&host)
|
||||
template<class T> void acceleratorPut(T& dev,const T&host)
|
||||
{
|
||||
acceleratorCopyToDevice(&host,&dev,sizeof(T));
|
||||
acceleratorCopyToDevice((void *)&host,&dev,sizeof(T));
|
||||
}
|
||||
template<class T> T acceleratorGet(T& dev)
|
||||
{
|
||||
|
||||
@@ -28,6 +28,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
/* END LEGAL */
|
||||
#pragma once
|
||||
|
||||
#ifndef MIN
|
||||
#define MIN(x,y) ((x)>(y)?(y):(x))
|
||||
#endif
|
||||
|
||||
|
||||
// Introduce a class to gain deterministic bit reproducible reduction.
|
||||
// make static; perhaps just a namespace is required.
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
@@ -73,9 +73,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#define thread_critical DO_PRAGMA(omp critical)
|
||||
|
||||
#ifdef GRID_OMP
|
||||
inline void thread_bcopy(void *from, void *to,size_t bytes)
|
||||
inline void thread_bcopy(const void *from, void *to,size_t bytes)
|
||||
{
|
||||
uint64_t *ufrom = (uint64_t *)from;
|
||||
const uint64_t *ufrom = (const uint64_t *)from;
|
||||
uint64_t *uto = (uint64_t *)to;
|
||||
assert(bytes%8==0);
|
||||
uint64_t words=bytes/8;
|
||||
@@ -84,7 +84,7 @@ inline void thread_bcopy(void *from, void *to,size_t bytes)
|
||||
});
|
||||
}
|
||||
#else
|
||||
inline void thread_bcopy(void *from, void *to,size_t bytes)
|
||||
inline void thread_bcopy(const void *from, void *to,size_t bytes)
|
||||
{
|
||||
bcopy(from,to,bytes);
|
||||
}
|
||||
|
||||
@@ -39,6 +39,8 @@ int FlightRecorder::ContinueOnFail;
|
||||
int FlightRecorder::LoggingMode;
|
||||
int FlightRecorder::ChecksumComms;
|
||||
int FlightRecorder::ChecksumCommsSend;
|
||||
const char * FlightRecorder::StepName;
|
||||
int32_t FlightRecorder::StepLoggingCounter;
|
||||
int32_t FlightRecorder::XmitLoggingCounter;
|
||||
int32_t FlightRecorder::RecvLoggingCounter;
|
||||
int32_t FlightRecorder::CsumLoggingCounter;
|
||||
@@ -58,6 +60,8 @@ void FlightRecorder::ResetCounters(void)
|
||||
CsumLoggingCounter=0;
|
||||
NormLoggingCounter=0;
|
||||
ReductionLoggingCounter=0;
|
||||
StepName = "No steps started";
|
||||
StepLoggingCounter=0;
|
||||
}
|
||||
void FlightRecorder::Truncate(void)
|
||||
{
|
||||
@@ -88,6 +92,12 @@ void FlightRecorder::SetLoggingMode(FlightRecorder::LoggingMode_t mode)
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
bool FlightRecorder::StepLog(const char *name)
|
||||
{
|
||||
StepName = name;
|
||||
StepLoggingCounter ++;
|
||||
return true;
|
||||
}
|
||||
|
||||
void FlightRecorder::SetLoggingModePrint(void)
|
||||
{
|
||||
@@ -111,17 +121,19 @@ uint64_t FlightRecorder::ErrorCount(void)
|
||||
{
|
||||
return ErrorCounter;
|
||||
}
|
||||
void FlightRecorder::NormLog(double value)
|
||||
bool FlightRecorder::NormLog(double value)
|
||||
{
|
||||
uint64_t hex = * ( (uint64_t *)&value );
|
||||
if(LoggingMode == LoggingModePrint) {
|
||||
std::cerr<<"FlightRecorder::NormLog : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
|
||||
NormLoggingCounter++;
|
||||
return true;
|
||||
}
|
||||
if(LoggingMode == LoggingModeRecord) {
|
||||
std::cerr<<"FlightRecorder::NormLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
|
||||
NormLogVector.push_back(value);
|
||||
NormLoggingCounter++;
|
||||
return true;
|
||||
}
|
||||
if(LoggingMode == LoggingModeVerify) {
|
||||
|
||||
@@ -130,6 +142,9 @@ void FlightRecorder::NormLog(double value)
|
||||
|
||||
if ( (value != NormLogVector[NormLoggingCounter]) || std::isnan(value) ) {
|
||||
|
||||
fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
|
||||
FlightRecorder::StepLoggingCounter,
|
||||
FlightRecorder::StepName);
|
||||
std::cerr<<"FlightRecorder::NormLog Oops, I did it again "<< NormLoggingCounter
|
||||
<<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<" "
|
||||
<<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::endl;
|
||||
@@ -142,7 +157,9 @@ void FlightRecorder::NormLog(double value)
|
||||
NormLoggingCounter,NormLogVector.size(),
|
||||
value, NormLogVector[NormLoggingCounter]); fflush(stderr);
|
||||
|
||||
if(!ContinueOnFail)assert(0); // Force takedown of job
|
||||
BACKTRACEFP(stderr);
|
||||
|
||||
if(!ContinueOnFail) return false;
|
||||
|
||||
ErrorCounter++;
|
||||
} else {
|
||||
@@ -159,18 +176,21 @@ void FlightRecorder::NormLog(double value)
|
||||
}
|
||||
NormLoggingCounter++;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
void FlightRecorder::CsumLog(uint64_t hex)
|
||||
bool FlightRecorder::CsumLog(uint64_t hex)
|
||||
{
|
||||
if(LoggingMode == LoggingModePrint) {
|
||||
std::cerr<<"FlightRecorder::CsumLog : "<< CsumLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
|
||||
CsumLoggingCounter++;
|
||||
return true;
|
||||
}
|
||||
|
||||
if(LoggingMode == LoggingModeRecord) {
|
||||
std::cerr<<"FlightRecorder::CsumLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
|
||||
CsumLogVector.push_back(hex);
|
||||
CsumLoggingCounter++;
|
||||
return true;
|
||||
}
|
||||
|
||||
if(LoggingMode == LoggingModeVerify) {
|
||||
@@ -181,6 +201,9 @@ void FlightRecorder::CsumLog(uint64_t hex)
|
||||
|
||||
if ( hex != hexref ) {
|
||||
|
||||
fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
|
||||
FlightRecorder::StepLoggingCounter,
|
||||
FlightRecorder::StepName);
|
||||
std::cerr<<"FlightRecorder::CsumLog Oops, I did it again "<< CsumLoggingCounter
|
||||
<<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<std::endl;
|
||||
|
||||
@@ -188,9 +211,10 @@ void FlightRecorder::CsumLog(uint64_t hex)
|
||||
GridHostname(),
|
||||
GlobalSharedMemory::WorldShmRank,
|
||||
CsumLoggingCounter,hex, hexref);
|
||||
BACKTRACEFP(stderr);
|
||||
fflush(stderr);
|
||||
|
||||
if(!ContinueOnFail) assert(0); // Force takedown of job
|
||||
if(!ContinueOnFail) return false;
|
||||
|
||||
ErrorCounter++;
|
||||
|
||||
@@ -207,7 +231,9 @@ void FlightRecorder::CsumLog(uint64_t hex)
|
||||
}
|
||||
CsumLoggingCounter++;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void FlightRecorder::ReductionLog(double local,double global)
|
||||
{
|
||||
uint64_t hex_l = * ( (uint64_t *)&local );
|
||||
@@ -224,11 +250,15 @@ void FlightRecorder::ReductionLog(double local,double global)
|
||||
if(LoggingMode == LoggingModeVerify) {
|
||||
if(ReductionLoggingCounter < ReductionLogVector.size()){
|
||||
if ( global != ReductionLogVector[ReductionLoggingCounter] ) {
|
||||
fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
|
||||
FlightRecorder::StepLoggingCounter,
|
||||
FlightRecorder::StepName);
|
||||
fprintf(stderr,"%s:%d Oops, MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e expect glb %.16e\n",
|
||||
GridHostname(),
|
||||
GlobalSharedMemory::WorldShmRank,
|
||||
ReductionLoggingCounter,ReductionLogVector.size(),
|
||||
global, local, ReductionLogVector[ReductionLoggingCounter]); fflush(stderr);
|
||||
BACKTRACEFP(stderr);
|
||||
|
||||
if ( !ContinueOnFail ) assert(0);
|
||||
|
||||
@@ -250,10 +280,11 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
|
||||
if(LoggingMode == LoggingModeNone) return;
|
||||
|
||||
if ( ChecksumCommsSend ){
|
||||
uint64_t *ubuf = (uint64_t *)buf;
|
||||
if(LoggingMode == LoggingModeNone) return;
|
||||
|
||||
if(LoggingMode == LoggingModeNone) return;
|
||||
|
||||
#ifdef GRID_SYCL
|
||||
uint64_t *ubuf = (uint64_t *)buf;
|
||||
uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t));
|
||||
if(LoggingMode == LoggingModePrint) {
|
||||
std::cerr<<"FlightRecorder::xmitLog : "<< XmitLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
|
||||
@@ -267,11 +298,15 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
|
||||
if(LoggingMode == LoggingModeVerify) {
|
||||
if(XmitLoggingCounter < XmitLogVector.size()){
|
||||
if ( _xor != XmitLogVector[XmitLoggingCounter] ) {
|
||||
fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
|
||||
FlightRecorder::StepLoggingCounter,
|
||||
FlightRecorder::StepName);
|
||||
fprintf(stderr,"%s:%d Oops, send buf difference! Reproduce failure for xmit %d/%zu %lx expect glb %lx\n",
|
||||
GridHostname(),
|
||||
GlobalSharedMemory::WorldShmRank,
|
||||
XmitLoggingCounter,XmitLogVector.size(),
|
||||
_xor, XmitLogVector[XmitLoggingCounter]); fflush(stderr);
|
||||
BACKTRACEFP(stderr);
|
||||
|
||||
if ( !ContinueOnFail ) assert(0);
|
||||
|
||||
@@ -293,9 +328,9 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
|
||||
void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
|
||||
{
|
||||
if ( ChecksumComms ){
|
||||
uint64_t *ubuf = (uint64_t *)buf;
|
||||
if(LoggingMode == LoggingModeNone) return;
|
||||
#ifdef GRID_SYCL
|
||||
uint64_t *ubuf = (uint64_t *)buf;
|
||||
uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t));
|
||||
if(LoggingMode == LoggingModePrint) {
|
||||
std::cerr<<"FlightRecorder::recvLog : "<< RecvLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
|
||||
@@ -309,11 +344,15 @@ void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
|
||||
if(LoggingMode == LoggingModeVerify) {
|
||||
if(RecvLoggingCounter < RecvLogVector.size()){
|
||||
if ( _xor != RecvLogVector[RecvLoggingCounter] ) {
|
||||
fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
|
||||
FlightRecorder::StepLoggingCounter,
|
||||
FlightRecorder::StepName);
|
||||
fprintf(stderr,"%s:%d Oops, recv buf difference! Reproduce failure for recv %d/%zu %lx expect glb %lx from MPI rank %d\n",
|
||||
GridHostname(),
|
||||
GlobalSharedMemory::WorldShmRank,
|
||||
RecvLoggingCounter,RecvLogVector.size(),
|
||||
_xor, RecvLogVector[RecvLoggingCounter],rank); fflush(stderr);
|
||||
BACKTRACEFP(stderr);
|
||||
|
||||
if ( !ContinueOnFail ) assert(0);
|
||||
|
||||
|
||||
@@ -12,6 +12,8 @@ class FlightRecorder {
|
||||
|
||||
static int LoggingMode;
|
||||
static uint64_t ErrorCounter;
|
||||
static const char * StepName;
|
||||
static int32_t StepLoggingCounter;
|
||||
static int32_t XmitLoggingCounter;
|
||||
static int32_t RecvLoggingCounter;
|
||||
static int32_t CsumLoggingCounter;
|
||||
@@ -30,8 +32,9 @@ class FlightRecorder {
|
||||
static void SetLoggingModeRecord(void);
|
||||
static void SetLoggingModeVerify(void);
|
||||
static void SetLoggingMode(LoggingMode_t mode);
|
||||
static void NormLog(double value);
|
||||
static void CsumLog(uint64_t csum);
|
||||
static bool StepLog(const char *name);
|
||||
static bool NormLog(double value);
|
||||
static bool CsumLog(uint64_t csum);
|
||||
static void ReductionLog(double lcl, double glbl);
|
||||
static void Truncate(void);
|
||||
static void ResetCounters(void);
|
||||
|
||||
@@ -509,7 +509,14 @@ void Grid_init(int *argc,char ***argv)
|
||||
Grid_default_latt,
|
||||
Grid_default_mpi);
|
||||
|
||||
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--flightrecorder") ){
|
||||
std::cout << GridLogMessage <<" Enabling flight recorder " <<std::endl;
|
||||
FlightRecorder::SetLoggingMode(FlightRecorder::LoggingModeRecord);
|
||||
FlightRecorder::PrintEntireLog = 1;
|
||||
FlightRecorder::ChecksumComms = 1;
|
||||
FlightRecorder::ChecksumCommsSend=1;
|
||||
}
|
||||
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
|
||||
std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
|
||||
std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
|
||||
@@ -552,6 +559,9 @@ void * Grid_backtrace_buffer[_NBACKTRACE];
|
||||
void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
|
||||
{
|
||||
fprintf(stderr,"Signal handler on host %s\n",hostname);
|
||||
fprintf(stderr,"FlightRecorder step %d stage %s \n",
|
||||
FlightRecorder::StepLoggingCounter,
|
||||
FlightRecorder::StepName);
|
||||
fprintf(stderr,"Caught signal %d\n",si->si_signo);
|
||||
fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr);
|
||||
fprintf(stderr," code %d\n",si->si_code);
|
||||
@@ -648,3 +658,4 @@ void Grid_debug_handler_init(void)
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
|
||||
@@ -50,7 +50,7 @@ namespace Grid{
|
||||
int64_t index64;
|
||||
IndexFromCoorReversed(coor,index64,dims);
|
||||
if ( index64>=2*1024*1024*1024LL ){
|
||||
std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
|
||||
// std::cout << " IndexFromCoorReversed " << coor<<" index " << index64<< " dims "<<dims<<std::endl;
|
||||
}
|
||||
assert(index64<2*1024*1024*1024LL);
|
||||
index = (int) index64;
|
||||
|
||||
@@ -25,13 +25,20 @@ directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/Grid.h>
|
||||
|
||||
#if Nc == 3
|
||||
#include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
|
||||
#include <Grid/qcd/smearing/JacobianAction.h>
|
||||
#endif
|
||||
|
||||
using namespace Grid;
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
#if Nc != 3
|
||||
#warning FTHMC2p1f will not work for Nc != 3
|
||||
std::cout << "This program will currently only work for Nc == 3." << std::endl;
|
||||
#else
|
||||
std::cout << std::setprecision(12);
|
||||
|
||||
Grid_init(&argc, &argv);
|
||||
@@ -220,7 +227,6 @@ int main(int argc, char **argv)
|
||||
TheHMC.Run(SmearingPolicy); // for smearing
|
||||
|
||||
Grid_finalize();
|
||||
#endif
|
||||
} // main
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -24,14 +24,22 @@ See the full license in the file "LICENSE" in the top level distribution
|
||||
directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#include <Grid/Grid.h>
|
||||
|
||||
#if Nc == 3
|
||||
#include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
|
||||
#include <Grid/qcd/smearing/JacobianAction.h>
|
||||
#endif
|
||||
|
||||
using namespace Grid;
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
#if Nc != 3
|
||||
#warning FTHMC2p1f_3GeV will not work for Nc != 3
|
||||
std::cout << "This program will currently only work for Nc == 3." << std::endl;
|
||||
#else
|
||||
std::cout << std::setprecision(12);
|
||||
|
||||
Grid_init(&argc, &argv);
|
||||
@@ -220,6 +228,7 @@ int main(int argc, char **argv)
|
||||
TheHMC.Run(SmearingPolicy); // for smearing
|
||||
|
||||
Grid_finalize();
|
||||
#endif
|
||||
} // main
|
||||
|
||||
|
||||
|
||||
@@ -25,13 +25,20 @@ directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/Grid.h>
|
||||
|
||||
#if Nc == 3
|
||||
#include <Grid/qcd/smearing/GaugeConfigurationMasked.h>
|
||||
#include <Grid/qcd/smearing/JacobianAction.h>
|
||||
#endif
|
||||
|
||||
using namespace Grid;
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
#if Nc != 3
|
||||
#warning HMC2p1f_3GeV will not work for Nc != 3
|
||||
std::cout << "This program will currently only work for Nc == 3." << std::endl;
|
||||
#else
|
||||
std::cout << std::setprecision(12);
|
||||
|
||||
Grid_init(&argc, &argv);
|
||||
@@ -220,6 +227,7 @@ int main(int argc, char **argv)
|
||||
TheHMC.Run(SmearingPolicy); // for smearing
|
||||
|
||||
Grid_finalize();
|
||||
#endif
|
||||
} // main
|
||||
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# additional include paths necessary to compile the C++ library
|
||||
SUBDIRS = Grid HMC benchmarks tests examples
|
||||
SUBDIRS = Grid benchmarks tests examples HMC
|
||||
|
||||
include $(top_srcdir)/doxygen.inc
|
||||
|
||||
|
||||
@@ -118,7 +118,7 @@ public:
|
||||
fprintf(FP,"Packet bytes, direction, GB/s per node\n");
|
||||
for(int lat=16;lat<=maxlat;lat+=8){
|
||||
// for(int Ls=8;Ls<=8;Ls*=2){
|
||||
{ int Ls=12;
|
||||
{ int Ls=8;
|
||||
|
||||
Coordinate latt_size ({lat*mpi_layout[0],
|
||||
lat*mpi_layout[1],
|
||||
@@ -175,8 +175,8 @@ public:
|
||||
timestat.statistics(t_time);
|
||||
|
||||
dbytes=dbytes*ppn;
|
||||
double xbytes = dbytes*0.5;
|
||||
double bidibytes = dbytes;
|
||||
double xbytes = dbytes;
|
||||
double bidibytes = dbytes*2.0;
|
||||
|
||||
std::cout<<GridLogMessage << lat<<"\t"<<Ls<<"\t "
|
||||
<< bytes << " \t "
|
||||
@@ -492,17 +492,18 @@ public:
|
||||
}
|
||||
FGrid->Barrier();
|
||||
double t1=usecond();
|
||||
uint64_t ncall = 500;
|
||||
|
||||
FGrid->Broadcast(0,&ncall,sizeof(ncall));
|
||||
uint64_t no = 50;
|
||||
uint64_t ni = 100;
|
||||
|
||||
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
||||
|
||||
time_statistics timestat;
|
||||
std::vector<double> t_time(ncall);
|
||||
for(uint64_t i=0;i<ncall;i++){
|
||||
std::vector<double> t_time(no);
|
||||
for(uint64_t i=0;i<no;i++){
|
||||
t0=usecond();
|
||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||
for(uint64_t j=0;j<ni;j++){
|
||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||
}
|
||||
t1=usecond();
|
||||
t_time[i] = t1-t0;
|
||||
}
|
||||
@@ -520,11 +521,11 @@ public:
|
||||
double mf_hi, mf_lo, mf_err;
|
||||
|
||||
timestat.statistics(t_time);
|
||||
mf_hi = flops/timestat.min;
|
||||
mf_lo = flops/timestat.max;
|
||||
mf_hi = flops/timestat.min*ni;
|
||||
mf_lo = flops/timestat.max*ni;
|
||||
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
||||
|
||||
mflops = flops/timestat.mean;
|
||||
mflops = flops/timestat.mean*ni;
|
||||
mflops_all.push_back(mflops);
|
||||
if ( mflops_best == 0 ) mflops_best = mflops;
|
||||
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
||||
@@ -535,6 +536,7 @@ public:
|
||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
|
||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
|
||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call "<< timestat.mean/ni<<std::endl;
|
||||
|
||||
}
|
||||
|
||||
@@ -654,17 +656,19 @@ public:
|
||||
}
|
||||
FGrid->Barrier();
|
||||
double t1=usecond();
|
||||
uint64_t ncall = 500;
|
||||
|
||||
FGrid->Broadcast(0,&ncall,sizeof(ncall));
|
||||
uint64_t no = 50;
|
||||
uint64_t ni = 100;
|
||||
|
||||
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
||||
|
||||
time_statistics timestat;
|
||||
std::vector<double> t_time(ncall);
|
||||
for(uint64_t i=0;i<ncall;i++){
|
||||
std::vector<double> t_time(no);
|
||||
for(uint64_t i=0;i<no;i++){
|
||||
t0=usecond();
|
||||
Ds.DhopEO(src_o,r_e,DaggerNo);
|
||||
for(uint64_t j=0;j<ni;j++){
|
||||
Ds.DhopEO(src_o,r_e,DaggerNo);
|
||||
}
|
||||
t1=usecond();
|
||||
t_time[i] = t1-t0;
|
||||
}
|
||||
@@ -675,11 +679,11 @@ public:
|
||||
double mf_hi, mf_lo, mf_err;
|
||||
|
||||
timestat.statistics(t_time);
|
||||
mf_hi = flops/timestat.min;
|
||||
mf_lo = flops/timestat.max;
|
||||
mf_hi = flops/timestat.min*ni;
|
||||
mf_lo = flops/timestat.max*ni;
|
||||
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
||||
|
||||
mflops = flops/timestat.mean;
|
||||
mflops = flops/timestat.mean*ni;
|
||||
mflops_all.push_back(mflops);
|
||||
if ( mflops_best == 0 ) mflops_best = mflops;
|
||||
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
||||
@@ -689,6 +693,7 @@ public:
|
||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per rank "<< mflops/NP<<std::endl;
|
||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo mflop/s per node "<< mflops/NN<<std::endl;
|
||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Deo us per call "<< timestat.mean/ni<<std::endl;
|
||||
|
||||
}
|
||||
|
||||
@@ -792,19 +797,18 @@ public:
|
||||
Dc.M(src,r);
|
||||
}
|
||||
FGrid->Barrier();
|
||||
double t1=usecond();
|
||||
uint64_t ncall = 500;
|
||||
|
||||
FGrid->Broadcast(0,&ncall,sizeof(ncall));
|
||||
uint64_t ni = 100;
|
||||
uint64_t no = 50;
|
||||
|
||||
// std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
|
||||
|
||||
time_statistics timestat;
|
||||
std::vector<double> t_time(ncall);
|
||||
for(uint64_t i=0;i<ncall;i++){
|
||||
t0=usecond();
|
||||
Dc.M(src,r);
|
||||
t1=usecond();
|
||||
std::vector<double> t_time(no);
|
||||
for(uint64_t i=0;i<no;i++){
|
||||
double t0=usecond();
|
||||
for(uint64_t j=0;j<ni;j++){
|
||||
Dc.M(src,r);
|
||||
}
|
||||
double t1=usecond();
|
||||
t_time[i] = t1-t0;
|
||||
}
|
||||
FGrid->Barrier();
|
||||
@@ -814,20 +818,21 @@ public:
|
||||
double mf_hi, mf_lo, mf_err;
|
||||
|
||||
timestat.statistics(t_time);
|
||||
mf_hi = flops/timestat.min;
|
||||
mf_lo = flops/timestat.max;
|
||||
mf_hi = flops/timestat.min*ni;
|
||||
mf_lo = flops/timestat.max*ni;
|
||||
mf_err= flops/timestat.min * timestat.err/timestat.mean;
|
||||
|
||||
mflops = flops/timestat.mean;
|
||||
mflops = flops/timestat.mean*ni;
|
||||
mflops_all.push_back(mflops);
|
||||
if ( mflops_best == 0 ) mflops_best = mflops;
|
||||
if ( mflops_worst== 0 ) mflops_worst= mflops;
|
||||
if ( mflops>mflops_best ) mflops_best = mflops;
|
||||
if ( mflops<mflops_worst) mflops_worst= mflops;
|
||||
|
||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
|
||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s = "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<" "<<timestat.mean<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per rank "<< mflops/NP<<std::endl;
|
||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov mflop/s per node "<< mflops/NN<<std::endl;
|
||||
std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"Dclov us per call "<< timestat.mean/ni<<std::endl;
|
||||
|
||||
}
|
||||
|
||||
@@ -868,7 +873,7 @@ int main (int argc, char ** argv)
|
||||
int do_su4=0;
|
||||
int do_memory=1;
|
||||
int do_comms =1;
|
||||
int do_blas =1;
|
||||
int do_blas =0;
|
||||
int do_dslash=1;
|
||||
|
||||
int sel=4;
|
||||
|
||||
31
configure.ac
31
configure.ac
@@ -72,6 +72,7 @@ AC_CHECK_HEADERS(malloc/malloc.h)
|
||||
AC_CHECK_HEADERS(malloc.h)
|
||||
AC_CHECK_HEADERS(endian.h)
|
||||
AC_CHECK_HEADERS(execinfo.h)
|
||||
AC_CHECK_HEADERS(numaif.h)
|
||||
AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
|
||||
AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])
|
||||
|
||||
@@ -128,6 +129,20 @@ case ${ac_LAPACK} in
|
||||
AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
|
||||
esac
|
||||
|
||||
############### internal reduction
|
||||
AC_ARG_ENABLE([reduction],
|
||||
[AS_HELP_STRING([--enable-reduction=mpi|grid],[enable reduction])],
|
||||
[ac_REDUCTION=${enable_reduction}], [ac_REDUCTION=grid])
|
||||
|
||||
case ${ac_REDUCTION} in
|
||||
mpi)
|
||||
;;
|
||||
grid)
|
||||
AC_DEFINE([USE_GRID_REDUCTION],[1],[use GRID REDUCTION]);;
|
||||
*)
|
||||
AC_DEFINE([USE_GRID_REDUCTION],[1],[use GRID REDUCTION]);;
|
||||
esac
|
||||
|
||||
############### tracing
|
||||
AC_ARG_ENABLE([tracing],
|
||||
[AS_HELP_STRING([--enable-tracing=none|nvtx|roctx|timer],[enable tracing])],
|
||||
@@ -136,7 +151,7 @@ AC_ARG_ENABLE([tracing],
|
||||
case ${ac_TRACING} in
|
||||
nvtx)
|
||||
AC_DEFINE([GRID_TRACING_NVTX],[1],[use NVTX])
|
||||
LIBS="${LIBS} -lnvToolsExt64_1"
|
||||
LIBS="${LIBS} -lnvToolsExt"
|
||||
;;
|
||||
roctx)
|
||||
AC_DEFINE([GRID_TRACING_ROCTX],[1],[use ROCTX])
|
||||
@@ -226,6 +241,20 @@ case ${ac_SFW_FP16} in
|
||||
esac
|
||||
|
||||
|
||||
############### MPI BOUNCE TO HOST
|
||||
AC_ARG_ENABLE([accelerator-aware-mpi],
|
||||
[AS_HELP_STRING([--enable-accelerator-aware-mpi=yes|no],[run mpi transfers from device])],
|
||||
[ac_ACCELERATOR_AWARE_MPI=${enable_accelerator_aware_mpi}], [ac_ACCELERATOR_AWARE_MPI=yes])
|
||||
|
||||
# Force accelerator CSHIFT now
|
||||
AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on device])
|
||||
|
||||
case ${ac_ACCELERATOR_AWARE_MPI} in
|
||||
yes)
|
||||
AC_DEFINE([ACCELERATOR_AWARE_MPI],[1],[ Stencil can use device pointers]);;
|
||||
*);;
|
||||
esac
|
||||
|
||||
############### SYCL/CUDA/HIP/none
|
||||
AC_ARG_ENABLE([accelerator],
|
||||
[AS_HELP_STRING([--enable-accelerator=cuda|sycl|hip|none],[enable none,cuda,sycl,hip acceleration])],
|
||||
|
||||
@@ -93,10 +93,13 @@ int main(int argc, char ** argv)
|
||||
Real coeff = (width*width) / Real(4*Iterations);
|
||||
|
||||
chi=kronecker;
|
||||
|
||||
// chi = (1-p^2/2N)^N kronecker
|
||||
for(int n = 0; n < Iterations; ++n) {
|
||||
Laplacian.M(chi,psi);
|
||||
chi = chi - coeff*psi;
|
||||
RealD n2 = norm2(chi);
|
||||
chi = chi * (1.0/std::sqrt(n2));
|
||||
}
|
||||
|
||||
std::cout << " Wuppertal smeared operator is chi = \n" << chi <<std::endl;
|
||||
|
||||
@@ -1,383 +0,0 @@
|
||||
/*
|
||||
* Warning: This code illustrative only: not well tested, and not meant for production use
|
||||
* without regression / tests being applied
|
||||
*/
|
||||
|
||||
#include <Grid/Grid.h>
|
||||
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
|
||||
RealD LLscale =1.0;
|
||||
RealD LCscale =1.0;
|
||||
|
||||
template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field>
|
||||
{
|
||||
public:
|
||||
INHERIT_GIMPL_TYPES(Gimpl);
|
||||
|
||||
GridBase *grid;
|
||||
GaugeField U;
|
||||
|
||||
CovariantLaplacianCshift(GaugeField &_U) :
|
||||
grid(_U.Grid()),
|
||||
U(_U) { };
|
||||
|
||||
virtual GridBase *Grid(void) { return grid; };
|
||||
|
||||
virtual void M (const Field &in, Field &out)
|
||||
{
|
||||
out=Zero();
|
||||
for(int mu=0;mu<Nd-1;mu++) {
|
||||
GaugeLinkField Umu = PeekIndex<LorentzIndex>(U, mu); // NB: Inefficent
|
||||
out = out - Gimpl::CovShiftForward(Umu,mu,in);
|
||||
out = out - Gimpl::CovShiftBackward(Umu,mu,in);
|
||||
out = out + 2.0*in;
|
||||
}
|
||||
};
|
||||
virtual void Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian
|
||||
virtual void Mdiag (const Field &in, Field &out) {assert(0);}; // Unimplemented need only for multigrid
|
||||
virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid
|
||||
virtual void MdirAll (const Field &in, std::vector<Field> &out) {assert(0);}; // Unimplemented need only for multigrid
|
||||
};
|
||||
|
||||
void MakePhase(Coordinate mom,LatticeComplex &phase)
|
||||
{
|
||||
GridBase *grid = phase.Grid();
|
||||
auto latt_size = grid->GlobalDimensions();
|
||||
ComplexD ci(0.0,1.0);
|
||||
phase=Zero();
|
||||
|
||||
LatticeComplex coor(phase.Grid());
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
RealD TwoPiL = M_PI * 2.0/ latt_size[mu];
|
||||
LatticeCoordinate(coor,mu);
|
||||
phase = phase + (TwoPiL * mom[mu]) * coor;
|
||||
}
|
||||
phase = exp(phase*ci);
|
||||
}
|
||||
void PointSource(Coordinate &coor,LatticePropagator &source)
|
||||
{
|
||||
// Coordinate coor({0,0,0,0});
|
||||
source=Zero();
|
||||
SpinColourMatrix kronecker; kronecker=1.0;
|
||||
pokeSite(kronecker,source,coor);
|
||||
}
|
||||
void Z2WallSource(GridParallelRNG &RNG,int tslice,LatticePropagator &source)
|
||||
{
|
||||
GridBase *grid = source.Grid();
|
||||
LatticeComplex noise(grid);
|
||||
LatticeComplex zz(grid); zz=Zero();
|
||||
LatticeInteger t(grid);
|
||||
|
||||
RealD nrm=1.0/sqrt(2);
|
||||
bernoulli(RNG, noise); // 0,1 50:50
|
||||
|
||||
noise = (2.*noise - Complex(1,1))*nrm;
|
||||
|
||||
LatticeCoordinate(t,Tdir);
|
||||
noise = where(t==Integer(tslice), noise, zz);
|
||||
|
||||
source = 1.0;
|
||||
source = source*noise;
|
||||
std::cout << " Z2 wall " << norm2(source) << std::endl;
|
||||
}
|
||||
template<class Field>
|
||||
void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared)
|
||||
{
|
||||
typedef CovariantLaplacianCshift <PeriodicGimplR,Field> Laplacian_t;
|
||||
Laplacian_t Laplacian(U);
|
||||
|
||||
Integer Iterations = 40;
|
||||
Real width = 2.0;
|
||||
Real coeff = (width*width) / Real(4*Iterations);
|
||||
|
||||
Field tmp(U.Grid());
|
||||
smeared=unsmeared;
|
||||
// chi = (1-p^2/2N)^N kronecker
|
||||
for(int n = 0; n < Iterations; ++n) {
|
||||
Laplacian.M(smeared,tmp);
|
||||
smeared = smeared - coeff*tmp;
|
||||
std::cout << " smear iter " << n<<" " <<norm2(smeared)<<std::endl;
|
||||
}
|
||||
}
|
||||
void GaussianSource(Coordinate &site,LatticeGaugeField &U,LatticePropagator &source)
|
||||
{
|
||||
LatticePropagator tmp(source.Grid());
|
||||
PointSource(site,source);
|
||||
std::cout << " GaussianSource Kronecker "<< norm2(source)<<std::endl;
|
||||
tmp = source;
|
||||
GaussianSmear(U,tmp,source);
|
||||
std::cout << " GaussianSource Smeared "<< norm2(source)<<std::endl;
|
||||
}
|
||||
void GaussianWallSource(GridParallelRNG &RNG,int tslice,LatticeGaugeField &U,LatticePropagator &source)
|
||||
{
|
||||
Z2WallSource(RNG,tslice,source);
|
||||
auto tmp = source;
|
||||
GaussianSmear(U,tmp,source);
|
||||
}
|
||||
void SequentialSource(int tslice,Coordinate &mom,LatticePropagator &spectator,LatticePropagator &source)
|
||||
{
|
||||
assert(mom.size()==Nd);
|
||||
assert(mom[Tdir] == 0);
|
||||
|
||||
GridBase * grid = spectator.Grid();
|
||||
|
||||
|
||||
LatticeInteger ts(grid);
|
||||
LatticeCoordinate(ts,Tdir);
|
||||
source = Zero();
|
||||
source = where(ts==Integer(tslice),spectator,source); // Stick in a slice of the spectator, zero everywhere else
|
||||
|
||||
LatticeComplex phase(grid);
|
||||
MakePhase(mom,phase);
|
||||
|
||||
source = source *phase;
|
||||
}
|
||||
template<class Action>
|
||||
void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator)
|
||||
{
|
||||
GridBase *UGrid = D.GaugeGrid();
|
||||
GridBase *FGrid = D.FermionGrid();
|
||||
|
||||
LatticeFermion src4 (UGrid);
|
||||
LatticeFermion src5 (FGrid);
|
||||
LatticeFermion result5(FGrid);
|
||||
LatticeFermion result4(UGrid);
|
||||
LatticePropagator prop5(FGrid);
|
||||
|
||||
ConjugateGradient<LatticeFermion> CG(1.0e-8,100000);
|
||||
SchurRedBlackDiagMooeeSolve<LatticeFermion> schur(CG);
|
||||
ZeroGuesser<LatticeFermion> ZG; // Could be a DeflatedGuesser if have eigenvectors
|
||||
for(int s=0;s<Nd;s++){
|
||||
for(int c=0;c<Nc;c++){
|
||||
PropToFerm<Action>(src4,source,s,c);
|
||||
|
||||
D.ImportPhysicalFermionSource(src4,src5);
|
||||
|
||||
result5=Zero();
|
||||
schur(D,src5,result5,ZG);
|
||||
std::cout<<GridLogMessage
|
||||
<<"spin "<<s<<" color "<<c
|
||||
<<" norm2(src5d) " <<norm2(src5)
|
||||
<<" norm2(result5d) "<<norm2(result5)<<std::endl;
|
||||
|
||||
D.ExportPhysicalFermionSolution(result5,result4);
|
||||
|
||||
FermToProp<Action>(prop5,result5,s,c);
|
||||
FermToProp<Action>(propagator,result4,s,c);
|
||||
}
|
||||
}
|
||||
LatticePropagator Axial_mu(UGrid);
|
||||
LatticePropagator Vector_mu(UGrid);
|
||||
|
||||
LatticeComplex PA (UGrid);
|
||||
LatticeComplex VV (UGrid);
|
||||
LatticeComplex PJ5q(UGrid);
|
||||
LatticeComplex PP (UGrid);
|
||||
|
||||
std::vector<TComplex> sumPA;
|
||||
std::vector<TComplex> sumVV;
|
||||
std::vector<TComplex> sumPP;
|
||||
std::vector<TComplex> sumPJ5q;
|
||||
|
||||
Gamma g5(Gamma::Algebra::Gamma5);
|
||||
D.ContractConservedCurrent(prop5,prop5,Axial_mu,source,Current::Axial,Tdir);
|
||||
PA = trace(g5*Axial_mu); // Pseudoscalar-Axial conserved current
|
||||
sliceSum(PA,sumPA,Tdir);
|
||||
|
||||
int Nt{static_cast<int>(sumPA.size())};
|
||||
|
||||
for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PAc["<<t<<"] "<<real(TensorRemove(sumPA[t]))*LCscale<<std::endl;
|
||||
|
||||
PP = trace(adj(propagator)*propagator); // Pseudoscalar density
|
||||
sliceSum(PP,sumPP,Tdir);
|
||||
for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PP["<<t<<"] "<<real(TensorRemove(sumPP[t]))*LCscale<<std::endl;
|
||||
|
||||
D.ContractJ5q(prop5,PJ5q);
|
||||
sliceSum(PJ5q,sumPJ5q,Tdir);
|
||||
for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PJ5q["<<t<<"] "<<real(TensorRemove(sumPJ5q[t]))<<std::endl;
|
||||
|
||||
Gamma::Algebra GammaV[3] = {
|
||||
Gamma::Algebra::GammaX,
|
||||
Gamma::Algebra::GammaY,
|
||||
Gamma::Algebra::GammaZ
|
||||
};
|
||||
for( int mu=0;mu<3;mu++ ) {
|
||||
Gamma gV(GammaV[mu]);
|
||||
D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu);
|
||||
// auto ss=sliceSum(Vector_mu,Tdir);
|
||||
// for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"ss["<<mu<<"]["<<t<<"] "<<ss[t]<<std::endl;
|
||||
VV = trace(gV*Vector_mu); // (local) Vector-Vector conserved current
|
||||
sliceSum(VV,sumVV,Tdir);
|
||||
for(int t=0;t<Nt;t++){
|
||||
RealD Ct = real(TensorRemove(sumVV[t]))*LCscale;
|
||||
std::cout<<GridLogMessage <<"VVc["<<mu<<"]["<<t<<"] "<< Ct
|
||||
<< " 2 pi^2 t^3 C(t) "<< 2 * M_PI *M_PI * t*t*t *Ct<<std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class MesonFile: Serializable {
|
||||
public:
|
||||
GRID_SERIALIZABLE_CLASS_MEMBERS(MesonFile, std::vector<std::vector<Complex> >, data);
|
||||
};
|
||||
|
||||
void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase)
|
||||
{
|
||||
const int nchannel=3;
|
||||
Gamma::Algebra Gammas[nchannel][2] = {
|
||||
{Gamma::Algebra::GammaX,Gamma::Algebra::GammaX},
|
||||
{Gamma::Algebra::GammaY,Gamma::Algebra::GammaY},
|
||||
{Gamma::Algebra::GammaZ,Gamma::Algebra::GammaZ}
|
||||
};
|
||||
|
||||
Gamma G5(Gamma::Algebra::Gamma5);
|
||||
|
||||
LatticeComplex meson_CF(q1.Grid());
|
||||
MesonFile MF;
|
||||
|
||||
for(int ch=0;ch<nchannel;ch++){
|
||||
|
||||
Gamma Gsrc(Gammas[ch][0]);
|
||||
Gamma Gsnk(Gammas[ch][1]);
|
||||
|
||||
meson_CF = trace(G5*adj(q1)*G5*Gsnk*q2*adj(Gsrc));
|
||||
|
||||
std::vector<TComplex> meson_T;
|
||||
sliceSum(meson_CF,meson_T, Tdir);
|
||||
|
||||
int nt=meson_T.size();
|
||||
|
||||
std::vector<Complex> corr(nt);
|
||||
for(int t=0;t<nt;t++){
|
||||
corr[t] = TensorRemove(meson_T[t])*LLscale; // Yes this is ugly, not figured a work around
|
||||
std::cout << " channel "<<ch<<" t "<<t<<" " <<real(corr[t])<< " 2 pi^2 t^3 C(t) "<< 2 * M_PI *M_PI * t*t*t *real(corr[t])<<std::endl;
|
||||
}
|
||||
MF.data.push_back(corr);
|
||||
}
|
||||
|
||||
{
|
||||
XmlWriter WR(file);
|
||||
write(WR,"MesonFile",MF);
|
||||
}
|
||||
}
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
const int Ls=32;
|
||||
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
// Double precision grids
|
||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
|
||||
GridDefaultSimd(Nd,vComplex::Nsimd()),
|
||||
GridDefaultMpi());
|
||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// You can manage seeds however you like.
|
||||
// Recommend SeedUniqueString.
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
std::vector<int> seeds4({1,2,3,4});
|
||||
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
||||
|
||||
LatticeGaugeField Umu(UGrid);
|
||||
std::string config;
|
||||
RealD M5=1.8;
|
||||
if( argc > 1 && argv[1][0] != '-' )
|
||||
{
|
||||
std::cout<<GridLogMessage <<"Loading configuration from "<<argv[1]<<std::endl;
|
||||
FieldMetaData header;
|
||||
NerscIO::readConfiguration(Umu, header, argv[1]);
|
||||
config=argv[1];
|
||||
M5=1.8;
|
||||
}
|
||||
else
|
||||
{
|
||||
SU<Nc>::ColdConfiguration(Umu);
|
||||
config="ColdConfig";
|
||||
// RealD P=1.0; // Don't scale
|
||||
RealD P=0.5871119; // 48I
|
||||
// RealD P=0.6153342; // 64I
|
||||
// RealD P=0.6388238 // 32Ifine
|
||||
RealD u0 = sqrt(sqrt(P));
|
||||
RealD M5mf = M5 - 4.0*(1.0-u0);
|
||||
RealD w0 = 1.0 - M5mf;
|
||||
#if 0
|
||||
// M5=1.8 with U=u0
|
||||
Umu = Umu * u0;
|
||||
LLscale = 1.0;
|
||||
LCscale = 1.0;
|
||||
std::cout<<GridLogMessage <<"Gauge links are u=u0= "<<u0<<std::endl;
|
||||
std::cout<<GridLogMessage <<"M5 = "<<M5<<std::endl;
|
||||
#else
|
||||
M5 = M5mf;
|
||||
std::cout<<GridLogMessage <<"Gauge links are u=1 "<<std::endl;
|
||||
std::cout<<GridLogMessage <<"u0="<<u0<<std::endl;
|
||||
std::cout<<GridLogMessage <<"M5=M5mf = "<<M5<<std::endl;
|
||||
LLscale = 1.0/(1-w0*w0)/(1-w0*w0);
|
||||
LCscale = 1.0/(1-w0*w0)/(1-w0*w0);
|
||||
#endif
|
||||
std::cout<<GridLogMessage <<"LLscale = "<<LLscale<<std::endl;
|
||||
std::cout<<GridLogMessage <<"LCscale = "<<LCscale<<std::endl;
|
||||
}
|
||||
|
||||
std::vector<RealD> masses({ 0.00} ); // u/d, s, c ??
|
||||
|
||||
int nmass = masses.size();
|
||||
|
||||
std::vector<MobiusFermionD *> FermActs;
|
||||
|
||||
std::cout<<GridLogMessage <<"======================"<<std::endl;
|
||||
std::cout<<GridLogMessage <<"MobiusFermion action as Scaled Shamir kernel"<<std::endl;
|
||||
std::cout<<GridLogMessage <<"======================"<<std::endl;
|
||||
|
||||
for(auto mass: masses) {
|
||||
|
||||
RealD b=1.5;// Scale factor b+c=2, b-c=1
|
||||
RealD c=0.5;
|
||||
|
||||
FermActs.push_back(new MobiusFermionD(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c));
|
||||
|
||||
}
|
||||
|
||||
LatticePropagator point_source(UGrid);
|
||||
// LatticePropagator wall_source(UGrid);
|
||||
|
||||
Coordinate Origin({0,0,0,0});
|
||||
PointSource (Origin,point_source);
|
||||
// Z2WallSource (RNG4,0,wall_source);
|
||||
|
||||
std::vector<LatticePropagator> PointProps(nmass,UGrid);
|
||||
// std::vector<LatticePropagator> GaussProps(nmass,UGrid);
|
||||
// std::vector<LatticePropagator> Z2Props (nmass,UGrid);
|
||||
|
||||
for(int m=0;m<nmass;m++) {
|
||||
|
||||
Solve(*FermActs[m],point_source ,PointProps[m]);
|
||||
}
|
||||
|
||||
LatticeComplex phase(UGrid);
|
||||
Coordinate mom({0,0,0,0});
|
||||
MakePhase(mom,phase);
|
||||
|
||||
for(int m1=0 ;m1<nmass;m1++) {
|
||||
for(int m2=m1;m2<nmass;m2++) {
|
||||
std::stringstream ssp,ssg,ssz;
|
||||
|
||||
ssp<<config<< "_m" << m1 << "_m"<< m2 << "_point_meson.xml";
|
||||
ssz<<config<< "_m" << m1 << "_m"<< m2 << "_wall_meson.xml";
|
||||
|
||||
MesonTrace(ssp.str(),PointProps[m1],PointProps[m2],phase);
|
||||
// MesonTrace(ssz.str(),Z2Props[m1],Z2Props[m2],phase);
|
||||
}}
|
||||
|
||||
Grid_finalize();
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -1,479 +0,0 @@
|
||||
/*
|
||||
* Warning: This code illustrative only: not well tested, and not meant for production use
|
||||
* without regression / tests being applied
|
||||
*/
|
||||
|
||||
#include <Grid/Grid.h>
|
||||
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
|
||||
RealD LLscale =1.0;
|
||||
RealD LCscale =1.0;
|
||||
|
||||
template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field>
|
||||
{
|
||||
public:
|
||||
INHERIT_GIMPL_TYPES(Gimpl);
|
||||
|
||||
GridBase *grid;
|
||||
GaugeField U;
|
||||
|
||||
CovariantLaplacianCshift(GaugeField &_U) :
|
||||
grid(_U.Grid()),
|
||||
U(_U) { };
|
||||
|
||||
virtual GridBase *Grid(void) { return grid; };
|
||||
|
||||
virtual void M (const Field &in, Field &out)
|
||||
{
|
||||
out=Zero();
|
||||
for(int mu=0;mu<Nd-1;mu++) {
|
||||
GaugeLinkField Umu = PeekIndex<LorentzIndex>(U, mu); // NB: Inefficent
|
||||
out = out - Gimpl::CovShiftForward(Umu,mu,in);
|
||||
out = out - Gimpl::CovShiftBackward(Umu,mu,in);
|
||||
out = out + 2.0*in;
|
||||
}
|
||||
};
|
||||
virtual void Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian
|
||||
virtual void Mdiag (const Field &in, Field &out) {assert(0);}; // Unimplemented need only for multigrid
|
||||
virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid
|
||||
virtual void MdirAll (const Field &in, std::vector<Field> &out) {assert(0);}; // Unimplemented need only for multigrid
|
||||
};
|
||||
|
||||
void MakePhase(Coordinate mom,LatticeComplex &phase)
|
||||
{
|
||||
GridBase *grid = phase.Grid();
|
||||
auto latt_size = grid->GlobalDimensions();
|
||||
ComplexD ci(0.0,1.0);
|
||||
phase=Zero();
|
||||
|
||||
LatticeComplex coor(phase.Grid());
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
RealD TwoPiL = M_PI * 2.0/ latt_size[mu];
|
||||
LatticeCoordinate(coor,mu);
|
||||
phase = phase + (TwoPiL * mom[mu]) * coor;
|
||||
}
|
||||
phase = exp(phase*ci);
|
||||
}
|
||||
|
||||
void PointSource(Coordinate &coor,LatticePropagator &source)
|
||||
{
|
||||
// Coordinate coor({0,0,0,0});
|
||||
source=Zero();
|
||||
SpinColourMatrix kronecker; kronecker=1.0;
|
||||
pokeSite(kronecker,source,coor);
|
||||
}
|
||||
void Z2WallSource(GridParallelRNG &RNG,int tslice,LatticePropagator &source)
|
||||
{
|
||||
GridBase *grid = source.Grid();
|
||||
LatticeComplex noise(grid);
|
||||
LatticeComplex zz(grid); zz=Zero();
|
||||
LatticeInteger t(grid);
|
||||
|
||||
RealD nrm=1.0/sqrt(2);
|
||||
bernoulli(RNG, noise); // 0,1 50:50
|
||||
|
||||
noise = (2.*noise - Complex(1,1))*nrm;
|
||||
|
||||
LatticeCoordinate(t,Tdir);
|
||||
noise = where(t==Integer(tslice), noise, zz);
|
||||
|
||||
source = 1.0;
|
||||
source = source*noise;
|
||||
std::cout << " Z2 wall " << norm2(source) << std::endl;
|
||||
}
|
||||
template<class Field>
|
||||
void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared)
|
||||
{
|
||||
typedef CovariantLaplacianCshift <PeriodicGimplR,Field> Laplacian_t;
|
||||
Laplacian_t Laplacian(U);
|
||||
|
||||
Integer Iterations = 40;
|
||||
Real width = 2.0;
|
||||
Real coeff = (width*width) / Real(4*Iterations);
|
||||
|
||||
Field tmp(U.Grid());
|
||||
smeared=unsmeared;
|
||||
// chi = (1-p^2/2N)^N kronecker
|
||||
for(int n = 0; n < Iterations; ++n) {
|
||||
Laplacian.M(smeared,tmp);
|
||||
smeared = smeared - coeff*tmp;
|
||||
std::cout << " smear iter " << n<<" " <<norm2(smeared)<<std::endl;
|
||||
}
|
||||
}
|
||||
void GaussianSource(Coordinate &site,LatticeGaugeField &U,LatticePropagator &source)
|
||||
{
|
||||
LatticePropagator tmp(source.Grid());
|
||||
PointSource(site,source);
|
||||
std::cout << " GaussianSource Kronecker "<< norm2(source)<<std::endl;
|
||||
tmp = source;
|
||||
GaussianSmear(U,tmp,source);
|
||||
std::cout << " GaussianSource Smeared "<< norm2(source)<<std::endl;
|
||||
}
|
||||
void GaussianWallSource(GridParallelRNG &RNG,int tslice,LatticeGaugeField &U,LatticePropagator &source)
|
||||
{
|
||||
Z2WallSource(RNG,tslice,source);
|
||||
auto tmp = source;
|
||||
GaussianSmear(U,tmp,source);
|
||||
}
|
||||
void SequentialSource(int tslice,Coordinate &mom,LatticePropagator &spectator,LatticePropagator &source)
|
||||
{
|
||||
assert(mom.size()==Nd);
|
||||
assert(mom[Tdir] == 0);
|
||||
|
||||
GridBase * grid = spectator.Grid();
|
||||
|
||||
|
||||
LatticeInteger ts(grid);
|
||||
LatticeCoordinate(ts,Tdir);
|
||||
source = Zero();
|
||||
source = where(ts==Integer(tslice),spectator,source); // Stick in a slice of the spectator, zero everywhere else
|
||||
|
||||
LatticeComplex phase(grid);
|
||||
MakePhase(mom,phase);
|
||||
|
||||
source = source *phase;
|
||||
}
|
||||
|
||||
template<class Action>
|
||||
void MasslessFreePropagator(Action &D,LatticePropagator &source,LatticePropagator &propagator)
|
||||
{
|
||||
GridBase *UGrid = source.Grid();
|
||||
GridBase *FGrid = D.FermionGrid();
|
||||
bool fiveD = true; //calculate 5d free propagator
|
||||
RealD mass = D.Mass();
|
||||
LatticeFermion src4 (UGrid);
|
||||
LatticeFermion result4 (UGrid);
|
||||
LatticeFermion result5(FGrid);
|
||||
LatticeFermion src5(FGrid);
|
||||
LatticePropagator prop5(FGrid);
|
||||
for(int s=0;s<Nd;s++){
|
||||
for(int c=0;c<Nc;c++){
|
||||
|
||||
PropToFerm<Action>(src4,source,s,c);
|
||||
|
||||
D.ImportPhysicalFermionSource(src4,src5);
|
||||
D.FreePropagator(src5,result5,mass,true);
|
||||
std::cout<<GridLogMessage
|
||||
<<"Free 5D prop spin "<<s<<" color "<<c
|
||||
<<" norm2(src5d) " <<norm2(src5)
|
||||
<<" norm2(result5d) "<<norm2(result5)<<std::endl;
|
||||
|
||||
D.ExportPhysicalFermionSolution(result5,result4);
|
||||
|
||||
FermToProp<Action>(prop5,result5,s,c);
|
||||
FermToProp<Action>(propagator,result4,s,c);
|
||||
}
|
||||
}
|
||||
|
||||
LatticePropagator Vector_mu(UGrid);
|
||||
LatticeComplex VV (UGrid);
|
||||
std::vector<TComplex> sumVV;
|
||||
Gamma::Algebra GammaV[3] = {
|
||||
Gamma::Algebra::GammaX,
|
||||
Gamma::Algebra::GammaY,
|
||||
Gamma::Algebra::GammaZ
|
||||
};
|
||||
for( int mu=0;mu<3;mu++ ) {
|
||||
Gamma gV(GammaV[mu]);
|
||||
D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu);
|
||||
VV = trace(gV*Vector_mu); // (local) Vector-Vector conserved current
|
||||
sliceSum(VV,sumVV,Tdir);
|
||||
int Nt = sumVV.size();
|
||||
for(int t=0;t<Nt;t++){
|
||||
RealD Ct = real(TensorRemove(sumVV[t]))*LCscale;
|
||||
RealD Cont=0;
|
||||
if(t) Cont=1.0/(2 * M_PI *M_PI * t*t*t);
|
||||
std::cout<<GridLogMessage <<"VVc["<<mu<<"]["<<t<<"] "<< Ct
|
||||
<< " 2 pi^2 t^3 C(t) "<< Ct/Cont << " delta Ct "<< Ct-Cont <<std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
template<class Action>
|
||||
void MasslessFreePropagator1(Action &D,LatticePropagator &source,LatticePropagator &propagator)
|
||||
{
|
||||
bool fiveD = false; //calculate 4d free propagator
|
||||
RealD mass = D.Mass();
|
||||
GridBase *UGrid = source.Grid();
|
||||
LatticeFermion src4 (UGrid);
|
||||
LatticeFermion result4 (UGrid);
|
||||
for(int s=0;s<Nd;s++){
|
||||
for(int c=0;c<Nc;c++){
|
||||
PropToFerm<Action>(src4,source,s,c);
|
||||
D.FreePropagator(src4,result4,mass,false);
|
||||
FermToProp<Action>(propagator,result4,s,c);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<class Action>
|
||||
void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator)
|
||||
{
|
||||
GridBase *UGrid = D.GaugeGrid();
|
||||
GridBase *FGrid = D.FermionGrid();
|
||||
|
||||
LatticeFermion src4 (UGrid);
|
||||
LatticeFermion src5 (FGrid);
|
||||
LatticeFermion result5(FGrid);
|
||||
LatticeFermion result4(UGrid);
|
||||
LatticePropagator prop5(FGrid);
|
||||
|
||||
ConjugateGradient<LatticeFermion> CG(1.0e-10,100000);
|
||||
SchurRedBlackDiagMooeeSolve<LatticeFermion> schur(CG);
|
||||
ZeroGuesser<LatticeFermion> ZG; // Could be a DeflatedGuesser if have eigenvectors
|
||||
for(int s=0;s<Nd;s++){
|
||||
for(int c=0;c<Nc;c++){
|
||||
PropToFerm<Action>(src4,source,s,c);
|
||||
|
||||
D.ImportPhysicalFermionSource(src4,src5);
|
||||
|
||||
result5=Zero();
|
||||
schur(D,src5,result5,ZG);
|
||||
std::cout<<GridLogMessage
|
||||
<<"spin "<<s<<" color "<<c
|
||||
<<" norm2(src5d) " <<norm2(src5)
|
||||
<<" norm2(result5d) "<<norm2(result5)<<std::endl;
|
||||
|
||||
D.ExportPhysicalFermionSolution(result5,result4);
|
||||
|
||||
FermToProp<Action>(prop5,result5,s,c);
|
||||
FermToProp<Action>(propagator,result4,s,c);
|
||||
}
|
||||
}
|
||||
LatticePropagator Axial_mu(UGrid);
|
||||
LatticePropagator Vector_mu(UGrid);
|
||||
|
||||
LatticeComplex PA (UGrid);
|
||||
LatticeComplex VV (UGrid);
|
||||
LatticeComplex PJ5q(UGrid);
|
||||
LatticeComplex PP (UGrid);
|
||||
|
||||
std::vector<TComplex> sumPA;
|
||||
std::vector<TComplex> sumVV;
|
||||
std::vector<TComplex> sumPP;
|
||||
std::vector<TComplex> sumPJ5q;
|
||||
|
||||
Gamma g5(Gamma::Algebra::Gamma5);
|
||||
D.ContractConservedCurrent(prop5,prop5,Axial_mu,source,Current::Axial,Tdir);
|
||||
PA = trace(g5*Axial_mu); // Pseudoscalar-Axial conserved current
|
||||
sliceSum(PA,sumPA,Tdir);
|
||||
|
||||
int Nt{static_cast<int>(sumPA.size())};
|
||||
|
||||
for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PAc["<<t<<"] "<<real(TensorRemove(sumPA[t]))*LCscale<<std::endl;
|
||||
|
||||
PP = trace(adj(propagator)*propagator); // Pseudoscalar density
|
||||
sliceSum(PP,sumPP,Tdir);
|
||||
for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PP["<<t<<"] "<<real(TensorRemove(sumPP[t]))*LCscale<<std::endl;
|
||||
|
||||
D.ContractJ5q(prop5,PJ5q);
|
||||
sliceSum(PJ5q,sumPJ5q,Tdir);
|
||||
for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PJ5q["<<t<<"] "<<real(TensorRemove(sumPJ5q[t]))<<std::endl;
|
||||
|
||||
Gamma::Algebra GammaV[3] = {
|
||||
Gamma::Algebra::GammaX,
|
||||
Gamma::Algebra::GammaY,
|
||||
Gamma::Algebra::GammaZ
|
||||
};
|
||||
for( int mu=0;mu<3;mu++ ) {
|
||||
Gamma gV(GammaV[mu]);
|
||||
D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu);
|
||||
// auto ss=sliceSum(Vector_mu,Tdir);
|
||||
// for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"ss["<<mu<<"]["<<t<<"] "<<ss[t]<<std::endl;
|
||||
VV = trace(gV*Vector_mu); // (local) Vector-Vector conserved current
|
||||
sliceSum(VV,sumVV,Tdir);
|
||||
for(int t=0;t<Nt;t++){
|
||||
RealD Ct = real(TensorRemove(sumVV[t]))*LCscale;
|
||||
RealD Cont=0;
|
||||
if(t) Cont=1.0/(2 * M_PI *M_PI * t*t*t);
|
||||
std::cout<<GridLogMessage <<"VVc["<<mu<<"]["<<t<<"] "<< Ct
|
||||
<< " 2 pi^2 t^3 C(t) "<< Ct/Cont << " delta Ct "<< Ct-Cont <<std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class MesonFile: Serializable {
|
||||
public:
|
||||
GRID_SERIALIZABLE_CLASS_MEMBERS(MesonFile, std::vector<std::vector<Complex> >, data);
|
||||
};
|
||||
|
||||
void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase)
|
||||
{
|
||||
const int nchannel=4;
|
||||
Gamma::Algebra Gammas[nchannel][2] = {
|
||||
{Gamma::Algebra::GammaXGamma5,Gamma::Algebra::GammaXGamma5},
|
||||
{Gamma::Algebra::GammaYGamma5,Gamma::Algebra::GammaYGamma5},
|
||||
{Gamma::Algebra::GammaZGamma5,Gamma::Algebra::GammaZGamma5},
|
||||
{Gamma::Algebra::Identity,Gamma::Algebra::Identity}
|
||||
};
|
||||
|
||||
LatticeComplex meson_CF(q1.Grid());
|
||||
MesonFile MF;
|
||||
|
||||
for(int ch=0;ch<nchannel;ch++){
|
||||
|
||||
Gamma Gsrc(Gammas[ch][0]);
|
||||
Gamma Gsnk(Gammas[ch][1]);
|
||||
|
||||
meson_CF = trace(adj(q1)*Gsnk*q2*adj(Gsrc));
|
||||
|
||||
std::vector<TComplex> meson_T;
|
||||
sliceSum(meson_CF,meson_T, Tdir);
|
||||
|
||||
int nt=meson_T.size();
|
||||
|
||||
std::vector<Complex> corr(nt);
|
||||
for(int t=0;t<nt;t++){
|
||||
corr[t] = TensorRemove(meson_T[t])*LLscale; // Yes this is ugly, not figured a work around
|
||||
RealD Ct = real(corr[t]);
|
||||
RealD Cont=0;
|
||||
if(t) Cont=1.0/(2 * M_PI *M_PI * t*t*t);
|
||||
std::cout << " channel "<<ch<<" t "<<t<<" " <<real(corr[t])<< " 2 pi^2 t^3 C(t) "<< 2 * M_PI *M_PI * t*t*t * Ct
|
||||
<< " deltaC " <<Ct-Cont<<std::endl;
|
||||
}
|
||||
MF.data.push_back(corr);
|
||||
}
|
||||
|
||||
{
|
||||
XmlWriter WR(file);
|
||||
write(WR,"MesonFile",MF);
|
||||
}
|
||||
}
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
const int Ls=10;
|
||||
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
// Double precision grids
|
||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
|
||||
GridDefaultSimd(Nd,vComplex::Nsimd()),
|
||||
GridDefaultMpi());
|
||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// You can manage seeds however you like.
|
||||
// Recommend SeedUniqueString.
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// std::vector<int> seeds4({1,2,3,4});
|
||||
// GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
||||
|
||||
LatticeGaugeField Umu(UGrid);
|
||||
std::string config;
|
||||
RealD M5=atof(getenv("M5"));
|
||||
RealD mq = atof(getenv("mass"));
|
||||
int tadpole = atof(getenv("tadpole"));
|
||||
std::vector<RealD> masses({ mq} ); // u/d, s, c ??
|
||||
if( argc > 1 && argv[1][0] != '-' )
|
||||
{
|
||||
std::cout<<GridLogMessage <<"Loading configuration from "<<argv[1]<<std::endl;
|
||||
FieldMetaData header;
|
||||
NerscIO::readConfiguration(Umu, header, argv[1]);
|
||||
config=argv[1];
|
||||
LLscale = 1.0;
|
||||
LCscale = 1.0;
|
||||
}
|
||||
else
|
||||
{
|
||||
SU<Nc>::ColdConfiguration(Umu);
|
||||
config="ColdConfig";
|
||||
// RealD P=1.0; // Don't scale
|
||||
// RealD P=0.6388238 // 32Ifine
|
||||
// RealD P=0.6153342; // 64I
|
||||
RealD P=0.5871119; // 48I
|
||||
RealD u0 = sqrt(sqrt(P));
|
||||
RealD w0 = 1 - M5;
|
||||
std::cout<<GridLogMessage <<"For plaquette P="<<P<<" u0= "<<u0<<std::endl;
|
||||
if ( tadpole == 1 ) {
|
||||
Umu = Umu * u0;
|
||||
// LLscale = 1.0/(1-w0*w0)/(1-w0*w0)/u0/u0;
|
||||
// LCscale = 1.0/(1-w0*w0)/(1-w0*w0)/u0/u0;
|
||||
LLscale = 1.0;
|
||||
LCscale = 1.0;
|
||||
std::cout<<GridLogMessage <<"Gauge links are u= u0 "<<std::endl;
|
||||
std::cout<<GridLogMessage <<"M5 = "<<M5<<std::endl;
|
||||
} else if ( tadpole == 2) {
|
||||
std::cout<<GridLogMessage <<"Gauge links are u=1 "<<std::endl;
|
||||
LLscale = 1.0;
|
||||
LCscale = 1.0;
|
||||
std::cout<<GridLogMessage <<"M5 = "<<M5<<std::endl;
|
||||
} else {
|
||||
LLscale = 1.0/u0/u0;
|
||||
LCscale = 1.0/u0/u0;
|
||||
M5 = M5 - 4.0 * (1-u0);
|
||||
std::cout<<GridLogMessage <<"Gauge links are u=1 "<<std::endl;
|
||||
std::cout<<GridLogMessage <<"M5mf = "<<M5<<std::endl;
|
||||
}
|
||||
std::cout<<GridLogMessage <<"mq = "<<mq<<std::endl;
|
||||
std::cout<<GridLogMessage <<"LLscale = "<<LLscale<<std::endl;
|
||||
std::cout<<GridLogMessage <<"LCscale = "<<LCscale<<std::endl;
|
||||
}
|
||||
|
||||
int nmass = masses.size();
|
||||
|
||||
typedef DomainWallFermionD FermionActionD;
|
||||
// typedef MobiusFermionD FermionActionD;
|
||||
std::vector<FermionActionD *> FermActs;
|
||||
std::vector<DomainWallFermionD *> DWFActs;
|
||||
|
||||
std::cout<<GridLogMessage <<"======================"<<std::endl;
|
||||
std::cout<<GridLogMessage <<"DomainWallFermion action"<<std::endl;
|
||||
std::cout<<GridLogMessage <<"======================"<<std::endl;
|
||||
|
||||
for(auto mass: masses) {
|
||||
std::vector<Complex> boundary = {1,1,1,-1};
|
||||
FermionActionD::ImplParams Params(boundary);
|
||||
RealD b=1.5;
|
||||
RealD c=0.5;
|
||||
std::cout<<GridLogMessage <<"Making DomainWallFermion action"<<std::endl;
|
||||
// DWFActs.push_back(new DomainWallFermionD(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5));
|
||||
FermActs.push_back(new FermionActionD(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,Params));
|
||||
// FermActs.push_back(new FermionActionD(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass+0.001,M5,b,c));
|
||||
std::cout<<GridLogMessage <<"Made DomainWallFermion action"<<std::endl;
|
||||
}
|
||||
|
||||
LatticePropagator point_source(UGrid);
|
||||
|
||||
Coordinate Origin({0,0,0,0});
|
||||
PointSource (Origin,point_source);
|
||||
|
||||
std::vector<LatticePropagator> PointProps(nmass,UGrid);
|
||||
// std::vector<LatticePropagator> FreeProps(nmass,UGrid);
|
||||
// LatticePropagator delta(UGrid);
|
||||
|
||||
for(int m=0;m<nmass;m++) {
|
||||
Solve(*FermActs[m],point_source ,PointProps[m]);
|
||||
// MasslessFreePropagator(*FermActs[m],point_source ,FreeProps[m]);
|
||||
|
||||
// delta = PointProps[m] - FreeProps[m];
|
||||
// std::cout << " delta "<<norm2(delta) << " FFT "<<norm2(FreeProps[m])<< " CG " <<norm2(PointProps[m])<<std::endl;
|
||||
}
|
||||
|
||||
LatticeComplex phase(UGrid);
|
||||
Coordinate mom({0,0,0,0});
|
||||
MakePhase(mom,phase);
|
||||
|
||||
for(int m1=0 ;m1<nmass;m1++) {
|
||||
for(int m2=m1;m2<nmass;m2++) {
|
||||
std::stringstream ssp,ssg,ssz;
|
||||
|
||||
ssp<<config<< "_m" << m1 << "_m"<< m2 << "_point_meson.xml";
|
||||
ssz<<config<< "_m" << m1 << "_m"<< m2 << "_free_meson.xml";
|
||||
|
||||
std::cout << "CG determined VV correlation function"<<std::endl;
|
||||
MesonTrace(ssp.str(),PointProps[m1],PointProps[m2],phase);
|
||||
|
||||
// std::cout << "FFT derived VV correlation function"<<std::endl;
|
||||
// MesonTrace(ssz.str(),FreeProps[m1],FreeProps[m2],phase);
|
||||
}}
|
||||
|
||||
Grid_finalize();
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -1,433 +0,0 @@
|
||||
/*
|
||||
* Warning: This code illustrative only: not well tested, and not meant for production use
|
||||
* without regression / tests being applied
|
||||
*/
|
||||
|
||||
#include <Grid/Grid.h>
|
||||
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
|
||||
RealD LLscale =1.0;
|
||||
RealD LCscale =1.0;
|
||||
|
||||
template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field>
|
||||
{
|
||||
public:
|
||||
INHERIT_GIMPL_TYPES(Gimpl);
|
||||
|
||||
GridBase *grid;
|
||||
GaugeField U;
|
||||
|
||||
CovariantLaplacianCshift(GaugeField &_U) :
|
||||
grid(_U.Grid()),
|
||||
U(_U) { };
|
||||
|
||||
virtual GridBase *Grid(void) { return grid; };
|
||||
|
||||
virtual void M (const Field &in, Field &out)
|
||||
{
|
||||
out=Zero();
|
||||
for(int mu=0;mu<Nd-1;mu++) {
|
||||
GaugeLinkField Umu = PeekIndex<LorentzIndex>(U, mu); // NB: Inefficent
|
||||
out = out - Gimpl::CovShiftForward(Umu,mu,in);
|
||||
out = out - Gimpl::CovShiftBackward(Umu,mu,in);
|
||||
out = out + 2.0*in;
|
||||
}
|
||||
};
|
||||
virtual void Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian
|
||||
virtual void Mdiag (const Field &in, Field &out) {assert(0);}; // Unimplemented need only for multigrid
|
||||
virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid
|
||||
virtual void MdirAll (const Field &in, std::vector<Field> &out) {assert(0);}; // Unimplemented need only for multigrid
|
||||
};
|
||||
|
||||
void MakePhase(Coordinate mom,LatticeComplex &phase)
|
||||
{
|
||||
GridBase *grid = phase.Grid();
|
||||
auto latt_size = grid->GlobalDimensions();
|
||||
ComplexD ci(0.0,1.0);
|
||||
phase=Zero();
|
||||
|
||||
LatticeComplex coor(phase.Grid());
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
RealD TwoPiL = M_PI * 2.0/ latt_size[mu];
|
||||
LatticeCoordinate(coor,mu);
|
||||
phase = phase + (TwoPiL * mom[mu]) * coor;
|
||||
}
|
||||
phase = exp(phase*ci);
|
||||
}
|
||||
|
||||
void PointSource(Coordinate &coor,LatticePropagator &source)
|
||||
{
|
||||
// Coordinate coor({0,0,0,0});
|
||||
source=Zero();
|
||||
SpinColourMatrix kronecker; kronecker=1.0;
|
||||
pokeSite(kronecker,source,coor);
|
||||
}
|
||||
void Z2WallSource(GridParallelRNG &RNG,int tslice,LatticePropagator &source)
|
||||
{
|
||||
GridBase *grid = source.Grid();
|
||||
LatticeComplex noise(grid);
|
||||
LatticeComplex zz(grid); zz=Zero();
|
||||
LatticeInteger t(grid);
|
||||
|
||||
RealD nrm=1.0/sqrt(2);
|
||||
bernoulli(RNG, noise); // 0,1 50:50
|
||||
|
||||
noise = (2.*noise - Complex(1,1))*nrm;
|
||||
|
||||
LatticeCoordinate(t,Tdir);
|
||||
noise = where(t==Integer(tslice), noise, zz);
|
||||
|
||||
source = 1.0;
|
||||
source = source*noise;
|
||||
std::cout << " Z2 wall " << norm2(source) << std::endl;
|
||||
}
|
||||
template<class Field>
|
||||
void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared)
|
||||
{
|
||||
typedef CovariantLaplacianCshift <PeriodicGimplR,Field> Laplacian_t;
|
||||
Laplacian_t Laplacian(U);
|
||||
|
||||
Integer Iterations = 40;
|
||||
Real width = 2.0;
|
||||
Real coeff = (width*width) / Real(4*Iterations);
|
||||
|
||||
Field tmp(U.Grid());
|
||||
smeared=unsmeared;
|
||||
// chi = (1-p^2/2N)^N kronecker
|
||||
for(int n = 0; n < Iterations; ++n) {
|
||||
Laplacian.M(smeared,tmp);
|
||||
smeared = smeared - coeff*tmp;
|
||||
std::cout << " smear iter " << n<<" " <<norm2(smeared)<<std::endl;
|
||||
}
|
||||
}
|
||||
void GaussianSource(Coordinate &site,LatticeGaugeField &U,LatticePropagator &source)
|
||||
{
|
||||
LatticePropagator tmp(source.Grid());
|
||||
PointSource(site,source);
|
||||
std::cout << " GaussianSource Kronecker "<< norm2(source)<<std::endl;
|
||||
tmp = source;
|
||||
GaussianSmear(U,tmp,source);
|
||||
std::cout << " GaussianSource Smeared "<< norm2(source)<<std::endl;
|
||||
}
|
||||
void GaussianWallSource(GridParallelRNG &RNG,int tslice,LatticeGaugeField &U,LatticePropagator &source)
|
||||
{
|
||||
Z2WallSource(RNG,tslice,source);
|
||||
auto tmp = source;
|
||||
GaussianSmear(U,tmp,source);
|
||||
}
|
||||
void SequentialSource(int tslice,Coordinate &mom,LatticePropagator &spectator,LatticePropagator &source)
|
||||
{
|
||||
assert(mom.size()==Nd);
|
||||
assert(mom[Tdir] == 0);
|
||||
|
||||
GridBase * grid = spectator.Grid();
|
||||
|
||||
|
||||
LatticeInteger ts(grid);
|
||||
LatticeCoordinate(ts,Tdir);
|
||||
source = Zero();
|
||||
source = where(ts==Integer(tslice),spectator,source); // Stick in a slice of the spectator, zero everywhere else
|
||||
|
||||
LatticeComplex phase(grid);
|
||||
MakePhase(mom,phase);
|
||||
|
||||
source = source *phase;
|
||||
}
|
||||
|
||||
template<class Action>
|
||||
void MasslessFreePropagator(Action &D,LatticePropagator &source,LatticePropagator &propagator)
|
||||
{
|
||||
GridBase *UGrid = source.Grid();
|
||||
GridBase *FGrid = D.FermionGrid();
|
||||
bool fiveD = true; //calculate 4d free propagator
|
||||
RealD mass = D.Mass();
|
||||
LatticeFermion src4 (UGrid);
|
||||
LatticeFermion result4 (UGrid);
|
||||
LatticeFermion result5(FGrid);
|
||||
LatticeFermion src5(FGrid);
|
||||
LatticePropagator prop5(FGrid);
|
||||
for(int s=0;s<Nd;s++){
|
||||
for(int c=0;c<Nc;c++){
|
||||
|
||||
PropToFerm<Action>(src4,source,s,c);
|
||||
|
||||
D.ImportPhysicalFermionSource(src4,src5);
|
||||
D.FreePropagator(src5,result5,mass,true);
|
||||
std::cout<<GridLogMessage
|
||||
<<"spin "<<s<<" color "<<c
|
||||
<<" norm2(src5d) " <<norm2(src5)
|
||||
<<" norm2(result5d) "<<norm2(result5)<<std::endl;
|
||||
|
||||
D.ExportPhysicalFermionSolution(result5,result4);
|
||||
|
||||
FermToProp<Action>(prop5,result5,s,c);
|
||||
FermToProp<Action>(propagator,result4,s,c);
|
||||
}
|
||||
}
|
||||
|
||||
LatticePropagator Vector_mu(UGrid);
|
||||
LatticeComplex VV (UGrid);
|
||||
std::vector<TComplex> sumVV;
|
||||
Gamma::Algebra GammaV[3] = {
|
||||
Gamma::Algebra::GammaX,
|
||||
Gamma::Algebra::GammaY,
|
||||
Gamma::Algebra::GammaZ
|
||||
};
|
||||
for( int mu=0;mu<3;mu++ ) {
|
||||
Gamma gV(GammaV[mu]);
|
||||
D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu);
|
||||
VV = trace(gV*Vector_mu); // (local) Vector-Vector conserved current
|
||||
sliceSum(VV,sumVV,Tdir);
|
||||
int Nt = sumVV.size();
|
||||
for(int t=0;t<Nt;t++){
|
||||
RealD Ct = real(TensorRemove(sumVV[t]))*LCscale;
|
||||
std::cout<<GridLogMessage <<"VVc["<<mu<<"]["<<t<<"] "<< Ct
|
||||
<< " 2 pi^2 t^3 C(t) "<< 2 * M_PI *M_PI * t*t*t *Ct<<std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<class Action>
|
||||
void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator)
|
||||
{
|
||||
GridBase *UGrid = D.GaugeGrid();
|
||||
GridBase *FGrid = D.FermionGrid();
|
||||
|
||||
LatticeFermion src4 (UGrid);
|
||||
LatticeFermion src5 (FGrid);
|
||||
LatticeFermion result5(FGrid);
|
||||
LatticeFermion result4(UGrid);
|
||||
LatticePropagator prop5(FGrid);
|
||||
|
||||
ConjugateGradient<LatticeFermion> CG(1.0e-6,100000);
|
||||
SchurRedBlackDiagMooeeSolve<LatticeFermion> schur(CG);
|
||||
ZeroGuesser<LatticeFermion> ZG; // Could be a DeflatedGuesser if have eigenvectors
|
||||
for(int s=0;s<Nd;s++){
|
||||
for(int c=0;c<Nc;c++){
|
||||
PropToFerm<Action>(src4,source,s,c);
|
||||
|
||||
D.ImportPhysicalFermionSource(src4,src5);
|
||||
|
||||
result5=Zero();
|
||||
schur(D,src5,result5,ZG);
|
||||
std::cout<<GridLogMessage
|
||||
<<"spin "<<s<<" color "<<c
|
||||
<<" norm2(src5d) " <<norm2(src5)
|
||||
<<" norm2(result5d) "<<norm2(result5)<<std::endl;
|
||||
|
||||
D.ExportPhysicalFermionSolution(result5,result4);
|
||||
|
||||
FermToProp<Action>(prop5,result5,s,c);
|
||||
FermToProp<Action>(propagator,result4,s,c);
|
||||
}
|
||||
}
|
||||
LatticePropagator Axial_mu(UGrid);
|
||||
LatticePropagator Vector_mu(UGrid);
|
||||
|
||||
LatticeComplex PA (UGrid);
|
||||
LatticeComplex VV (UGrid);
|
||||
LatticeComplex PJ5q(UGrid);
|
||||
LatticeComplex PP (UGrid);
|
||||
|
||||
std::vector<TComplex> sumPA;
|
||||
std::vector<TComplex> sumVV;
|
||||
std::vector<TComplex> sumPP;
|
||||
std::vector<TComplex> sumPJ5q;
|
||||
|
||||
Gamma g5(Gamma::Algebra::Gamma5);
|
||||
D.ContractConservedCurrent(prop5,prop5,Axial_mu,source,Current::Axial,Tdir);
|
||||
PA = trace(g5*Axial_mu); // Pseudoscalar-Axial conserved current
|
||||
sliceSum(PA,sumPA,Tdir);
|
||||
|
||||
int Nt{static_cast<int>(sumPA.size())};
|
||||
|
||||
for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PAc["<<t<<"] "<<real(TensorRemove(sumPA[t]))*LCscale<<std::endl;
|
||||
|
||||
PP = trace(adj(propagator)*propagator); // Pseudoscalar density
|
||||
sliceSum(PP,sumPP,Tdir);
|
||||
for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PP["<<t<<"] "<<real(TensorRemove(sumPP[t]))*LCscale<<std::endl;
|
||||
|
||||
D.ContractJ5q(prop5,PJ5q);
|
||||
sliceSum(PJ5q,sumPJ5q,Tdir);
|
||||
for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PJ5q["<<t<<"] "<<real(TensorRemove(sumPJ5q[t]))<<std::endl;
|
||||
|
||||
Gamma::Algebra GammaV[3] = {
|
||||
Gamma::Algebra::GammaX,
|
||||
Gamma::Algebra::GammaY,
|
||||
Gamma::Algebra::GammaZ
|
||||
};
|
||||
for( int mu=0;mu<3;mu++ ) {
|
||||
Gamma gV(GammaV[mu]);
|
||||
D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu);
|
||||
// auto ss=sliceSum(Vector_mu,Tdir);
|
||||
// for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"ss["<<mu<<"]["<<t<<"] "<<ss[t]<<std::endl;
|
||||
VV = trace(gV*Vector_mu); // (local) Vector-Vector conserved current
|
||||
sliceSum(VV,sumVV,Tdir);
|
||||
for(int t=0;t<Nt;t++){
|
||||
RealD Ct = real(TensorRemove(sumVV[t]))*LCscale;
|
||||
std::cout<<GridLogMessage <<"VVc["<<mu<<"]["<<t<<"] "<< Ct
|
||||
<< " 2 pi^2 t^3 C(t) "<< 2 * M_PI *M_PI * t*t*t *Ct<<std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class MesonFile: Serializable {
|
||||
public:
|
||||
GRID_SERIALIZABLE_CLASS_MEMBERS(MesonFile, std::vector<std::vector<Complex> >, data);
|
||||
};
|
||||
|
||||
void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase)
|
||||
{
|
||||
const int nchannel=3;
|
||||
Gamma::Algebra Gammas[nchannel][2] = {
|
||||
{Gamma::Algebra::GammaX,Gamma::Algebra::GammaX},
|
||||
{Gamma::Algebra::GammaY,Gamma::Algebra::GammaY},
|
||||
// {Gamma::Algebra::GammaZ,Gamma::Algebra::GammaZ}
|
||||
{Gamma::Algebra::Gamma5,Gamma::Algebra::Gamma5}
|
||||
};
|
||||
|
||||
Gamma G5(Gamma::Algebra::Gamma5);
|
||||
|
||||
LatticeComplex meson_CF(q1.Grid());
|
||||
MesonFile MF;
|
||||
|
||||
for(int ch=0;ch<nchannel;ch++){
|
||||
|
||||
Gamma Gsrc(Gammas[ch][0]);
|
||||
Gamma Gsnk(Gammas[ch][1]);
|
||||
|
||||
meson_CF = trace(G5*adj(q1)*G5*Gsnk*q2*adj(Gsrc));
|
||||
|
||||
std::vector<TComplex> meson_T;
|
||||
sliceSum(meson_CF,meson_T, Tdir);
|
||||
|
||||
int nt=meson_T.size();
|
||||
|
||||
std::vector<Complex> corr(nt);
|
||||
for(int t=0;t<nt;t++){
|
||||
corr[t] = TensorRemove(meson_T[t])*LLscale; // Yes this is ugly, not figured a work around
|
||||
std::cout << " channel "<<ch<<" t "<<t<<" " <<real(corr[t])<< " 2 pi^2 t^3 C(t) "<< 2 * M_PI *M_PI * t*t*t *real(corr[t])<<std::endl;
|
||||
}
|
||||
MF.data.push_back(corr);
|
||||
}
|
||||
|
||||
{
|
||||
XmlWriter WR(file);
|
||||
write(WR,"MesonFile",MF);
|
||||
}
|
||||
}
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
const int Ls=8;
|
||||
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
// Double precision grids
|
||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
|
||||
GridDefaultSimd(Nd,vComplex::Nsimd()),
|
||||
GridDefaultMpi());
|
||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// You can manage seeds however you like.
|
||||
// Recommend SeedUniqueString.
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// std::vector<int> seeds4({1,2,3,4});
|
||||
// GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
|
||||
|
||||
LatticeGaugeField Umu(UGrid);
|
||||
std::string config;
|
||||
RealD M5=atof(getenv("M5"));
|
||||
RealD mq = atof(getenv("mass"));
|
||||
std::vector<RealD> masses({ mq} ); // u/d, s, c ??
|
||||
if( argc > 1 && argv[1][0] != '-' )
|
||||
{
|
||||
std::cout<<GridLogMessage <<"Loading configuration from "<<argv[1]<<std::endl;
|
||||
FieldMetaData header;
|
||||
NerscIO::readConfiguration(Umu, header, argv[1]);
|
||||
config=argv[1];
|
||||
LLscale = 1.0;
|
||||
LCscale = 1.0;
|
||||
}
|
||||
else
|
||||
{
|
||||
SU<Nc>::ColdConfiguration(Umu);
|
||||
config="ColdConfig";
|
||||
// RealD P=1.0; // Don't scale
|
||||
// RealD P=0.6153342; // 64I
|
||||
// RealD P=0.6388238 // 32Ifine
|
||||
// RealD P=0.5871119; // 48I
|
||||
// RealD u0 = sqrt(sqrt(P));
|
||||
// Umu = Umu * u0;
|
||||
RealD w0 = 1 - M5;
|
||||
LLscale = 1.0/(1-w0*w0)/(1-w0*w0);
|
||||
LCscale = 1.0/(1-w0*w0)/(1-w0*w0);
|
||||
std::cout<<GridLogMessage <<"Gauge links are u=1 "<<std::endl;
|
||||
std::cout<<GridLogMessage <<"M5 = "<<M5<<std::endl;
|
||||
std::cout<<GridLogMessage <<"mq = "<<mq<<std::endl;
|
||||
std::cout<<GridLogMessage <<"LLscale = "<<LLscale<<std::endl;
|
||||
std::cout<<GridLogMessage <<"LCscale = "<<LCscale<<std::endl;
|
||||
}
|
||||
|
||||
int nmass = masses.size();
|
||||
|
||||
std::vector<DomainWallFermionD *> FermActs;
|
||||
|
||||
std::cout<<GridLogMessage <<"======================"<<std::endl;
|
||||
std::cout<<GridLogMessage <<"DomainWallFermion action"<<std::endl;
|
||||
std::cout<<GridLogMessage <<"======================"<<std::endl;
|
||||
|
||||
for(auto mass: masses) {
|
||||
|
||||
std::cout<<GridLogMessage <<"Making DomainWallFermion action"<<std::endl;
|
||||
FermActs.push_back(new DomainWallFermionD(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5));
|
||||
std::cout<<GridLogMessage <<"Made DomainWallFermion action"<<std::endl;
|
||||
|
||||
}
|
||||
|
||||
LatticePropagator point_source(UGrid);
|
||||
|
||||
Coordinate Origin({0,0,0,0});
|
||||
PointSource (Origin,point_source);
|
||||
|
||||
// std::vector<LatticePropagator> PointProps(nmass,UGrid);
|
||||
std::vector<LatticePropagator> FreeProps(nmass,UGrid);
|
||||
LatticePropagator delta(UGrid);
|
||||
|
||||
for(int m=0;m<nmass;m++) {
|
||||
// Solve(*FermActs[m],point_source ,PointProps[m]);
|
||||
MasslessFreePropagator(*FermActs[m],point_source ,FreeProps[m]);
|
||||
|
||||
// delta = PointProps[m] - FreeProps[m];
|
||||
// std::cout << " delta "<<norm2(delta) << " FFT "<<norm2(FreeProps[m])<< " CG " <<norm2(PointProps[m])<<std::endl;
|
||||
}
|
||||
|
||||
LatticeComplex phase(UGrid);
|
||||
Coordinate mom({0,0,0,0});
|
||||
MakePhase(mom,phase);
|
||||
|
||||
for(int m1=0 ;m1<nmass;m1++) {
|
||||
for(int m2=m1;m2<nmass;m2++) {
|
||||
std::stringstream ssp,ssg,ssz;
|
||||
|
||||
ssp<<config<< "_m" << m1 << "_m"<< m2 << "_point_meson.xml";
|
||||
ssz<<config<< "_m" << m1 << "_m"<< m2 << "_free_meson.xml";
|
||||
|
||||
// std::cout << "CG determined VV correlation function"<<std::endl;
|
||||
// MesonTrace(ssp.str(),PointProps[m1],PointProps[m2],phase);
|
||||
|
||||
std::cout << "FFT derived VV correlation function"<<std::endl;
|
||||
MesonTrace(ssz.str(),FreeProps[m1],FreeProps[m2],phase);
|
||||
}}
|
||||
|
||||
Grid_finalize();
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#!/bin/bash
|
||||
|
||||
#PBS -q EarlyAppAccess
|
||||
##PBS -q EarlyAppAccess
|
||||
#PBS -q debug
|
||||
#PBS -l select=1
|
||||
#PBS -l walltime=00:20:00
|
||||
#PBS -A LatticeQCD_aesp_CNDA
|
||||
@@ -12,27 +13,24 @@ source ../sourceme.sh
|
||||
cp $PBS_NODEFILE nodefile
|
||||
|
||||
export OMP_NUM_THREADS=4
|
||||
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||
unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
|
||||
unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
|
||||
unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
|
||||
export MPICH_OFI_NIC_POLICY=GPU
|
||||
|
||||
#export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
|
||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
|
||||
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
|
||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
||||
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
||||
export MPICH_OFI_NIC_POLICY=GPU
|
||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
|
||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
||||
|
||||
CMD="mpiexec -np 12 -ppn 12 -envall \
|
||||
./Benchmark_dwf_fp32 --mpi 2.1.2.3 --grid 32.32.64.48 \
|
||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --debug-signals"
|
||||
./gpu_tile.sh ./Benchmark_dwf_fp32 --mpi 2.1.2.3 --grid 32.32.64.96 \
|
||||
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 8 "
|
||||
|
||||
#for f in 1 2 3 4 5 6 7 8
|
||||
for f in 1
|
||||
do
|
||||
echo $CMD
|
||||
$CMD | tee 1node.32.32.64.48.dwf.hbm.$f
|
||||
done
|
||||
$CMD
|
||||
|
||||
|
||||
74
systems/Aurora/benchmarks/bench16.pbs
Normal file
74
systems/Aurora/benchmarks/bench16.pbs
Normal file
@@ -0,0 +1,74 @@
|
||||
#!/bin/bash
|
||||
|
||||
##PBS -q LatticeQCD_aesp_CNDA
|
||||
#PBS -q debug-scaling
|
||||
##PBS -q prod
|
||||
#PBS -l select=16
|
||||
#PBS -l walltime=00:20:00
|
||||
#PBS -A LatticeQCD_aesp_CNDA
|
||||
|
||||
cd $PBS_O_WORKDIR
|
||||
|
||||
source ../sourceme.sh
|
||||
|
||||
cp $PBS_NODEFILE nodefile
|
||||
|
||||
export OMP_NUM_THREADS=4
|
||||
export MPICH_OFI_NIC_POLICY=GPU
|
||||
|
||||
#export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
|
||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
|
||||
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
|
||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
||||
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
|
||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
||||
|
||||
#
|
||||
# Local vol 16.16.16.32
|
||||
#
|
||||
|
||||
LX=16
|
||||
LY=16
|
||||
LZ=16
|
||||
LT=32
|
||||
|
||||
NX=2
|
||||
NY=2
|
||||
NZ=4
|
||||
NT=1
|
||||
|
||||
GX=2
|
||||
GY=2
|
||||
GZ=1
|
||||
GT=3
|
||||
|
||||
PX=$((NX * GX ))
|
||||
PY=$((NY * GY ))
|
||||
PZ=$((NZ * GZ ))
|
||||
PT=$((NT * GT ))
|
||||
|
||||
VX=$((PX * LX ))
|
||||
VY=$((PY * LY ))
|
||||
VZ=$((PZ * LZ ))
|
||||
VT=$((PT * LT ))
|
||||
|
||||
NP=$((PX*PY*PZ*PT))
|
||||
VOL=${VX}.${VY}.${VZ}.${VT}
|
||||
AT=8
|
||||
MPI=${PX}.${PY}.${PZ}.${PT}
|
||||
|
||||
CMD="mpiexec -np $NP -ppn 12 -envall \
|
||||
./gpu_tile.sh ./Benchmark_dwf_fp32 --mpi $MPI --grid $VOL \
|
||||
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads $AT --comms-overlap "
|
||||
|
||||
echo VOL $VOL
|
||||
echo MPI $MPI
|
||||
echo NPROC $NP
|
||||
echo $CMD
|
||||
$CMD
|
||||
|
||||
@@ -1,58 +1,48 @@
|
||||
#!/bin/bash
|
||||
|
||||
#PBS -q EarlyAppAccess
|
||||
##PBS -q EarlyAppAccess
|
||||
#PBS -q debug
|
||||
#PBS -l select=2
|
||||
#PBS -l walltime=00:20:00
|
||||
#PBS -A LatticeQCD_aesp_CNDA
|
||||
|
||||
#export OMP_PROC_BIND=spread
|
||||
#unset OMP_PLACES
|
||||
|
||||
cd $PBS_O_WORKDIR
|
||||
|
||||
source ../sourceme.sh
|
||||
#module load pti-gpu
|
||||
|
||||
|
||||
cp $PBS_NODEFILE nodefile
|
||||
|
||||
export OMP_NUM_THREADS=4
|
||||
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||
export MPICH_OFI_NIC_POLICY=GPU
|
||||
|
||||
#export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
|
||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
|
||||
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
||||
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
||||
export MPICH_OFI_NIC_POLICY=GPU
|
||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
||||
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
|
||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
||||
|
||||
# 12 ppn, 2 nodes, 24 ranks
|
||||
#
|
||||
CMD="mpiexec -np 24 -ppn 12 -envall \
|
||||
./gpu_tile.sh \
|
||||
./Benchmark_comms_host_device --mpi 2.2.2.3 --grid 24.32.32.24 \
|
||||
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
||||
#$CMD | tee 2node.comms.hbm
|
||||
# Local vol 16.16.16.32
|
||||
#
|
||||
|
||||
#VOL=32.64.64.96
|
||||
|
||||
CMD="mpiexec -np 24 -ppn 12 -envall \
|
||||
./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.32.64.48 \
|
||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap --debug-signals"
|
||||
|
||||
#for f in 1 2 3 4 5 6 7 8
|
||||
for f in 1
|
||||
for VOL in 32.32.32.96 32.64.64.96
|
||||
do
|
||||
for AT in 32
|
||||
do
|
||||
CMD="mpiexec -np 24 -ppn 12 -envall \
|
||||
./gpu_tile.sh ./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid $VOL \
|
||||
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads $AT --comms-overlap "
|
||||
|
||||
echo $CMD
|
||||
$CMD | tee 2node.32.32.64.48.dwf.hbm.$f
|
||||
$CMD
|
||||
done
|
||||
done
|
||||
|
||||
CMD="mpiexec -np 24 -ppn 12 -envall \
|
||||
./gpu_tile.sh \
|
||||
./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 64.64.64.96 \
|
||||
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
||||
#$CMD | tee 2node.64.64.64.96.dwf.hbm
|
||||
|
||||
|
||||
@@ -4,10 +4,12 @@
|
||||
#export NUMA_MAP=(0 0 1 1 0 0 1 1 0 0 1 1);
|
||||
#export GPU_MAP=(0.0 0.1 3.0 3.1 1.0 1.1 4.0 4.1 2.0 2.1 5.0 5.1)
|
||||
|
||||
export NUMA_MAP=(0 0 0 0 0 0 1 1 1 1 1 1 );
|
||||
export NUMA_PMAP=(0 0 0 1 1 1 0 0 0 1 1 1 );
|
||||
export NUMA_HMAP=(2 2 2 3 3 3 3 2 2 2 2 3 3 3 );
|
||||
export GPU_MAP=(0.0 1.0 2.0 3.0 4.0 5.0 0.1 1.1 2.1 3.1 4.1 5.1 )
|
||||
|
||||
export NUMA=${NUMA_MAP[$PALS_LOCAL_RANKID]}
|
||||
export NUMAP=${NUMA_PMAP[$PALS_LOCAL_RANKID]}
|
||||
export NUMAH=${NUMA_HMAP[$PALS_LOCAL_RANKID]}
|
||||
export gpu_id=${GPU_MAP[$PALS_LOCAL_RANKID]}
|
||||
|
||||
unset EnableWalkerPartition
|
||||
@@ -17,18 +19,19 @@ export ONEAPI_DEVICE_FILTER=gpu,level_zero
|
||||
|
||||
export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
|
||||
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
||||
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:5
|
||||
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
|
||||
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:4
|
||||
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
|
||||
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
|
||||
#export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
|
||||
|
||||
#export MPI_BUF_NUMA=$NUMAH
|
||||
|
||||
echo "rank $PALS_RANKID ; local rank $PALS_LOCAL_RANKID ; ZE_AFFINITY_MASK=$ZE_AFFINITY_MASK ; NUMA $NUMA "
|
||||
|
||||
if [ $PALS_RANKID = "0" ]
|
||||
then
|
||||
# numactl -m $NUMA -N $NUMA onetrace --chrome-device-timeline "$@"
|
||||
# numactl -m $NUMA -N $NUMA unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@"
|
||||
numactl -m $NUMA -N $NUMA "$@"
|
||||
# numactl -p $NUMAP -N $NUMAP unitrace --chrome-kernel-logging --chrome-mpi-logging --chrome-sycl-logging --demangle "$@"
|
||||
numactl -p $NUMAP -N $NUMAP "$@"
|
||||
else
|
||||
numactl -m $NUMA -N $NUMA "$@"
|
||||
numactl -p $NUMAP -N $NUMAP "$@"
|
||||
fi
|
||||
@@ -1,17 +1,25 @@
|
||||
#Ahead of time compile for PVC
|
||||
|
||||
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl "
|
||||
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel -fsycl -fno-exceptions "
|
||||
../../configure \
|
||||
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc"
|
||||
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/ -fPIC"
|
||||
|
||||
#JIT compile
|
||||
#export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl "
|
||||
#export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions "
|
||||
|
||||
../configure \
|
||||
--enable-simd=GPU \
|
||||
--enable-reduction=grid \
|
||||
--enable-gen-simd-width=64 \
|
||||
--enable-comms=mpi-auto \
|
||||
--enable-debug \
|
||||
--prefix $HOME/gpt-install \
|
||||
--disable-gparity \
|
||||
--disable-fermion-reps \
|
||||
--with-lime=$CLIME \
|
||||
--enable-shm=nvlink \
|
||||
--enable-accelerator=sycl \
|
||||
--enable-accelerator-aware-mpi=yes\
|
||||
--enable-accelerator-aware-mpi=no\
|
||||
--enable-unified=no \
|
||||
MPICXX=mpicxx \
|
||||
CXX=icpx
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
module load oneapi/release/2023.12.15.001
|
||||
#module load oneapi/release/2023.12.15.001
|
||||
#module load mpich/icc-all-debug-pmix-gpu/52.2
|
||||
#module load mpich-config/mode/deterministic
|
||||
#module load intel_compute_runtime/release/821.35
|
||||
module load pti-gpu
|
||||
|
||||
source ~/spack/share/spack/setup-env.sh
|
||||
spack load c-lime
|
||||
spack load openssl
|
||||
|
||||
22
systems/Frontier-rocm631/config-command
Normal file
22
systems/Frontier-rocm631/config-command
Normal file
@@ -0,0 +1,22 @@
|
||||
CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
|
||||
../../configure --enable-comms=mpi-auto \
|
||||
--with-lime=$CLIME \
|
||||
--enable-unified=no \
|
||||
--enable-shm=nvlink \
|
||||
--enable-tracing=none \
|
||||
--enable-accelerator=hip \
|
||||
--enable-gen-simd-width=64 \
|
||||
--disable-gparity \
|
||||
--disable-fermion-reps \
|
||||
--enable-simd=GPU \
|
||||
--with-gmp=$OLCF_GMP_ROOT \
|
||||
--with-fftw=$FFTW_DIR/.. \
|
||||
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
|
||||
--disable-fermion-reps \
|
||||
CXX=hipcc MPICXX=mpicxx \
|
||||
CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
|
||||
LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas"
|
||||
|
||||
|
||||
|
||||
|
||||
16
systems/Frontier-rocm631/sourceme631.sh
Normal file
16
systems/Frontier-rocm631/sourceme631.sh
Normal file
@@ -0,0 +1,16 @@
|
||||
|
||||
echo spack
|
||||
. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
|
||||
|
||||
#module load cce/15.0.1
|
||||
|
||||
module load rocm/6.3.1
|
||||
module load cray-fftw
|
||||
module load craype-accel-amd-gfx90a
|
||||
export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
|
||||
|
||||
#Ugly hacks to get down level software working on current system
|
||||
#export LD_LIBRARY_PATH=/opt/cray/libfabric/1.20.1/lib64/:$LD_LIBRARY_PATH
|
||||
#export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
|
||||
#ln -s /opt/rocm-6.0.0/lib/libamdhip64.so.6 .
|
||||
|
||||
@@ -30,14 +30,10 @@ source ${root}/sourceme.sh
|
||||
|
||||
export OMP_NUM_THREADS=7
|
||||
export MPICH_GPU_SUPPORT_ENABLED=1
|
||||
export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
|
||||
|
||||
for vol in 32.32.32.64
|
||||
#export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
|
||||
#64.64.32.96
|
||||
for vol in 64.64.32.64
|
||||
do
|
||||
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol > log.shm0.ov.$vol
|
||||
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.ov.$vol
|
||||
|
||||
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 0 --grid $vol > log.shm0.seq.$vol
|
||||
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-sequential --shm 2048 --shm-mpi 1 --grid $vol > log.shm1.seq.$vol
|
||||
srun ./select_gpu ./Benchmark_dwf_fp32 --mpi 2.2.2.2 --accelerator-threads 8 --comms-overlap --shm 2048 --shm-mpi 0 --grid $vol -Ls 16
|
||||
done
|
||||
|
||||
|
||||
@@ -3,20 +3,19 @@ CLIME=`spack find --paths c-lime@2-3-9 | grep c-lime| cut -c 15-`
|
||||
--with-lime=$CLIME \
|
||||
--enable-unified=no \
|
||||
--enable-shm=nvlink \
|
||||
--enable-tracing=timer \
|
||||
--enable-tracing=none \
|
||||
--enable-accelerator=hip \
|
||||
--enable-gen-simd-width=64 \
|
||||
--disable-gparity \
|
||||
--disable-fermion-reps \
|
||||
--enable-simd=GPU \
|
||||
--enable-accelerator-cshift \
|
||||
--with-gmp=$OLCF_GMP_ROOT \
|
||||
--with-fftw=$FFTW_DIR/.. \
|
||||
--with-mpfr=/opt/cray/pe/gcc/mpfr/3.1.4/ \
|
||||
--disable-fermion-reps \
|
||||
CXX=hipcc MPICXX=mpicxx \
|
||||
CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
|
||||
LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas"
|
||||
CXXFLAGS="-fPIC -I${ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 " \
|
||||
LDFLAGS="-L/lib64 -L${ROCM_PATH}/lib -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lhipblas -lrocblas"
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,12 +1,25 @@
|
||||
|
||||
echo spack
|
||||
. /autofs/nccs-svm1_home1/paboyle/Crusher/Grid/spack/share/spack/setup-env.sh
|
||||
spack load c-lime
|
||||
module load emacs
|
||||
module load PrgEnv-gnu
|
||||
module load rocm
|
||||
module load cray-mpich
|
||||
module load gmp
|
||||
|
||||
module load cce/15.0.1
|
||||
module load rocm/5.3.0
|
||||
module load cray-fftw
|
||||
module load craype-accel-amd-gfx90a
|
||||
|
||||
#Ugly hacks to get down level software working on current system
|
||||
export LD_LIBRARY_PATH=/opt/cray/libfabric/1.20.1/lib64/:$LD_LIBRARY_PATH
|
||||
export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
|
||||
export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
|
||||
ln -s /opt/rocm-6.0.0/lib/libamdhip64.so.6 .
|
||||
|
||||
#echo spack load c-lime
|
||||
#spack load c-lime
|
||||
#module load emacs
|
||||
##module load PrgEnv-gnu
|
||||
##module load cray-mpich
|
||||
##module load cray-fftw
|
||||
##module load craype-accel-amd-gfx90a
|
||||
##export LD_LIBRARY_PATH=/opt/gcc/mpfr/3.1.4/lib:$LD_LIBRARY_PATH
|
||||
#Hack for lib
|
||||
#export LD_LIBRARY_PATH=`pwd`:$LD_LIBRARY_PATH
|
||||
##export LD_LIBRARY_PATH=`pwd`/:$LD_LIBRARY_PATH
|
||||
|
||||
18
systems/Linux-cuda/config-command
Normal file
18
systems/Linux-cuda/config-command
Normal file
@@ -0,0 +1,18 @@
|
||||
../../configure \
|
||||
--enable-comms=mpi \
|
||||
--enable-simd=GPU \
|
||||
--enable-gen-simd-width=64 \
|
||||
--enable-shm=nvlink \
|
||||
--with-lime=$CLIME \
|
||||
--with-hdf5=$HDF5 \
|
||||
--with-fftw=$FFTW \
|
||||
--with-gmp=$GMP \
|
||||
--with-mpfr=$MPFR \
|
||||
--enable-accelerator=cuda \
|
||||
--disable-gparity \
|
||||
--disable-fermion-reps \
|
||||
--disable-unified \
|
||||
CXX=nvcc \
|
||||
LDFLAGS="-cudart shared -L$NVIDIALIB -lcublas" \
|
||||
CXXFLAGS="-ccbin mpicxx -gencode arch=compute_80,code=sm_80 -std=c++17 -cudart shared"
|
||||
|
||||
16
systems/Linux-cuda/sourceme.sh
Normal file
16
systems/Linux-cuda/sourceme.sh
Normal file
@@ -0,0 +1,16 @@
|
||||
. /home/paboyle/spack/share/spack/setup-env.sh
|
||||
spack load cuda@12.0.0
|
||||
spack load c-lime
|
||||
spack load gmp
|
||||
spack load mpfr
|
||||
spack load hdf5
|
||||
spack load fftw
|
||||
spack load openmpi
|
||||
export FFTW=`spack find --paths fftw | grep fftw | cut -c 14-`
|
||||
export HDF5=`spack find --paths hdf5 | grep hdf5 | cut -c 14-`
|
||||
export CUDA=`spack find --paths cuda@11.8.0 | grep cuda | cut -c 14-`
|
||||
export CLIME=`spack find --paths c-lime | grep c-lime| cut -c 15-`
|
||||
export GMP=`spack find --paths gmp | grep gmp | cut -c 12-`
|
||||
export MPFR=`spack find --paths mpfr | grep mpfr | cut -c 13-`
|
||||
export NVIDIALIB=$CUDA/targets/x86_64-linux/lib/
|
||||
export LD_LIBRARY_PATH=$NVIDIALIB:$LD_LIBRARY_PATH:$HDF5/lib:$FFTW/lib:$CLIME/lib/:$MPFR/lib
|
||||
@@ -1,7 +1,7 @@
|
||||
spack load c-lime
|
||||
spack load gmp
|
||||
spack load mpfr
|
||||
CLIME=`spack find --paths c-lime | grep c-lime| cut -c 15-`
|
||||
CLIME=`spack find --paths c-lime | grep c-lime| cut -c 13-`
|
||||
GMP=`spack find --paths gmp | grep gmp | cut -c 12-`
|
||||
MPFR=`spack find --paths mpfr | grep mpfr | cut -c 13-`
|
||||
echo clime X$CLIME
|
||||
|
||||
206
systems/WorkArounds.txt
Normal file
206
systems/WorkArounds.txt
Normal file
@@ -0,0 +1,206 @@
|
||||
The purpose of this file is to collate all non-obvious known magic shell variables
|
||||
and compiler flags required for either correctness or performance on various systems.
|
||||
|
||||
A repository of work-arounds.
|
||||
|
||||
Contents:
|
||||
1. Interconnect + MPI
|
||||
2. Compilation
|
||||
3. Profiling
|
||||
|
||||
************************
|
||||
* 1. INTERCONNECT + MPI
|
||||
************************
|
||||
|
||||
--------------------------------------------------------------------
|
||||
MPI2-IO correctness: force OpenMPI to use the MPICH romio implementation for parallel I/O
|
||||
--------------------------------------------------------------------
|
||||
export OMPI_MCA_io=romio321
|
||||
|
||||
--------------------------------------
|
||||
ROMIO fail with > 2GB per node read (32 bit issue)
|
||||
--------------------------------------
|
||||
|
||||
Use later MPICH
|
||||
|
||||
https://github.com/paboyle/Grid/issues/381
|
||||
|
||||
https://github.com/pmodels/mpich/commit/3a479ab0
|
||||
|
||||
--------------------------------------------------------------------
|
||||
Slingshot: Frontier and Perlmutter libfabric slow down
|
||||
and physical memory fragmentation
|
||||
--------------------------------------------------------------------
|
||||
export FI_MR_CACHE_MONITOR=disabled
|
||||
or
|
||||
export FI_MR_CACHE_MONITOR=kdreg2
|
||||
|
||||
--------------------------------------------------------------------
|
||||
Perlmutter
|
||||
--------------------------------------------------------------------
|
||||
|
||||
export MPICH_RDMA_ENABLED_CUDA=1
|
||||
export MPICH_GPU_IPC_ENABLED=1
|
||||
export MPICH_GPU_EAGER_REGISTER_HOST_MEM=0
|
||||
export MPICH_GPU_NO_ASYNC_MEMCPY=0
|
||||
|
||||
--------------------------------------------------------------------
|
||||
Frontier/LumiG
|
||||
--------------------------------------------------------------------
|
||||
|
||||
Hiding ROCR_VISIBLE_DEVICES triggers SDMA engines to be used for GPU-GPU
|
||||
|
||||
cat << EOF > select_gpu
|
||||
#!/bin/bash
|
||||
export MPICH_GPU_SUPPORT_ENABLED=1
|
||||
export MPICH_SMP_SINGLE_COPY_MODE=XPMEM
|
||||
export GPU_MAP=(0 1 2 3 7 6 5 4)
|
||||
export NUMA_MAP=(3 3 1 1 2 2 0 0)
|
||||
export GPU=\${GPU_MAP[\$SLURM_LOCALID]}
|
||||
export NUMA=\${NUMA_MAP[\$SLURM_LOCALID]}
|
||||
export HIP_VISIBLE_DEVICES=\$GPU
|
||||
unset ROCR_VISIBLE_DEVICES
|
||||
echo RANK \$SLURM_LOCALID using GPU \$GPU
|
||||
exec numactl -m \$NUMA -N \$NUMA \$*
|
||||
EOF
|
||||
chmod +x ./select_gpu
|
||||
|
||||
srun ./select_gpu BINARY
|
||||
|
||||
|
||||
--------------------------------------------------------------------
|
||||
Mellanox performance with A100 GPU (Tursa, Booster, Leonardo)
|
||||
--------------------------------------------------------------------
|
||||
export OMPI_MCA_btl=^uct,openib
|
||||
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
|
||||
export UCX_RNDV_SCHEME=put_zcopy
|
||||
export UCX_RNDV_THRESH=16384
|
||||
export UCX_IB_GPU_DIRECT_RDMA=yes
|
||||
|
||||
--------------------------------------------------------------------
|
||||
Mellanox + A100 correctness (Tursa, Booster, Leonardo)
|
||||
--------------------------------------------------------------------
|
||||
export UCX_MEMTYPE_CACHE=n
|
||||
|
||||
--------------------------------------------------------------------
|
||||
MPICH/Aurora/PVC correctness and performance
|
||||
--------------------------------------------------------------------
|
||||
|
||||
https://github.com/pmodels/mpich/issues/7302
|
||||
|
||||
--enable-cuda-aware-mpi=no
|
||||
--enable-unified=no
|
||||
|
||||
Grid's internal D-H-H-D pipeline mode, avoid device memory in MPI
|
||||
Do not use SVM
|
||||
|
||||
Ideally use MPICH with fix to issue 7302:
|
||||
|
||||
https://github.com/pmodels/mpich/pull/7312
|
||||
|
||||
Ideally:
|
||||
MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE=generic
|
||||
|
||||
Alternatives:
|
||||
export MPIR_CVAR_NOLOCAL=1
|
||||
export MPIR_CVAR_CH4_IPC_GPU_P2P_THRESHOLD=1000000000
|
||||
|
||||
--------------------------------------------------------------------
|
||||
MPICH/Aurora/PVC correctness and performance
|
||||
--------------------------------------------------------------------
|
||||
|
||||
Broken:
|
||||
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||
|
||||
This gives good peformance without requiring
|
||||
--enable-cuda-aware-mpi=no
|
||||
|
||||
But is an open issue reported by James Osborn
|
||||
https://github.com/pmodels/mpich/issues/7139
|
||||
|
||||
Possibly resolved but unclear if in the installed software yet.
|
||||
|
||||
************************
|
||||
* 2. COMPILATION
|
||||
************************
|
||||
|
||||
--------------------------------------------------------------------
|
||||
G++ compiler breakage / graveyard
|
||||
--------------------------------------------------------------------
|
||||
|
||||
9.3.0, 10.3.1,
|
||||
https://github.com/paboyle/Grid/issues/290
|
||||
https://github.com/paboyle/Grid/issues/264
|
||||
|
||||
Working (-) Broken (X):
|
||||
|
||||
4.9.0 -
|
||||
4.9.1 -
|
||||
5.1.0 X
|
||||
5.2.0 X
|
||||
5.3.0 X
|
||||
5.4.0 X
|
||||
6.1.0 X
|
||||
6.2.0 X
|
||||
6.3.0 -
|
||||
7.1.0 -
|
||||
8.0.0 (HEAD) -
|
||||
|
||||
https://github.com/paboyle/Grid/issues/100
|
||||
|
||||
--------------------------------------------------------------------
|
||||
AMD GPU nodes :
|
||||
--------------------------------------------------------------------
|
||||
|
||||
multiple ROCM versions broken; use 5.3.0
|
||||
manifests itself as wrong results in fp32
|
||||
|
||||
https://github.com/paboyle/Grid/issues/464
|
||||
|
||||
--------------------------------------------------------------------
|
||||
Aurora/PVC
|
||||
--------------------------------------------------------------------
|
||||
|
||||
SYCL ahead of time compilation (fixes rare runtime JIT errors and faster runtime, PB)
|
||||
SYCL slow link and relocatable code issues (Christoph Lehner)
|
||||
Opt large register file required for good performance in fp64
|
||||
|
||||
|
||||
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
|
||||
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc"
|
||||
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -fPIC"
|
||||
|
||||
--------------------------------------------------------------------
|
||||
Aurora/PVC useful extra options
|
||||
--------------------------------------------------------------------
|
||||
|
||||
Host only sanitizer:
|
||||
-Xarch_host -fsanitize=leak
|
||||
-Xarch_host -fsanitize=address
|
||||
|
||||
Deterministic MPI reduction:
|
||||
export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
|
||||
export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
|
||||
export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
|
||||
unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
|
||||
unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
|
||||
unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
|
||||
|
||||
|
||||
|
||||
************************
|
||||
* 3. Visual profile tools
|
||||
************************
|
||||
|
||||
--------------------------------------------------------------------
|
||||
Frontier/rocprof
|
||||
--------------------------------------------------------------------
|
||||
|
||||
--------------------------------------------------------------------
|
||||
Aurora/unitrace
|
||||
--------------------------------------------------------------------
|
||||
|
||||
|
||||
--------------------------------------------------------------------
|
||||
Tursa/nsight-sys
|
||||
--------------------------------------------------------------------
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user